26 files changed, 1082 insertions, 184 deletions
diff --git a/contrib/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp b/contrib/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp
index deaa11325809..496f2befde58 100644
--- a/contrib/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp
+++ b/contrib/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp
@@ -460,7 +460,7 @@ bool BPFAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
   } else if (BPFOperand::isValidIdAtStart (Name))
     Operands.push_back(BPFOperand::createToken(Name, NameLoc));
   else
-    return true;
+    return Error(NameLoc, "invalid register/token name");
 
   while (!getLexer().is(AsmToken::EndOfStatement)) {
     // Attempt to parse token as operator
@@ -472,8 +472,10 @@ bool BPFAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
       continue;
 
     // Attempt to parse token as an immediate
-    if (parseImmediate(Operands) != MatchOperand_Success)
-      return true;
+    if (parseImmediate(Operands) != MatchOperand_Success) {
+      SMLoc Loc = getLexer().getLoc();
+      return Error(Loc, "unexpected token");
+    }
   }
 
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
diff --git a/contrib/llvm/lib/Target/BPF/BPF.h b/contrib/llvm/lib/Target/BPF/BPF.h
index 4a0cb20357c8..76d3e1ca5f6f 100644
--- a/contrib/llvm/lib/Target/BPF/BPF.h
+++ b/contrib/llvm/lib/Target/BPF/BPF.h
@@ -17,6 +17,11 @@ namespace llvm {
 class BPFTargetMachine;
 
 FunctionPass *createBPFISelDag(BPFTargetMachine &TM);
+FunctionPass *createBPFMIPeepholePass();
+FunctionPass *createBPFMIPreEmitPeepholePass();
+
+void initializeBPFMIPeepholePass(PassRegistry&);
+void initializeBPFMIPreEmitPeepholePass(PassRegistry&);
 }
 
 #endif
diff --git a/contrib/llvm/lib/Target/BPF/BPF.td b/contrib/llvm/lib/Target/BPF/BPF.td
index 2d0c22a3a516..877bd15f4f2b 100644
--- a/contrib/llvm/lib/Target/BPF/BPF.td
+++ b/contrib/llvm/lib/Target/BPF/BPF.td
@@ -26,6 +26,12 @@ def : Proc<"probe", []>;
 def DummyFeature : SubtargetFeature<"dummy", "isDummyMode",
                                     "true", "unused feature">;
 
+def ALU32 : SubtargetFeature<"alu32", "HasAlu32", "true",
+                             "Enable ALU32 instructions">;
+
+def DwarfRIS: SubtargetFeature<"dwarfris", "UseDwarfRIS", "true",
+                               "Disable MCAsmInfo DwarfUsesRelocationsAcrossSections">;
+
 def BPFInstPrinter : AsmWriter {
   string AsmWriterClassName  = "InstPrinter";
   bit isMCAsmWriter = 1;
diff --git a/contrib/llvm/lib/Target/BPF/BPFCallingConv.td b/contrib/llvm/lib/Target/BPF/BPFCallingConv.td
index 8cec6fa54698..637f9752ec42 100644
--- a/contrib/llvm/lib/Target/BPF/BPFCallingConv.td
+++ b/contrib/llvm/lib/Target/BPF/BPFCallingConv.td
@@ -26,4 +26,24 @@ def CC_BPF64 : CallingConv<[
   CCAssignToStack<8, 8>
 ]>;
 
+// Return-value convention when -mattr=+alu32 enabled
+def RetCC_BPF32 : CallingConv<[
+  CCIfType<[i32], CCAssignToRegWithShadow<[W0], [R0]>>,
+  CCIfType<[i64], CCAssignToRegWithShadow<[R0], [W0]>>
+]>;
+
+// Calling convention when -mattr=+alu32 enabled
+def CC_BPF32 : CallingConv<[
+  // Promote i8/i16/i32 args to i64
+  CCIfType<[i32], CCAssignToRegWithShadow<[W1, W2, W3, W4, W5],
+                                          [R1, R2, R3, R4, R5]>>,
+
+  // All arguments get passed in integer registers if there is space.
+  CCIfType<[i64], CCAssignToRegWithShadow<[R1, R2, R3, R4, R5],
+                                          [W1, W2, W3, W4, W5]>>,
+
+  // Could be assigned to the stack in 8-byte aligned units, but unsupported
+  CCAssignToStack<8, 8>
+]>;
+
 def CSR : CalleeSavedRegs<(add R6, R7, R8, R9, R10)>;
diff --git a/contrib/llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp b/contrib/llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp
index 61b04d1f2a13..8b9bc08e144f 100644
--- a/contrib/llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp
+++ b/contrib/llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp
@@ -39,8 +39,14 @@ using namespace llvm;
 namespace {
 
 class BPFDAGToDAGISel : public SelectionDAGISel {
+
+  /// Subtarget - Keep a pointer to the BPFSubtarget around so that we can
+  /// make the right decision when generating code for different subtargets.
+  const BPFSubtarget *Subtarget;
+
 public:
-  explicit BPFDAGToDAGISel(BPFTargetMachine &TM) : SelectionDAGISel(TM) {
+  explicit BPFDAGToDAGISel(BPFTargetMachine &TM)
+      : SelectionDAGISel(TM), Subtarget(nullptr) {
     curr_func_ = nullptr;
   }
 
@@ -48,6 +54,12 @@ public:
     return "BPF DAG->DAG Pattern Instruction Selection";
   }
 
+  bool runOnMachineFunction(MachineFunction &MF) override {
+    // Reset the subtarget each time through.
+    Subtarget = &MF.getSubtarget<BPFSubtarget>();
+    return SelectionDAGISel::runOnMachineFunction(MF);
+  }
+
   void PreprocessISelDAG() override;
 
   bool SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintCode,
@@ -65,9 +77,9 @@ private:
   bool SelectFIAddr(SDValue Addr, SDValue &Base, SDValue &Offset);
 
   // Node preprocessing cases
-  void PreprocessLoad(SDNode *Node, SelectionDAG::allnodes_iterator I);
+  void PreprocessLoad(SDNode *Node, SelectionDAG::allnodes_iterator &I);
   void PreprocessCopyToReg(SDNode *Node);
-  void PreprocessTrunc(SDNode *Node, SelectionDAG::allnodes_iterator I);
+  void PreprocessTrunc(SDNode *Node, SelectionDAG::allnodes_iterator &I);
 
   // Find constants from a constant structure
   typedef std::vector<unsigned char> val_vec_type;
@@ -176,12 +188,9 @@ bool BPFDAGToDAGISel::SelectInlineAsmMemoryOperand(
 void BPFDAGToDAGISel::Select(SDNode *Node) {
   unsigned Opcode = Node->getOpcode();
 
-  // Dump information about the Node being selected
-  DEBUG(dbgs() << "Selecting: "; Node->dump(CurDAG); dbgs() << '\n');
-
   // If we have a custom node, we already have selected!
   if (Node->isMachineOpcode()) {
-    DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << '\n');
+    LLVM_DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << '\n');
     return;
   }
 
@@ -241,7 +250,7 @@ void BPFDAGToDAGISel::Select(SDNode *Node) {
 }
 
 void BPFDAGToDAGISel::PreprocessLoad(SDNode *Node,
-                                     SelectionDAG::allnodes_iterator I) {
+                                     SelectionDAG::allnodes_iterator &I) {
   union {
     uint8_t c[8];
     uint16_t s;
@@ -268,7 +277,7 @@ void BPFDAGToDAGISel::PreprocessLoad(SDNode *Node,
     if (OP1N->getOpcode() <= ISD::BUILTIN_OP_END || OP1N->getNumOperands() == 0)
       return;
 
-    DEBUG(dbgs() << "Check candidate load: "; LD->dump(); dbgs() << '\n');
+    LLVM_DEBUG(dbgs() << "Check candidate load: "; LD->dump(); dbgs() << '\n');
 
     const GlobalAddressSDNode *GADN =
         dyn_cast<GlobalAddressSDNode>(OP1N->getOperand(0).getNode());
@@ -278,7 +287,7 @@ void BPFDAGToDAGISel::PreprocessLoad(SDNode *Node,
           getConstantFieldValue(GADN, CDN->getZExtValue(), size, new_val.c);
   } else if (LDAddrNode->getOpcode() > ISD::BUILTIN_OP_END &&
              LDAddrNode->getNumOperands() > 0) {
-    DEBUG(dbgs() << "Check candidate load: "; LD->dump(); dbgs() << '\n');
+    LLVM_DEBUG(dbgs() << "Check candidate load: "; LD->dump(); dbgs() << '\n');
 
     SDValue OP1 = LDAddrNode->getOperand(0);
     if (const GlobalAddressSDNode *GADN =
@@ -301,8 +310,8 @@ void BPFDAGToDAGISel::PreprocessLoad(SDNode *Node,
     val = new_val.d;
   }
 
-  DEBUG(dbgs() << "Replacing load of size " << size << " with constant " << val
-               << '\n');
+  LLVM_DEBUG(dbgs() << "Replacing load of size " << size << " with constant "
+                    << val << '\n');
   SDValue NVal = CurDAG->getConstant(val, DL, MVT::i64);
 
   // After replacement, the current node is dead, we need to
@@ -418,8 +427,8 @@ bool BPFDAGToDAGISel::fillGenericConstant(const DataLayout &DL,
 
   if (const ConstantInt *CI = dyn_cast<ConstantInt>(CV)) {
     uint64_t val = CI->getZExtValue();
-    DEBUG(dbgs() << "Byte array at offset " << Offset << " with value " << val
-                 << '\n');
+    LLVM_DEBUG(dbgs() << "Byte array at offset " << Offset << " with value "
+                      << val << '\n');
 
     if (Size > 8 || (Size & (Size - 1)))
       return false;
@@ -508,17 +517,49 @@ void BPFDAGToDAGISel::PreprocessCopyToReg(SDNode *Node) {
     break;
   }
 
-  DEBUG(dbgs() << "Find Load Value to VReg "
-               << TargetRegisterInfo::virtReg2Index(RegN->getReg()) << '\n');
+  LLVM_DEBUG(dbgs() << "Find Load Value to VReg "
+                    << TargetRegisterInfo::virtReg2Index(RegN->getReg())
+                    << '\n');
   load_to_vreg_[RegN->getReg()] = mem_load_op;
 }
 
 void BPFDAGToDAGISel::PreprocessTrunc(SDNode *Node,
-                                      SelectionDAG::allnodes_iterator I) {
+                                      SelectionDAG::allnodes_iterator &I) {
   ConstantSDNode *MaskN = dyn_cast<ConstantSDNode>(Node->getOperand(1));
   if (!MaskN)
     return;
 
+  // The Reg operand should be a virtual register, which is defined
+  // outside the current basic block. DAG combiner has done a pretty
+  // good job in removing truncating inside a single basic block except
+  // when the Reg operand comes from bpf_load_[byte | half | word] for
+  // which the generic optimizer doesn't understand their results are
+  // zero extended.
+  SDValue BaseV = Node->getOperand(0);
+  if (BaseV.getOpcode() == ISD::INTRINSIC_W_CHAIN) {
+    unsigned IntNo = cast<ConstantSDNode>(BaseV->getOperand(1))->getZExtValue();
+    uint64_t MaskV = MaskN->getZExtValue();
+
+    if (!((IntNo == Intrinsic::bpf_load_byte && MaskV == 0xFF) ||
+          (IntNo == Intrinsic::bpf_load_half && MaskV == 0xFFFF) ||
+          (IntNo == Intrinsic::bpf_load_word && MaskV == 0xFFFFFFFF)))
+      return;
+
+    LLVM_DEBUG(dbgs() << "Remove the redundant AND operation in: ";
+               Node->dump(); dbgs() << '\n');
+
+    I--;
+    CurDAG->ReplaceAllUsesWith(SDValue(Node, 0), BaseV);
+    I++;
+    CurDAG->DeleteNode(Node);
+
+    return;
+  }
+
+  // Multiple basic blocks case.
+  if (BaseV.getOpcode() != ISD::CopyFromReg)
+    return;
+
   unsigned match_load_op = 0;
   switch (MaskN->getZExtValue()) {
   default:
@@ -534,19 +575,12 @@ void BPFDAGToDAGISel::PreprocessTrunc(SDNode *Node,
     break;
   }
 
-  // The Reg operand should be a virtual register, which is defined
-  // outside the current basic block. DAG combiner has done a pretty
-  // good job in removing truncating inside a single basic block.
-  SDValue BaseV = Node->getOperand(0);
-  if (BaseV.getOpcode() != ISD::CopyFromReg)
-    return;
-
   const RegisterSDNode *RegN =
       dyn_cast<RegisterSDNode>(BaseV.getNode()->getOperand(1));
   if (!RegN || !TargetRegisterInfo::isVirtualRegister(RegN->getReg()))
     return;
   unsigned AndOpReg = RegN->getReg();
-  DEBUG(dbgs() << "Examine " << printReg(AndOpReg) << '\n');
+  LLVM_DEBUG(dbgs() << "Examine " << printReg(AndOpReg) << '\n');
 
   // Examine the PHI insns in the MachineBasicBlock to found out the
   // definitions of this virtual register. At this stage (DAG2DAG
@@ -576,8 +610,8 @@ void BPFDAGToDAGISel::PreprocessTrunc(SDNode *Node,
     //   %2 = PHI %0, <%bb.1>, %1, <%bb.3>
     // Trace each incoming definition, e.g., (%0, %bb.1) and (%1, %bb.3)
     // The AND operation can be removed if both %0 in %bb.1 and %1 in
-    // %bb.3 are defined with with a load matching the MaskN.
-    DEBUG(dbgs() << "Check PHI Insn: "; MII->dump(); dbgs() << '\n');
+    // %bb.3 are defined with a load matching the MaskN.
+    LLVM_DEBUG(dbgs() << "Check PHI Insn: "; MII->dump(); dbgs() << '\n');
     unsigned PrevReg = -1;
     for (unsigned i = 0; i < MII->getNumOperands(); ++i) {
       const MachineOperand &MOP = MII->getOperand(i);
@@ -593,8 +627,8 @@ void BPFDAGToDAGISel::PreprocessTrunc(SDNode *Node,
     }
   }
 
-  DEBUG(dbgs() << "Remove the redundant AND operation in: "; Node->dump();
-        dbgs() << '\n');
+  LLVM_DEBUG(dbgs() << "Remove the redundant AND operation in: "; Node->dump();
+             dbgs() << '\n');
 
   I--;
   CurDAG->ReplaceAllUsesWith(SDValue(Node, 0), BaseV);
diff --git a/contrib/llvm/lib/Target/BPF/BPFISelLowering.cpp b/contrib/llvm/lib/Target/BPF/BPFISelLowering.cpp
index 3ea96e3148f2..9272cf692dc9 100644
--- a/contrib/llvm/lib/Target/BPF/BPFISelLowering.cpp
+++ b/contrib/llvm/lib/Target/BPF/BPFISelLowering.cpp
@@ -33,6 +33,10 @@ using namespace llvm;
 
 #define DEBUG_TYPE "bpf-lower"
 
+static cl::opt<bool> BPFExpandMemcpyInOrder("bpf-expand-memcpy-in-order",
+  cl::Hidden, cl::init(false),
+  cl::desc("Expand memcpy into load/store pairs in order"));
+
 static void fail(const SDLoc &DL, SelectionDAG &DAG, const Twine &Msg) {
   MachineFunction &MF = DAG.getMachineFunction();
   DAG.getContext()->diagnose(
@@ -57,6 +61,8 @@ BPFTargetLowering::BPFTargetLowering(const TargetMachine &TM,
 
   // Set up the register classes.
   addRegisterClass(MVT::i64, &BPF::GPRRegClass);
+  if (STI.getHasAlu32())
+    addRegisterClass(MVT::i32, &BPF::GPR32RegClass);
 
   // Compute derived properties from the register classes
   computeRegisterProperties(STI.getRegisterInfo());
@@ -67,9 +73,6 @@ BPFTargetLowering::BPFTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::BR_JT, MVT::Other, Expand);
   setOperationAction(ISD::BRIND, MVT::Other, Expand);
   setOperationAction(ISD::BRCOND, MVT::Other, Expand);
-  setOperationAction(ISD::SETCC, MVT::i64, Expand);
-  setOperationAction(ISD::SELECT, MVT::i64, Expand);
-  setOperationAction(ISD::SELECT_CC, MVT::i64, Custom);
 
   setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
 
@@ -77,32 +80,39 @@ BPFTargetLowering::BPFTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
   setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
 
-  setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
-  setOperationAction(ISD::UDIVREM, MVT::i64, Expand);
-  setOperationAction(ISD::SREM, MVT::i64, Expand);
-  setOperationAction(ISD::UREM, MVT::i64, Expand);
-
-  setOperationAction(ISD::MULHU, MVT::i64, Expand);
-  setOperationAction(ISD::MULHS, MVT::i64, Expand);
-  setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
-  setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
+  for (auto VT : { MVT::i32, MVT::i64 }) {
+    if (VT == MVT::i32 && !STI.getHasAlu32())
+      continue;
 
-  setOperationAction(ISD::ADDC, MVT::i64, Expand);
-  setOperationAction(ISD::ADDE, MVT::i64, Expand);
-  setOperationAction(ISD::SUBC, MVT::i64, Expand);
-  setOperationAction(ISD::SUBE, MVT::i64, Expand);
+    setOperationAction(ISD::SDIVREM, VT, Expand);
+    setOperationAction(ISD::UDIVREM, VT, Expand);
+    setOperationAction(ISD::SREM, VT, Expand);
+    setOperationAction(ISD::UREM, VT, Expand);
+    setOperationAction(ISD::MULHU, VT, Expand);
+    setOperationAction(ISD::MULHS, VT, Expand);
+    setOperationAction(ISD::UMUL_LOHI, VT, Expand);
+    setOperationAction(ISD::SMUL_LOHI, VT, Expand);
+    setOperationAction(ISD::ROTR, VT, Expand);
+    setOperationAction(ISD::ROTL, VT, Expand);
+    setOperationAction(ISD::SHL_PARTS, VT, Expand);
+    setOperationAction(ISD::SRL_PARTS, VT, Expand);
+    setOperationAction(ISD::SRA_PARTS, VT, Expand);
+    setOperationAction(ISD::CTPOP, VT, Expand);
+
+    setOperationAction(ISD::SETCC, VT, Expand);
+    setOperationAction(ISD::SELECT, VT, Expand);
+    setOperationAction(ISD::SELECT_CC, VT, Custom);
+  }
 
-  setOperationAction(ISD::ROTR, MVT::i64, Expand);
-  setOperationAction(ISD::ROTL, MVT::i64, Expand);
-  setOperationAction(ISD::SHL_PARTS, MVT::i64, Expand);
-  setOperationAction(ISD::SRL_PARTS, MVT::i64, Expand);
-  setOperationAction(ISD::SRA_PARTS, MVT::i64, Expand);
+  if (STI.getHasAlu32()) {
+    setOperationAction(ISD::BSWAP, MVT::i32, Promote);
+    setOperationAction(ISD::BR_CC, MVT::i32, Promote);
+  }
 
   setOperationAction(ISD::CTTZ, MVT::i64, Custom);
   setOperationAction(ISD::CTLZ, MVT::i64, Custom);
   setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Custom);
   setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
-  setOperationAction(ISD::CTPOP, MVT::i64, Expand);
 
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand);
@@ -126,12 +136,33 @@ BPFTargetLowering::BPFTargetLowering(const TargetMachine &TM,
   setMinFunctionAlignment(3);
   setPrefFunctionAlignment(3);
 
-  // inline memcpy() for kernel to see explicit copy
-  MaxStoresPerMemset = MaxStoresPerMemsetOptSize = 128;
-  MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 128;
-  MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = 128;
+  if (BPFExpandMemcpyInOrder) {
+    // LLVM generic code will try to expand memcpy into load/store pairs at this
+    // stage which is before quite a few IR optimization passes, therefore the
+    // loads and stores could potentially be moved apart from each other which
+    // will cause trouble to memcpy pattern matcher inside kernel eBPF JIT
+    // compilers.
+    //
+    // When -bpf-expand-memcpy-in-order specified, we want to defer the expand
+    // of memcpy to later stage in IR optimization pipeline so those load/store
+    // pairs won't be touched and could be kept in order. Hence, we set
+    // MaxStoresPerMem* to zero to disable the generic getMemcpyLoadsAndStores
+    // code path, and ask LLVM to use target expander EmitTargetCodeForMemcpy.
+    MaxStoresPerMemset = MaxStoresPerMemsetOptSize = 0;
+    MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 0;
+    MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = 0;
+  } else {
+    // inline memcpy() for kernel to see explicit copy
+    unsigned CommonMaxStores =
+      STI.getSelectionDAGInfo()->getCommonMaxStoresPerMemFunc();
+
+    MaxStoresPerMemset = MaxStoresPerMemsetOptSize = CommonMaxStores;
+    MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = CommonMaxStores;
+    MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = CommonMaxStores;
+  }
 
   // CPU/Feature control
+  HasAlu32 = STI.getHasAlu32();
   HasJmpExt = STI.getHasJmpExt();
 }
 
@@ -189,26 +220,29 @@ SDValue BPFTargetLowering::LowerFormalArguments(
   // Assign locations to all of the incoming arguments.
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
-  CCInfo.AnalyzeFormalArguments(Ins, CC_BPF64);
+  CCInfo.AnalyzeFormalArguments(Ins, getHasAlu32() ? CC_BPF32 : CC_BPF64);
 
   for (auto &VA : ArgLocs) {
     if (VA.isRegLoc()) {
       // Arguments passed in registers
       EVT RegVT = VA.getLocVT();
-      switch (RegVT.getSimpleVT().SimpleTy) {
+      MVT::SimpleValueType SimpleTy = RegVT.getSimpleVT().SimpleTy;
+      switch (SimpleTy) {
       default: {
         errs() << "LowerFormalArguments Unhandled argument type: "
                << RegVT.getEVTString() << '\n';
         llvm_unreachable(0);
       }
+      case MVT::i32:
       case MVT::i64:
-        unsigned VReg = RegInfo.createVirtualRegister(&BPF::GPRRegClass);
+        unsigned VReg = RegInfo.createVirtualRegister(SimpleTy == MVT::i64 ?
+                                                      &BPF::GPRRegClass :
+                                                      &BPF::GPR32RegClass);
         RegInfo.addLiveIn(VA.getLocReg(), VReg);
         SDValue ArgValue = DAG.getCopyFromReg(Chain, DL, VReg, RegVT);
 
-        // If this is an 8/16/32-bit value, it is really passed promoted to 64
-        // bits. Insert an assert[sz]ext to capture this, then truncate to the
-        // right size.
+        // If this is an value that has been promoted to wider types, insert an
+        // assert[sz]ext to capture this, then truncate to the right size.
         if (VA.getLocInfo() == CCValAssign::SExt)
           ArgValue = DAG.getNode(ISD::AssertSext, DL, RegVT, ArgValue,
                                  DAG.getValueType(VA.getValVT()));
@@ -220,6 +254,8 @@ SDValue BPFTargetLowering::LowerFormalArguments(
           ArgValue = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), ArgValue);
 
         InVals.push_back(ArgValue);
+
+	break;
       }
     } else {
       fail(DL, DAG, "defined with too many args");
@@ -264,7 +300,7 @@ SDValue BPFTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
 
-  CCInfo.AnalyzeCallOperands(Outs, CC_BPF64);
+  CCInfo.AnalyzeCallOperands(Outs, getHasAlu32() ? CC_BPF32 : CC_BPF64);
 
   unsigned NumBytes = CCInfo.getNextStackOffset();
 
@@ -388,7 +424,7 @@ BPFTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
   }
 
   // Analize return values.
-  CCInfo.AnalyzeReturn(Outs, RetCC_BPF64);
+  CCInfo.AnalyzeReturn(Outs, getHasAlu32() ? RetCC_BPF32 : RetCC_BPF64);
 
   SDValue Flag;
   SmallVector<SDValue, 4> RetOps(1, Chain);
@@ -432,7 +468,7 @@ SDValue BPFTargetLowering::LowerCallResult(
     return DAG.getCopyFromReg(Chain, DL, 1, Ins[0].VT, InFlag).getValue(1);
   }
 
-  CCInfo.AnalyzeCallResult(Ins, RetCC_BPF64);
+  CCInfo.AnalyzeCallResult(Ins, getHasAlu32() ? RetCC_BPF32 : RetCC_BPF64);
 
   // Copy all of the result registers out of their specified physreg.
   for (auto &Val : RVLocs) {
@@ -485,8 +521,7 @@ SDValue BPFTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
   if (!getHasJmpExt())
     NegateCC(LHS, RHS, CC);
 
-  SDValue TargetCC = DAG.getConstant(CC, DL, MVT::i64);
-
+  SDValue TargetCC = DAG.getConstant(CC, DL, LHS.getValueType());
   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
   SDValue Ops[] = {LHS, RHS, TargetCC, TrueV, FalseV};
 
@@ -507,6 +542,8 @@ const char *BPFTargetLowering::getTargetNodeName(unsigned Opcode) const {
     return "BPFISD::BR_CC";
   case BPFISD::Wrapper:
     return "BPFISD::Wrapper";
+  case BPFISD::MEMCPY:
+    return "BPFISD::MEMCPY";
   }
   return nullptr;
 }
@@ -523,14 +560,90 @@ SDValue BPFTargetLowering::LowerGlobalAddress(SDValue Op,
   return DAG.getNode(BPFISD::Wrapper, DL, MVT::i64, GA);
 }
 
+unsigned
+BPFTargetLowering::EmitSubregExt(MachineInstr &MI, MachineBasicBlock *BB,
+                                 unsigned Reg, bool isSigned) const {
+  const TargetInstrInfo &TII = *BB->getParent()->getSubtarget().getInstrInfo();
+  const TargetRegisterClass *RC = getRegClassFor(MVT::i64);
+  int RShiftOp = isSigned ? BPF::SRA_ri : BPF::SRL_ri;
+  MachineFunction *F = BB->getParent();
+  DebugLoc DL = MI.getDebugLoc();
+
+  MachineRegisterInfo &RegInfo = F->getRegInfo();
+  unsigned PromotedReg0 = RegInfo.createVirtualRegister(RC);
+  unsigned PromotedReg1 = RegInfo.createVirtualRegister(RC);
+  unsigned PromotedReg2 = RegInfo.createVirtualRegister(RC);
+  BuildMI(BB, DL, TII.get(BPF::MOV_32_64), PromotedReg0).addReg(Reg);
+  BuildMI(BB, DL, TII.get(BPF::SLL_ri), PromotedReg1)
+    .addReg(PromotedReg0).addImm(32);
+  BuildMI(BB, DL, TII.get(RShiftOp), PromotedReg2)
+    .addReg(PromotedReg1).addImm(32);
+
+  return PromotedReg2;
+}
+
+MachineBasicBlock *
+BPFTargetLowering::EmitInstrWithCustomInserterMemcpy(MachineInstr &MI,
+                                                     MachineBasicBlock *BB)
+                                                     const {
+  MachineFunction *MF = MI.getParent()->getParent();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  MachineInstrBuilder MIB(*MF, MI);
+  unsigned ScratchReg;
+
+  // This function does custom insertion during lowering BPFISD::MEMCPY which
+  // only has two register operands from memcpy semantics, the copy source
+  // address and the copy destination address.
+  //
+  // Because we will expand BPFISD::MEMCPY into load/store pairs, we will need
+  // a third scratch register to serve as the destination register of load and
+  // source register of store.
+  //
+  // The scratch register here is with the Define | Dead | EarlyClobber flags.
+  // The EarlyClobber flag has the semantic property that the operand it is
+  // attached to is clobbered before the rest of the inputs are read. Hence it
+  // must be unique among the operands to the instruction. The Define flag is
+  // needed to coerce the machine verifier that an Undef value isn't a problem
+  // as we anyway is loading memory into it. The Dead flag is needed as the
+  // value in scratch isn't supposed to be used by any other instruction.
+  ScratchReg = MRI.createVirtualRegister(&BPF::GPRRegClass);
+  MIB.addReg(ScratchReg,
+             RegState::Define | RegState::Dead | RegState::EarlyClobber);
+
+  return BB;
+}
+
 MachineBasicBlock *
 BPFTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
                                                MachineBasicBlock *BB) const {
   const TargetInstrInfo &TII = *BB->getParent()->getSubtarget().getInstrInfo();
   DebugLoc DL = MI.getDebugLoc();
-  bool isSelectOp = MI.getOpcode() == BPF::Select;
+  unsigned Opc = MI.getOpcode();
+  bool isSelectRROp = (Opc == BPF::Select ||
+                       Opc == BPF::Select_64_32 ||
+                       Opc == BPF::Select_32 ||
+                       Opc == BPF::Select_32_64);
+
+  bool isMemcpyOp = Opc == BPF::MEMCPY;
+
+#ifndef NDEBUG
+  bool isSelectRIOp = (Opc == BPF::Select_Ri ||
+                       Opc == BPF::Select_Ri_64_32 ||
+                       Opc == BPF::Select_Ri_32 ||
+                       Opc == BPF::Select_Ri_32_64);
+
+
+  assert((isSelectRROp || isSelectRIOp || isMemcpyOp) &&
+         "Unexpected instr type to insert");
+#endif
+
+  if (isMemcpyOp)
+    return EmitInstrWithCustomInserterMemcpy(MI, BB);
 
-  assert((isSelectOp || MI.getOpcode() == BPF::Select_Ri) && "Unexpected instr type to insert");
+  bool is32BitCmp = (Opc == BPF::Select_32 ||
+                     Opc == BPF::Select_32_64 ||
+                     Opc == BPF::Select_Ri_32 ||
+                     Opc == BPF::Select_Ri_32_64);
 
   // To "insert" a SELECT instruction, we actually have to insert the diamond
   // control-flow pattern.  The incoming instruction knows the destination vreg
@@ -561,56 +674,72 @@ BPFTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
   BB->addSuccessor(Copy1MBB);
 
   // Insert Branch if Flag
-  unsigned LHS = MI.getOperand(1).getReg();
   int CC = MI.getOperand(3).getImm();
   int NewCC;
   switch (CC) {
   case ISD::SETGT:
-    NewCC = isSelectOp ? BPF::JSGT_rr : BPF::JSGT_ri;
+    NewCC = isSelectRROp ? BPF::JSGT_rr : BPF::JSGT_ri;
     break;
   case ISD::SETUGT:
-    NewCC = isSelectOp ? BPF::JUGT_rr : BPF::JUGT_ri;
+    NewCC = isSelectRROp ? BPF::JUGT_rr : BPF::JUGT_ri;
     break;
   case ISD::SETGE:
-    NewCC = isSelectOp ? BPF::JSGE_rr : BPF::JSGE_ri;
+    NewCC = isSelectRROp ? BPF::JSGE_rr : BPF::JSGE_ri;
     break;
   case ISD::SETUGE:
-    NewCC = isSelectOp ? BPF::JUGE_rr : BPF::JUGE_ri;
+    NewCC = isSelectRROp ? BPF::JUGE_rr : BPF::JUGE_ri;
     break;
   case ISD::SETEQ:
-    NewCC = isSelectOp ? BPF::JEQ_rr : BPF::JEQ_ri;
+    NewCC = isSelectRROp ? BPF::JEQ_rr : BPF::JEQ_ri;
     break;
   case ISD::SETNE:
-    NewCC = isSelectOp ? BPF::JNE_rr : BPF::JNE_ri;
+    NewCC = isSelectRROp ? BPF::JNE_rr : BPF::JNE_ri;
     break;
   case ISD::SETLT:
-    NewCC = isSelectOp ? BPF::JSLT_rr : BPF::JSLT_ri;
+    NewCC = isSelectRROp ? BPF::JSLT_rr : BPF::JSLT_ri;
     break;
   case ISD::SETULT:
-    NewCC = isSelectOp ? BPF::JULT_rr : BPF::JULT_ri;
+    NewCC = isSelectRROp ? BPF::JULT_rr : BPF::JULT_ri;
     break;
   case ISD::SETLE:
-    NewCC = isSelectOp ? BPF::JSLE_rr : BPF::JSLE_ri;
+    NewCC = isSelectRROp ? BPF::JSLE_rr : BPF::JSLE_ri;
     break;
   case ISD::SETULE:
-    NewCC = isSelectOp ? BPF::JULE_rr : BPF::JULE_ri;
+    NewCC = isSelectRROp ? BPF::JULE_rr : BPF::JULE_ri;
     break;
   default:
     report_fatal_error("unimplemented select CondCode " + Twine(CC));
   }
-  if (isSelectOp)
-    BuildMI(BB, DL, TII.get(NewCC))
-        .addReg(LHS)
-        .addReg(MI.getOperand(2).getReg())
-        .addMBB(Copy1MBB);
-  else {
+
+  unsigned LHS = MI.getOperand(1).getReg();
+  bool isSignedCmp = (CC == ISD::SETGT ||
+                      CC == ISD::SETGE ||
+                      CC == ISD::SETLT ||
+                      CC == ISD::SETLE);
+
+  // eBPF at the moment only has 64-bit comparison. Any 32-bit comparison need
+  // to be promoted, however if the 32-bit comparison operands are destination
+  // registers then they are implicitly zero-extended already, there is no
+  // need of explicit zero-extend sequence for them.
+  //
+  // We simply do extension for all situations in this method, but we will
+  // try to remove those unnecessary in BPFMIPeephole pass.
+  if (is32BitCmp)
+    LHS = EmitSubregExt(MI, BB, LHS, isSignedCmp);
+
+  if (isSelectRROp) {
+    unsigned RHS = MI.getOperand(2).getReg();
+
+    if (is32BitCmp)
+      RHS = EmitSubregExt(MI, BB, RHS, isSignedCmp);
+
+    BuildMI(BB, DL, TII.get(NewCC)).addReg(LHS).addReg(RHS).addMBB(Copy1MBB);
+  } else {
     int64_t imm32 = MI.getOperand(2).getImm();
     // sanity check before we build J*_ri instruction.
     assert (isInt<32>(imm32));
     BuildMI(BB, DL, TII.get(NewCC))
-        .addReg(LHS)
-        .addImm(imm32)
-        .addMBB(Copy1MBB);
+        .addReg(LHS).addImm(imm32).addMBB(Copy1MBB);
   }
 
   // Copy0MBB:
@@ -634,3 +763,13 @@ BPFTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
   MI.eraseFromParent(); // The pseudo instruction is gone now.
   return BB;
 }
+
+EVT BPFTargetLowering::getSetCCResultType(const DataLayout &, LLVMContext &,
+                                          EVT VT) const {
+  return getHasAlu32() ? MVT::i32 : MVT::i64;
+}
+
+MVT BPFTargetLowering::getScalarShiftAmountTy(const DataLayout &DL,
+                                              EVT VT) const {
+  return (getHasAlu32() && VT == MVT::i32) ? MVT::i32 : MVT::i64;
+}
diff --git a/contrib/llvm/lib/Target/BPF/BPFISelLowering.h b/contrib/llvm/lib/Target/BPF/BPFISelLowering.h
index 6ca2594a7e88..0aa8b9ac57ac 100644
--- a/contrib/llvm/lib/Target/BPF/BPFISelLowering.h
+++ b/contrib/llvm/lib/Target/BPF/BPFISelLowering.h
@@ -28,7 +28,8 @@ enum NodeType : unsigned {
   CALL,
   SELECT_CC,
   BR_CC,
-  Wrapper
+  Wrapper,
+  MEMCPY
 };
 }
 
@@ -54,10 +55,17 @@ public:
   EmitInstrWithCustomInserter(MachineInstr &MI,
                               MachineBasicBlock *BB) const override;
 
+  bool getHasAlu32() const { return HasAlu32; }
   bool getHasJmpExt() const { return HasJmpExt; }
 
+  EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
+                         EVT VT) const override;
+
+  MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override;
+
 private:
   // Control Instruction Selection Features
+  bool HasAlu32;
   bool HasJmpExt;
 
   SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const;
@@ -100,6 +108,14 @@ private:
                                          Type *Ty) const override {
     return true;
   }
+
+  unsigned EmitSubregExt(MachineInstr &MI, MachineBasicBlock *BB, unsigned Reg,
+                         bool isSigned) const;
+
+  MachineBasicBlock * EmitInstrWithCustomInserterMemcpy(MachineInstr &MI,
+                                                        MachineBasicBlock *BB)
+                                                        const;
+
 };
 }
 
diff --git a/contrib/llvm/lib/Target/BPF/BPFInstrInfo.cpp b/contrib/llvm/lib/Target/BPF/BPFInstrInfo.cpp
index 5351cfa95020..4d47debdaa74 100644
--- a/contrib/llvm/lib/Target/BPF/BPFInstrInfo.cpp
+++ b/contrib/llvm/lib/Target/BPF/BPFInstrInfo.cpp
@@ -36,10 +36,92 @@ void BPFInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
   if (BPF::GPRRegClass.contains(DestReg, SrcReg))
     BuildMI(MBB, I, DL, get(BPF::MOV_rr), DestReg)
         .addReg(SrcReg, getKillRegState(KillSrc));
+  else if (BPF::GPR32RegClass.contains(DestReg, SrcReg))
+    BuildMI(MBB, I, DL, get(BPF::MOV_rr_32), DestReg)
+        .addReg(SrcReg, getKillRegState(KillSrc));
   else
     llvm_unreachable("Impossible reg-to-reg copy");
 }
 
+void BPFInstrInfo::expandMEMCPY(MachineBasicBlock::iterator MI) const {
+  unsigned DstReg = MI->getOperand(0).getReg();
+  unsigned SrcReg = MI->getOperand(1).getReg();
+  uint64_t CopyLen = MI->getOperand(2).getImm();
+  uint64_t Alignment = MI->getOperand(3).getImm();
+  unsigned ScratchReg = MI->getOperand(4).getReg();
+  MachineBasicBlock *BB = MI->getParent();
+  DebugLoc dl = MI->getDebugLoc();
+  unsigned LdOpc, StOpc;
+
+  switch (Alignment) {
+  case 1:
+    LdOpc = BPF::LDB;
+    StOpc = BPF::STB;
+    break;
+  case 2:
+    LdOpc = BPF::LDH;
+    StOpc = BPF::STH;
+    break;
+  case 4:
+    LdOpc = BPF::LDW;
+    StOpc = BPF::STW;
+    break;
+  case 8:
+    LdOpc = BPF::LDD;
+    StOpc = BPF::STD;
+    break;
+  default:
+    llvm_unreachable("unsupported memcpy alignment");
+  }
+
+  unsigned IterationNum = CopyLen >> Log2_64(Alignment);
+  for(unsigned I = 0; I < IterationNum; ++I) {
+    BuildMI(*BB, MI, dl, get(LdOpc))
+            .addReg(ScratchReg, RegState::Define).addReg(SrcReg)
+            .addImm(I * Alignment);
+    BuildMI(*BB, MI, dl, get(StOpc))
+            .addReg(ScratchReg, RegState::Kill).addReg(DstReg)
+            .addImm(I * Alignment);
+  }
+
+  unsigned BytesLeft = CopyLen & (Alignment - 1);
+  unsigned Offset = IterationNum * Alignment;
+  bool Hanging4Byte = BytesLeft & 0x4;
+  bool Hanging2Byte = BytesLeft & 0x2;
+  bool Hanging1Byte = BytesLeft & 0x1;
+  if (Hanging4Byte) {
+    BuildMI(*BB, MI, dl, get(BPF::LDW))
+            .addReg(ScratchReg, RegState::Define).addReg(SrcReg).addImm(Offset);
+    BuildMI(*BB, MI, dl, get(BPF::STW))
+            .addReg(ScratchReg, RegState::Kill).addReg(DstReg).addImm(Offset);
+    Offset += 4;
+  }
+  if (Hanging2Byte) {
+    BuildMI(*BB, MI, dl, get(BPF::LDH))
+            .addReg(ScratchReg, RegState::Define).addReg(SrcReg).addImm(Offset);
+    BuildMI(*BB, MI, dl, get(BPF::STH))
+            .addReg(ScratchReg, RegState::Kill).addReg(DstReg).addImm(Offset);
+    Offset += 2;
+  }
+  if (Hanging1Byte) {
+    BuildMI(*BB, MI, dl, get(BPF::LDB))
+            .addReg(ScratchReg, RegState::Define).addReg(SrcReg).addImm(Offset);
+    BuildMI(*BB, MI, dl, get(BPF::STB))
+            .addReg(ScratchReg, RegState::Kill).addReg(DstReg).addImm(Offset);
+  }
+
+  BB->erase(MI);
+}
+
+bool BPFInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
+  if (MI.getOpcode() == BPF::MEMCPY) {
+    expandMEMCPY(MI);
+    return true;
+  }
+
+  return false;
+}
+
 void BPFInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
                                        MachineBasicBlock::iterator I,
                                        unsigned SrcReg, bool IsKill, int FI,
@@ -54,6 +136,11 @@ void BPFInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
         .addReg(SrcReg, getKillRegState(IsKill))
         .addFrameIndex(FI)
         .addImm(0);
+  else if (RC == &BPF::GPR32RegClass)
+    BuildMI(MBB, I, DL, get(BPF::STW32))
+        .addReg(SrcReg, getKillRegState(IsKill))
+        .addFrameIndex(FI)
+        .addImm(0);
   else
     llvm_unreachable("Can't store this register to stack slot");
 }
@@ -69,6 +156,8 @@ void BPFInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
 
   if (RC == &BPF::GPRRegClass)
     BuildMI(MBB, I, DL, get(BPF::LDD), DestReg).addFrameIndex(FI).addImm(0);
+  else if (RC == &BPF::GPR32RegClass)
+    BuildMI(MBB, I, DL, get(BPF::LDW32), DestReg).addFrameIndex(FI).addImm(0);
   else
     llvm_unreachable("Can't load this register from stack slot");
 }
@@ -83,7 +172,7 @@ bool BPFInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
   MachineBasicBlock::iterator I = MBB.end();
   while (I != MBB.begin()) {
     --I;
-    if (I->isDebugValue())
+    if (I->isDebugInstr())
       continue;
 
     // Working from the bottom, when we see a non-terminator
@@ -158,7 +247,7 @@ unsigned BPFInstrInfo::removeBranch(MachineBasicBlock &MBB,
 
   while (I != MBB.begin()) {
     --I;
-    if (I->isDebugValue())
+    if (I->isDebugInstr())
       continue;
     if (I->getOpcode() != BPF::JMP)
       break;
diff --git a/contrib/llvm/lib/Target/BPF/BPFInstrInfo.h b/contrib/llvm/lib/Target/BPF/BPFInstrInfo.h
index f591f48a89a6..fb65a86a6d18 100644
--- a/contrib/llvm/lib/Target/BPF/BPFInstrInfo.h
+++ b/contrib/llvm/lib/Target/BPF/BPFInstrInfo.h
@@ -34,6 +34,8 @@ public:
                    const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
                    bool KillSrc) const override;
 
+  bool expandPostRAPseudo(MachineInstr &MI) const override;
+
   void storeRegToStackSlot(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator MBBI, unsigned SrcReg,
                            bool isKill, int FrameIndex,
@@ -55,6 +57,9 @@ public:
                         MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
                         const DebugLoc &DL,
                         int *BytesAdded = nullptr) const override;
+private:
+  void expandMEMCPY(MachineBasicBlock::iterator) const;
+
 };
 }
 
diff --git a/contrib/llvm/lib/Target/BPF/BPFInstrInfo.td b/contrib/llvm/lib/Target/BPF/BPFInstrInfo.td
index 126d55fc28de..aaef5fb706e0 100644
--- a/contrib/llvm/lib/Target/BPF/BPFInstrInfo.td
+++ b/contrib/llvm/lib/Target/BPF/BPFInstrInfo.td
@@ -28,6 +28,10 @@ def SDT_BPFBrCC         : SDTypeProfile<0, 4, [SDTCisSameAs<0, 1>,
                                                SDTCisVT<3, OtherVT>]>;
 def SDT_BPFWrapper      : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>,
                                                SDTCisPtrTy<0>]>;
+def SDT_BPFMEMCPY       : SDTypeProfile<0, 4, [SDTCisVT<0, i64>,
+                                               SDTCisVT<1, i64>,
+                                               SDTCisVT<2, i64>,
+                                               SDTCisVT<3, i64>]>;
 
 def BPFcall         : SDNode<"BPFISD::CALL", SDT_BPFCall,
                              [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
@@ -43,8 +47,13 @@ def BPFbrcc         : SDNode<"BPFISD::BR_CC", SDT_BPFBrCC,
 
 def BPFselectcc     : SDNode<"BPFISD::SELECT_CC", SDT_BPFSelectCC, [SDNPInGlue]>;
 def BPFWrapper      : SDNode<"BPFISD::Wrapper", SDT_BPFWrapper>;
+def BPFmemcpy       : SDNode<"BPFISD::MEMCPY", SDT_BPFMEMCPY,
+                             [SDNPHasChain, SDNPInGlue, SDNPOutGlue,
+                              SDNPMayStore, SDNPMayLoad]>;
 def BPFIsLittleEndian : Predicate<"CurDAG->getDataLayout().isLittleEndian()">;
 def BPFIsBigEndian    : Predicate<"!CurDAG->getDataLayout().isLittleEndian()">;
+def BPFHasALU32 : Predicate<"Subtarget->getHasAlu32()">;
+def BPFNoALU32 : Predicate<"!Subtarget->getHasAlu32()">;
 
 def brtarget : Operand<OtherVT> {
   let PrintMethod = "printBrTargetOperand";
@@ -57,6 +66,8 @@ def u64imm   : Operand<i64> {
 
 def i64immSExt32 : PatLeaf<(i64 imm),
                 [{return isInt<32>(N->getSExtValue()); }]>;
+def i32immSExt32 : PatLeaf<(i32 imm),
+                [{return isInt<32>(N->getSExtValue()); }]>;
 
 // Addressing modes.
 def ADDRri : ComplexPattern<i64, 2, "SelectAddr", [], []>;
@@ -218,7 +229,7 @@ multiclass ALU<BPFArithOp Opc, string OpcodeStr, SDNode OpNode> {
                    (outs GPR32:$dst),
                    (ins GPR32:$src2, i32imm:$imm),
                    "$dst "#OpcodeStr#" $imm",
-                   [(set GPR32:$dst, (OpNode GPR32:$src2, i32:$imm))]>;
+                   [(set GPR32:$dst, (OpNode GPR32:$src2, i32immSExt32:$imm))]>;
 }
 
 let Constraints = "$dst = $src2" in {
@@ -292,7 +303,7 @@ def MOV_ri_32 : ALU_RI<BPF_ALU, BPF_MOV,
                     (outs GPR32:$dst),
                     (ins i32imm:$imm),
                     "$dst = $imm",
-                    [(set GPR32:$dst, (i32 i32:$imm))]>;
+                    [(set GPR32:$dst, (i32 i32immSExt32:$imm))]>;
 }
 
 def FI_ri
@@ -347,9 +358,11 @@ class STORE<BPFWidthModifer SizeOp, string OpcodeStr, list<dag> Pattern>
 class STOREi64<BPFWidthModifer Opc, string OpcodeStr, PatFrag OpNode>
     : STORE<Opc, OpcodeStr, [(OpNode i64:$src, ADDRri:$addr)]>;
 
-def STW : STOREi64<BPF_W, "u32", truncstorei32>;
-def STH : STOREi64<BPF_H, "u16", truncstorei16>;
-def STB : STOREi64<BPF_B, "u8", truncstorei8>;
+let Predicates = [BPFNoALU32] in {
+  def STW : STOREi64<BPF_W, "u32", truncstorei32>;
+  def STH : STOREi64<BPF_H, "u16", truncstorei16>;
+  def STB : STOREi64<BPF_B, "u8", truncstorei8>;
+}
 def STD : STOREi64<BPF_DW, "u64", store>;
 
 // LOAD instructions
@@ -371,9 +384,13 @@ class LOAD<BPFWidthModifer SizeOp, string OpcodeStr, list<dag> Pattern>
 class LOADi64<BPFWidthModifer SizeOp, string OpcodeStr, PatFrag OpNode>
     : LOAD<SizeOp, OpcodeStr, [(set i64:$dst, (OpNode ADDRri:$addr))]>;
 
-def LDW : LOADi64<BPF_W, "u32", zextloadi32>;
-def LDH : LOADi64<BPF_H, "u16", zextloadi16>;
-def LDB : LOADi64<BPF_B, "u8", zextloadi8>;
+
+let Predicates = [BPFNoALU32] in {
+  def LDW : LOADi64<BPF_W, "u32", zextloadi32>;
+  def LDH : LOADi64<BPF_H, "u16", zextloadi16>;
+  def LDB : LOADi64<BPF_B, "u8", zextloadi8>;
+}
+
 def LDD : LOADi64<BPF_DW, "u64", load>;
 
 class BRANCH<BPFJumpOp Opc, string OpcodeStr, list<dag> Pattern>
@@ -456,7 +473,7 @@ let isReturn = 1, isTerminator = 1, hasDelaySlot=0, isBarrier = 1,
 }
 
 // ADJCALLSTACKDOWN/UP pseudo insns
-let Defs = [R11], Uses = [R11] in {
+let Defs = [R11], Uses = [R11], isCodeGenOnly = 1 in {
 def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i64imm:$amt1, i64imm:$amt2),
                               "#ADJCALLSTACKDOWN $amt1 $amt2",
                               [(BPFcallseq_start timm:$amt1, timm:$amt2)]>;
@@ -465,7 +482,7 @@ def ADJCALLSTACKUP   : Pseudo<(outs), (ins i64imm:$amt1, i64imm:$amt2),
                               [(BPFcallseq_end timm:$amt1, timm:$amt2)]>;
 }
 
-let usesCustomInserter = 1 in {
+let usesCustomInserter = 1, isCodeGenOnly = 1 in {
   def Select : Pseudo<(outs GPR:$dst),
                       (ins GPR:$lhs, GPR:$rhs, i64imm:$imm, GPR:$src, GPR:$src2),
                       "# Select PSEUDO $dst = $lhs $imm $rhs ? $src : $src2",
@@ -476,6 +493,36 @@ let usesCustomInserter = 1 in {
                       "# Select PSEUDO $dst = $lhs $imm $rhs ? $src : $src2",
                       [(set i64:$dst,
                        (BPFselectcc i64:$lhs, (i64immSExt32:$rhs), (i64 imm:$imm), i64:$src, i64:$src2))]>;
+  def Select_64_32 : Pseudo<(outs GPR32:$dst),
+                      (ins GPR:$lhs, GPR:$rhs, i64imm:$imm, GPR32:$src, GPR32:$src2),
+                      "# Select PSEUDO $dst = $lhs $imm $rhs ? $src : $src2",
+                      [(set i32:$dst,
+                       (BPFselectcc i64:$lhs, i64:$rhs, (i64 imm:$imm), i32:$src, i32:$src2))]>;
+  def Select_Ri_64_32 : Pseudo<(outs GPR32:$dst),
+                      (ins GPR:$lhs, i64imm:$rhs, i64imm:$imm, GPR32:$src, GPR32:$src2),
+                      "# Select PSEUDO $dst = $lhs $imm $rhs ? $src : $src2",
+                      [(set i32:$dst,
+                       (BPFselectcc i64:$lhs, (i64immSExt32:$rhs), (i64 imm:$imm), i32:$src, i32:$src2))]>;
+  def Select_32 : Pseudo<(outs GPR32:$dst),
+                      (ins GPR32:$lhs, GPR32:$rhs, i32imm:$imm, GPR32:$src, GPR32:$src2),
+                      "# Select PSEUDO $dst = $lhs $imm $rhs ? $src : $src2",
+                      [(set i32:$dst,
+                       (BPFselectcc i32:$lhs, i32:$rhs, (i32 imm:$imm), i32:$src, i32:$src2))]>;
+  def Select_Ri_32 : Pseudo<(outs GPR32:$dst),
+                      (ins GPR32:$lhs, i32imm:$rhs, i32imm:$imm, GPR32:$src, GPR32:$src2),
+                      "# Select PSEUDO $dst = $lhs $imm $rhs ? $src : $src2",
+                      [(set i32:$dst,
+                       (BPFselectcc i32:$lhs, (i32immSExt32:$rhs), (i32 imm:$imm), i32:$src, i32:$src2))]>;
+  def Select_32_64 : Pseudo<(outs GPR:$dst),
+                      (ins GPR32:$lhs, GPR32:$rhs, i32imm:$imm, GPR:$src, GPR:$src2),
+                      "# Select PSEUDO $dst = $lhs $imm $rhs ? $src : $src2",
+                      [(set i64:$dst,
+                       (BPFselectcc i32:$lhs, i32:$rhs, (i32 imm:$imm), i64:$src, i64:$src2))]>;
+  def Select_Ri_32_64 : Pseudo<(outs GPR:$dst),
+                      (ins GPR32:$lhs, i32imm:$rhs, i32imm:$imm, GPR:$src, GPR:$src2),
+                      "# Select PSEUDO $dst = $lhs $imm $rhs ? $src : $src2",
+                      [(set i64:$dst,
+                       (BPFselectcc i32:$lhs, (i32immSExt32:$rhs), (i32 imm:$imm), i64:$src, i64:$src2))]>;
 }
 
 // load 64-bit global addr into register
@@ -492,9 +539,11 @@ def : Pat<(BPFcall imm:$dst), (JAL imm:$dst)>;
 def : Pat<(BPFcall GPR:$dst), (JALX GPR:$dst)>;
 
 // Loads
-def : Pat<(extloadi8  ADDRri:$src), (i64 (LDB ADDRri:$src))>;
-def : Pat<(extloadi16 ADDRri:$src), (i64 (LDH ADDRri:$src))>;
-def : Pat<(extloadi32 ADDRri:$src), (i64 (LDW ADDRri:$src))>;
+let Predicates = [BPFNoALU32] in {
+  def : Pat<(i64 (extloadi8  ADDRri:$src)), (i64 (LDB ADDRri:$src))>;
+  def : Pat<(i64 (extloadi16 ADDRri:$src)), (i64 (LDH ADDRri:$src))>;
+  def : Pat<(i64 (extloadi32 ADDRri:$src)), (i64 (LDW ADDRri:$src))>;
+}
 
 // Atomics
 class XADD<BPFWidthModifer SizeOp, string OpcodeStr, PatFrag OpNode>
@@ -581,3 +630,102 @@ def LD_ABS_W : LOAD_ABS<BPF_W, "u32", int_bpf_load_word>;
 def LD_IND_B : LOAD_IND<BPF_B, "u8", int_bpf_load_byte>;
 def LD_IND_H : LOAD_IND<BPF_H, "u16", int_bpf_load_half>;
 def LD_IND_W : LOAD_IND<BPF_W, "u32", int_bpf_load_word>;
+
+let isCodeGenOnly = 1 in {
+  def MOV_32_64 : ALU_RR<BPF_ALU, BPF_MOV,
+                         (outs GPR:$dst), (ins GPR32:$src),
+                         "$dst = $src", []>;
+}
+
+def : Pat<(i64 (sext GPR32:$src)),
+          (SRA_ri (SLL_ri (MOV_32_64 GPR32:$src), 32), 32)>;
+
+def : Pat<(i64 (zext GPR32:$src)),
+          (SRL_ri (SLL_ri (MOV_32_64 GPR32:$src), 32), 32)>;
+
+// For i64 -> i32 truncation, use the 32-bit subregister directly.
+def : Pat<(i32 (trunc GPR:$src)),
+          (i32 (EXTRACT_SUBREG GPR:$src, sub_32))>;
+
+// For i32 -> i64 anyext, we don't care about the high bits.
+def : Pat<(i64 (anyext GPR32:$src)),
+          (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$src, sub_32)>;
+
+class STORE32<BPFWidthModifer SizeOp, string OpcodeStr, list<dag> Pattern>
+    : TYPE_LD_ST<BPF_MEM.Value, SizeOp.Value,
+                 (outs),
+                 (ins GPR32:$src, MEMri:$addr),
+                 "*("#OpcodeStr#" *)($addr) = $src",
+                 Pattern> {
+  bits<4> src;
+  bits<20> addr;
+
+  let Inst{51-48} = addr{19-16}; // base reg
+  let Inst{55-52} = src;
+  let Inst{47-32} = addr{15-0}; // offset
+  let BPFClass = BPF_STX;
+}
+
+class STOREi32<BPFWidthModifer Opc, string OpcodeStr, PatFrag OpNode>
+    : STORE32<Opc, OpcodeStr, [(OpNode i32:$src, ADDRri:$addr)]>;
+
+let Predicates = [BPFHasALU32], DecoderNamespace = "BPFALU32" in {
+  def STW32 : STOREi32<BPF_W, "u32", store>;
+  def STH32 : STOREi32<BPF_H, "u16", truncstorei16>;
+  def STB32 : STOREi32<BPF_B, "u8", truncstorei8>;
+}
+
+class LOAD32<BPFWidthModifer SizeOp, string OpcodeStr, list<dag> Pattern>
+    : TYPE_LD_ST<BPF_MEM.Value, SizeOp.Value,
+                (outs GPR32:$dst),
+                (ins MEMri:$addr),
+                "$dst = *("#OpcodeStr#" *)($addr)",
+                Pattern> {
+  bits<4> dst;
+  bits<20> addr;
+
+  let Inst{51-48} = dst;
+  let Inst{55-52} = addr{19-16};
+  let Inst{47-32} = addr{15-0};
+  let BPFClass = BPF_LDX;
+}
+
+class LOADi32<BPFWidthModifer SizeOp, string OpcodeStr, PatFrag OpNode>
+    : LOAD32<SizeOp, OpcodeStr, [(set i32:$dst, (OpNode ADDRri:$addr))]>;
+
+let Predicates = [BPFHasALU32], DecoderNamespace = "BPFALU32" in {
+  def LDW32 : LOADi32<BPF_W, "u32", load>;
+  def LDH32 : LOADi32<BPF_H, "u16", zextloadi16>;
+  def LDB32 : LOADi32<BPF_B, "u8", zextloadi8>;
+}
+
+let Predicates = [BPFHasALU32] in {
+  def : Pat<(truncstorei8 GPR:$src, ADDRri:$dst),
+            (STB32 (EXTRACT_SUBREG GPR:$src, sub_32), ADDRri:$dst)>;
+  def : Pat<(truncstorei16 GPR:$src, ADDRri:$dst),
+            (STH32 (EXTRACT_SUBREG GPR:$src, sub_32), ADDRri:$dst)>;
+  def : Pat<(truncstorei32 GPR:$src, ADDRri:$dst),
+            (STW32 (EXTRACT_SUBREG GPR:$src, sub_32), ADDRri:$dst)>;
+  def : Pat<(i32 (extloadi8 ADDRri:$src)), (i32 (LDB32 ADDRri:$src))>;
+  def : Pat<(i32 (extloadi16 ADDRri:$src)), (i32 (LDH32 ADDRri:$src))>;
+  def : Pat<(i64 (zextloadi8  ADDRri:$src)),
+            (SUBREG_TO_REG (i64 0), (LDB32 ADDRri:$src), sub_32)>;
+  def : Pat<(i64 (zextloadi16 ADDRri:$src)),
+            (SUBREG_TO_REG (i64 0), (LDH32 ADDRri:$src), sub_32)>;
+  def : Pat<(i64 (zextloadi32 ADDRri:$src)),
+            (SUBREG_TO_REG (i64 0), (LDW32 ADDRri:$src), sub_32)>;
+  def : Pat<(i64 (extloadi8  ADDRri:$src)),
+            (SUBREG_TO_REG (i64 0), (LDB32 ADDRri:$src), sub_32)>;
+  def : Pat<(i64 (extloadi16 ADDRri:$src)),
+            (SUBREG_TO_REG (i64 0), (LDH32 ADDRri:$src), sub_32)>;
+  def : Pat<(i64 (extloadi32 ADDRri:$src)),
+            (SUBREG_TO_REG (i64 0), (LDW32 ADDRri:$src), sub_32)>;
+}
+
+let usesCustomInserter = 1, isCodeGenOnly = 1 in {
+    def MEMCPY : Pseudo<
+      (outs),
+      (ins GPR:$dst, GPR:$src, i64imm:$len, i64imm:$align, variable_ops),
+      "#memcpy dst: $dst, src: $src, len: $len, align: $align",
+      [(BPFmemcpy GPR:$dst, GPR:$src, imm:$len, imm:$align)]>;
+}
diff --git a/contrib/llvm/lib/Target/BPF/BPFMIPeephole.cpp b/contrib/llvm/lib/Target/BPF/BPFMIPeephole.cpp
new file mode 100644
index 000000000000..9e984d0facfb
--- /dev/null
+++ b/contrib/llvm/lib/Target/BPF/BPFMIPeephole.cpp
@@ -0,0 +1,284 @@
+//===-------------- BPFMIPeephole.cpp - MI Peephole Cleanups  -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass performs peephole optimizations to cleanup ugly code sequences at
+// MachineInstruction layer.
+//
+// Currently, there are two optimizations implemented:
+//  - One pre-RA MachineSSA pass to eliminate type promotion sequences, those
+//    zero extend 32-bit subregisters to 64-bit registers, if the compiler
+//    could prove the subregisters is defined by 32-bit operations in which
+//    case the upper half of the underlying 64-bit registers were zeroed
+//    implicitly.
+//
+//  - One post-RA PreEmit pass to do final cleanup on some redundant
+//    instructions generated due to bad RA on subregister.
+//===----------------------------------------------------------------------===//
+
+#include "BPF.h"
+#include "BPFInstrInfo.h"
+#include "BPFTargetMachine.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "bpf-mi-zext-elim"
+
+STATISTIC(ZExtElemNum, "Number of zero extension shifts eliminated");
+
+namespace {
+
+struct BPFMIPeephole : public MachineFunctionPass {
+
+  static char ID;
+  const BPFInstrInfo *TII;
+  MachineFunction *MF;
+  MachineRegisterInfo *MRI;
+
+  BPFMIPeephole() : MachineFunctionPass(ID) {
+    initializeBPFMIPeepholePass(*PassRegistry::getPassRegistry());
+  }
+
+private:
+  // Initialize class variables.
+  void initialize(MachineFunction &MFParm);
+
+  bool isMovFrom32Def(MachineInstr *MovMI);
+  bool eliminateZExtSeq(void);
+
+public:
+
+  // Main entry point for this pass.
+  bool runOnMachineFunction(MachineFunction &MF) override {
+    if (skipFunction(MF.getFunction()))
+      return false;
+
+    initialize(MF);
+
+    return eliminateZExtSeq();
+  }
+};
+
+// Initialize class variables.
+void BPFMIPeephole::initialize(MachineFunction &MFParm) {
+  MF = &MFParm;
+  MRI = &MF->getRegInfo();
+  TII = MF->getSubtarget<BPFSubtarget>().getInstrInfo();
+  LLVM_DEBUG(dbgs() << "*** BPF MachineSSA peephole pass ***\n\n");
+}
+
+bool BPFMIPeephole::isMovFrom32Def(MachineInstr *MovMI)
+{
+  MachineInstr *DefInsn = MRI->getVRegDef(MovMI->getOperand(1).getReg());
+
+  LLVM_DEBUG(dbgs() << "  Def of Mov Src:");
+  LLVM_DEBUG(DefInsn->dump());
+
+  if (!DefInsn)
+    return false;
+
+  if (DefInsn->isPHI()) {
+    for (unsigned i = 1, e = DefInsn->getNumOperands(); i < e; i += 2) {
+      MachineOperand &opnd = DefInsn->getOperand(i);
+
+      if (!opnd.isReg())
+        return false;
+
+      MachineInstr *PhiDef = MRI->getVRegDef(opnd.getReg());
+      // quick check on PHI incoming definitions.
+      if (!PhiDef || PhiDef->isPHI() || PhiDef->getOpcode() == BPF::COPY)
+        return false;
+    }
+  }
+
+  if (DefInsn->getOpcode() == BPF::COPY) {
+    MachineOperand &opnd = DefInsn->getOperand(1);
+
+    if (!opnd.isReg())
+      return false;
+
+    unsigned Reg = opnd.getReg();
+    if ((TargetRegisterInfo::isVirtualRegister(Reg) &&
+         MRI->getRegClass(Reg) == &BPF::GPRRegClass))
+       return false;
+  }
+
+  LLVM_DEBUG(dbgs() << "  One ZExt elim sequence identified.\n");
+
+  return true;
+}
+
+bool BPFMIPeephole::eliminateZExtSeq(void) {
+  MachineInstr* ToErase = nullptr;
+  bool Eliminated = false;
+
+  for (MachineBasicBlock &MBB : *MF) {
+    for (MachineInstr &MI : MBB) {
+      // If the previous instruction was marked for elimination, remove it now.
+      if (ToErase) {
+        ToErase->eraseFromParent();
+        ToErase = nullptr;
+      }
+
+      // Eliminate the 32-bit to 64-bit zero extension sequence when possible.
+      //
+      //   MOV_32_64 rB, wA
+      //   SLL_ri    rB, rB, 32
+      //   SRL_ri    rB, rB, 32
+      if (MI.getOpcode() == BPF::SRL_ri &&
+          MI.getOperand(2).getImm() == 32) {
+        unsigned DstReg = MI.getOperand(0).getReg();
+        unsigned ShfReg = MI.getOperand(1).getReg();
+        MachineInstr *SllMI = MRI->getVRegDef(ShfReg);
+
+        LLVM_DEBUG(dbgs() << "Starting SRL found:");
+        LLVM_DEBUG(MI.dump());
+
+        if (!SllMI ||
+            SllMI->isPHI() ||
+            SllMI->getOpcode() != BPF::SLL_ri ||
+            SllMI->getOperand(2).getImm() != 32)
+          continue;
+
+        LLVM_DEBUG(dbgs() << "  SLL found:");
+        LLVM_DEBUG(SllMI->dump());
+
+        MachineInstr *MovMI = MRI->getVRegDef(SllMI->getOperand(1).getReg());
+        if (!MovMI ||
+            MovMI->isPHI() ||
+            MovMI->getOpcode() != BPF::MOV_32_64)
+          continue;
+
+        LLVM_DEBUG(dbgs() << "  Type cast Mov found:");
+        LLVM_DEBUG(MovMI->dump());
+
+        unsigned SubReg = MovMI->getOperand(1).getReg();
+        if (!isMovFrom32Def(MovMI)) {
+          LLVM_DEBUG(dbgs()
+                     << "  One ZExt elim sequence failed qualifying elim.\n");
+          continue;
+        }
+
+        BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(BPF::SUBREG_TO_REG), DstReg)
+          .addImm(0).addReg(SubReg).addImm(BPF::sub_32);
+
+        SllMI->eraseFromParent();
+        MovMI->eraseFromParent();
+        // MI is the right shift, we can't erase it in it's own iteration.
+        // Mark it to ToErase, and erase in the next iteration.
+        ToErase = &MI;
+        ZExtElemNum++;
+        Eliminated = true;
+      }
+    }
+  }
+
+  return Eliminated;
+}
+
+} // end default namespace
+
+INITIALIZE_PASS(BPFMIPeephole, DEBUG_TYPE,
+                "BPF MachineSSA Peephole Optimization", false, false)
+
+char BPFMIPeephole::ID = 0;
+FunctionPass* llvm::createBPFMIPeepholePass() { return new BPFMIPeephole(); }
+
+STATISTIC(RedundantMovElemNum, "Number of redundant moves eliminated");
+
+namespace {
+
+struct BPFMIPreEmitPeephole : public MachineFunctionPass {
+
+  static char ID;
+  MachineFunction *MF;
+  const TargetRegisterInfo *TRI;
+
+  BPFMIPreEmitPeephole() : MachineFunctionPass(ID) {
+    initializeBPFMIPreEmitPeepholePass(*PassRegistry::getPassRegistry());
+  }
+
+private:
+  // Initialize class variables.
+  void initialize(MachineFunction &MFParm);
+
+  bool eliminateRedundantMov(void);
+
+public:
+
+  // Main entry point for this pass.
+  bool runOnMachineFunction(MachineFunction &MF) override {
+    if (skipFunction(MF.getFunction()))
+      return false;
+
+    initialize(MF);
+
+    return eliminateRedundantMov();
+  }
+};
+
+// Initialize class variables.
+void BPFMIPreEmitPeephole::initialize(MachineFunction &MFParm) {
+  MF = &MFParm;
+  TRI = MF->getSubtarget<BPFSubtarget>().getRegisterInfo();
+  LLVM_DEBUG(dbgs() << "*** BPF PreEmit peephole pass ***\n\n");
+}
+
+bool BPFMIPreEmitPeephole::eliminateRedundantMov(void) {
+  MachineInstr* ToErase = nullptr;
+  bool Eliminated = false;
+
+  for (MachineBasicBlock &MBB : *MF) {
+    for (MachineInstr &MI : MBB) {
+      // If the previous instruction was marked for elimination, remove it now.
+      if (ToErase) {
+        LLVM_DEBUG(dbgs() << "  Redundant Mov Eliminated:");
+        LLVM_DEBUG(ToErase->dump());
+        ToErase->eraseFromParent();
+        ToErase = nullptr;
+      }
+
+      // Eliminate identical move:
+      //
+      //   MOV rA, rA
+      //
+      // This is particularly possible to happen when sub-register support
+      // enabled. The special type cast insn MOV_32_64 involves different
+      // register class on src (i32) and dst (i64), RA could generate useless
+      // instruction due to this.
+      if (MI.getOpcode() == BPF::MOV_32_64) {
+        unsigned dst = MI.getOperand(0).getReg();
+        unsigned dst_sub = TRI->getSubReg(dst, BPF::sub_32);
+        unsigned src = MI.getOperand(1).getReg();
+
+        if (dst_sub != src)
+          continue;
+
+        ToErase = &MI;
+        RedundantMovElemNum++;
+        Eliminated = true;
+      }
+    }
+  }
+
+  return Eliminated;
+}
+
+} // end default namespace
+
+INITIALIZE_PASS(BPFMIPreEmitPeephole, "bpf-mi-pemit-peephole",
+                "BPF PreEmit Peephole Optimization", false, false)
+
+char BPFMIPreEmitPeephole::ID = 0;
+FunctionPass* llvm::createBPFMIPreEmitPeepholePass()
+{
+  return new BPFMIPreEmitPeephole();
+}
diff --git a/contrib/llvm/lib/Target/BPF/BPFRegisterInfo.cpp b/contrib/llvm/lib/Target/BPF/BPFRegisterInfo.cpp
index 6f7067816098..635c11113151 100644
--- a/contrib/llvm/lib/Target/BPF/BPFRegisterInfo.cpp
+++ b/contrib/llvm/lib/Target/BPF/BPFRegisterInfo.cpp
@@ -37,8 +37,8 @@ BPFRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
 
 BitVector BPFRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   BitVector Reserved(getNumRegs());
-  Reserved.set(BPF::R10); // R10 is read only frame pointer
-  Reserved.set(BPF::R11); // R11 is pseudo stack pointer
+  markSuperRegs(Reserved, BPF::W10); // [W|R]10 is read only frame pointer
+  markSuperRegs(Reserved, BPF::W11); // [W|R]11 is pseudo stack pointer
   return Reserved;
 }
 
diff --git a/contrib/llvm/lib/Target/BPF/BPFRegisterInfo.h b/contrib/llvm/lib/Target/BPF/BPFRegisterInfo.h
index 4202850e9eb9..bb0d6bcf5450 100644
--- a/contrib/llvm/lib/Target/BPF/BPFRegisterInfo.h
+++ b/contrib/llvm/lib/Target/BPF/BPFRegisterInfo.h
@@ -29,6 +29,8 @@ struct BPFRegisterInfo : public BPFGenRegisterInfo {
 
   BitVector getReservedRegs(const MachineFunction &MF) const override;
 
+  bool enableMultipleCopyHints() const override { return true; }
+
   void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj,
                            unsigned FIOperandNum,
                            RegScavenger *RS = nullptr) const override;
diff --git a/contrib/llvm/lib/Target/BPF/BPFSelectionDAGInfo.cpp b/contrib/llvm/lib/Target/BPF/BPFSelectionDAGInfo.cpp
new file mode 100644
index 000000000000..24d5f59bbfd7
--- /dev/null
+++ b/contrib/llvm/lib/Target/BPF/BPFSelectionDAGInfo.cpp
@@ -0,0 +1,43 @@
+//===-- BPFSelectionDAGInfo.cpp - BPF SelectionDAG Info -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the BPFSelectionDAGInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "BPFTargetMachine.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/IR/DerivedTypes.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "bpf-selectiondag-info"
+
+SDValue BPFSelectionDAGInfo::EmitTargetCodeForMemcpy(
+    SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
+    SDValue Size, unsigned Align, bool isVolatile, bool AlwaysInline,
+    MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
+  // Requires the copy size to be a constant.
+  ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
+  if (!ConstantSize)
+    return SDValue();
+
+  unsigned CopyLen = ConstantSize->getZExtValue();
+  unsigned StoresNumEstimate = alignTo(CopyLen, Align) >> Log2_32(Align);
+  // Impose the same copy length limit as MaxStoresPerMemcpy.
+  if (StoresNumEstimate > getCommonMaxStoresPerMemFunc())
+    return SDValue();
+
+  SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
+
+  Dst = DAG.getNode(BPFISD::MEMCPY, dl, VTs, Chain, Dst, Src,
+                    DAG.getConstant(CopyLen, dl, MVT::i64),
+                    DAG.getConstant(Align, dl, MVT::i64));
+
+  return Dst.getValue(0);
+}
diff --git a/contrib/llvm/lib/Target/BPF/BPFSelectionDAGInfo.h b/contrib/llvm/lib/Target/BPF/BPFSelectionDAGInfo.h
new file mode 100644
index 000000000000..19d3c5769573
--- /dev/null
+++ b/contrib/llvm/lib/Target/BPF/BPFSelectionDAGInfo.h
@@ -0,0 +1,36 @@
+//===-- BPFSelectionDAGInfo.h - BPF SelectionDAG Info -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the BPF subclass for SelectionDAGTargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_BPF_BPFSELECTIONDAGINFO_H
+#define LLVM_LIB_TARGET_BPF_BPFSELECTIONDAGINFO_H
+
+#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
+
+namespace llvm {
+
+class BPFSelectionDAGInfo : public SelectionDAGTargetInfo {
+public:
+  SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, const SDLoc &dl,
+                                  SDValue Chain, SDValue Dst, SDValue Src,
+                                  SDValue Size, unsigned Align, bool isVolatile,
+                                  bool AlwaysInline,
+                                  MachinePointerInfo DstPtrInfo,
+                                  MachinePointerInfo SrcPtrInfo) const override;
+
+  unsigned getCommonMaxStoresPerMemFunc() const { return 128; }
+
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/BPF/BPFSubtarget.cpp b/contrib/llvm/lib/Target/BPF/BPFSubtarget.cpp
index 42ca87f9ef67..56780bd9d46f 100644
--- a/contrib/llvm/lib/Target/BPF/BPFSubtarget.cpp
+++ b/contrib/llvm/lib/Target/BPF/BPFSubtarget.cpp
@@ -30,11 +30,14 @@ BPFSubtarget &BPFSubtarget::initializeSubtargetDependencies(StringRef CPU,
                                                             StringRef FS) {
   initializeEnvironment();
   initSubtargetFeatures(CPU, FS);
+  ParseSubtargetFeatures(CPU, FS);
   return *this;
 }
 
 void BPFSubtarget::initializeEnvironment() {
   HasJmpExt = false;
+  HasAlu32 = false;
+  UseDwarfRIS = false;
 }
 
 void BPFSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
diff --git a/contrib/llvm/lib/Target/BPF/BPFSubtarget.h b/contrib/llvm/lib/Target/BPF/BPFSubtarget.h
index fa1f24443bc3..60e56435fe4c 100644
--- a/contrib/llvm/lib/Target/BPF/BPFSubtarget.h
+++ b/contrib/llvm/lib/Target/BPF/BPFSubtarget.h
@@ -17,6 +17,7 @@
 #include "BPFFrameLowering.h"
 #include "BPFISelLowering.h"
 #include "BPFInstrInfo.h"
+#include "BPFSelectionDAGInfo.h"
 #include "llvm/CodeGen/SelectionDAGTargetInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/DataLayout.h"
@@ -33,7 +34,7 @@ class BPFSubtarget : public BPFGenSubtargetInfo {
   BPFInstrInfo InstrInfo;
   BPFFrameLowering FrameLowering;
   BPFTargetLowering TLInfo;
-  SelectionDAGTargetInfo TSInfo;
+  BPFSelectionDAGInfo TSInfo;
 
 private:
   void initializeEnvironment();
@@ -47,6 +48,12 @@ protected:
   // whether the cpu supports jmp ext
   bool HasJmpExt;
 
+  // whether the cpu supports alu32 instructions.
+  bool HasAlu32;
+
+  // whether we should enable MCAsmInfo DwarfUsesRelocationsAcrossSections
+  bool UseDwarfRIS;
+
 public:
   // This constructor initializes the data members to match that
   // of the specified triple.
@@ -59,6 +66,8 @@ public:
   // subtarget options.  Definition of function is auto generated by tblgen.
   void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
   bool getHasJmpExt() const { return HasJmpExt; }
+  bool getHasAlu32() const { return HasAlu32; }
+  bool getUseDwarfRIS() const { return UseDwarfRIS; }
 
   const BPFInstrInfo *getInstrInfo() const override { return &InstrInfo; }
   const BPFFrameLowering *getFrameLowering() const override {
@@ -67,7 +76,7 @@ public:
   const BPFTargetLowering *getTargetLowering() const override {
     return &TLInfo;
   }
-  const SelectionDAGTargetInfo *getSelectionDAGInfo() const override {
+  const BPFSelectionDAGInfo *getSelectionDAGInfo() const override {
     return &TSInfo;
   }
   const TargetRegisterInfo *getRegisterInfo() const override {
diff --git a/contrib/llvm/lib/Target/BPF/BPFTargetMachine.cpp b/contrib/llvm/lib/Target/BPF/BPFTargetMachine.cpp
index 60672fa2684b..84d89bff74fe 100644
--- a/contrib/llvm/lib/Target/BPF/BPFTargetMachine.cpp
+++ b/contrib/llvm/lib/Target/BPF/BPFTargetMachine.cpp
@@ -13,6 +13,7 @@
 
 #include "BPFTargetMachine.h"
 #include "BPF.h"
+#include "MCTargetDesc/BPFMCAsmInfo.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
@@ -22,11 +23,18 @@
 #include "llvm/Target/TargetOptions.h"
 using namespace llvm;
 
+static cl::
+opt<bool> DisableMIPeephole("disable-bpf-peephole", cl::Hidden,
+                            cl::desc("Disable machine peepholes for BPF"));
+
 extern "C" void LLVMInitializeBPFTarget() {
   // Register the target.
   RegisterTargetMachine<BPFTargetMachine> X(getTheBPFleTarget());
   RegisterTargetMachine<BPFTargetMachine> Y(getTheBPFbeTarget());
   RegisterTargetMachine<BPFTargetMachine> Z(getTheBPFTarget());
+
+  PassRegistry &PR = *PassRegistry::getPassRegistry();
+  initializeBPFMIPeepholePass(PR);
 }
 
 // DataLayout: little or big endian
@@ -61,6 +69,9 @@ BPFTargetMachine::BPFTargetMachine(const Target &T, const Triple &TT,
       TLOF(make_unique<TargetLoweringObjectFileELF>()),
       Subtarget(TT, CPU, FS, *this) {
   initAsmInfo();
+
+  BPFMCAsmInfo *MAI = static_cast<BPFMCAsmInfo *>(const_cast<MCAsmInfo *>(AsmInfo));
+  MAI->setDwarfUsesRelocationsAcrossSections(!Subtarget.getUseDwarfRIS());
 }
 namespace {
 // BPF Code Generator Pass Configuration Options.
@@ -74,6 +85,8 @@ public:
   }
 
   bool addInstSelector() override;
+  void addMachineSSAOptimization() override;
+  void addPreEmitPass() override;
 };
 }
 
@@ -88,3 +101,21 @@ bool BPFPassConfig::addInstSelector() {
 
   return false;
 }
+
+void BPFPassConfig::addMachineSSAOptimization() {
+  // The default implementation must be called first as we want eBPF
+  // Peephole ran at last.
+  TargetPassConfig::addMachineSSAOptimization();
+
+  const BPFSubtarget *Subtarget = getBPFTargetMachine().getSubtargetImpl();
+  if (Subtarget->getHasAlu32() && !DisableMIPeephole)
+    addPass(createBPFMIPeepholePass());
+}
+
+void BPFPassConfig::addPreEmitPass() {
+  const BPFSubtarget *Subtarget = getBPFTargetMachine().getSubtargetImpl();
+
+  if (getOptLevel() != CodeGenOpt::None)
+    if (Subtarget->getHasAlu32() && !DisableMIPeephole)
+      addPass(createBPFMIPreEmitPeepholePass());
+}
diff --git a/contrib/llvm/lib/Target/BPF/Disassembler/BPFDisassembler.cpp b/contrib/llvm/lib/Target/BPF/Disassembler/BPFDisassembler.cpp
index 6fc87d79c439..e7790ddb3d7e 100644
--- a/contrib/llvm/lib/Target/BPF/Disassembler/BPFDisassembler.cpp
+++ b/contrib/llvm/lib/Target/BPF/Disassembler/BPFDisassembler.cpp
@@ -35,6 +35,34 @@ namespace {
 /// A disassembler class for BPF.
 class BPFDisassembler : public MCDisassembler {
 public:
+  enum BPF_CLASS {
+    BPF_LD = 0x0,
+    BPF_LDX = 0x1,
+    BPF_ST = 0x2,
+    BPF_STX = 0x3,
+    BPF_ALU = 0x4,
+    BPF_JMP = 0x5,
+    BPF_RES = 0x6,
+    BPF_ALU64 = 0x7
+  };
+
+  enum BPF_SIZE {
+    BPF_W = 0x0,
+    BPF_H = 0x1,
+    BPF_B = 0x2,
+    BPF_DW = 0x3
+  };
+
+  enum BPF_MODE {
+    BPF_IMM = 0x0,
+    BPF_ABS = 0x1,
+    BPF_IND = 0x2,
+    BPF_MEM = 0x3,
+    BPF_LEN = 0x4,
+    BPF_MSH = 0x5,
+    BPF_XADD = 0x6
+  };
+
   BPFDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx)
       : MCDisassembler(STI, Ctx) {}
   ~BPFDisassembler() override = default;
@@ -43,6 +71,10 @@ public:
                               ArrayRef<uint8_t> Bytes, uint64_t Address,
                               raw_ostream &VStream,
                               raw_ostream &CStream) const override;
+
+  uint8_t getInstClass(uint64_t Inst) const { return (Inst >> 56) & 0x7; };
+  uint8_t getInstSize(uint64_t Inst) const { return (Inst >> 59) & 0x3; };
+  uint8_t getInstMode(uint64_t Inst) const { return (Inst >> 61) & 0x7; };
 };
 
 } // end anonymous namespace
@@ -141,8 +173,17 @@ DecodeStatus BPFDisassembler::getInstruction(MCInst &Instr, uint64_t &Size,
   Result = readInstruction64(Bytes, Address, Size, Insn, IsLittleEndian);
   if (Result == MCDisassembler::Fail) return MCDisassembler::Fail;
 
-  Result = decodeInstruction(DecoderTableBPF64, Instr, Insn,
-                             Address, this, STI);
+  uint8_t InstClass = getInstClass(Insn);
+  if ((InstClass == BPF_LDX || InstClass == BPF_STX) &&
+      getInstSize(Insn) != BPF_DW &&
+      getInstMode(Insn) == BPF_MEM &&
+      STI.getFeatureBits()[BPF::ALU32])
+    Result = decodeInstruction(DecoderTableBPFALU3264, Instr, Insn, Address,
+                               this, STI);
+  else
+    Result = decodeInstruction(DecoderTableBPF64, Instr, Insn, Address, this,
+                               STI);
+
   if (Result == MCDisassembler::Fail) return MCDisassembler::Fail;
 
   switch (Instr.getOpcode()) {
diff --git a/contrib/llvm/lib/Target/BPF/InstPrinter/BPFInstPrinter.cpp b/contrib/llvm/lib/Target/BPF/InstPrinter/BPFInstPrinter.cpp
index 1f4ef098403d..20627da38817 100644
--- a/contrib/llvm/lib/Target/BPF/InstPrinter/BPFInstPrinter.cpp
+++ b/contrib/llvm/lib/Target/BPF/InstPrinter/BPFInstPrinter.cpp
@@ -12,7 +12,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "BPFInstPrinter.h"
-#include "BPF.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
diff --git a/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp b/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp
index 6593d9d018fd..6c255e9ef780 100644
--- a/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp
+++ b/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp
@@ -12,6 +12,7 @@
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCFixup.h"
 #include "llvm/MC/MCObjectWriter.h"
+#include "llvm/Support/EndianStream.h"
 #include <cassert>
 #include <cstdint>
 
@@ -21,18 +22,16 @@ namespace {
 
 class BPFAsmBackend : public MCAsmBackend {
 public:
-  bool IsLittleEndian;
-
-  BPFAsmBackend(bool IsLittleEndian)
-    : MCAsmBackend(), IsLittleEndian(IsLittleEndian) {}
+  BPFAsmBackend(support::endianness Endian) : MCAsmBackend(Endian) {}
   ~BPFAsmBackend() override = default;
 
   void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
                   const MCValue &Target, MutableArrayRef<char> Data,
-                  uint64_t Value, bool IsResolved) const override;
+                  uint64_t Value, bool IsResolved,
+                  const MCSubtargetInfo *STI) const override;
 
-  std::unique_ptr<MCObjectWriter>
-  createObjectWriter(raw_pwrite_stream &OS) const override;
+  std::unique_ptr<MCObjectTargetWriter>
+  createObjectTargetWriter() const override;
 
   // No instruction requires relaxation
   bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
@@ -43,22 +42,25 @@ public:
 
   unsigned getNumFixupKinds() const override { return 1; }
 
-  bool mayNeedRelaxation(const MCInst &Inst) const override { return false; }
+  bool mayNeedRelaxation(const MCInst &Inst,
+                         const MCSubtargetInfo &STI) const override {
+    return false;
+  }
 
   void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
                         MCInst &Res) const override {}
 
-  bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
+  bool writeNopData(raw_ostream &OS, uint64_t Count) const override;
 };
 
 } // end anonymous namespace
 
-bool BPFAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
+bool BPFAsmBackend::writeNopData(raw_ostream &OS, uint64_t Count) const {
   if ((Count % 8) != 0)
     return false;
 
   for (uint64_t i = 0; i < Count; i += 8)
-    OW->write64(0x15000000);
+    support::endian::write<uint64_t>(OS, 0x15000000, Endian);
 
   return true;
 }
@@ -66,19 +68,17 @@ bool BPFAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
 void BPFAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
                                const MCValue &Target,
                                MutableArrayRef<char> Data, uint64_t Value,
-                               bool IsResolved) const {
+                               bool IsResolved,
+                               const MCSubtargetInfo *STI) const {
   if (Fixup.getKind() == FK_SecRel_4 || Fixup.getKind() == FK_SecRel_8) {
     assert(Value == 0);
-  } else if (Fixup.getKind() == FK_Data_4 || Fixup.getKind() == FK_Data_8) {
-    unsigned Size = Fixup.getKind() == FK_Data_4 ? 4 : 8;
-
-    for (unsigned i = 0; i != Size; ++i) {
-      unsigned Idx = IsLittleEndian ? i : Size - i - 1;
-      Data[Fixup.getOffset() + Idx] = uint8_t(Value >> (i * 8));
-    }
+  } else if (Fixup.getKind() == FK_Data_4) {
+    support::endian::write<uint32_t>(&Data[Fixup.getOffset()], Value, Endian);
+  } else if (Fixup.getKind() == FK_Data_8) {
+    support::endian::write<uint64_t>(&Data[Fixup.getOffset()], Value, Endian);
   } else if (Fixup.getKind() == FK_PCRel_4) {
     Value = (uint32_t)((Value - 8) / 8);
-    if (IsLittleEndian) {
+    if (Endian == support::little) {
       Data[Fixup.getOffset() + 1] = 0x10;
       support::endian::write32le(&Data[Fixup.getOffset() + 4], Value);
     } else {
@@ -88,31 +88,26 @@ void BPFAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
   } else {
     assert(Fixup.getKind() == FK_PCRel_2);
     Value = (uint16_t)((Value - 8) / 8);
-    if (IsLittleEndian) {
-      Data[Fixup.getOffset() + 2] = Value & 0xFF;
-      Data[Fixup.getOffset() + 3] = Value >> 8;
-    } else {
-      Data[Fixup.getOffset() + 2] = Value >> 8;
-      Data[Fixup.getOffset() + 3] = Value & 0xFF;
-    }
+    support::endian::write<uint16_t>(&Data[Fixup.getOffset() + 2], Value,
+                                     Endian);
   }
 }
 
-std::unique_ptr<MCObjectWriter>
-BPFAsmBackend::createObjectWriter(raw_pwrite_stream &OS) const {
-  return createBPFELFObjectWriter(OS, 0, IsLittleEndian);
+std::unique_ptr<MCObjectTargetWriter>
+BPFAsmBackend::createObjectTargetWriter() const {
+  return createBPFELFObjectWriter(0);
 }
 
 MCAsmBackend *llvm::createBPFAsmBackend(const Target &T,
                                         const MCSubtargetInfo &STI,
                                         const MCRegisterInfo &MRI,
                                         const MCTargetOptions &) {
-  return new BPFAsmBackend(/*IsLittleEndian=*/true);
+  return new BPFAsmBackend(support::little);
 }
 
 MCAsmBackend *llvm::createBPFbeAsmBackend(const Target &T,
                                           const MCSubtargetInfo &STI,
                                           const MCRegisterInfo &MRI,
                                           const MCTargetOptions &) {
-  return new BPFAsmBackend(/*IsLittleEndian=*/false);
+  return new BPFAsmBackend(support::big);
 }
diff --git a/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp b/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp
index f7de612dab15..134e890dfe49 100644
--- a/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp
+++ b/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp
@@ -54,9 +54,7 @@ unsigned BPFELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target,
   }
 }
 
-std::unique_ptr<MCObjectWriter>
-llvm::createBPFELFObjectWriter(raw_pwrite_stream &OS, uint8_t OSABI,
-                               bool IsLittleEndian) {
-  return createELFObjectWriter(llvm::make_unique<BPFELFObjectWriter>(OSABI), OS,
-                               IsLittleEndian);
+std::unique_ptr<MCObjectTargetWriter>
+llvm::createBPFELFObjectWriter(uint8_t OSABI) {
+  return llvm::make_unique<BPFELFObjectWriter>(OSABI);
 }
diff --git a/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h b/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h
index fd7c97bf1f0a..171f7f607ff4 100644
--- a/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h
+++ b/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h
@@ -44,6 +44,10 @@ public:
     // line numbers, etc.
     CodePointerSize = 8;
   }
+
+  void setDwarfUsesRelocationsAcrossSections(bool enable) {
+    DwarfUsesRelocationsAcrossSections = enable;
+  }
 };
 }
 
diff --git a/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp b/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp
index b4ecfdee7bff..437f658caf6e 100644
--- a/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp
+++ b/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp
@@ -122,44 +122,35 @@ void BPFMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
                               computeAvailableFeatures(STI.getFeatureBits()));
 
   unsigned Opcode = MI.getOpcode();
-  support::endian::Writer<support::little> LE(OS);
-  support::endian::Writer<support::big> BE(OS);
+  support::endian::Writer OSE(OS,
+                              IsLittleEndian ? support::little : support::big);
 
   if (Opcode == BPF::LD_imm64 || Opcode == BPF::LD_pseudo) {
     uint64_t Value = getBinaryCodeForInstr(MI, Fixups, STI);
-    LE.write<uint8_t>(Value >> 56);
+    OS << char(Value >> 56);
     if (IsLittleEndian)
-      LE.write<uint8_t>((Value >> 48) & 0xff);
+      OS << char((Value >> 48) & 0xff);
     else
-      LE.write<uint8_t>(SwapBits((Value >> 48) & 0xff));
-    LE.write<uint16_t>(0);
-    if (IsLittleEndian)
-      LE.write<uint32_t>(Value & 0xffffFFFF);
-    else
-      BE.write<uint32_t>(Value & 0xffffFFFF);
+      OS << char(SwapBits((Value >> 48) & 0xff));
+    OSE.write<uint16_t>(0);
+    OSE.write<uint32_t>(Value & 0xffffFFFF);
 
     const MCOperand &MO = MI.getOperand(1);
     uint64_t Imm = MO.isImm() ? MO.getImm() : 0;
-    LE.write<uint8_t>(0);
-    LE.write<uint8_t>(0);
-    LE.write<uint16_t>(0);
-    if (IsLittleEndian)
-      LE.write<uint32_t>(Imm >> 32);
-    else
-      BE.write<uint32_t>(Imm >> 32);
+    OSE.write<uint8_t>(0);
+    OSE.write<uint8_t>(0);
+    OSE.write<uint16_t>(0);
+    OSE.write<uint32_t>(Imm >> 32);
   } else {
     // Get instruction encoding and emit it
     uint64_t Value = getBinaryCodeForInstr(MI, Fixups, STI);
-    LE.write<uint8_t>(Value >> 56);
-    if (IsLittleEndian) {
-      LE.write<uint8_t>((Value >> 48) & 0xff);
-      LE.write<uint16_t>((Value >> 32) & 0xffff);
-      LE.write<uint32_t>(Value & 0xffffFFFF);
-    } else {
-      LE.write<uint8_t>(SwapBits((Value >> 48) & 0xff));
-      BE.write<uint16_t>((Value >> 32) & 0xffff);
-      BE.write<uint32_t>(Value & 0xffffFFFF);
-    }
+    OS << char(Value >> 56);
+    if (IsLittleEndian)
+      OS << char((Value >> 48) & 0xff);
+    else
+      OS << char(SwapBits((Value >> 48) & 0xff));
+    OSE.write<uint16_t>((Value >> 32) & 0xffff);
+    OSE.write<uint32_t>(Value & 0xffffFFFF);
   }
 }
 
diff --git a/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp b/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp
index cbf1ea7d7fb8..834b57527882 100644
--- a/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp
+++ b/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp
@@ -52,10 +52,10 @@ static MCSubtargetInfo *createBPFMCSubtargetInfo(const Triple &TT,
 
 static MCStreamer *createBPFMCStreamer(const Triple &T, MCContext &Ctx,
                                        std::unique_ptr<MCAsmBackend> &&MAB,
-                                       raw_pwrite_stream &OS,
+                                       std::unique_ptr<MCObjectWriter> &&OW,
                                        std::unique_ptr<MCCodeEmitter> &&Emitter,
                                        bool RelaxAll) {
-  return createELFStreamer(Ctx, std::move(MAB), OS, std::move(Emitter),
+  return createELFStreamer(Ctx, std::move(MAB), std::move(OW), std::move(Emitter),
                            RelaxAll);
 }
 
diff --git a/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.h b/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.h
index a6dac3abca02..6d2f0a1601e6 100644
--- a/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.h
+++ b/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.h
@@ -24,7 +24,7 @@ class MCAsmBackend;
 class MCCodeEmitter;
 class MCContext;
 class MCInstrInfo;
-class MCObjectWriter;
+class MCObjectTargetWriter;
 class MCRegisterInfo;
 class MCSubtargetInfo;
 class MCTargetOptions;
@@ -52,9 +52,7 @@ MCAsmBackend *createBPFbeAsmBackend(const Target &T, const MCSubtargetInfo &STI,
                                     const MCRegisterInfo &MRI,
                                     const MCTargetOptions &Options);
 
-std::unique_ptr<MCObjectWriter> createBPFELFObjectWriter(raw_pwrite_stream &OS,
-                                                         uint8_t OSABI,
-                                                         bool IsLittleEndian);
+std::unique_ptr<MCObjectTargetWriter> createBPFELFObjectWriter(uint8_t OSABI);
 }
 
 // Defines symbolic names for BPF registers.  This defines a mapping from