diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2018-02-02 17:07:53 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2018-02-02 17:07:53 +0000 |
commit | 6d18171c1901a4db5d3e757a5ba4737fe8789dec (patch) | |
tree | 6adfbc90504e1005368a826374523b46773e1599 | |
parent | 4a6a1ccbecd7e34f40b05b4ba0a05d0031dd1eff (diff) |
Vendor import of llvm release_60 branch r324090:vendor/llvm/llvm-release_60-r324090
Notes
Notes:
svn path=/vendor/llvm/dist-release_60/; revision=328786
svn path=/vendor/llvm/llvm-release_60-r324090/; revision=328787; tag=vendor/llvm/llvm-release_60-r324090
51 files changed, 1787 insertions, 72 deletions
diff --git a/include/llvm/CodeGen/Passes.h b/include/llvm/CodeGen/Passes.h index 4370d116e08c..064526b1efa7 100644 --- a/include/llvm/CodeGen/Passes.h +++ b/include/llvm/CodeGen/Passes.h @@ -417,6 +417,9 @@ namespace llvm { // This pass expands memcmp() to load/stores. FunctionPass *createExpandMemCmpPass(); + // This pass expands indirectbr instructions. + FunctionPass *createIndirectBrExpandPass(); + } // End llvm namespace #endif diff --git a/include/llvm/CodeGen/TargetInstrInfo.h b/include/llvm/CodeGen/TargetInstrInfo.h index 38a1b33aecad..6b5404be35d3 100644 --- a/include/llvm/CodeGen/TargetInstrInfo.h +++ b/include/llvm/CodeGen/TargetInstrInfo.h @@ -950,6 +950,10 @@ public: /// Return true when a target supports MachineCombiner. virtual bool useMachineCombiner() const { return false; } + /// Return true if the given SDNode can be copied during scheduling + /// even if it has glue. + virtual bool canCopyGluedNodeDuringSchedule(SDNode *N) const { return false; } + protected: /// Target-dependent implementation for foldMemoryOperand. /// Target-independent code in foldMemoryOperand will diff --git a/include/llvm/CodeGen/TargetLowering.h b/include/llvm/CodeGen/TargetLowering.h index 380e3b19dc80..cea8472caa35 100644 --- a/include/llvm/CodeGen/TargetLowering.h +++ b/include/llvm/CodeGen/TargetLowering.h @@ -800,7 +800,7 @@ public: } /// Return true if lowering to a jump table is allowed. - bool areJTsAllowed(const Function *Fn) const { + virtual bool areJTsAllowed(const Function *Fn) const { if (Fn->getFnAttribute("no-jump-tables").getValueAsString() == "true") return false; diff --git a/include/llvm/CodeGen/TargetPassConfig.h b/include/llvm/CodeGen/TargetPassConfig.h index a378c7b2fca1..da9841a0586e 100644 --- a/include/llvm/CodeGen/TargetPassConfig.h +++ b/include/llvm/CodeGen/TargetPassConfig.h @@ -416,6 +416,13 @@ protected: /// immediately before machine code is emitted. virtual void addPreEmitPass() { } + /// Targets may add passes immediately before machine code is emitted in this + /// callback. This is called even later than `addPreEmitPass`. + // FIXME: Rename `addPreEmitPass` to something more sensible given its actual + // position and remove the `2` suffix here as this callback is what + // `addPreEmitPass` *should* be but in reality isn't. + virtual void addPreEmitPass2() {} + /// Utilities for targets to add passes to the pass manager. /// diff --git a/include/llvm/CodeGen/TargetSubtargetInfo.h b/include/llvm/CodeGen/TargetSubtargetInfo.h index 576522aef466..9d99cba347ce 100644 --- a/include/llvm/CodeGen/TargetSubtargetInfo.h +++ b/include/llvm/CodeGen/TargetSubtargetInfo.h @@ -174,6 +174,9 @@ public: /// \brief True if the subtarget should run the atomic expansion pass. virtual bool enableAtomicExpand() const; + /// True if the subtarget should run the indirectbr expansion pass. + virtual bool enableIndirectBrExpand() const; + /// \brief Override generic scheduling policy within a region. /// /// This is a convenient way for targets that don't provide any custom diff --git a/include/llvm/InitializePasses.h b/include/llvm/InitializePasses.h index dd7aa722ed2b..4c79333f5d2e 100644 --- a/include/llvm/InitializePasses.h +++ b/include/llvm/InitializePasses.h @@ -161,6 +161,7 @@ void initializeIVUsersWrapperPassPass(PassRegistry&); void initializeIfConverterPass(PassRegistry&); void initializeImplicitNullChecksPass(PassRegistry&); void initializeIndVarSimplifyLegacyPassPass(PassRegistry&); +void initializeIndirectBrExpandPassPass(PassRegistry&); void initializeInductiveRangeCheckEliminationPass(PassRegistry&); void initializeInferAddressSpacesPass(PassRegistry&); void initializeInferFunctionAttrsLegacyPassPass(PassRegistry&); diff --git a/lib/CodeGen/CMakeLists.txt b/lib/CodeGen/CMakeLists.txt index 3aeb4910ab10..865de4f47afa 100644 --- a/lib/CodeGen/CMakeLists.txt +++ b/lib/CodeGen/CMakeLists.txt @@ -33,6 +33,7 @@ add_llvm_library(LLVMCodeGen GlobalMerge.cpp IfConversion.cpp ImplicitNullChecks.cpp + IndirectBrExpandPass.cpp InlineSpiller.cpp InterferenceCache.cpp InterleavedAccessPass.cpp diff --git a/lib/CodeGen/CodeGen.cpp b/lib/CodeGen/CodeGen.cpp index c0d7eb4cf47b..879cd2859ee9 100644 --- a/lib/CodeGen/CodeGen.cpp +++ b/lib/CodeGen/CodeGen.cpp @@ -38,6 +38,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) { initializeGCModuleInfoPass(Registry); initializeIfConverterPass(Registry); initializeImplicitNullChecksPass(Registry); + initializeIndirectBrExpandPassPass(Registry); initializeInterleavedAccessPass(Registry); initializeLiveDebugValuesPass(Registry); initializeLiveDebugVariablesPass(Registry); diff --git a/lib/CodeGen/IndirectBrExpandPass.cpp b/lib/CodeGen/IndirectBrExpandPass.cpp new file mode 100644 index 000000000000..7b05ebf820fd --- /dev/null +++ b/lib/CodeGen/IndirectBrExpandPass.cpp @@ -0,0 +1,221 @@ +//===- IndirectBrExpandPass.cpp - Expand indirectbr to switch -------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// +/// Implements an expansion pass to turn `indirectbr` instructions in the IR +/// into `switch` instructions. This works by enumerating the basic blocks in +/// a dense range of integers, replacing each `blockaddr` constant with the +/// corresponding integer constant, and then building a switch that maps from +/// the integers to the actual blocks. All of the indirectbr instructions in the +/// function are redirected to this common switch. +/// +/// While this is generically useful if a target is unable to codegen +/// `indirectbr` natively, it is primarily useful when there is some desire to +/// get the builtin non-jump-table lowering of a switch even when the input +/// source contained an explicit indirect branch construct. +/// +/// Note that it doesn't make any sense to enable this pass unless a target also +/// disables jump-table lowering of switches. Doing that is likely to pessimize +/// the code. +/// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/Sequence.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/CodeGen/TargetSubtargetInfo.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" +#include "llvm/Pass.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetMachine.h" + +using namespace llvm; + +#define DEBUG_TYPE "indirectbr-expand" + +namespace { + +class IndirectBrExpandPass : public FunctionPass { + const TargetLowering *TLI = nullptr; + +public: + static char ID; // Pass identification, replacement for typeid + + IndirectBrExpandPass() : FunctionPass(ID) { + initializeIndirectBrExpandPassPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F) override; +}; + +} // end anonymous namespace + +char IndirectBrExpandPass::ID = 0; + +INITIALIZE_PASS(IndirectBrExpandPass, DEBUG_TYPE, + "Expand indirectbr instructions", false, false) + +FunctionPass *llvm::createIndirectBrExpandPass() { + return new IndirectBrExpandPass(); +} + +bool IndirectBrExpandPass::runOnFunction(Function &F) { + auto &DL = F.getParent()->getDataLayout(); + auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); + if (!TPC) + return false; + + auto &TM = TPC->getTM<TargetMachine>(); + auto &STI = *TM.getSubtargetImpl(F); + if (!STI.enableIndirectBrExpand()) + return false; + TLI = STI.getTargetLowering(); + + SmallVector<IndirectBrInst *, 1> IndirectBrs; + + // Set of all potential successors for indirectbr instructions. + SmallPtrSet<BasicBlock *, 4> IndirectBrSuccs; + + // Build a list of indirectbrs that we want to rewrite. + for (BasicBlock &BB : F) + if (auto *IBr = dyn_cast<IndirectBrInst>(BB.getTerminator())) { + // Handle the degenerate case of no successors by replacing the indirectbr + // with unreachable as there is no successor available. + if (IBr->getNumSuccessors() == 0) { + (void)new UnreachableInst(F.getContext(), IBr); + IBr->eraseFromParent(); + continue; + } + + IndirectBrs.push_back(IBr); + for (BasicBlock *SuccBB : IBr->successors()) + IndirectBrSuccs.insert(SuccBB); + } + + if (IndirectBrs.empty()) + return false; + + // If we need to replace any indirectbrs we need to establish integer + // constants that will correspond to each of the basic blocks in the function + // whose address escapes. We do that here and rewrite all the blockaddress + // constants to just be those integer constants cast to a pointer type. + SmallVector<BasicBlock *, 4> BBs; + + for (BasicBlock &BB : F) { + // Skip blocks that aren't successors to an indirectbr we're going to + // rewrite. + if (!IndirectBrSuccs.count(&BB)) + continue; + + auto IsBlockAddressUse = [&](const Use &U) { + return isa<BlockAddress>(U.getUser()); + }; + auto BlockAddressUseIt = llvm::find_if(BB.uses(), IsBlockAddressUse); + if (BlockAddressUseIt == BB.use_end()) + continue; + + assert(std::find_if(std::next(BlockAddressUseIt), BB.use_end(), + IsBlockAddressUse) == BB.use_end() && + "There should only ever be a single blockaddress use because it is " + "a constant and should be uniqued."); + + auto *BA = cast<BlockAddress>(BlockAddressUseIt->getUser()); + + // Skip if the constant was formed but ended up not being used (due to DCE + // or whatever). + if (!BA->isConstantUsed()) + continue; + + // Compute the index we want to use for this basic block. We can't use zero + // because null can be compared with block addresses. + int BBIndex = BBs.size() + 1; + BBs.push_back(&BB); + + auto *ITy = cast<IntegerType>(DL.getIntPtrType(BA->getType())); + ConstantInt *BBIndexC = ConstantInt::get(ITy, BBIndex); + + // Now rewrite the blockaddress to an integer constant based on the index. + // FIXME: We could potentially preserve the uses as arguments to inline asm. + // This would allow some uses such as diagnostic information in crashes to + // have higher quality even when this transform is enabled, but would break + // users that round-trip blockaddresses through inline assembly and then + // back into an indirectbr. + BA->replaceAllUsesWith(ConstantExpr::getIntToPtr(BBIndexC, BA->getType())); + } + + if (BBs.empty()) { + // There are no blocks whose address is taken, so any indirectbr instruction + // cannot get a valid input and we can replace all of them with unreachable. + for (auto *IBr : IndirectBrs) { + (void)new UnreachableInst(F.getContext(), IBr); + IBr->eraseFromParent(); + } + return true; + } + + BasicBlock *SwitchBB; + Value *SwitchValue; + + // Compute a common integer type across all the indirectbr instructions. + IntegerType *CommonITy = nullptr; + for (auto *IBr : IndirectBrs) { + auto *ITy = + cast<IntegerType>(DL.getIntPtrType(IBr->getAddress()->getType())); + if (!CommonITy || ITy->getBitWidth() > CommonITy->getBitWidth()) + CommonITy = ITy; + } + + auto GetSwitchValue = [DL, CommonITy](IndirectBrInst *IBr) { + return CastInst::CreatePointerCast( + IBr->getAddress(), CommonITy, + Twine(IBr->getAddress()->getName()) + ".switch_cast", IBr); + }; + + if (IndirectBrs.size() == 1) { + // If we only have one indirectbr, we can just directly replace it within + // its block. + SwitchBB = IndirectBrs[0]->getParent(); + SwitchValue = GetSwitchValue(IndirectBrs[0]); + IndirectBrs[0]->eraseFromParent(); + } else { + // Otherwise we need to create a new block to hold the switch across BBs, + // jump to that block instead of each indirectbr, and phi together the + // values for the switch. + SwitchBB = BasicBlock::Create(F.getContext(), "switch_bb", &F); + auto *SwitchPN = PHINode::Create(CommonITy, IndirectBrs.size(), + "switch_value_phi", SwitchBB); + SwitchValue = SwitchPN; + + // Now replace the indirectbr instructions with direct branches to the + // switch block and fill out the PHI operands. + for (auto *IBr : IndirectBrs) { + SwitchPN->addIncoming(GetSwitchValue(IBr), IBr->getParent()); + BranchInst::Create(SwitchBB, IBr); + IBr->eraseFromParent(); + } + } + + // Now build the switch in the block. The block will have no terminator + // already. + auto *SI = SwitchInst::Create(SwitchValue, BBs[0], BBs.size(), SwitchBB); + + // Add a case for each block. + for (int i : llvm::seq<int>(1, BBs.size())) + SI->addCase(ConstantInt::get(CommonITy, i + 1), BBs[i]); + + return true; +} diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp index b566c232cbc3..3a2fb0c0a836 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -1996,14 +1996,15 @@ SDValue SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, SDNode *Node, Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); Entry.Node = Op; Entry.Ty = ArgTy; - Entry.IsSExt = isSigned; - Entry.IsZExt = !isSigned; + Entry.IsSExt = TLI.shouldSignExtendTypeInLibCall(ArgVT, isSigned); + Entry.IsZExt = !TLI.shouldSignExtendTypeInLibCall(ArgVT, isSigned); Args.push_back(Entry); } SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC), TLI.getPointerTy(DAG.getDataLayout())); - Type *RetTy = Node->getValueType(0).getTypeForEVT(*DAG.getContext()); + EVT RetVT = Node->getValueType(0); + Type *RetTy = RetVT.getTypeForEVT(*DAG.getContext()); // By default, the input chain to this libcall is the entry node of the // function. If the libcall is going to be emitted as a tail call then @@ -2022,13 +2023,14 @@ SDValue SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, SDNode *Node, InChain = TCChain; TargetLowering::CallLoweringInfo CLI(DAG); + bool signExtend = TLI.shouldSignExtendTypeInLibCall(RetVT, isSigned); CLI.setDebugLoc(SDLoc(Node)) .setChain(InChain) .setLibCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee, std::move(Args)) .setTailCall(isTailCall) - .setSExtResult(isSigned) - .setZExtResult(!isSigned) + .setSExtResult(signExtend) + .setZExtResult(!signExtend) .setIsPostTypeLegalization(true); std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI); diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp index 49f304c8cc86..82337d43c5c9 100644 --- a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp +++ b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp @@ -1117,22 +1117,34 @@ SUnit *ScheduleDAGRRList::CopyAndMoveSuccessors(SUnit *SU) { if (!N) return nullptr; - if (SU->getNode()->getGluedNode()) + DEBUG(dbgs() << "Considering duplicating the SU\n"); + DEBUG(SU->dump(this)); + + if (N->getGluedNode() && + !TII->canCopyGluedNodeDuringSchedule(N)) { + DEBUG(dbgs() + << "Giving up because it has incoming glue and the target does not " + "want to copy it\n"); return nullptr; + } SUnit *NewSU; bool TryUnfold = false; for (unsigned i = 0, e = N->getNumValues(); i != e; ++i) { MVT VT = N->getSimpleValueType(i); - if (VT == MVT::Glue) + if (VT == MVT::Glue) { + DEBUG(dbgs() << "Giving up because it has outgoing glue\n"); return nullptr; - else if (VT == MVT::Other) + } else if (VT == MVT::Other) TryUnfold = true; } for (const SDValue &Op : N->op_values()) { MVT VT = Op.getNode()->getSimpleValueType(Op.getResNo()); - if (VT == MVT::Glue) + if (VT == MVT::Glue && !TII->canCopyGluedNodeDuringSchedule(N)) { + DEBUG(dbgs() << "Giving up because it one of the operands is glue and " + "the target does not want to copy it\n"); return nullptr; + } } // If possible unfold instruction. diff --git a/lib/CodeGen/TargetPassConfig.cpp b/lib/CodeGen/TargetPassConfig.cpp index 6c91bdc1c524..3e6ad3eeef0f 100644 --- a/lib/CodeGen/TargetPassConfig.cpp +++ b/lib/CodeGen/TargetPassConfig.cpp @@ -907,6 +907,9 @@ void TargetPassConfig::addMachinePasses() { if (EnableMachineOutliner) PM->add(createMachineOutlinerPass(EnableLinkOnceODROutlining)); + // Add passes that directly emit MI after all other MI passes. + addPreEmitPass2(); + AddingMachinePasses = false; } diff --git a/lib/CodeGen/TargetSubtargetInfo.cpp b/lib/CodeGen/TargetSubtargetInfo.cpp index 1a317cd865f0..8693f344f9be 100644 --- a/lib/CodeGen/TargetSubtargetInfo.cpp +++ b/lib/CodeGen/TargetSubtargetInfo.cpp @@ -38,6 +38,10 @@ bool TargetSubtargetInfo::enableAtomicExpand() const { return true; } +bool TargetSubtargetInfo::enableIndirectBrExpand() const { + return false; +} + bool TargetSubtargetInfo::enableMachineScheduler() const { return false; } diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp index 61967605432e..2c127d787260 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -3756,36 +3756,45 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { // FIXME: This isn't safe because the addressing mode doesn't work // correctly if vaddr is negative. // - // FIXME: Handle v_add_u32 and VOP3 form. Also don't rely on immediate - // being in src0. - // // FIXME: Should probably be done somewhere else, maybe SIFoldOperands. // // See if we can extract an immediate offset by recognizing one of these: // V_ADD_I32_e32 dst, imm, src1 // V_ADD_I32_e32 dst, (S_MOV_B32 imm), src1 // V_ADD will be removed by "Remove dead machine instructions". - if (Add && Add->getOpcode() == AMDGPU::V_ADD_I32_e32) { - const MachineOperand *Src = - getNamedOperand(*Add, AMDGPU::OpName::src0); - - if (Src->isReg()) { - auto Mov = MRI.getUniqueVRegDef(Src->getReg()); - if (Mov && Mov->getOpcode() == AMDGPU::S_MOV_B32) - Src = &Mov->getOperand(1); - } + if (Add && + (Add->getOpcode() == AMDGPU::V_ADD_I32_e32 || + Add->getOpcode() == AMDGPU::V_ADD_U32_e64)) { + static const unsigned SrcNames[2] = { + AMDGPU::OpName::src0, + AMDGPU::OpName::src1, + }; + + // Find a literal offset in one of source operands. + for (int i = 0; i < 2; i++) { + const MachineOperand *Src = + getNamedOperand(*Add, SrcNames[i]); + + if (Src->isReg()) { + auto Mov = MRI.getUniqueVRegDef(Src->getReg()); + if (Mov && Mov->getOpcode() == AMDGPU::S_MOV_B32) + Src = &Mov->getOperand(1); + } + + if (Src) { + if (Src->isImm()) + Offset = Src->getImm(); + else if (Src->isCImm()) + Offset = Src->getCImm()->getZExtValue(); + } + + if (Offset && isLegalMUBUFImmOffset(Offset)) { + VAddr = getNamedOperand(*Add, SrcNames[!i]); + break; + } - if (Src) { - if (Src->isImm()) - Offset = Src->getImm(); - else if (Src->isCImm()) - Offset = Src->getCImm()->getZExtValue(); - } - - if (Offset && isLegalMUBUFImmOffset(Offset)) - VAddr = getNamedOperand(*Add, AMDGPU::OpName::src1); - else Offset = 0; + } } BuildMI(*MBB, Inst, Inst.getDebugLoc(), diff --git a/lib/Target/ARM/Thumb1InstrInfo.cpp b/lib/Target/ARM/Thumb1InstrInfo.cpp index 49645834e2de..05c98aab6f27 100644 --- a/lib/Target/ARM/Thumb1InstrInfo.cpp +++ b/lib/Target/ARM/Thumb1InstrInfo.cpp @@ -141,3 +141,16 @@ void Thumb1InstrInfo::expandLoadStackGuard( else expandLoadStackGuardBase(MI, ARM::tLDRLIT_ga_abs, ARM::tLDRi); } + +bool Thumb1InstrInfo::canCopyGluedNodeDuringSchedule(SDNode *N) const { + // In Thumb1 the scheduler may need to schedule a cross-copy between GPRS and CPSR + // but this is not always possible there, so allow the Scheduler to clone tADCS and tSBCS + // even if they have glue. + // FIXME. Actually implement the cross-copy where it is possible (post v6) + // because these copies entail more spilling. + unsigned Opcode = N->getMachineOpcode(); + if (Opcode == ARM::tADCS || Opcode == ARM::tSBCS) + return true; + + return false; +} diff --git a/lib/Target/ARM/Thumb1InstrInfo.h b/lib/Target/ARM/Thumb1InstrInfo.h index e8d9a9c4ff14..9f04a3ed262f 100644 --- a/lib/Target/ARM/Thumb1InstrInfo.h +++ b/lib/Target/ARM/Thumb1InstrInfo.h @@ -53,6 +53,7 @@ public: const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const override; + bool canCopyGluedNodeDuringSchedule(SDNode *N) const override; private: void expandLoadStackGuard(MachineBasicBlock::iterator MI) const override; }; diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp index 79ca9cc6b800..ba05b0f48df7 100644 --- a/lib/Target/Mips/MipsISelLowering.cpp +++ b/lib/Target/Mips/MipsISelLowering.cpp @@ -3507,10 +3507,9 @@ MipsTargetLowering::CanLowerReturn(CallingConv::ID CallConv, bool MipsTargetLowering::shouldSignExtendTypeInLibCall(EVT Type, bool IsSigned) const { - if (Subtarget.hasMips3() && Subtarget.useSoftFloat()) { - if (Type == MVT::i32) + if ((ABI.IsN32() || ABI.IsN64()) && Type == MVT::i32) return true; - } + return IsSigned; } diff --git a/lib/Target/Mips/MipsTargetObjectFile.cpp b/lib/Target/Mips/MipsTargetObjectFile.cpp index 9db6b7b1bcd6..f767c8321988 100644 --- a/lib/Target/Mips/MipsTargetObjectFile.cpp +++ b/lib/Target/Mips/MipsTargetObjectFile.cpp @@ -136,6 +136,13 @@ IsGlobalInSmallSectionImpl(const GlobalObject *GO, return false; Type *Ty = GVA->getValueType(); + + // It is possible that the type of the global is unsized, i.e. a declaration + // of a extern struct. In this case don't presume it is in the small data + // section. This happens e.g. when building the FreeBSD kernel. + if (!Ty->isSized()) + return false; + return IsInSmallSection( GVA->getParent()->getDataLayout().getTypeAllocSize(Ty)); } diff --git a/lib/Target/Sparc/SparcFrameLowering.cpp b/lib/Target/Sparc/SparcFrameLowering.cpp index 9864aa372354..9f6c7d65592d 100644 --- a/lib/Target/Sparc/SparcFrameLowering.cpp +++ b/lib/Target/Sparc/SparcFrameLowering.cpp @@ -88,10 +88,11 @@ void SparcFrameLowering::emitPrologue(MachineFunction &MF, assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported"); MachineFrameInfo &MFI = MF.getFrameInfo(); + const SparcSubtarget &Subtarget = MF.getSubtarget<SparcSubtarget>(); const SparcInstrInfo &TII = - *static_cast<const SparcInstrInfo *>(MF.getSubtarget().getInstrInfo()); + *static_cast<const SparcInstrInfo *>(Subtarget.getInstrInfo()); const SparcRegisterInfo &RegInfo = - *static_cast<const SparcRegisterInfo *>(MF.getSubtarget().getRegisterInfo()); + *static_cast<const SparcRegisterInfo *>(Subtarget.getRegisterInfo()); MachineBasicBlock::iterator MBBI = MBB.begin(); // Debug location must be unknown since the first debug location is used // to determine the end of the prologue. @@ -141,7 +142,7 @@ void SparcFrameLowering::emitPrologue(MachineFunction &MF, // Adds the SPARC subtarget-specific spill area to the stack // size. Also ensures target-required alignment. - NumBytes = MF.getSubtarget<SparcSubtarget>().getAdjustedFrameSize(NumBytes); + NumBytes = Subtarget.getAdjustedFrameSize(NumBytes); // Finally, ensure that the size is sufficiently aligned for the // data on the stack. @@ -176,9 +177,27 @@ void SparcFrameLowering::emitPrologue(MachineFunction &MF, .addCFIIndex(CFIIndex); if (NeedsStackRealignment) { - // andn %o6, MaxAlign-1, %o6 + int64_t Bias = Subtarget.getStackPointerBias(); + unsigned regUnbiased; + if (Bias) { + // This clobbers G1 which we always know is available here. + regUnbiased = SP::G1; + // add %o6, BIAS, %g1 + BuildMI(MBB, MBBI, dl, TII.get(SP::ADDri), regUnbiased) + .addReg(SP::O6).addImm(Bias); + } else + regUnbiased = SP::O6; + + // andn %regUnbiased, MaxAlign-1, %regUnbiased int MaxAlign = MFI.getMaxAlignment(); - BuildMI(MBB, MBBI, dl, TII.get(SP::ANDNri), SP::O6).addReg(SP::O6).addImm(MaxAlign - 1); + BuildMI(MBB, MBBI, dl, TII.get(SP::ANDNri), regUnbiased) + .addReg(regUnbiased).addImm(MaxAlign - 1); + + if (Bias) { + // add %g1, -BIAS, %o6 + BuildMI(MBB, MBBI, dl, TII.get(SP::ADDri), SP::O6) + .addReg(regUnbiased).addImm(-Bias); + } } } diff --git a/lib/Target/X86/CMakeLists.txt b/lib/Target/X86/CMakeLists.txt index 7e0df2941467..23ac9d9936ad 100644 --- a/lib/Target/X86/CMakeLists.txt +++ b/lib/Target/X86/CMakeLists.txt @@ -48,6 +48,7 @@ set(sources X86PadShortFunction.cpp X86RegisterBankInfo.cpp X86RegisterInfo.cpp + X86RetpolineThunks.cpp X86SelectionDAGInfo.cpp X86ShuffleDecodeConstantPool.cpp X86Subtarget.cpp diff --git a/lib/Target/X86/X86.h b/lib/Target/X86/X86.h index 5631648d2dc8..361326824292 100644 --- a/lib/Target/X86/X86.h +++ b/lib/Target/X86/X86.h @@ -22,6 +22,7 @@ namespace llvm { class FunctionPass; class ImmutablePass; class InstructionSelector; +class ModulePass; class PassRegistry; class X86RegisterBankInfo; class X86Subtarget; @@ -102,6 +103,9 @@ void initializeFixupBWInstPassPass(PassRegistry &); /// encoding when possible in order to reduce code size. FunctionPass *createX86EvexToVexInsts(); +/// This pass creates the thunks for the retpoline feature. +FunctionPass *createX86RetpolineThunksPass(); + InstructionSelector *createX86InstructionSelector(const X86TargetMachine &TM, X86Subtarget &, X86RegisterBankInfo &); diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td index ba998467b799..ba97982e3330 100644 --- a/lib/Target/X86/X86.td +++ b/lib/Target/X86/X86.td @@ -329,6 +329,27 @@ def FeatureHasFastGather : SubtargetFeature<"fast-gather", "HasFastGather", "true", "Indicates if gather is reasonably fast.">; +// Enable mitigation of some aspects of speculative execution related +// vulnerabilities by removing speculatable indirect branches. This disables +// jump-table formation, rewrites explicit `indirectbr` instructions into +// `switch` instructions, and uses a special construct called a "retpoline" to +// prevent speculation of the remaining indirect branches (indirect calls and +// tail calls). +def FeatureRetpoline + : SubtargetFeature<"retpoline", "UseRetpoline", "true", + "Remove speculation of indirect branches from the " + "generated code, either by avoiding them entirely or " + "lowering them with a speculation blocking construct.">; + +// Rely on external thunks for the emitted retpoline calls. This allows users +// to provide their own custom thunk definitions in highly specialized +// environments such as a kernel that does boot-time hot patching. +def FeatureRetpolineExternalThunk + : SubtargetFeature< + "retpoline-external-thunk", "UseRetpolineExternalThunk", "true", + "Enable retpoline, but with an externally provided thunk.", + [FeatureRetpoline]>; + //===----------------------------------------------------------------------===// // Register File Description //===----------------------------------------------------------------------===// diff --git a/lib/Target/X86/X86AsmPrinter.h b/lib/Target/X86/X86AsmPrinter.h index 7e70789ac82c..31328e6aea95 100644 --- a/lib/Target/X86/X86AsmPrinter.h +++ b/lib/Target/X86/X86AsmPrinter.h @@ -32,6 +32,7 @@ class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter { FaultMaps FM; std::unique_ptr<MCCodeEmitter> CodeEmitter; bool EmitFPOData = false; + bool NeedsRetpoline = false; // This utility class tracks the length of a stackmap instruction's 'shadow'. // It is used by the X86AsmPrinter to ensure that the stackmap shadow diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp index 5dae485f4c9f..80ce3c579fe0 100644 --- a/lib/Target/X86/X86FastISel.cpp +++ b/lib/Target/X86/X86FastISel.cpp @@ -3172,6 +3172,10 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) { (CalledFn && CalledFn->hasFnAttribute("no_caller_saved_registers"))) return false; + // Functions using retpoline should use SDISel for calls. + if (Subtarget->useRetpoline()) + return false; + // Handle only C, fastcc, and webkit_js calling conventions for now. switch (CC) { default: return false; diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp index 80b1cc192a88..11808f8995fe 100644 --- a/lib/Target/X86/X86FrameLowering.cpp +++ b/lib/Target/X86/X86FrameLowering.cpp @@ -741,6 +741,11 @@ void X86FrameLowering::emitStackProbeCall(MachineFunction &MF, bool InProlog) const { bool IsLargeCodeModel = MF.getTarget().getCodeModel() == CodeModel::Large; + // FIXME: Add retpoline support and remove this. + if (Is64Bit && IsLargeCodeModel && STI.useRetpoline()) + report_fatal_error("Emitting stack probe calls on 64-bit with the large " + "code model and retpoline not yet implemented."); + unsigned CallOp; if (Is64Bit) CallOp = IsLargeCodeModel ? X86::CALL64r : X86::CALL64pcrel32; @@ -2345,6 +2350,10 @@ void X86FrameLowering::adjustForSegmentedStacks( // This solution is not perfect, as it assumes that the .rodata section // is laid out within 2^31 bytes of each function body, but this seems // to be sufficient for JIT. + // FIXME: Add retpoline support and remove the error here.. + if (STI.useRetpoline()) + report_fatal_error("Emitting morestack calls on 64-bit with the large " + "code model and retpoline not yet implemented."); BuildMI(allocMBB, DL, TII.get(X86::CALL64m)) .addReg(X86::RIP) .addImm(0) diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp index 660c1eff3c4b..d79fd0ca4daa 100644 --- a/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -629,11 +629,11 @@ void X86DAGToDAGISel::PreprocessISelDAG() { SDNode *N = &*I++; // Preincrement iterator to avoid invalidation issues. if (OptLevel != CodeGenOpt::None && - // Only does this when target favors doesn't favor register indirect - // call. + // Only do this when the target can fold the load into the call or + // jmp. + !Subtarget->useRetpoline() && ((N->getOpcode() == X86ISD::CALL && !Subtarget->slowTwoMemOps()) || (N->getOpcode() == X86ISD::TC_RETURN && - // Only does this if load can be folded into TC_RETURN. (Subtarget->is64Bit() || !getTargetMachine().isPositionIndependent())))) { /// Also try moving call address load from outside callseq_start to just diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 3a163637da26..38885c42b529 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -25767,6 +25767,15 @@ X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask, return isShuffleMaskLegal(Mask, VT); } +bool X86TargetLowering::areJTsAllowed(const Function *Fn) const { + // If the subtarget is using retpolines, we need to not generate jump tables. + if (Subtarget.useRetpoline()) + return false; + + // Otherwise, fallback on the generic logic. + return TargetLowering::areJTsAllowed(Fn); +} + //===----------------------------------------------------------------------===// // X86 Scheduler Hooks //===----------------------------------------------------------------------===// @@ -27069,6 +27078,115 @@ X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI, return BB; } +static unsigned getOpcodeForRetpoline(unsigned RPOpc) { + switch (RPOpc) { + case X86::RETPOLINE_CALL32: + return X86::CALLpcrel32; + case X86::RETPOLINE_CALL64: + return X86::CALL64pcrel32; + case X86::RETPOLINE_TCRETURN32: + return X86::TCRETURNdi; + case X86::RETPOLINE_TCRETURN64: + return X86::TCRETURNdi64; + } + llvm_unreachable("not retpoline opcode"); +} + +static const char *getRetpolineSymbol(const X86Subtarget &Subtarget, + unsigned Reg) { + switch (Reg) { + case 0: + assert(!Subtarget.is64Bit() && "R11 should always be available on x64"); + return Subtarget.useRetpolineExternalThunk() + ? "__llvm_external_retpoline_push" + : "__llvm_retpoline_push"; + case X86::EAX: + return Subtarget.useRetpolineExternalThunk() + ? "__llvm_external_retpoline_eax" + : "__llvm_retpoline_eax"; + case X86::ECX: + return Subtarget.useRetpolineExternalThunk() + ? "__llvm_external_retpoline_ecx" + : "__llvm_retpoline_ecx"; + case X86::EDX: + return Subtarget.useRetpolineExternalThunk() + ? "__llvm_external_retpoline_edx" + : "__llvm_retpoline_edx"; + case X86::R11: + return Subtarget.useRetpolineExternalThunk() + ? "__llvm_external_retpoline_r11" + : "__llvm_retpoline_r11"; + } + llvm_unreachable("unexpected reg for retpoline"); +} + +MachineBasicBlock * +X86TargetLowering::EmitLoweredRetpoline(MachineInstr &MI, + MachineBasicBlock *BB) const { + // Copy the virtual register into the R11 physical register and + // call the retpoline thunk. + DebugLoc DL = MI.getDebugLoc(); + const X86InstrInfo *TII = Subtarget.getInstrInfo(); + unsigned CalleeVReg = MI.getOperand(0).getReg(); + unsigned Opc = getOpcodeForRetpoline(MI.getOpcode()); + + // Find an available scratch register to hold the callee. On 64-bit, we can + // just use R11, but we scan for uses anyway to ensure we don't generate + // incorrect code. On 32-bit, we use one of EAX, ECX, or EDX that isn't + // already a register use operand to the call to hold the callee. If none + // are available, push the callee instead. This is less efficient, but is + // necessary for functions using 3 regparms. Such function calls are + // (currently) not eligible for tail call optimization, because there is no + // scratch register available to hold the address of the callee. + SmallVector<unsigned, 3> AvailableRegs; + if (Subtarget.is64Bit()) + AvailableRegs.push_back(X86::R11); + else + AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX}); + + // Zero out any registers that are already used. + for (const auto &MO : MI.operands()) { + if (MO.isReg() && MO.isUse()) + for (unsigned &Reg : AvailableRegs) + if (Reg == MO.getReg()) + Reg = 0; + } + + // Choose the first remaining non-zero available register. + unsigned AvailableReg = 0; + for (unsigned MaybeReg : AvailableRegs) { + if (MaybeReg) { + AvailableReg = MaybeReg; + break; + } + } + + const char *Symbol = getRetpolineSymbol(Subtarget, AvailableReg); + + if (AvailableReg == 0) { + // No register available. Use PUSH. This must not be a tailcall, and this + // must not be x64. + if (Subtarget.is64Bit()) + report_fatal_error( + "Cannot make an indirect call on x86-64 using both retpoline and a " + "calling convention that preservers r11"); + if (Opc != X86::CALLpcrel32) + report_fatal_error("Cannot make an indirect tail call on x86 using " + "retpoline without a preserved register"); + BuildMI(*BB, MI, DL, TII->get(X86::PUSH32r)).addReg(CalleeVReg); + MI.getOperand(0).ChangeToES(Symbol); + MI.setDesc(TII->get(Opc)); + } else { + BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), AvailableReg) + .addReg(CalleeVReg); + MI.getOperand(0).ChangeToES(Symbol); + MI.setDesc(TII->get(Opc)); + MachineInstrBuilder(*BB->getParent(), &MI) + .addReg(AvailableReg, RegState::Implicit | RegState::Kill); + } + return BB; +} + MachineBasicBlock * X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI, MachineBasicBlock *MBB) const { @@ -27584,6 +27702,11 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, case X86::TLS_base_addr32: case X86::TLS_base_addr64: return EmitLoweredTLSAddr(MI, BB); + case X86::RETPOLINE_CALL32: + case X86::RETPOLINE_CALL64: + case X86::RETPOLINE_TCRETURN32: + case X86::RETPOLINE_TCRETURN64: + return EmitLoweredRetpoline(MI, BB); case X86::CATCHRET: return EmitLoweredCatchRet(MI, BB); case X86::CATCHPAD: diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index 1fb7c7ed4e98..3aa9d01bff20 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -982,6 +982,9 @@ namespace llvm { bool isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask, EVT VT) const override; + /// Returns true if lowering to a jump table is allowed. + bool areJTsAllowed(const Function *Fn) const override; + /// If true, then instruction selection should /// seek to shrink the FP constant of the specified type to a smaller type /// in order to save space and / or reduce runtime. @@ -1294,6 +1297,9 @@ namespace llvm { MachineBasicBlock *EmitLoweredTLSCall(MachineInstr &MI, MachineBasicBlock *BB) const; + MachineBasicBlock *EmitLoweredRetpoline(MachineInstr &MI, + MachineBasicBlock *BB) const; + MachineBasicBlock *emitEHSjLjSetJmp(MachineInstr &MI, MachineBasicBlock *MBB) const; diff --git a/lib/Target/X86/X86InstrCompiler.td b/lib/Target/X86/X86InstrCompiler.td index 06600a4ef286..d66d9258e96f 100644 --- a/lib/Target/X86/X86InstrCompiler.td +++ b/lib/Target/X86/X86InstrCompiler.td @@ -1146,14 +1146,14 @@ def X86tcret_6regs : PatFrag<(ops node:$ptr, node:$off), def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off), (TCRETURNri ptr_rc_tailcall:$dst, imm:$off)>, - Requires<[Not64BitMode]>; + Requires<[Not64BitMode, NotUseRetpoline]>; // FIXME: This is disabled for 32-bit PIC mode because the global base // register which is part of the address mode may be assigned a // callee-saved register. def : Pat<(X86tcret (load addr:$dst), imm:$off), (TCRETURNmi addr:$dst, imm:$off)>, - Requires<[Not64BitMode, IsNotPIC]>; + Requires<[Not64BitMode, IsNotPIC, NotUseRetpoline]>; def : Pat<(X86tcret (i32 tglobaladdr:$dst), imm:$off), (TCRETURNdi tglobaladdr:$dst, imm:$off)>, @@ -1165,13 +1165,21 @@ def : Pat<(X86tcret (i32 texternalsym:$dst), imm:$off), def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off), (TCRETURNri64 ptr_rc_tailcall:$dst, imm:$off)>, - Requires<[In64BitMode]>; + Requires<[In64BitMode, NotUseRetpoline]>; // Don't fold loads into X86tcret requiring more than 6 regs. // There wouldn't be enough scratch registers for base+index. def : Pat<(X86tcret_6regs (load addr:$dst), imm:$off), (TCRETURNmi64 addr:$dst, imm:$off)>, - Requires<[In64BitMode]>; + Requires<[In64BitMode, NotUseRetpoline]>; + +def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off), + (RETPOLINE_TCRETURN64 ptr_rc_tailcall:$dst, imm:$off)>, + Requires<[In64BitMode, UseRetpoline]>; + +def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off), + (RETPOLINE_TCRETURN32 ptr_rc_tailcall:$dst, imm:$off)>, + Requires<[Not64BitMode, UseRetpoline]>; def : Pat<(X86tcret (i64 tglobaladdr:$dst), imm:$off), (TCRETURNdi64 tglobaladdr:$dst, imm:$off)>, diff --git a/lib/Target/X86/X86InstrControl.td b/lib/Target/X86/X86InstrControl.td index 5581fd462a1d..7932686ebc87 100644 --- a/lib/Target/X86/X86InstrControl.td +++ b/lib/Target/X86/X86InstrControl.td @@ -211,11 +211,12 @@ let isCall = 1 in Sched<[WriteJumpLd]>; def CALL32r : I<0xFF, MRM2r, (outs), (ins GR32:$dst), "call{l}\t{*}$dst", [(X86call GR32:$dst)], IIC_CALL_RI>, - OpSize32, Requires<[Not64BitMode]>, Sched<[WriteJump]>; + OpSize32, Requires<[Not64BitMode,NotUseRetpoline]>, + Sched<[WriteJump]>; def CALL32m : I<0xFF, MRM2m, (outs), (ins i32mem:$dst), "call{l}\t{*}$dst", [(X86call (loadi32 addr:$dst))], IIC_CALL_MEM>, OpSize32, - Requires<[Not64BitMode,FavorMemIndirectCall]>, + Requires<[Not64BitMode,FavorMemIndirectCall,NotUseRetpoline]>, Sched<[WriteJumpLd]>; let Predicates = [Not64BitMode] in { @@ -298,11 +299,12 @@ let isCall = 1, Uses = [RSP, SSP], SchedRW = [WriteJump] in { def CALL64r : I<0xFF, MRM2r, (outs), (ins GR64:$dst), "call{q}\t{*}$dst", [(X86call GR64:$dst)], IIC_CALL_RI>, - Requires<[In64BitMode]>; + Requires<[In64BitMode,NotUseRetpoline]>; def CALL64m : I<0xFF, MRM2m, (outs), (ins i64mem:$dst), "call{q}\t{*}$dst", [(X86call (loadi64 addr:$dst))], IIC_CALL_MEM>, - Requires<[In64BitMode,FavorMemIndirectCall]>; + Requires<[In64BitMode,FavorMemIndirectCall, + NotUseRetpoline]>; def FARCALL64 : RI<0xFF, MRM3m, (outs), (ins opaque80mem:$dst), "lcall{q}\t{*}$dst", [], IIC_CALL_FAR_MEM>; @@ -341,6 +343,27 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, } } +let isPseudo = 1, isCall = 1, isCodeGenOnly = 1, + Uses = [RSP, SSP], + usesCustomInserter = 1, + SchedRW = [WriteJump] in { + def RETPOLINE_CALL32 : + PseudoI<(outs), (ins GR32:$dst), [(X86call GR32:$dst)]>, + Requires<[Not64BitMode,UseRetpoline]>; + + def RETPOLINE_CALL64 : + PseudoI<(outs), (ins GR64:$dst), [(X86call GR64:$dst)]>, + Requires<[In64BitMode,UseRetpoline]>; + + // Retpoline variant of indirect tail calls. + let isTerminator = 1, isReturn = 1, isBarrier = 1 in { + def RETPOLINE_TCRETURN64 : + PseudoI<(outs), (ins GR64:$dst, i32imm:$offset), []>; + def RETPOLINE_TCRETURN32 : + PseudoI<(outs), (ins GR32:$dst, i32imm:$offset), []>; + } +} + // Conditional tail calls are similar to the above, but they are branches // rather than barriers, and they use EFLAGS. let isCall = 1, isTerminator = 1, isReturn = 1, isBranch = 1, diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td index 27c67500b26f..a657b19c08c9 100644 --- a/lib/Target/X86/X86InstrInfo.td +++ b/lib/Target/X86/X86InstrInfo.td @@ -938,6 +938,8 @@ def HasFastLZCNT : Predicate<"Subtarget->hasFastLZCNT()">; def HasFastSHLDRotate : Predicate<"Subtarget->hasFastSHLDRotate()">; def HasERMSB : Predicate<"Subtarget->hasERMSB()">; def HasMFence : Predicate<"Subtarget->hasMFence()">; +def UseRetpoline : Predicate<"Subtarget->useRetpoline()">; +def NotUseRetpoline : Predicate<"!Subtarget->useRetpoline()">; //===----------------------------------------------------------------------===// // X86 Instruction Format Definitions. diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp index 8a7179e48a0b..730ba745eb70 100644 --- a/lib/Target/X86/X86MCInstLower.cpp +++ b/lib/Target/X86/X86MCInstLower.cpp @@ -874,6 +874,10 @@ void X86AsmPrinter::LowerSTATEPOINT(const MachineInstr &MI, // address is to far away. (TODO: support non-relative addressing) break; case MachineOperand::MO_Register: + // FIXME: Add retpoline support and remove this. + if (Subtarget->useRetpoline()) + report_fatal_error("Lowering register statepoints with retpoline not " + "yet implemented."); CallTargetMCOp = MCOperand::createReg(CallTarget.getReg()); CallOpcode = X86::CALL64r; break; @@ -1028,6 +1032,10 @@ void X86AsmPrinter::LowerPATCHPOINT(const MachineInstr &MI, EmitAndCountInstruction( MCInstBuilder(X86::MOV64ri).addReg(ScratchReg).addOperand(CalleeMCOp)); + // FIXME: Add retpoline support and remove this. + if (Subtarget->useRetpoline()) + report_fatal_error( + "Lowering patchpoint with retpoline not yet implemented."); EmitAndCountInstruction(MCInstBuilder(X86::CALL64r).addReg(ScratchReg)); } diff --git a/lib/Target/X86/X86RetpolineThunks.cpp b/lib/Target/X86/X86RetpolineThunks.cpp new file mode 100644 index 000000000000..223fa5771498 --- /dev/null +++ b/lib/Target/X86/X86RetpolineThunks.cpp @@ -0,0 +1,311 @@ +//======- X86RetpolineThunks.cpp - Construct retpoline thunks for x86 --=====// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// +/// Pass that injects an MI thunk implementing a "retpoline". This is +/// a RET-implemented trampoline that is used to lower indirect calls in a way +/// that prevents speculation on some x86 processors and can be used to mitigate +/// security vulnerabilities due to targeted speculative execution and side +/// channels such as CVE-2017-5715. +/// +/// TODO(chandlerc): All of this code could use better comments and +/// documentation. +/// +//===----------------------------------------------------------------------===// + +#include "X86.h" +#include "X86InstrBuilder.h" +#include "X86Subtarget.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +#define DEBUG_TYPE "x86-retpoline-thunks" + +static const char ThunkNamePrefix[] = "__llvm_retpoline_"; +static const char R11ThunkName[] = "__llvm_retpoline_r11"; +static const char EAXThunkName[] = "__llvm_retpoline_eax"; +static const char ECXThunkName[] = "__llvm_retpoline_ecx"; +static const char EDXThunkName[] = "__llvm_retpoline_edx"; +static const char PushThunkName[] = "__llvm_retpoline_push"; + +namespace { +class X86RetpolineThunks : public MachineFunctionPass { +public: + static char ID; + + X86RetpolineThunks() : MachineFunctionPass(ID) {} + + StringRef getPassName() const override { return "X86 Retpoline Thunks"; } + + bool doInitialization(Module &M) override; + bool runOnMachineFunction(MachineFunction &F) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + MachineFunctionPass::getAnalysisUsage(AU); + AU.addRequired<MachineModuleInfo>(); + AU.addPreserved<MachineModuleInfo>(); + } + +private: + MachineModuleInfo *MMI; + const TargetMachine *TM; + bool Is64Bit; + const X86Subtarget *STI; + const X86InstrInfo *TII; + + bool InsertedThunks; + + void createThunkFunction(Module &M, StringRef Name); + void insertRegReturnAddrClobber(MachineBasicBlock &MBB, unsigned Reg); + void insert32BitPushReturnAddrClobber(MachineBasicBlock &MBB); + void populateThunk(MachineFunction &MF, Optional<unsigned> Reg = None); +}; + +} // end anonymous namespace + +FunctionPass *llvm::createX86RetpolineThunksPass() { + return new X86RetpolineThunks(); +} + +char X86RetpolineThunks::ID = 0; + +bool X86RetpolineThunks::doInitialization(Module &M) { + InsertedThunks = false; + return false; +} + +bool X86RetpolineThunks::runOnMachineFunction(MachineFunction &MF) { + DEBUG(dbgs() << getPassName() << '\n'); + + TM = &MF.getTarget();; + STI = &MF.getSubtarget<X86Subtarget>(); + TII = STI->getInstrInfo(); + Is64Bit = TM->getTargetTriple().getArch() == Triple::x86_64; + + MMI = &getAnalysis<MachineModuleInfo>(); + Module &M = const_cast<Module &>(*MMI->getModule()); + + // If this function is not a thunk, check to see if we need to insert + // a thunk. + if (!MF.getName().startswith(ThunkNamePrefix)) { + // If we've already inserted a thunk, nothing else to do. + if (InsertedThunks) + return false; + + // Only add a thunk if one of the functions has the retpoline feature + // enabled in its subtarget, and doesn't enable external thunks. + // FIXME: Conditionalize on indirect calls so we don't emit a thunk when + // nothing will end up calling it. + // FIXME: It's a little silly to look at every function just to enumerate + // the subtargets, but eventually we'll want to look at them for indirect + // calls, so maybe this is OK. + if (!STI->useRetpoline() || STI->useRetpolineExternalThunk()) + return false; + + // Otherwise, we need to insert the thunk. + // WARNING: This is not really a well behaving thing to do in a function + // pass. We extract the module and insert a new function (and machine + // function) directly into the module. + if (Is64Bit) + createThunkFunction(M, R11ThunkName); + else + for (StringRef Name : + {EAXThunkName, ECXThunkName, EDXThunkName, PushThunkName}) + createThunkFunction(M, Name); + InsertedThunks = true; + return true; + } + + // If this *is* a thunk function, we need to populate it with the correct MI. + if (Is64Bit) { + assert(MF.getName() == "__llvm_retpoline_r11" && + "Should only have an r11 thunk on 64-bit targets"); + + // __llvm_retpoline_r11: + // callq .Lr11_call_target + // .Lr11_capture_spec: + // pause + // lfence + // jmp .Lr11_capture_spec + // .align 16 + // .Lr11_call_target: + // movq %r11, (%rsp) + // retq + populateThunk(MF, X86::R11); + } else { + // For 32-bit targets we need to emit a collection of thunks for various + // possible scratch registers as well as a fallback that is used when + // there are no scratch registers and assumes the retpoline target has + // been pushed. + // __llvm_retpoline_eax: + // calll .Leax_call_target + // .Leax_capture_spec: + // pause + // jmp .Leax_capture_spec + // .align 16 + // .Leax_call_target: + // movl %eax, (%esp) # Clobber return addr + // retl + // + // __llvm_retpoline_ecx: + // ... # Same setup + // movl %ecx, (%esp) + // retl + // + // __llvm_retpoline_edx: + // ... # Same setup + // movl %edx, (%esp) + // retl + // + // This last one is a bit more special and so needs a little extra + // handling. + // __llvm_retpoline_push: + // calll .Lpush_call_target + // .Lpush_capture_spec: + // pause + // lfence + // jmp .Lpush_capture_spec + // .align 16 + // .Lpush_call_target: + // # Clear pause_loop return address. + // addl $4, %esp + // # Top of stack words are: Callee, RA. Exchange Callee and RA. + // pushl 4(%esp) # Push callee + // pushl 4(%esp) # Push RA + // popl 8(%esp) # Pop RA to final RA + // popl (%esp) # Pop callee to next top of stack + // retl # Ret to callee + if (MF.getName() == EAXThunkName) + populateThunk(MF, X86::EAX); + else if (MF.getName() == ECXThunkName) + populateThunk(MF, X86::ECX); + else if (MF.getName() == EDXThunkName) + populateThunk(MF, X86::EDX); + else if (MF.getName() == PushThunkName) + populateThunk(MF); + else + llvm_unreachable("Invalid thunk name on x86-32!"); + } + + return true; +} + +void X86RetpolineThunks::createThunkFunction(Module &M, StringRef Name) { + assert(Name.startswith(ThunkNamePrefix) && + "Created a thunk with an unexpected prefix!"); + + LLVMContext &Ctx = M.getContext(); + auto Type = FunctionType::get(Type::getVoidTy(Ctx), false); + Function *F = + Function::Create(Type, GlobalValue::LinkOnceODRLinkage, Name, &M); + F->setVisibility(GlobalValue::HiddenVisibility); + F->setComdat(M.getOrInsertComdat(Name)); + + // Add Attributes so that we don't create a frame, unwind information, or + // inline. + AttrBuilder B; + B.addAttribute(llvm::Attribute::NoUnwind); + B.addAttribute(llvm::Attribute::Naked); + F->addAttributes(llvm::AttributeList::FunctionIndex, B); + + // Populate our function a bit so that we can verify. + BasicBlock *Entry = BasicBlock::Create(Ctx, "entry", F); + IRBuilder<> Builder(Entry); + + Builder.CreateRetVoid(); +} + +void X86RetpolineThunks::insertRegReturnAddrClobber(MachineBasicBlock &MBB, + unsigned Reg) { + const unsigned MovOpc = Is64Bit ? X86::MOV64mr : X86::MOV32mr; + const unsigned SPReg = Is64Bit ? X86::RSP : X86::ESP; + addRegOffset(BuildMI(&MBB, DebugLoc(), TII->get(MovOpc)), SPReg, false, 0) + .addReg(Reg); +} + +void X86RetpolineThunks::insert32BitPushReturnAddrClobber( + MachineBasicBlock &MBB) { + // The instruction sequence we use to replace the return address without + // a scratch register is somewhat complicated: + // # Clear capture_spec from return address. + // addl $4, %esp + // # Top of stack words are: Callee, RA. Exchange Callee and RA. + // pushl 4(%esp) # Push callee + // pushl 4(%esp) # Push RA + // popl 8(%esp) # Pop RA to final RA + // popl (%esp) # Pop callee to next top of stack + // retl # Ret to callee + BuildMI(&MBB, DebugLoc(), TII->get(X86::ADD32ri), X86::ESP) + .addReg(X86::ESP) + .addImm(4); + addRegOffset(BuildMI(&MBB, DebugLoc(), TII->get(X86::PUSH32rmm)), X86::ESP, + false, 4); + addRegOffset(BuildMI(&MBB, DebugLoc(), TII->get(X86::PUSH32rmm)), X86::ESP, + false, 4); + addRegOffset(BuildMI(&MBB, DebugLoc(), TII->get(X86::POP32rmm)), X86::ESP, + false, 8); + addRegOffset(BuildMI(&MBB, DebugLoc(), TII->get(X86::POP32rmm)), X86::ESP, + false, 0); +} + +void X86RetpolineThunks::populateThunk(MachineFunction &MF, + Optional<unsigned> Reg) { + // Set MF properties. We never use vregs... + MF.getProperties().set(MachineFunctionProperties::Property::NoVRegs); + + MachineBasicBlock *Entry = &MF.front(); + Entry->clear(); + + MachineBasicBlock *CaptureSpec = MF.CreateMachineBasicBlock(Entry->getBasicBlock()); + MachineBasicBlock *CallTarget = MF.CreateMachineBasicBlock(Entry->getBasicBlock()); + MF.push_back(CaptureSpec); + MF.push_back(CallTarget); + + const unsigned CallOpc = Is64Bit ? X86::CALL64pcrel32 : X86::CALLpcrel32; + const unsigned RetOpc = Is64Bit ? X86::RETQ : X86::RETL; + + BuildMI(Entry, DebugLoc(), TII->get(CallOpc)).addMBB(CallTarget); + Entry->addSuccessor(CallTarget); + Entry->addSuccessor(CaptureSpec); + CallTarget->setHasAddressTaken(); + + // In the capture loop for speculation, we want to stop the processor from + // speculating as fast as possible. On Intel processors, the PAUSE instruction + // will block speculation without consuming any execution resources. On AMD + // processors, the PAUSE instruction is (essentially) a nop, so we also use an + // LFENCE instruction which they have advised will stop speculation as well + // with minimal resource utilization. We still end the capture with a jump to + // form an infinite loop to fully guarantee that no matter what implementation + // of the x86 ISA, speculating this code path never escapes. + BuildMI(CaptureSpec, DebugLoc(), TII->get(X86::PAUSE)); + BuildMI(CaptureSpec, DebugLoc(), TII->get(X86::LFENCE)); + BuildMI(CaptureSpec, DebugLoc(), TII->get(X86::JMP_1)).addMBB(CaptureSpec); + CaptureSpec->setHasAddressTaken(); + CaptureSpec->addSuccessor(CaptureSpec); + + CallTarget->setAlignment(4); + if (Reg) { + insertRegReturnAddrClobber(*CallTarget, *Reg); + } else { + assert(!Is64Bit && "We only support non-reg thunks on 32-bit x86!"); + insert32BitPushReturnAddrClobber(*CallTarget); + } + BuildMI(CallTarget, DebugLoc(), TII->get(RetOpc)); +} diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp index ad023623142f..dca98d999e58 100644 --- a/lib/Target/X86/X86Subtarget.cpp +++ b/lib/Target/X86/X86Subtarget.cpp @@ -314,6 +314,8 @@ void X86Subtarget::initializeEnvironment() { HasSGX = false; HasCLFLUSHOPT = false; HasCLWB = false; + UseRetpoline = false; + UseRetpolineExternalThunk = false; IsPMULLDSlow = false; IsSHLDSlow = false; IsUAMem16Slow = false; diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h index c9435890fc1f..37ffac1faf68 100644 --- a/lib/Target/X86/X86Subtarget.h +++ b/lib/Target/X86/X86Subtarget.h @@ -341,6 +341,14 @@ protected: /// Processor supports Cache Line Write Back instruction bool HasCLWB; + /// Use a retpoline thunk rather than indirect calls to block speculative + /// execution. + bool UseRetpoline; + + /// When using a retpoline thunk, call an externally provided thunk rather + /// than emitting one inside the compiler. + bool UseRetpolineExternalThunk; + /// Use software floating point for code generation. bool UseSoftFloat; @@ -574,6 +582,8 @@ public: bool hasIBT() const { return HasIBT; } bool hasCLFLUSHOPT() const { return HasCLFLUSHOPT; } bool hasCLWB() const { return HasCLWB; } + bool useRetpoline() const { return UseRetpoline; } + bool useRetpolineExternalThunk() const { return UseRetpolineExternalThunk; } bool isXRaySupported() const override { return is64Bit(); } @@ -696,6 +706,10 @@ public: /// Return true if the subtarget allows calls to immediate address. bool isLegalToCallImmediateAddr() const; + /// If we are using retpolines, we need to expand indirectbr to avoid it + /// lowering to an actual indirect jump. + bool enableIndirectBrExpand() const override { return useRetpoline(); } + /// Enable the MachineScheduler pass for all X86 subtargets. bool enableMachineScheduler() const override { return true; } diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp index e95e6ecae091..ac242e1c00e0 100644 --- a/lib/Target/X86/X86TargetMachine.cpp +++ b/lib/Target/X86/X86TargetMachine.cpp @@ -321,6 +321,7 @@ public: void addPreRegAlloc() override; void addPostRegAlloc() override; void addPreEmitPass() override; + void addPreEmitPass2() override; void addPreSched2() override; }; @@ -350,6 +351,11 @@ void X86PassConfig::addIRPasses() { if (TM->getOptLevel() != CodeGenOpt::None) addPass(createInterleavedAccessPass()); + + // Add passes that handle indirect branch removal and insertion of a retpoline + // thunk. These will be a no-op unless a function subtarget has the retpoline + // feature enabled. + addPass(createIndirectBrExpandPass()); } bool X86PassConfig::addInstSelector() { @@ -436,3 +442,7 @@ void X86PassConfig::addPreEmitPass() { addPass(createX86EvexToVexInsts()); } } + +void X86PassConfig::addPreEmitPass2() { + addPass(createX86RetpolineThunksPass()); +} diff --git a/lib/Transforms/Scalar/DeadStoreElimination.cpp b/lib/Transforms/Scalar/DeadStoreElimination.cpp index e703014bb0e6..b665d94a70aa 100644 --- a/lib/Transforms/Scalar/DeadStoreElimination.cpp +++ b/lib/Transforms/Scalar/DeadStoreElimination.cpp @@ -1176,7 +1176,8 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA, auto *Earlier = dyn_cast<StoreInst>(DepWrite); auto *Later = dyn_cast<StoreInst>(Inst); if (Earlier && isa<ConstantInt>(Earlier->getValueOperand()) && - Later && isa<ConstantInt>(Later->getValueOperand())) { + Later && isa<ConstantInt>(Later->getValueOperand()) && + memoryIsNotModifiedBetween(Earlier, Later, AA)) { // If the store we find is: // a) partially overwritten by the store to 'Loc' // b) the later store is fully contained in the earlier one and diff --git a/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp index 8fa9ffb6d014..4a96e0ddca16 100644 --- a/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp +++ b/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp @@ -1071,6 +1071,7 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) { NewGEP = GetElementPtrInst::Create(GEP->getResultElementType(), NewGEP, ConstantInt::get(IntPtrTy, Index, true), GEP->getName(), GEP); + NewGEP->copyMetadata(*GEP); // Inherit the inbounds attribute of the original GEP. cast<GetElementPtrInst>(NewGEP)->setIsInBounds(GEPWasInBounds); } else { @@ -1095,6 +1096,7 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) { Type::getInt8Ty(GEP->getContext()), NewGEP, ConstantInt::get(IntPtrTy, AccumulativeByteOffset, true), "uglygep", GEP); + NewGEP->copyMetadata(*GEP); // Inherit the inbounds attribute of the original GEP. cast<GetElementPtrInst>(NewGEP)->setIsInBounds(GEPWasInBounds); if (GEP->getType() != I8PtrTy) diff --git a/test/CodeGen/AMDGPU/smrd.ll b/test/CodeGen/AMDGPU/smrd.ll index 9fd20fd67b8c..420c7b80b8d3 100644 --- a/test/CodeGen/AMDGPU/smrd.ll +++ b/test/CodeGen/AMDGPU/smrd.ll @@ -194,11 +194,7 @@ main_body: ; GCN-LABEL: {{^}}smrd_vgpr_offset_imm: ; GCN-NEXT: %bb. - -; SICIVI-NEXT: buffer_load_dword v{{[0-9]}}, v0, s[0:3], 0 offen offset:4095 ; - -; GFX9-NEXT: v_add_u32_e32 [[ADD:v[0-9]+]], 0xfff, v0 -; GFX9-NEXT: buffer_load_dword v{{[0-9]}}, [[ADD]], s[0:3], 0 offen ; +; GCN-NEXT: buffer_load_dword v{{[0-9]}}, v0, s[0:3], 0 offen offset:4095 ; define amdgpu_ps float @smrd_vgpr_offset_imm(<4 x i32> inreg %desc, i32 %offset) #0 { main_body: %off = add i32 %offset, 4095 @@ -244,16 +240,8 @@ main_body: ; GCN-LABEL: {{^}}smrd_vgpr_merged: ; GCN-NEXT: %bb. - -; SICIVI-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4 -; SICIVI-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28 - -; GFX9: buffer_load_dword -; GFX9: buffer_load_dword -; GFX9: buffer_load_dword -; GFX9: buffer_load_dword -; GFX9: buffer_load_dword -; GFX9: buffer_load_dword +; GCN-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4 +; GCN-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28 define amdgpu_ps void @smrd_vgpr_merged(<4 x i32> inreg %desc, i32 %a) #0 { main_body: %a1 = add i32 %a, 4 diff --git a/test/CodeGen/Mips/pr36061.ll b/test/CodeGen/Mips/pr36061.ll new file mode 100644 index 000000000000..6a9aa72aae0e --- /dev/null +++ b/test/CodeGen/Mips/pr36061.ll @@ -0,0 +1,65 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=mips64el-unknown-linux-gnu -target-abi n64 | FileCheck %s --check-prefix=MIPSN64 +; RUN: llc < %s -mtriple=mips64el-unknown-linux-gnu -target-abi n32 | FileCheck %s --check-prefix=MIPSN32 + +; Test that powi has its integer argument sign extended on mips64. + +declare double @llvm.powi.f64(double, i32) + +define double @powi(double %value, i32 %power) { +; MIPSN64-LABEL: powi: +; MIPSN64: # %bb.0: +; MIPSN64-NEXT: daddiu $sp, $sp, -16 +; MIPSN64-NEXT: .cfi_def_cfa_offset 16 +; MIPSN64-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill +; MIPSN64-NEXT: .cfi_offset 31, -8 +; MIPSN64-NEXT: jal __powidf2 +; MIPSN64-NEXT: sll $5, $5, 0 +; MIPSN64-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload +; MIPSN64-NEXT: jr $ra +; MIPSN64-NEXT: daddiu $sp, $sp, 16 +; +; MIPSN32-LABEL: powi: +; MIPSN32: # %bb.0: +; MIPSN32-NEXT: addiu $sp, $sp, -16 +; MIPSN32-NEXT: .cfi_def_cfa_offset 16 +; MIPSN32-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill +; MIPSN32-NEXT: .cfi_offset 31, -8 +; MIPSN32-NEXT: jal __powidf2 +; MIPSN32-NEXT: sll $5, $5, 0 +; MIPSN32-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload +; MIPSN32-NEXT: jr $ra +; MIPSN32-NEXT: addiu $sp, $sp, 16 + %1 = tail call double @llvm.powi.f64(double %value, i32 %power) + ret double %1 +} + +declare float @llvm.powi.f32(float, i32) + +define float @powfi(float %value, i32 %power) { +; MIPSN64-LABEL: powfi: +; MIPSN64: # %bb.0: +; MIPSN64-NEXT: daddiu $sp, $sp, -16 +; MIPSN64-NEXT: .cfi_def_cfa_offset 16 +; MIPSN64-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill +; MIPSN64-NEXT: .cfi_offset 31, -8 +; MIPSN64-NEXT: jal __powisf2 +; MIPSN64-NEXT: sll $5, $5, 0 +; MIPSN64-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload +; MIPSN64-NEXT: jr $ra +; MIPSN64-NEXT: daddiu $sp, $sp, 16 +; +; MIPSN32-LABEL: powfi: +; MIPSN32: # %bb.0: +; MIPSN32-NEXT: addiu $sp, $sp, -16 +; MIPSN32-NEXT: .cfi_def_cfa_offset 16 +; MIPSN32-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill +; MIPSN32-NEXT: .cfi_offset 31, -8 +; MIPSN32-NEXT: jal __powisf2 +; MIPSN32-NEXT: sll $5, $5, 0 +; MIPSN32-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload +; MIPSN32-NEXT: jr $ra +; MIPSN32-NEXT: addiu $sp, $sp, 16 + %1 = tail call float @llvm.powi.f32(float %value, i32 %power) + ret float %1 +} diff --git a/test/CodeGen/Mips/unsized-global.ll b/test/CodeGen/Mips/unsized-global.ll new file mode 100644 index 000000000000..a89ecc1fd1cb --- /dev/null +++ b/test/CodeGen/Mips/unsized-global.ll @@ -0,0 +1,22 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; Check that -mgpopt doesn't crash on unsized externals +; RUN: llc -mtriple=mips64-unknown-freebsd -mattr=+noabicalls -target-abi n64 -mgpopt -o - %s | FileCheck %s + +%struct.a = type opaque + +@b = external global %struct.a, align 1 + +; Function Attrs: norecurse nounwind readnone +define %struct.a* @d() { +; CHECK-LABEL: d: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lui $1, %highest(b) +; CHECK-NEXT: daddiu $1, $1, %higher(b) +; CHECK-NEXT: dsll $1, $1, 16 +; CHECK-NEXT: daddiu $1, $1, %hi(b) +; CHECK-NEXT: dsll $1, $1, 16 +; CHECK-NEXT: jr $ra +; CHECK-NEXT: daddiu $2, $1, %lo(b) +entry: + ret %struct.a* @b +} diff --git a/test/CodeGen/SPARC/stack-align.ll b/test/CodeGen/SPARC/stack-align.ll index b152e6a038f5..6516fb78e48b 100644 --- a/test/CodeGen/SPARC/stack-align.ll +++ b/test/CodeGen/SPARC/stack-align.ll @@ -1,4 +1,5 @@ -; RUN: llc -march=sparc < %s | FileCheck %s +; RUN: llc -march=sparc < %s | FileCheck %s --check-prefixes=CHECK,CHECK32 +; RUN: llc -march=sparcv9 < %s | FileCheck %s --check-prefixes=CHECK,CHECK64 declare void @stack_realign_helper(i32 %a, i32* %b) ;; This is a function where we have a local variable of 64-byte @@ -7,10 +8,15 @@ declare void @stack_realign_helper(i32 %a, i32* %b) ;; the argument is accessed via frame pointer not stack pointer (to %o0). ;; CHECK-LABEL: stack_realign: -;; CHECK: andn %sp, 63, %sp -;; CHECK-NEXT: ld [%fp+92], %o0 -;; CHECK-NEXT: call stack_realign_helper -;; CHECK-NEXT: add %sp, 128, %o1 +;; CHECK32: andn %sp, 63, %sp +;; CHECK32-NEXT: ld [%fp+92], %o0 +;; CHECK64: add %sp, 2047, %g1 +;; CHECK64-NEXT: andn %g1, 63, %g1 +;; CHECK64-NEXT: add %g1, -2047, %sp +;; CHECK64-NEXT: ld [%fp+2227], %o0 +;; CHECK-NEXT: call stack_realign_helper +;; CHECK32-NEXT: add %sp, 128, %o1 +;; CHECK64-NEXT: add %sp, 2239, %o1 define void @stack_realign(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g) { entry: diff --git a/test/CodeGen/Thumb/pr35836.ll b/test/CodeGen/Thumb/pr35836.ll new file mode 100644 index 000000000000..7765e66658a0 --- /dev/null +++ b/test/CodeGen/Thumb/pr35836.ll @@ -0,0 +1,56 @@ +; RUN: llc < %s | FileCheck %s + +target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" +target triple = "thumbv5e-none-linux-gnueabi" + +; Function Attrs: norecurse nounwind optsize +define void @f(i32,i32,i32,i32,i32* %x4p, i32* %x5p, i32* %x6p) { +if.end: + br label %while.body + +while.body: + %ll.0100 = phi i64 [ 0, %if.end ], [ %shr32, %while.body ] + %add = add nuw nsw i64 %ll.0100, 0 + %add3 = add nuw nsw i64 %add, 0 + %shr = lshr i64 %add3, 32 + %conv7 = zext i32 %0 to i64 + %conv9 = zext i32 %1 to i64 + %add10 = add nuw nsw i64 %conv9, %conv7 + %add11 = add nuw nsw i64 %add10, %shr + %shr14 = lshr i64 %add11, 32 + %conv16 = zext i32 %2 to i64 + %conv18 = zext i32 %3 to i64 + %add19 = add nuw nsw i64 %conv18, %conv16 + %add20 = add nuw nsw i64 %add19, %shr14 + %conv21 = trunc i64 %add20 to i32 + store i32 %conv21, i32* %x6p, align 4 + %shr23 = lshr i64 %add20, 32 + %x4 = load i32, i32* %x4p, align 4 + %conv25 = zext i32 %x4 to i64 + %x5 = load i32, i32* %x5p, align 4 + %conv27 = zext i32 %x5 to i64 + %add28 = add nuw nsw i64 %conv27, %conv25 + %add29 = add nuw nsw i64 %add28, %shr23 + %shr32 = lshr i64 %add29, 32 + br label %while.body +} +; CHECK: adds r3, r0, r1 +; CHECK: push {r5} +; CHECK: pop {r1} +; CHECK: adcs r1, r1 +; CHECK: ldr r0, [sp, #12] @ 4-byte Reload +; CHECK: ldr r2, [sp, #8] @ 4-byte Reload +; CHECK: adds r2, r0, r2 +; CHECK: push {r5} +; CHECK: pop {r4} +; CHECK: adcs r4, r4 +; CHECK: adds r0, r2, r5 +; CHECK: push {r3} +; CHECK: pop {r0} +; CHECK: adcs r0, r4 +; CHECK: ldr r6, [sp, #4] @ 4-byte Reload +; CHECK: str r0, [r6] +; CHECK: ldr r0, [r7] +; CHECK: ldr r6, [sp] @ 4-byte Reload +; CHECK: ldr r6, [r6] +; CHECK: adds r0, r6, r0 diff --git a/test/CodeGen/Thumb/pr35836_2.ll b/test/CodeGen/Thumb/pr35836_2.ll new file mode 100644 index 000000000000..af115e8ce21a --- /dev/null +++ b/test/CodeGen/Thumb/pr35836_2.ll @@ -0,0 +1,57 @@ +; RUN: llc < %s | FileCheck %s + +target datalayout = "e-m:e-p:64:64-i128:64-v128:64:128-a:0:64-n64-S64" +target triple = "thumbv6---gnueabi" + +; Function Attrs: norecurse nounwind readonly +define i128 @a(i64* nocapture readonly %z) local_unnamed_addr #0 { +entry: + %0 = load i64, i64* %z, align 4 + %conv.i = zext i64 %0 to i128 + %arrayidx1 = getelementptr inbounds i64, i64* %z, i64 2 + %1 = load i64, i64* %arrayidx1, align 4 + %conv.i38 = zext i64 %1 to i128 + %shl.i39 = shl nuw i128 %conv.i38, 64 + %or = or i128 %shl.i39, %conv.i + %arrayidx3 = getelementptr inbounds i64, i64* %z, i64 1 + %2 = load i64, i64* %arrayidx3, align 4 + %conv.i37 = zext i64 %2 to i128 + %arrayidx5 = getelementptr inbounds i64, i64* %z, i64 3 + %3 = load i64, i64* %arrayidx5, align 4 + %conv.i35 = zext i64 %3 to i128 + %shl.i36 = shl nuw i128 %conv.i35, 64 + %or7 = or i128 %shl.i36, %conv.i37 + %arrayidx10 = getelementptr inbounds i64, i64* %z, i64 4 + %4 = load i64, i64* %arrayidx10, align 4 + %conv.i64 = zext i64 %4 to i128 + %shl.i33 = shl nuw i128 %conv.i64, 64 + %or12 = or i128 %shl.i33, %conv.i + %arrayidx15 = getelementptr inbounds i64, i64* %z, i64 5 + %5 = load i64, i64* %arrayidx15, align 4 + %conv.i30 = zext i64 %5 to i128 + %shl.i = shl nuw i128 %conv.i30, 64 + %or17 = or i128 %shl.i, %conv.i37 + %add = add i128 %or7, %or + %add18 = add i128 %or17, %or12 + %mul = mul i128 %add18, %add + ret i128 %mul +} +; CHECK: adds r4, r2, r7 +; CHECK: mov r4, r1 +; CHECK: adcs r4, r6 +; CHECK: ldr r4, [sp, #20] @ 4-byte Reload +; CHECK: adcs r5, r4 +; CHECK: ldr r4, [sp, #24] @ 4-byte Reload +; CHECK: adcs r3, r4 +; CHECK: adds r4, r2, r7 +; CHECK: adcs r1, r6 +; CHECK: mov r2, sp +; CHECK: str r4, [r2] +; CHECK: str r1, [r2, #4] +; CHECK: ldr r6, [r0, #16] +; CHECK: ldr r7, [r0, #24] +; CHECK: adcs r7, r6 +; CHECK: str r7, [r2, #8] +; CHECK: ldr r6, [r0, #20] +; CHECK: ldr r0, [r0, #28] +; CHECK: adcs r0, r6 diff --git a/test/CodeGen/X86/O0-pipeline.ll b/test/CodeGen/X86/O0-pipeline.ll index cb7dabefe45a..3a720a5288a2 100644 --- a/test/CodeGen/X86/O0-pipeline.ll +++ b/test/CodeGen/X86/O0-pipeline.ll @@ -25,6 +25,7 @@ ; CHECK-NEXT: Instrument function entry/exit with calls to e.g. mcount() (post inlining) ; CHECK-NEXT: Scalarize Masked Memory Intrinsics ; CHECK-NEXT: Expand reduction intrinsics +; CHECK-NEXT: Expand indirectbr instructions ; CHECK-NEXT: Rewrite Symbols ; CHECK-NEXT: FunctionPass Manager ; CHECK-NEXT: Dominator Tree Construction @@ -57,6 +58,7 @@ ; CHECK-NEXT: Machine Natural Loop Construction ; CHECK-NEXT: Insert XRay ops ; CHECK-NEXT: Implement the 'patchable-function' attribute +; CHECK-NEXT: X86 Retpoline Thunks ; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: Machine Optimization Remark Emitter ; CHECK-NEXT: MachineDominator Tree Construction diff --git a/test/CodeGen/X86/retpoline-external.ll b/test/CodeGen/X86/retpoline-external.ll new file mode 100644 index 000000000000..66d32ba5d73d --- /dev/null +++ b/test/CodeGen/X86/retpoline-external.ll @@ -0,0 +1,166 @@ +; RUN: llc -mtriple=x86_64-unknown < %s | FileCheck %s --implicit-check-not="jmp.*\*" --implicit-check-not="call.*\*" --check-prefix=X64 +; RUN: llc -mtriple=x86_64-unknown -O0 < %s | FileCheck %s --implicit-check-not="jmp.*\*" --implicit-check-not="call.*\*" --check-prefix=X64FAST + +; RUN: llc -mtriple=i686-unknown < %s | FileCheck %s --implicit-check-not="jmp.*\*" --implicit-check-not="call.*\*" --check-prefix=X86 +; RUN: llc -mtriple=i686-unknown -O0 < %s | FileCheck %s --implicit-check-not="jmp.*\*" --implicit-check-not="call.*\*" --check-prefix=X86FAST + +declare void @bar(i32) + +; Test a simple indirect call and tail call. +define void @icall_reg(void (i32)* %fp, i32 %x) #0 { +entry: + tail call void @bar(i32 %x) + tail call void %fp(i32 %x) + tail call void @bar(i32 %x) + tail call void %fp(i32 %x) + ret void +} + +; X64-LABEL: icall_reg: +; X64-DAG: movq %rdi, %[[fp:[^ ]*]] +; X64-DAG: movl %esi, %[[x:[^ ]*]] +; X64: movl %[[x]], %edi +; X64: callq bar +; X64-DAG: movl %[[x]], %edi +; X64-DAG: movq %[[fp]], %r11 +; X64: callq __llvm_external_retpoline_r11 +; X64: movl %[[x]], %edi +; X64: callq bar +; X64-DAG: movl %[[x]], %edi +; X64-DAG: movq %[[fp]], %r11 +; X64: jmp __llvm_external_retpoline_r11 # TAILCALL + +; X64FAST-LABEL: icall_reg: +; X64FAST: callq bar +; X64FAST: callq __llvm_external_retpoline_r11 +; X64FAST: callq bar +; X64FAST: jmp __llvm_external_retpoline_r11 # TAILCALL + +; X86-LABEL: icall_reg: +; X86-DAG: movl 12(%esp), %[[fp:[^ ]*]] +; X86-DAG: movl 16(%esp), %[[x:[^ ]*]] +; X86: pushl %[[x]] +; X86: calll bar +; X86: movl %[[fp]], %eax +; X86: pushl %[[x]] +; X86: calll __llvm_external_retpoline_eax +; X86: pushl %[[x]] +; X86: calll bar +; X86: movl %[[fp]], %eax +; X86: pushl %[[x]] +; X86: calll __llvm_external_retpoline_eax +; X86-NOT: # TAILCALL + +; X86FAST-LABEL: icall_reg: +; X86FAST: calll bar +; X86FAST: calll __llvm_external_retpoline_eax +; X86FAST: calll bar +; X86FAST: calll __llvm_external_retpoline_eax + + +@global_fp = external global void (i32)* + +; Test an indirect call through a global variable. +define void @icall_global_fp(i32 %x, void (i32)** %fpp) #0 { + %fp1 = load void (i32)*, void (i32)** @global_fp + call void %fp1(i32 %x) + %fp2 = load void (i32)*, void (i32)** @global_fp + tail call void %fp2(i32 %x) + ret void +} + +; X64-LABEL: icall_global_fp: +; X64-DAG: movl %edi, %[[x:[^ ]*]] +; X64-DAG: movq global_fp(%rip), %r11 +; X64: callq __llvm_external_retpoline_r11 +; X64-DAG: movl %[[x]], %edi +; X64-DAG: movq global_fp(%rip), %r11 +; X64: jmp __llvm_external_retpoline_r11 # TAILCALL + +; X64FAST-LABEL: icall_global_fp: +; X64FAST: movq global_fp(%rip), %r11 +; X64FAST: callq __llvm_external_retpoline_r11 +; X64FAST: movq global_fp(%rip), %r11 +; X64FAST: jmp __llvm_external_retpoline_r11 # TAILCALL + +; X86-LABEL: icall_global_fp: +; X86: movl global_fp, %eax +; X86: pushl 4(%esp) +; X86: calll __llvm_external_retpoline_eax +; X86: addl $4, %esp +; X86: movl global_fp, %eax +; X86: jmp __llvm_external_retpoline_eax # TAILCALL + +; X86FAST-LABEL: icall_global_fp: +; X86FAST: calll __llvm_external_retpoline_eax +; X86FAST: jmp __llvm_external_retpoline_eax # TAILCALL + + +%struct.Foo = type { void (%struct.Foo*)** } + +; Test an indirect call through a vtable. +define void @vcall(%struct.Foo* %obj) #0 { + %vptr_field = getelementptr %struct.Foo, %struct.Foo* %obj, i32 0, i32 0 + %vptr = load void (%struct.Foo*)**, void (%struct.Foo*)*** %vptr_field + %vslot = getelementptr void(%struct.Foo*)*, void(%struct.Foo*)** %vptr, i32 1 + %fp = load void(%struct.Foo*)*, void(%struct.Foo*)** %vslot + tail call void %fp(%struct.Foo* %obj) + tail call void %fp(%struct.Foo* %obj) + ret void +} + +; X64-LABEL: vcall: +; X64: movq %rdi, %[[obj:[^ ]*]] +; X64: movq (%[[obj]]), %[[vptr:[^ ]*]] +; X64: movq 8(%[[vptr]]), %[[fp:[^ ]*]] +; X64: movq %[[fp]], %r11 +; X64: callq __llvm_external_retpoline_r11 +; X64-DAG: movq %[[obj]], %rdi +; X64-DAG: movq %[[fp]], %r11 +; X64: jmp __llvm_external_retpoline_r11 # TAILCALL + +; X64FAST-LABEL: vcall: +; X64FAST: callq __llvm_external_retpoline_r11 +; X64FAST: jmp __llvm_external_retpoline_r11 # TAILCALL + +; X86-LABEL: vcall: +; X86: movl 8(%esp), %[[obj:[^ ]*]] +; X86: movl (%[[obj]]), %[[vptr:[^ ]*]] +; X86: movl 4(%[[vptr]]), %[[fp:[^ ]*]] +; X86: movl %[[fp]], %eax +; X86: pushl %[[obj]] +; X86: calll __llvm_external_retpoline_eax +; X86: addl $4, %esp +; X86: movl %[[fp]], %eax +; X86: jmp __llvm_external_retpoline_eax # TAILCALL + +; X86FAST-LABEL: vcall: +; X86FAST: calll __llvm_external_retpoline_eax +; X86FAST: jmp __llvm_external_retpoline_eax # TAILCALL + + +declare void @direct_callee() + +define void @direct_tail() #0 { + tail call void @direct_callee() + ret void +} + +; X64-LABEL: direct_tail: +; X64: jmp direct_callee # TAILCALL +; X64FAST-LABEL: direct_tail: +; X64FAST: jmp direct_callee # TAILCALL +; X86-LABEL: direct_tail: +; X86: jmp direct_callee # TAILCALL +; X86FAST-LABEL: direct_tail: +; X86FAST: jmp direct_callee # TAILCALL + + +; Lastly check that no thunks were emitted. +; X64-NOT: __{{.*}}_retpoline_{{.*}}: +; X64FAST-NOT: __{{.*}}_retpoline_{{.*}}: +; X86-NOT: __{{.*}}_retpoline_{{.*}}: +; X86FAST-NOT: __{{.*}}_retpoline_{{.*}}: + + +attributes #0 = { "target-features"="+retpoline-external-thunk" } diff --git a/test/CodeGen/X86/retpoline.ll b/test/CodeGen/X86/retpoline.ll new file mode 100644 index 000000000000..57d3388b812a --- /dev/null +++ b/test/CodeGen/X86/retpoline.ll @@ -0,0 +1,367 @@ +; RUN: llc -mtriple=x86_64-unknown < %s | FileCheck %s --implicit-check-not="jmp.*\*" --implicit-check-not="call.*\*" --check-prefix=X64 +; RUN: llc -mtriple=x86_64-unknown -O0 < %s | FileCheck %s --implicit-check-not="jmp.*\*" --implicit-check-not="call.*\*" --check-prefix=X64FAST + +; RUN: llc -mtriple=i686-unknown < %s | FileCheck %s --implicit-check-not="jmp.*\*" --implicit-check-not="call.*\*" --check-prefix=X86 +; RUN: llc -mtriple=i686-unknown -O0 < %s | FileCheck %s --implicit-check-not="jmp.*\*" --implicit-check-not="call.*\*" --check-prefix=X86FAST + +declare void @bar(i32) + +; Test a simple indirect call and tail call. +define void @icall_reg(void (i32)* %fp, i32 %x) #0 { +entry: + tail call void @bar(i32 %x) + tail call void %fp(i32 %x) + tail call void @bar(i32 %x) + tail call void %fp(i32 %x) + ret void +} + +; X64-LABEL: icall_reg: +; X64-DAG: movq %rdi, %[[fp:[^ ]*]] +; X64-DAG: movl %esi, %[[x:[^ ]*]] +; X64: movl %[[x]], %edi +; X64: callq bar +; X64-DAG: movl %[[x]], %edi +; X64-DAG: movq %[[fp]], %r11 +; X64: callq __llvm_retpoline_r11 +; X64: movl %[[x]], %edi +; X64: callq bar +; X64-DAG: movl %[[x]], %edi +; X64-DAG: movq %[[fp]], %r11 +; X64: jmp __llvm_retpoline_r11 # TAILCALL + +; X64FAST-LABEL: icall_reg: +; X64FAST: callq bar +; X64FAST: callq __llvm_retpoline_r11 +; X64FAST: callq bar +; X64FAST: jmp __llvm_retpoline_r11 # TAILCALL + +; X86-LABEL: icall_reg: +; X86-DAG: movl 12(%esp), %[[fp:[^ ]*]] +; X86-DAG: movl 16(%esp), %[[x:[^ ]*]] +; X86: pushl %[[x]] +; X86: calll bar +; X86: movl %[[fp]], %eax +; X86: pushl %[[x]] +; X86: calll __llvm_retpoline_eax +; X86: pushl %[[x]] +; X86: calll bar +; X86: movl %[[fp]], %eax +; X86: pushl %[[x]] +; X86: calll __llvm_retpoline_eax +; X86-NOT: # TAILCALL + +; X86FAST-LABEL: icall_reg: +; X86FAST: calll bar +; X86FAST: calll __llvm_retpoline_eax +; X86FAST: calll bar +; X86FAST: calll __llvm_retpoline_eax + + +@global_fp = external global void (i32)* + +; Test an indirect call through a global variable. +define void @icall_global_fp(i32 %x, void (i32)** %fpp) #0 { + %fp1 = load void (i32)*, void (i32)** @global_fp + call void %fp1(i32 %x) + %fp2 = load void (i32)*, void (i32)** @global_fp + tail call void %fp2(i32 %x) + ret void +} + +; X64-LABEL: icall_global_fp: +; X64-DAG: movl %edi, %[[x:[^ ]*]] +; X64-DAG: movq global_fp(%rip), %r11 +; X64: callq __llvm_retpoline_r11 +; X64-DAG: movl %[[x]], %edi +; X64-DAG: movq global_fp(%rip), %r11 +; X64: jmp __llvm_retpoline_r11 # TAILCALL + +; X64FAST-LABEL: icall_global_fp: +; X64FAST: movq global_fp(%rip), %r11 +; X64FAST: callq __llvm_retpoline_r11 +; X64FAST: movq global_fp(%rip), %r11 +; X64FAST: jmp __llvm_retpoline_r11 # TAILCALL + +; X86-LABEL: icall_global_fp: +; X86: movl global_fp, %eax +; X86: pushl 4(%esp) +; X86: calll __llvm_retpoline_eax +; X86: addl $4, %esp +; X86: movl global_fp, %eax +; X86: jmp __llvm_retpoline_eax # TAILCALL + +; X86FAST-LABEL: icall_global_fp: +; X86FAST: calll __llvm_retpoline_eax +; X86FAST: jmp __llvm_retpoline_eax # TAILCALL + + +%struct.Foo = type { void (%struct.Foo*)** } + +; Test an indirect call through a vtable. +define void @vcall(%struct.Foo* %obj) #0 { + %vptr_field = getelementptr %struct.Foo, %struct.Foo* %obj, i32 0, i32 0 + %vptr = load void (%struct.Foo*)**, void (%struct.Foo*)*** %vptr_field + %vslot = getelementptr void(%struct.Foo*)*, void(%struct.Foo*)** %vptr, i32 1 + %fp = load void(%struct.Foo*)*, void(%struct.Foo*)** %vslot + tail call void %fp(%struct.Foo* %obj) + tail call void %fp(%struct.Foo* %obj) + ret void +} + +; X64-LABEL: vcall: +; X64: movq %rdi, %[[obj:[^ ]*]] +; X64: movq (%[[obj]]), %[[vptr:[^ ]*]] +; X64: movq 8(%[[vptr]]), %[[fp:[^ ]*]] +; X64: movq %[[fp]], %r11 +; X64: callq __llvm_retpoline_r11 +; X64-DAG: movq %[[obj]], %rdi +; X64-DAG: movq %[[fp]], %r11 +; X64: jmp __llvm_retpoline_r11 # TAILCALL + +; X64FAST-LABEL: vcall: +; X64FAST: callq __llvm_retpoline_r11 +; X64FAST: jmp __llvm_retpoline_r11 # TAILCALL + +; X86-LABEL: vcall: +; X86: movl 8(%esp), %[[obj:[^ ]*]] +; X86: movl (%[[obj]]), %[[vptr:[^ ]*]] +; X86: movl 4(%[[vptr]]), %[[fp:[^ ]*]] +; X86: movl %[[fp]], %eax +; X86: pushl %[[obj]] +; X86: calll __llvm_retpoline_eax +; X86: addl $4, %esp +; X86: movl %[[fp]], %eax +; X86: jmp __llvm_retpoline_eax # TAILCALL + +; X86FAST-LABEL: vcall: +; X86FAST: calll __llvm_retpoline_eax +; X86FAST: jmp __llvm_retpoline_eax # TAILCALL + + +declare void @direct_callee() + +define void @direct_tail() #0 { + tail call void @direct_callee() + ret void +} + +; X64-LABEL: direct_tail: +; X64: jmp direct_callee # TAILCALL +; X64FAST-LABEL: direct_tail: +; X64FAST: jmp direct_callee # TAILCALL +; X86-LABEL: direct_tail: +; X86: jmp direct_callee # TAILCALL +; X86FAST-LABEL: direct_tail: +; X86FAST: jmp direct_callee # TAILCALL + + +declare void @nonlazybind_callee() #1 + +define void @nonlazybind_caller() #0 { + call void @nonlazybind_callee() + tail call void @nonlazybind_callee() + ret void +} + +; X64-LABEL: nonlazybind_caller: +; X64: movq nonlazybind_callee@GOTPCREL(%rip), %[[REG:.*]] +; X64: movq %[[REG]], %r11 +; X64: callq __llvm_retpoline_r11 +; X64: movq %[[REG]], %r11 +; X64: jmp __llvm_retpoline_r11 # TAILCALL +; X64FAST-LABEL: nonlazybind_caller: +; X64FAST: movq nonlazybind_callee@GOTPCREL(%rip), %r11 +; X64FAST: callq __llvm_retpoline_r11 +; X64FAST: movq nonlazybind_callee@GOTPCREL(%rip), %r11 +; X64FAST: jmp __llvm_retpoline_r11 # TAILCALL +; X86-LABEL: nonlazybind_caller: +; X86: calll nonlazybind_callee@PLT +; X86: jmp nonlazybind_callee@PLT # TAILCALL +; X86FAST-LABEL: nonlazybind_caller: +; X86FAST: calll nonlazybind_callee@PLT +; X86FAST: jmp nonlazybind_callee@PLT # TAILCALL + + +@indirectbr_rewrite.targets = constant [10 x i8*] [i8* blockaddress(@indirectbr_rewrite, %bb0), + i8* blockaddress(@indirectbr_rewrite, %bb1), + i8* blockaddress(@indirectbr_rewrite, %bb2), + i8* blockaddress(@indirectbr_rewrite, %bb3), + i8* blockaddress(@indirectbr_rewrite, %bb4), + i8* blockaddress(@indirectbr_rewrite, %bb5), + i8* blockaddress(@indirectbr_rewrite, %bb6), + i8* blockaddress(@indirectbr_rewrite, %bb7), + i8* blockaddress(@indirectbr_rewrite, %bb8), + i8* blockaddress(@indirectbr_rewrite, %bb9)] + +; Check that when retpolines are enabled a function with indirectbr gets +; rewritten to use switch, and that in turn doesn't get lowered as a jump +; table. +define void @indirectbr_rewrite(i64* readonly %p, i64* %sink) #0 { +; X64-LABEL: indirectbr_rewrite: +; X64-NOT: jmpq +; X86-LABEL: indirectbr_rewrite: +; X86-NOT: jmpl +entry: + %i0 = load i64, i64* %p + %target.i0 = getelementptr [10 x i8*], [10 x i8*]* @indirectbr_rewrite.targets, i64 0, i64 %i0 + %target0 = load i8*, i8** %target.i0 + indirectbr i8* %target0, [label %bb1, label %bb3] + +bb0: + store volatile i64 0, i64* %sink + br label %latch + +bb1: + store volatile i64 1, i64* %sink + br label %latch + +bb2: + store volatile i64 2, i64* %sink + br label %latch + +bb3: + store volatile i64 3, i64* %sink + br label %latch + +bb4: + store volatile i64 4, i64* %sink + br label %latch + +bb5: + store volatile i64 5, i64* %sink + br label %latch + +bb6: + store volatile i64 6, i64* %sink + br label %latch + +bb7: + store volatile i64 7, i64* %sink + br label %latch + +bb8: + store volatile i64 8, i64* %sink + br label %latch + +bb9: + store volatile i64 9, i64* %sink + br label %latch + +latch: + %i.next = load i64, i64* %p + %target.i.next = getelementptr [10 x i8*], [10 x i8*]* @indirectbr_rewrite.targets, i64 0, i64 %i.next + %target.next = load i8*, i8** %target.i.next + ; Potentially hit a full 10 successors here so that even if we rewrite as + ; a switch it will try to be lowered with a jump table. + indirectbr i8* %target.next, [label %bb0, + label %bb1, + label %bb2, + label %bb3, + label %bb4, + label %bb5, + label %bb6, + label %bb7, + label %bb8, + label %bb9] +} + +; Lastly check that the necessary thunks were emitted. +; +; X64-LABEL: .section .text.__llvm_retpoline_r11,{{.*}},__llvm_retpoline_r11,comdat +; X64-NEXT: .hidden __llvm_retpoline_r11 +; X64-NEXT: .weak __llvm_retpoline_r11 +; X64: __llvm_retpoline_r11: +; X64-NEXT: # {{.*}} # %entry +; X64-NEXT: callq [[CALL_TARGET:.*]] +; X64-NEXT: [[CAPTURE_SPEC:.*]]: # Block address taken +; X64-NEXT: # %entry +; X64-NEXT: # =>This Inner Loop Header: Depth=1 +; X64-NEXT: pause +; X64-NEXT: lfence +; X64-NEXT: jmp [[CAPTURE_SPEC]] +; X64-NEXT: .p2align 4, 0x90 +; X64-NEXT: [[CALL_TARGET]]: # Block address taken +; X64-NEXT: # %entry +; X64-NEXT: movq %r11, (%rsp) +; X64-NEXT: retq +; +; X86-LABEL: .section .text.__llvm_retpoline_eax,{{.*}},__llvm_retpoline_eax,comdat +; X86-NEXT: .hidden __llvm_retpoline_eax +; X86-NEXT: .weak __llvm_retpoline_eax +; X86: __llvm_retpoline_eax: +; X86-NEXT: # {{.*}} # %entry +; X86-NEXT: calll [[CALL_TARGET:.*]] +; X86-NEXT: [[CAPTURE_SPEC:.*]]: # Block address taken +; X86-NEXT: # %entry +; X86-NEXT: # =>This Inner Loop Header: Depth=1 +; X86-NEXT: pause +; X86-NEXT: lfence +; X86-NEXT: jmp [[CAPTURE_SPEC]] +; X86-NEXT: .p2align 4, 0x90 +; X86-NEXT: [[CALL_TARGET]]: # Block address taken +; X86-NEXT: # %entry +; X86-NEXT: movl %eax, (%esp) +; X86-NEXT: retl +; +; X86-LABEL: .section .text.__llvm_retpoline_ecx,{{.*}},__llvm_retpoline_ecx,comdat +; X86-NEXT: .hidden __llvm_retpoline_ecx +; X86-NEXT: .weak __llvm_retpoline_ecx +; X86: __llvm_retpoline_ecx: +; X86-NEXT: # {{.*}} # %entry +; X86-NEXT: calll [[CALL_TARGET:.*]] +; X86-NEXT: [[CAPTURE_SPEC:.*]]: # Block address taken +; X86-NEXT: # %entry +; X86-NEXT: # =>This Inner Loop Header: Depth=1 +; X86-NEXT: pause +; X86-NEXT: lfence +; X86-NEXT: jmp [[CAPTURE_SPEC]] +; X86-NEXT: .p2align 4, 0x90 +; X86-NEXT: [[CALL_TARGET]]: # Block address taken +; X86-NEXT: # %entry +; X86-NEXT: movl %ecx, (%esp) +; X86-NEXT: retl +; +; X86-LABEL: .section .text.__llvm_retpoline_edx,{{.*}},__llvm_retpoline_edx,comdat +; X86-NEXT: .hidden __llvm_retpoline_edx +; X86-NEXT: .weak __llvm_retpoline_edx +; X86: __llvm_retpoline_edx: +; X86-NEXT: # {{.*}} # %entry +; X86-NEXT: calll [[CALL_TARGET:.*]] +; X86-NEXT: [[CAPTURE_SPEC:.*]]: # Block address taken +; X86-NEXT: # %entry +; X86-NEXT: # =>This Inner Loop Header: Depth=1 +; X86-NEXT: pause +; X86-NEXT: lfence +; X86-NEXT: jmp [[CAPTURE_SPEC]] +; X86-NEXT: .p2align 4, 0x90 +; X86-NEXT: [[CALL_TARGET]]: # Block address taken +; X86-NEXT: # %entry +; X86-NEXT: movl %edx, (%esp) +; X86-NEXT: retl +; +; X86-LABEL: .section .text.__llvm_retpoline_push,{{.*}},__llvm_retpoline_push,comdat +; X86-NEXT: .hidden __llvm_retpoline_push +; X86-NEXT: .weak __llvm_retpoline_push +; X86: __llvm_retpoline_push: +; X86-NEXT: # {{.*}} # %entry +; X86-NEXT: calll [[CALL_TARGET:.*]] +; X86-NEXT: [[CAPTURE_SPEC:.*]]: # Block address taken +; X86-NEXT: # %entry +; X86-NEXT: # =>This Inner Loop Header: Depth=1 +; X86-NEXT: pause +; X86-NEXT: lfence +; X86-NEXT: jmp [[CAPTURE_SPEC]] +; X86-NEXT: .p2align 4, 0x90 +; X86-NEXT: [[CALL_TARGET]]: # Block address taken +; X86-NEXT: # %entry +; X86-NEXT: addl $4, %esp +; X86-NEXT: pushl 4(%esp) +; X86-NEXT: pushl 4(%esp) +; X86-NEXT: popl 8(%esp) +; X86-NEXT: popl (%esp) +; X86-NEXT: retl + + +attributes #0 = { "target-features"="+retpoline" } +attributes #1 = { nonlazybind } diff --git a/test/Transforms/DeadStoreElimination/merge-stores.ll b/test/Transforms/DeadStoreElimination/merge-stores.ll index ad1bc310878f..ff1bfaa4d3c8 100644 --- a/test/Transforms/DeadStoreElimination/merge-stores.ll +++ b/test/Transforms/DeadStoreElimination/merge-stores.ll @@ -186,6 +186,23 @@ define void @PR34074(i32* %x, i64* %y) { ret void } +; We can't eliminate the last store because P and Q may alias. + +define void @PR36129(i32* %P, i32* %Q) { +; CHECK-LABEL: @PR36129( +; CHECK-NEXT: store i32 1, i32* [[P:%.*]] +; CHECK-NEXT: [[P2:%.*]] = bitcast i32* [[P]] to i8* +; CHECK-NEXT: store i32 2, i32* [[Q:%.*]] +; CHECK-NEXT: store i8 3, i8* [[P2]] +; CHECK-NEXT: ret void +; + store i32 1, i32* %P + %P2 = bitcast i32* %P to i8* + store i32 2, i32* %Q + store i8 3, i8* %P2 + ret void +} + !0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 5.0.0 (trunk 306512)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2) !1 = !DIFile(filename: "me.cpp", directory: "/compiler-explorer") !2 = !{} diff --git a/test/Transforms/IndirectBrExpand/basic.ll b/test/Transforms/IndirectBrExpand/basic.ll new file mode 100644 index 000000000000..d0319c6b9946 --- /dev/null +++ b/test/Transforms/IndirectBrExpand/basic.ll @@ -0,0 +1,63 @@ +; RUN: opt < %s -indirectbr-expand -S | FileCheck %s +; +; REQUIRES: x86-registered-target + +target triple = "x86_64-unknown-linux-gnu" + +@test1.targets = constant [4 x i8*] [i8* blockaddress(@test1, %bb0), + i8* blockaddress(@test1, %bb1), + i8* blockaddress(@test1, %bb2), + i8* blockaddress(@test1, %bb3)] +; CHECK-LABEL: @test1.targets = constant [4 x i8*] +; CHECK: [i8* inttoptr (i64 1 to i8*), +; CHECK: i8* inttoptr (i64 2 to i8*), +; CHECK: i8* inttoptr (i64 3 to i8*), +; CHECK: i8* blockaddress(@test1, %bb3)] + +define void @test1(i64* readonly %p, i64* %sink) #0 { +; CHECK-LABEL: define void @test1( +entry: + %i0 = load i64, i64* %p + %target.i0 = getelementptr [4 x i8*], [4 x i8*]* @test1.targets, i64 0, i64 %i0 + %target0 = load i8*, i8** %target.i0 + ; Only a subset of blocks are viable successors here. + indirectbr i8* %target0, [label %bb0, label %bb1] +; CHECK-NOT: indirectbr +; CHECK: %[[ENTRY_V:.*]] = ptrtoint i8* %{{.*}} to i64 +; CHECK-NEXT: br label %[[SWITCH_BB:.*]] + +bb0: + store volatile i64 0, i64* %sink + br label %latch + +bb1: + store volatile i64 1, i64* %sink + br label %latch + +bb2: + store volatile i64 2, i64* %sink + br label %latch + +bb3: + store volatile i64 3, i64* %sink + br label %latch + +latch: + %i.next = load i64, i64* %p + %target.i.next = getelementptr [4 x i8*], [4 x i8*]* @test1.targets, i64 0, i64 %i.next + %target.next = load i8*, i8** %target.i.next + ; A different subset of blocks are viable successors here. + indirectbr i8* %target.next, [label %bb1, label %bb2] +; CHECK-NOT: indirectbr +; CHECK: %[[LATCH_V:.*]] = ptrtoint i8* %{{.*}} to i64 +; CHECK-NEXT: br label %[[SWITCH_BB]] +; +; CHECK: [[SWITCH_BB]]: +; CHECK-NEXT: %[[V:.*]] = phi i64 [ %[[ENTRY_V]], %entry ], [ %[[LATCH_V]], %latch ] +; CHECK-NEXT: switch i64 %[[V]], label %bb0 [ +; CHECK-NEXT: i64 2, label %bb1 +; CHECK-NEXT: i64 3, label %bb2 +; CHECK-NEXT: ] +} + +attributes #0 = { "target-features"="+retpoline" } diff --git a/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/split-gep-and-gvn-addrspace-addressing-modes.ll b/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/split-gep-and-gvn-addrspace-addressing-modes.ll index 23ec0ca25544..43fe18f1aa25 100644 --- a/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/split-gep-and-gvn-addrspace-addressing-modes.ll +++ b/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/split-gep-and-gvn-addrspace-addressing-modes.ll @@ -92,3 +92,48 @@ define amdgpu_kernel void @sum_of_lds_array_over_max_mubuf_offset(i32 %x, i32 %y store float %tmp21, float addrspace(1)* %output, align 4 ret void } + +; IR-LABEL: @keep_metadata( +; IR: getelementptr {{.*}} !amdgpu.uniform +; IR: getelementptr {{.*}} !amdgpu.uniform +; IR: getelementptr {{.*}} !amdgpu.uniform +define amdgpu_ps <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @keep_metadata([0 x <4 x i32>] addrspace(2)* inreg noalias dereferenceable(18446744073709551615), [0 x <8 x i32>] addrspace(2)* inreg noalias dereferenceable(18446744073709551615), [0 x <4 x i32>] addrspace(2)* inreg noalias dereferenceable(18446744073709551615), [0 x <8 x i32>] addrspace(2)* inreg noalias dereferenceable(18446744073709551615), float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, i32, i32, float, i32) #5 { +main_body: + %22 = call nsz float @llvm.amdgcn.interp.mov(i32 2, i32 0, i32 0, i32 %5) #8 + %23 = bitcast float %22 to i32 + %24 = shl i32 %23, 1 + %25 = getelementptr [0 x <8 x i32>], [0 x <8 x i32>] addrspace(2)* %1, i32 0, i32 %24, !amdgpu.uniform !0 + %26 = load <8 x i32>, <8 x i32> addrspace(2)* %25, align 32, !invariant.load !0 + %27 = shl i32 %23, 2 + %28 = or i32 %27, 3 + %29 = bitcast [0 x <8 x i32>] addrspace(2)* %1 to [0 x <4 x i32>] addrspace(2)* + %30 = getelementptr [0 x <4 x i32>], [0 x <4 x i32>] addrspace(2)* %29, i32 0, i32 %28, !amdgpu.uniform !0 + %31 = load <4 x i32>, <4 x i32> addrspace(2)* %30, align 16, !invariant.load !0 + %32 = call nsz <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> zeroinitializer, <8 x i32> %26, <4 x i32> %31, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #8 + %33 = extractelement <4 x float> %32, i32 0 + %34 = extractelement <4 x float> %32, i32 1 + %35 = extractelement <4 x float> %32, i32 2 + %36 = extractelement <4 x float> %32, i32 3 + %37 = bitcast float %4 to i32 + %38 = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> undef, i32 %37, 4 + %39 = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %38, float %33, 5 + %40 = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %39, float %34, 6 + %41 = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %40, float %35, 7 + %42 = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %41, float %36, 8 + %43 = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %42, float %20, 19 + ret <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %43 +} + +; Function Attrs: nounwind readnone speculatable +declare float @llvm.amdgcn.interp.mov(i32, i32, i32, i32) #6 + +; Function Attrs: nounwind readonly +declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #7 + + +!0 = !{} + +attributes #5 = { "InitialPSInputAddr"="45175" } +attributes #6 = { nounwind readnone speculatable } +attributes #7 = { nounwind readonly } +attributes #8 = { nounwind readnone } diff --git a/tools/opt/opt.cpp b/tools/opt/opt.cpp index 5bc00ea35ae5..c471e0f2e3ec 100644 --- a/tools/opt/opt.cpp +++ b/tools/opt/opt.cpp @@ -402,6 +402,7 @@ int main(int argc, char **argv) { initializeSjLjEHPreparePass(Registry); initializePreISelIntrinsicLoweringLegacyPassPass(Registry); initializeGlobalMergePass(Registry); + initializeIndirectBrExpandPassPass(Registry); initializeInterleavedAccessPass(Registry); initializeEntryExitInstrumenterPass(Registry); initializePostInlineEntryExitInstrumenterPass(Registry); |