diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2022-01-27 22:06:42 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2022-01-27 22:06:42 +0000 |
commit | 6f8fc217eaa12bf657be1c6468ed9938d10168b3 (patch) | |
tree | a1fd89b864d9b93e2ad68fe1dcf7afee2e3c8d76 /llvm/lib | |
parent | 77fc4c146f0870ffb09c1afb823ccbe742c5e6ff (diff) | |
download | src-6f8fc217eaa12bf657be1c6468ed9938d10168b3.tar.gz src-6f8fc217eaa12bf657be1c6468ed9938d10168b3.zip |
Vendor import of llvm-project main llvmorg-14-init-17616-g024a1fab5c35.vendor/llvm-project/llvmorg-14-init-17616-g024a1fab5c35
Diffstat (limited to 'llvm/lib')
964 files changed, 31115 insertions, 13302 deletions
diff --git a/llvm/lib/Analysis/AliasAnalysis.cpp b/llvm/lib/Analysis/AliasAnalysis.cpp index 49199060786c..a8132e5abf54 100644 --- a/llvm/lib/Analysis/AliasAnalysis.cpp +++ b/llvm/lib/Analysis/AliasAnalysis.cpp @@ -242,7 +242,7 @@ ModRefInfo AAResults::getModRefInfo(const CallBase *Call, if (onlyReadsMemory(MRB)) Result = clearMod(Result); - else if (doesNotReadMemory(MRB)) + else if (onlyWritesMemory(MRB)) Result = clearRef(Result); if (onlyAccessesArgPointees(MRB) || onlyAccessesInaccessibleOrArgMem(MRB)) { @@ -320,7 +320,7 @@ ModRefInfo AAResults::getModRefInfo(const CallBase *Call1, // from Call1 reading memory written by Call2. if (onlyReadsMemory(Call1B)) Result = clearMod(Result); - else if (doesNotReadMemory(Call1B)) + else if (onlyWritesMemory(Call1B)) Result = clearRef(Result); // If Call2 only access memory through arguments, accumulate the mod/ref @@ -988,6 +988,29 @@ bool llvm::isIdentifiedFunctionLocal(const Value *V) { return isa<AllocaInst>(V) || isNoAliasCall(V) || isNoAliasOrByValArgument(V); } +bool llvm::isNotVisibleOnUnwind(const Value *Object, + bool &RequiresNoCaptureBeforeUnwind) { + RequiresNoCaptureBeforeUnwind = false; + + // Alloca goes out of scope on unwind. + if (isa<AllocaInst>(Object)) + return true; + + // Byval goes out of scope on unwind. + if (auto *A = dyn_cast<Argument>(Object)) + return A->hasByValAttr(); + + // A noalias return is not accessible from any other code. If the pointer + // does not escape prior to the unwind, then the caller cannot access the + // memory either. + if (isNoAliasCall(Object)) { + RequiresNoCaptureBeforeUnwind = true; + return true; + } + + return false; +} + void llvm::getAAResultsAnalysisUsage(AnalysisUsage &AU) { // This function needs to be in sync with llvm::createLegacyPMAAResults -- if // more alias analyses are added to llvm::createLegacyPMAAResults, they need diff --git a/llvm/lib/Analysis/AliasAnalysisEvaluator.cpp b/llvm/lib/Analysis/AliasAnalysisEvaluator.cpp index 0c097b2fa302..1577f1eb70b1 100644 --- a/llvm/lib/Analysis/AliasAnalysisEvaluator.cpp +++ b/llvm/lib/Analysis/AliasAnalysisEvaluator.cpp @@ -142,13 +142,13 @@ void AAEvaluator::runInternal(Function &F, AAResults &AA) { for (SetVector<Value *>::iterator I1 = Pointers.begin(), E = Pointers.end(); I1 != E; ++I1) { auto I1Size = LocationSize::afterPointer(); - Type *I1ElTy = cast<PointerType>((*I1)->getType())->getElementType(); + Type *I1ElTy = (*I1)->getType()->getPointerElementType(); if (I1ElTy->isSized()) I1Size = LocationSize::precise(DL.getTypeStoreSize(I1ElTy)); for (SetVector<Value *>::iterator I2 = Pointers.begin(); I2 != I1; ++I2) { auto I2Size = LocationSize::afterPointer(); - Type *I2ElTy = cast<PointerType>((*I2)->getType())->getElementType(); + Type *I2ElTy = (*I2)->getType()->getPointerElementType(); if (I2ElTy->isSized()) I2Size = LocationSize::precise(DL.getTypeStoreSize(I2ElTy)); @@ -233,7 +233,7 @@ void AAEvaluator::runInternal(Function &F, AAResults &AA) { for (CallBase *Call : Calls) { for (auto Pointer : Pointers) { auto Size = LocationSize::afterPointer(); - Type *ElTy = cast<PointerType>(Pointer->getType())->getElementType(); + Type *ElTy = Pointer->getType()->getPointerElementType(); if (ElTy->isSized()) Size = LocationSize::precise(DL.getTypeStoreSize(ElTy)); diff --git a/llvm/lib/Analysis/BasicAliasAnalysis.cpp b/llvm/lib/Analysis/BasicAliasAnalysis.cpp index 5f1bf2001d47..b4c985962837 100644 --- a/llvm/lib/Analysis/BasicAliasAnalysis.cpp +++ b/llvm/lib/Analysis/BasicAliasAnalysis.cpp @@ -779,7 +779,7 @@ FunctionModRefBehavior BasicAAResult::getModRefBehavior(const CallBase *Call) { // than that. if (Call->onlyReadsMemory()) Min = FMRB_OnlyReadsMemory; - else if (Call->doesNotReadMemory()) + else if (Call->onlyWritesMemory()) Min = FMRB_OnlyWritesMemory; if (Call->onlyAccessesArgMemory()) @@ -812,7 +812,7 @@ FunctionModRefBehavior BasicAAResult::getModRefBehavior(const Function *F) { // If the function declares it only reads memory, go with that. if (F->onlyReadsMemory()) Min = FMRB_OnlyReadsMemory; - else if (F->doesNotReadMemory()) + else if (F->onlyWritesMemory()) Min = FMRB_OnlyWritesMemory; if (F->onlyAccessesArgMemory()) @@ -972,7 +972,7 @@ ModRefInfo BasicAAResult::getModRefInfo(const CallBase *Call, continue; } // Operand aliases 'Object' but call only writes into it. - if (Call->doesNotReadMemory(OperandNo)) { + if (Call->onlyWritesMemory(OperandNo)) { Result = setMod(Result); continue; } @@ -1020,9 +1020,9 @@ ModRefInfo BasicAAResult::getModRefInfo(const CallBase *Call, getBestAAResults().alias(MemoryLocation::getForDest(Inst), Loc, AAQI); // It's also possible for Loc to alias both src and dest, or neither. ModRefInfo rv = ModRefInfo::NoModRef; - if (SrcAA != AliasResult::NoAlias) + if (SrcAA != AliasResult::NoAlias || Call->hasReadingOperandBundles()) rv = setRef(rv); - if (DestAA != AliasResult::NoAlias) + if (DestAA != AliasResult::NoAlias || Call->hasClobberingOperandBundles()) rv = setMod(rv); return rv; } @@ -1248,8 +1248,8 @@ AliasResult BasicAAResult::aliasGEP( else GCD = APIntOps::GreatestCommonDivisor(GCD, ScaleForGCD.abs()); - ConstantRange CR = - computeConstantRange(Index.Val.V, true, &AC, Index.CxtI); + ConstantRange CR = computeConstantRange(Index.Val.V, /* ForSigned */ false, + true, &AC, Index.CxtI); KnownBits Known = computeKnownBits(Index.Val.V, DL, 0, &AC, Index.CxtI, DT); CR = CR.intersectWith( diff --git a/llvm/lib/Analysis/BranchProbabilityInfo.cpp b/llvm/lib/Analysis/BranchProbabilityInfo.cpp index 856d7e90acb2..ffb80134749a 100644 --- a/llvm/lib/Analysis/BranchProbabilityInfo.cpp +++ b/llvm/lib/Analysis/BranchProbabilityInfo.cpp @@ -42,6 +42,7 @@ #include <cassert> #include <cstdint> #include <iterator> +#include <map> #include <utility> using namespace llvm; diff --git a/llvm/lib/Analysis/CFLSteensAliasAnalysis.cpp b/llvm/lib/Analysis/CFLSteensAliasAnalysis.cpp index 9467bb3c9b2d..090dccc53b6e 100644 --- a/llvm/lib/Analysis/CFLSteensAliasAnalysis.cpp +++ b/llvm/lib/Analysis/CFLSteensAliasAnalysis.cpp @@ -63,7 +63,7 @@ using namespace llvm::cflaa; CFLSteensAAResult::CFLSteensAAResult( std::function<const TargetLibraryInfo &(Function &F)> GetTLI) - : AAResultBase(), GetTLI(std::move(GetTLI)) {} + : GetTLI(std::move(GetTLI)) {} CFLSteensAAResult::CFLSteensAAResult(CFLSteensAAResult &&Arg) : AAResultBase(std::move(Arg)), GetTLI(std::move(Arg.GetTLI)) {} CFLSteensAAResult::~CFLSteensAAResult() = default; diff --git a/llvm/lib/Analysis/CallGraphSCCPass.cpp b/llvm/lib/Analysis/CallGraphSCCPass.cpp index f2e5eab72bf2..930cb13c0cb3 100644 --- a/llvm/lib/Analysis/CallGraphSCCPass.cpp +++ b/llvm/lib/Analysis/CallGraphSCCPass.cpp @@ -61,7 +61,7 @@ class CGPassManager : public ModulePass, public PMDataManager { public: static char ID; - explicit CGPassManager() : ModulePass(ID), PMDataManager() {} + explicit CGPassManager() : ModulePass(ID) {} /// Execute all of the passes scheduled for execution. Keep track of /// whether any of the passes modifies the module, and if so, return true. diff --git a/llvm/lib/Analysis/CaptureTracking.cpp b/llvm/lib/Analysis/CaptureTracking.cpp index 9b45f455be08..ba8462e659d5 100644 --- a/llvm/lib/Analysis/CaptureTracking.cpp +++ b/llvm/lib/Analysis/CaptureTracking.cpp @@ -75,7 +75,7 @@ bool CaptureTracker::isDereferenceableOrNull(Value *O, const DataLayout &DL) { namespace { struct SimpleCaptureTracker : public CaptureTracker { explicit SimpleCaptureTracker(bool ReturnCaptures) - : ReturnCaptures(ReturnCaptures), Captured(false) {} + : ReturnCaptures(ReturnCaptures) {} void tooManyUses() override { Captured = true; } @@ -89,7 +89,7 @@ namespace { bool ReturnCaptures; - bool Captured; + bool Captured = false; }; /// Only find pointer captures which happen before the given instruction. Uses @@ -101,7 +101,7 @@ namespace { CapturesBefore(bool ReturnCaptures, const Instruction *I, const DominatorTree *DT, bool IncludeI, const LoopInfo *LI) : BeforeHere(I), DT(DT), ReturnCaptures(ReturnCaptures), - IncludeI(IncludeI), Captured(false), LI(LI) {} + IncludeI(IncludeI), LI(LI) {} void tooManyUses() override { Captured = true; } @@ -139,7 +139,7 @@ namespace { bool ReturnCaptures; bool IncludeI; - bool Captured; + bool Captured = false; const LoopInfo *LI; }; @@ -155,7 +155,7 @@ namespace { struct EarliestCaptures : public CaptureTracker { EarliestCaptures(bool ReturnCaptures, Function &F, const DominatorTree &DT) - : DT(DT), ReturnCaptures(ReturnCaptures), Captured(false), F(F) {} + : DT(DT), ReturnCaptures(ReturnCaptures), F(F) {} void tooManyUses() override { Captured = true; @@ -199,7 +199,7 @@ namespace { bool ReturnCaptures; - bool Captured; + bool Captured = false; Function &F; }; diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp index 922b38e92785..7cf69f613c66 100644 --- a/llvm/lib/Analysis/ConstantFolding.cpp +++ b/llvm/lib/Analysis/ConstantFolding.cpp @@ -106,11 +106,8 @@ Constant *FoldBitCast(Constant *C, Type *DestTy, const DataLayout &DL) { "Invalid constantexpr bitcast!"); // Catch the obvious splat cases. - if (C->isNullValue() && !DestTy->isX86_MMXTy() && !DestTy->isX86_AMXTy()) - return Constant::getNullValue(DestTy); - if (C->isAllOnesValue() && !DestTy->isX86_MMXTy() && !DestTy->isX86_AMXTy() && - !DestTy->isPtrOrPtrVectorTy()) // Don't get ones for ptr types! - return Constant::getAllOnesValue(DestTy); + if (Constant *Res = ConstantFoldLoadFromUniformValue(C, DestTy)) + return Res; if (auto *VTy = dyn_cast<VectorType>(C->getType())) { // Handle a vector->scalar integer/fp cast. @@ -362,16 +359,8 @@ Constant *llvm::ConstantFoldLoadThroughBitcast(Constant *C, Type *DestTy, // Catch the obvious splat cases (since all-zeros can coerce non-integral // pointers legally). - if (C->isNullValue() && !DestTy->isX86_MMXTy() && !DestTy->isX86_AMXTy()) - return Constant::getNullValue(DestTy); - if (C->isAllOnesValue() && - (DestTy->isIntegerTy() || DestTy->isFloatingPointTy() || - DestTy->isVectorTy()) && - !DestTy->isX86_AMXTy() && !DestTy->isX86_MMXTy() && - !DestTy->isPtrOrPtrVectorTy()) - // Get ones when the input is trivial, but - // only for supported types inside getAllOnesValue. - return Constant::getAllOnesValue(DestTy); + if (Constant *Res = ConstantFoldLoadFromUniformValue(C, DestTy)) + return Res; // If the type sizes are the same and a cast is legal, just directly // cast the constant. @@ -410,6 +399,12 @@ Constant *llvm::ConstantFoldLoadThroughBitcast(Constant *C, Type *DestTy, } while (ElemC && DL.getTypeSizeInBits(ElemC->getType()).isZero()); C = ElemC; } else { + // For non-byte-sized vector elements, the first element is not + // necessarily located at the vector base address. + if (auto *VT = dyn_cast<VectorType>(SrcTy)) + if (!DL.typeSizeEqualsStoreSize(VT->getElementType())) + return nullptr; + C = C->getAggregateElement(0u); } } while (C); @@ -558,23 +553,16 @@ Constant *FoldReinterpretLoadFromConst(Constant *C, Type *LoadTy, // If this isn't an integer load we can't fold it directly. if (!IntType) { - // If this is a float/double load, we can try folding it as an int32/64 load - // and then bitcast the result. This can be useful for union cases. Note + // If this is a non-integer load, we can try folding it as an int load and + // then bitcast the result. This can be useful for union cases. Note // that address spaces don't matter here since we're not going to result in // an actual new load. - Type *MapTy; - if (LoadTy->isHalfTy()) - MapTy = Type::getInt16Ty(C->getContext()); - else if (LoadTy->isFloatTy()) - MapTy = Type::getInt32Ty(C->getContext()); - else if (LoadTy->isDoubleTy()) - MapTy = Type::getInt64Ty(C->getContext()); - else if (LoadTy->isVectorTy()) { - MapTy = PointerType::getIntNTy( - C->getContext(), DL.getTypeSizeInBits(LoadTy).getFixedSize()); - } else + if (!LoadTy->isFloatingPointTy() && !LoadTy->isPointerTy() && + !LoadTy->isVectorTy()) return nullptr; + Type *MapTy = Type::getIntNTy( + C->getContext(), DL.getTypeSizeInBits(LoadTy).getFixedSize()); if (Constant *Res = FoldReinterpretLoadFromConst(C, MapTy, Offset, DL)) { if (Res->isNullValue() && !LoadTy->isX86_MMXTy() && !LoadTy->isX86_AMXTy()) @@ -680,9 +668,21 @@ Constant *llvm::ConstantFoldLoadFromConst(Constant *C, Type *Ty, if (Constant *Result = ConstantFoldLoadThroughBitcast(AtOffset, Ty, DL)) return Result; + // Explicitly check for out-of-bounds access, so we return undef even if the + // constant is a uniform value. + TypeSize Size = DL.getTypeAllocSize(C->getType()); + if (!Size.isScalable() && Offset.sge(Size.getFixedSize())) + return UndefValue::get(Ty); + + // Try an offset-independent fold of a uniform value. + if (Constant *Result = ConstantFoldLoadFromUniformValue(C, Ty)) + return Result; + // Try hard to fold loads from bitcasted strange and non-type-safe things. if (Offset.getMinSignedBits() <= 64) - return FoldReinterpretLoadFromConst(C, Ty, Offset.getSExtValue(), DL); + if (Constant *Result = + FoldReinterpretLoadFromConst(C, Ty, Offset.getSExtValue(), DL)) + return Result; return nullptr; } @@ -704,15 +704,13 @@ Constant *llvm::ConstantFoldLoadFromConstPtr(Constant *C, Type *Ty, Offset, DL)) return Result; - // If this load comes from anywhere in a constant global, and if the global - // is all undef or zero, we know what it loads. + // If this load comes from anywhere in a uniform constant global, the value + // is always the same, regardless of the loaded offset. if (auto *GV = dyn_cast<GlobalVariable>(getUnderlyingObject(C))) { if (GV->isConstant() && GV->hasDefinitiveInitializer()) { - if (GV->getInitializer()->isNullValue() && !Ty->isX86_MMXTy() && - !Ty->isX86_AMXTy()) - return Constant::getNullValue(Ty); - if (isa<UndefValue>(GV->getInitializer())) - return UndefValue::get(Ty); + if (Constant *Res = + ConstantFoldLoadFromUniformValue(GV->getInitializer(), Ty)) + return Res; } } @@ -725,6 +723,19 @@ Constant *llvm::ConstantFoldLoadFromConstPtr(Constant *C, Type *Ty, return ConstantFoldLoadFromConstPtr(C, Ty, Offset, DL); } +Constant *llvm::ConstantFoldLoadFromUniformValue(Constant *C, Type *Ty) { + if (isa<PoisonValue>(C)) + return PoisonValue::get(Ty); + if (isa<UndefValue>(C)) + return UndefValue::get(Ty); + if (C->isNullValue() && !Ty->isX86_MMXTy() && !Ty->isX86_AMXTy()) + return Constant::getNullValue(Ty); + if (C->isAllOnesValue() && + (Ty->isIntOrIntVectorTy() || Ty->isFPOrFPVectorTy())) + return Constant::getAllOnesValue(Ty); + return nullptr; +} + namespace { /// One of Op0/Op1 is a constant expression. @@ -930,7 +941,7 @@ Constant *SymbolicallyEvaluateGEP(const GEPOperator *GEP, if (auto *GV = dyn_cast<GlobalValue>(Ptr)) SrcElemTy = GV->getValueType(); else if (!PTy->isOpaque()) - SrcElemTy = PTy->getElementType(); + SrcElemTy = PTy->getNonOpaquePointerElementType(); else SrcElemTy = Type::getInt8Ty(Ptr->getContext()); @@ -1171,10 +1182,11 @@ Constant *llvm::ConstantFoldInstOperands(Instruction *I, return ConstantFoldInstOperandsImpl(I, I->getOpcode(), Ops, DL, TLI); } -Constant *llvm::ConstantFoldCompareInstOperands(unsigned Predicate, +Constant *llvm::ConstantFoldCompareInstOperands(unsigned IntPredicate, Constant *Ops0, Constant *Ops1, const DataLayout &DL, const TargetLibraryInfo *TLI) { + CmpInst::Predicate Predicate = (CmpInst::Predicate)IntPredicate; // fold: icmp (inttoptr x), null -> icmp x, 0 // fold: icmp null, (inttoptr x) -> icmp 0, x // fold: icmp (ptrtoint x), 0 -> icmp x, null @@ -1248,10 +1260,30 @@ Constant *llvm::ConstantFoldCompareInstOperands(unsigned Predicate, Predicate == ICmpInst::ICMP_EQ ? Instruction::And : Instruction::Or; return ConstantFoldBinaryOpOperands(OpC, LHS, RHS, DL); } + + // Convert pointer comparison (base+offset1) pred (base+offset2) into + // offset1 pred offset2, for the case where the offset is inbounds. This + // only works for equality and unsigned comparison, as inbounds permits + // crossing the sign boundary. However, the offset comparison itself is + // signed. + if (Ops0->getType()->isPointerTy() && !ICmpInst::isSigned(Predicate)) { + unsigned IndexWidth = DL.getIndexTypeSizeInBits(Ops0->getType()); + APInt Offset0(IndexWidth, 0); + Value *Stripped0 = + Ops0->stripAndAccumulateInBoundsConstantOffsets(DL, Offset0); + APInt Offset1(IndexWidth, 0); + Value *Stripped1 = + Ops1->stripAndAccumulateInBoundsConstantOffsets(DL, Offset1); + if (Stripped0 == Stripped1) + return ConstantExpr::getCompare( + ICmpInst::getSignedPredicate(Predicate), + ConstantInt::get(CE0->getContext(), Offset0), + ConstantInt::get(CE0->getContext(), Offset1)); + } } else if (isa<ConstantExpr>(Ops1)) { // If RHS is a constant expression, but the left side isn't, swap the // operands and try again. - Predicate = ICmpInst::getSwappedPredicate((ICmpInst::Predicate)Predicate); + Predicate = ICmpInst::getSwappedPredicate(Predicate); return ConstantFoldCompareInstOperands(Predicate, Ops1, Ops0, DL, TLI); } @@ -1347,23 +1379,6 @@ Constant *llvm::ConstantFoldCastOperand(unsigned Opcode, Constant *C, } } -Constant *llvm::ConstantFoldLoadThroughGEPConstantExpr(Constant *C, - ConstantExpr *CE, - Type *Ty, - const DataLayout &DL) { - if (!CE->getOperand(1)->isNullValue()) - return nullptr; // Do not allow stepping over the value! - - // Loop over all of the operands, tracking down which value we are - // addressing. - for (unsigned i = 2, e = CE->getNumOperands(); i != e; ++i) { - C = C->getAggregateElement(CE->getOperand(i)); - if (!C) - return nullptr; - } - return ConstantFoldLoadThroughBitcast(C, Ty, DL); -} - //===----------------------------------------------------------------------===// // Constant Folding for Calls // @@ -2463,36 +2478,21 @@ static Constant *ConstantFoldScalarCall2(StringRef Name, !getConstIntOrUndef(Operands[1], C1)) return nullptr; - unsigned BitWidth = Ty->getScalarSizeInBits(); switch (IntrinsicID) { default: break; case Intrinsic::smax: - if (!C0 && !C1) - return UndefValue::get(Ty); - if (!C0 || !C1) - return ConstantInt::get(Ty, APInt::getSignedMaxValue(BitWidth)); - return ConstantInt::get(Ty, C0->sgt(*C1) ? *C0 : *C1); - case Intrinsic::smin: - if (!C0 && !C1) - return UndefValue::get(Ty); - if (!C0 || !C1) - return ConstantInt::get(Ty, APInt::getSignedMinValue(BitWidth)); - return ConstantInt::get(Ty, C0->slt(*C1) ? *C0 : *C1); - case Intrinsic::umax: - if (!C0 && !C1) - return UndefValue::get(Ty); - if (!C0 || !C1) - return ConstantInt::get(Ty, APInt::getMaxValue(BitWidth)); - return ConstantInt::get(Ty, C0->ugt(*C1) ? *C0 : *C1); - case Intrinsic::umin: if (!C0 && !C1) return UndefValue::get(Ty); if (!C0 || !C1) - return ConstantInt::get(Ty, APInt::getMinValue(BitWidth)); - return ConstantInt::get(Ty, C0->ult(*C1) ? *C0 : *C1); + return MinMaxIntrinsic::getSaturationPoint(IntrinsicID, Ty); + return ConstantInt::get( + Ty, ICmpInst::compare(*C0, *C1, + MinMaxIntrinsic::getPredicate(IntrinsicID)) + ? *C0 + : *C1); case Intrinsic::usub_with_overflow: case Intrinsic::ssub_with_overflow: @@ -2572,9 +2572,9 @@ static Constant *ConstantFoldScalarCall2(StringRef Name, case Intrinsic::ctlz: assert(C1 && "Must be constant int"); - // cttz(0, 1) and ctlz(0, 1) are undef. + // cttz(0, 1) and ctlz(0, 1) are poison. if (C1->isOne() && (!C0 || C0->isZero())) - return UndefValue::get(Ty); + return PoisonValue::get(Ty); if (!C0) return Constant::getNullValue(Ty); if (IntrinsicID == Intrinsic::cttz) @@ -2583,13 +2583,15 @@ static Constant *ConstantFoldScalarCall2(StringRef Name, return ConstantInt::get(Ty, C0->countLeadingZeros()); case Intrinsic::abs: - // Undef or minimum val operand with poison min --> undef assert(C1 && "Must be constant int"); + assert((C1->isOne() || C1->isZero()) && "Must be 0 or 1"); + + // Undef or minimum val operand with poison min --> undef if (C1->isOne() && (!C0 || C0->isMinSignedValue())) return UndefValue::get(Ty); // Undef operand with no poison min --> 0 (sign bit must be clear) - if (C1->isZero() && !C0) + if (!C0) return Constant::getNullValue(Ty); return ConstantInt::get(Ty, C0->abs()); diff --git a/llvm/lib/Analysis/ConstraintSystem.cpp b/llvm/lib/Analysis/ConstraintSystem.cpp index 9739c6af5769..773f71ada0ee 100644 --- a/llvm/lib/Analysis/ConstraintSystem.cpp +++ b/llvm/lib/Analysis/ConstraintSystem.cpp @@ -142,7 +142,7 @@ bool ConstraintSystem::mayHaveSolution() { return HasSolution; } -bool ConstraintSystem::isConditionImplied(SmallVector<int64_t, 8> R) { +bool ConstraintSystem::isConditionImplied(SmallVector<int64_t, 8> R) const { // If all variable coefficients are 0, we have 'C >= 0'. If the constant is >= // 0, R is always true, regardless of the system. if (all_of(makeArrayRef(R).drop_front(1), [](int64_t C) { return C == 0; })) diff --git a/llvm/lib/Analysis/CostModel.cpp b/llvm/lib/Analysis/CostModel.cpp index f407ec0d017a..326bacad01fe 100644 --- a/llvm/lib/Analysis/CostModel.cpp +++ b/llvm/lib/Analysis/CostModel.cpp @@ -50,7 +50,7 @@ namespace { public: static char ID; // Class identification, replacement for typeinfo - CostModelAnalysis() : FunctionPass(ID), F(nullptr), TTI(nullptr) { + CostModelAnalysis() : FunctionPass(ID) { initializeCostModelAnalysisPass( *PassRegistry::getPassRegistry()); } @@ -69,9 +69,9 @@ namespace { void print(raw_ostream &OS, const Module*) const override; /// The function that we analyze. - Function *F; + Function *F = nullptr; /// Target information. - const TargetTransformInfo *TTI; + const TargetTransformInfo *TTI = nullptr; }; } // End of anonymous namespace diff --git a/llvm/lib/Analysis/DDG.cpp b/llvm/lib/Analysis/DDG.cpp index da5de75a038c..7e1357959a3f 100644 --- a/llvm/lib/Analysis/DDG.cpp +++ b/llvm/lib/Analysis/DDG.cpp @@ -106,7 +106,7 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, const DDGNode &N) { //===--------------------------------------------------------------------===// SimpleDDGNode::SimpleDDGNode(Instruction &I) - : DDGNode(NodeKind::SingleInstruction), InstList() { + : DDGNode(NodeKind::SingleInstruction) { assert(InstList.empty() && "Expected empty list."); InstList.push_back(&I); } diff --git a/llvm/lib/Analysis/DevelopmentModeInlineAdvisor.cpp b/llvm/lib/Analysis/DevelopmentModeInlineAdvisor.cpp index 31b2dafa29b4..4a792fce51d1 100644 --- a/llvm/lib/Analysis/DevelopmentModeInlineAdvisor.cpp +++ b/llvm/lib/Analysis/DevelopmentModeInlineAdvisor.cpp @@ -11,8 +11,10 @@ // //===----------------------------------------------------------------------===// #include "llvm/Config/config.h" +#include "llvm/Support/Casting.h" #if defined(LLVM_HAVE_TF_API) +#include "llvm/ADT/BitVector.h" #include "llvm/Analysis/CallGraph.h" #include "llvm/Analysis/InlineSizeEstimatorAnalysis.h" #include "llvm/Analysis/MLInlineAdvisor.h" @@ -111,7 +113,7 @@ private: StringRef LogFileName; const ModelUnderTrainingRunner *const MUTR; std::unique_ptr<Logger> L; - std::vector<bool> Effects; + BitVector Effects; /// There's at least one output. We'll set this to a different value if MUTR /// is avaliable. size_t OutputCount = 1; @@ -150,7 +152,7 @@ public: DevelopmentModeMLInlineAdvisor( Module &M, ModuleAnalysisManager &MAM, std::unique_ptr<MLModelRunner> ModelRunner, - std::function<bool(CallBase &)> GetDefaultAdvice, bool IsDoingInference, + std::function<bool(CallBase &)> GetDefaultAdvice, std::unique_ptr<TrainingLogger> Logger); size_t getTotalSizeEstimate(); @@ -341,10 +343,11 @@ void TrainingLogger::print() { DevelopmentModeMLInlineAdvisor::DevelopmentModeMLInlineAdvisor( Module &M, ModuleAnalysisManager &MAM, std::unique_ptr<MLModelRunner> ModelRunner, - std::function<bool(CallBase &)> GetDefaultAdvice, bool IsDoingInference, + std::function<bool(CallBase &)> GetDefaultAdvice, std::unique_ptr<TrainingLogger> Logger) : MLInlineAdvisor(M, MAM, std::move(ModelRunner)), - GetDefaultAdvice(GetDefaultAdvice), IsDoingInference(IsDoingInference), + GetDefaultAdvice(GetDefaultAdvice), + IsDoingInference(isa<ModelUnderTrainingRunner>(getModelRunner())), Logger(std::move(Logger)), InitialNativeSize(isLogging() ? getTotalSizeEstimate() : 0), CurrentNativeSize(InitialNativeSize) { @@ -410,8 +413,6 @@ size_t DevelopmentModeMLInlineAdvisor::getTotalSizeEstimate() { for (auto &F : M) { if (F.isDeclaration()) continue; - if (isFunctionDeleted(&F)) - continue; Ret += *getNativeSizeEstimate(F); } return Ret; @@ -422,30 +423,20 @@ std::unique_ptr<InlineAdvisor> llvm::getDevelopmentModeAdvisor( std::function<bool(CallBase &)> GetDefaultAdvice) { auto &Ctx = M.getContext(); std::unique_ptr<MLModelRunner> Runner; - ModelUnderTrainingRunner *MUTRPtr = nullptr; - bool IsDoingInference = false; if (TFModelUnderTrainingPath.empty()) Runner.reset(new NoInferenceModelRunner(Ctx, getInputFeatures())); - else { - std::unique_ptr<ModelUnderTrainingRunner> MUTR; - if (auto MaybeOutputSpecs = loadOutputSpecs( - Ctx, DecisionName, TFModelUnderTrainingPath, TFOutputSpecOverride)) - MUTR = std::make_unique<ModelUnderTrainingRunner>( - Ctx, TFModelUnderTrainingPath, getInputFeatures(), *MaybeOutputSpecs); - if (!MUTR || !MUTR->isValid()) { - Ctx.emitError("Could not load the policy model from the provided path"); - return nullptr; - } - IsDoingInference = true; - MUTRPtr = MUTR.get(); - Runner = std::move(MUTR); - } + else + Runner = ModelUnderTrainingRunner::createAndEnsureValid( + Ctx, TFModelUnderTrainingPath, DecisionName, getInputFeatures(), + TFOutputSpecOverride); + if (!Runner) + return nullptr; std::unique_ptr<TrainingLogger> Logger; if (!TrainingLog.empty()) - Logger = std::make_unique<TrainingLogger>(TrainingLog, MUTRPtr); + Logger = std::make_unique<TrainingLogger>( + TrainingLog, dyn_cast<ModelUnderTrainingRunner>(Runner.get())); return std::make_unique<DevelopmentModeMLInlineAdvisor>( - M, MAM, std::move(Runner), GetDefaultAdvice, IsDoingInference, - std::move(Logger)); + M, MAM, std::move(Runner), GetDefaultAdvice, std::move(Logger)); } #endif // defined(LLVM_HAVE_TF_API) diff --git a/llvm/lib/Analysis/DivergenceAnalysis.cpp b/llvm/lib/Analysis/DivergenceAnalysis.cpp index 7426d0c07592..39e80c2ad51c 100644 --- a/llvm/lib/Analysis/DivergenceAnalysis.cpp +++ b/llvm/lib/Analysis/DivergenceAnalysis.cpp @@ -130,7 +130,7 @@ bool DivergenceAnalysisImpl::inRegion(const Instruction &I) const { } bool DivergenceAnalysisImpl::inRegion(const BasicBlock &BB) const { - return (!RegionLoop && BB.getParent() == &F) || RegionLoop->contains(&BB); + return RegionLoop ? RegionLoop->contains(&BB) : (BB.getParent() == &F); } void DivergenceAnalysisImpl::pushUsers(const Value &V) { @@ -348,7 +348,7 @@ DivergenceInfo::DivergenceInfo(Function &F, const DominatorTree &DT, const PostDominatorTree &PDT, const LoopInfo &LI, const TargetTransformInfo &TTI, bool KnownReducible) - : F(F), ContainsIrreducible(false) { + : F(F) { if (!KnownReducible) { using RPOTraversal = ReversePostOrderTraversal<const Function *>; RPOTraversal FuncRPOT(&F); diff --git a/llvm/lib/Analysis/DomPrinter.cpp b/llvm/lib/Analysis/DomPrinter.cpp index ebbe0d3e2c5f..6088de53028d 100644 --- a/llvm/lib/Analysis/DomPrinter.cpp +++ b/llvm/lib/Analysis/DomPrinter.cpp @@ -80,6 +80,19 @@ struct DOTGraphTraits<PostDominatorTree*> }; } +PreservedAnalyses DomTreePrinterPass::run(Function &F, + FunctionAnalysisManager &AM) { + WriteDOTGraphToFile(F, &AM.getResult<DominatorTreeAnalysis>(F), "dom", false); + return PreservedAnalyses::all(); +} + +PreservedAnalyses DomTreeOnlyPrinterPass::run(Function &F, + FunctionAnalysisManager &AM) { + WriteDOTGraphToFile(F, &AM.getResult<DominatorTreeAnalysis>(F), "domonly", + true); + return PreservedAnalyses::all(); +} + void DominatorTree::viewGraph(const Twine &Name, const Twine &Title) { #ifndef NDEBUG ViewGraph(this, Name, false, Title); diff --git a/llvm/lib/Analysis/DominanceFrontier.cpp b/llvm/lib/Analysis/DominanceFrontier.cpp index 14e6965f1259..a8806fe5a480 100644 --- a/llvm/lib/Analysis/DominanceFrontier.cpp +++ b/llvm/lib/Analysis/DominanceFrontier.cpp @@ -37,7 +37,7 @@ INITIALIZE_PASS_END(DominanceFrontierWrapperPass, "domfrontier", "Dominance Frontier Construction", true, true) DominanceFrontierWrapperPass::DominanceFrontierWrapperPass() - : FunctionPass(ID), DF() { + : FunctionPass(ID) { initializeDominanceFrontierWrapperPassPass(*PassRegistry::getPassRegistry()); } diff --git a/llvm/lib/Analysis/GlobalsModRef.cpp b/llvm/lib/Analysis/GlobalsModRef.cpp index d00a7c944f10..6869530148c5 100644 --- a/llvm/lib/Analysis/GlobalsModRef.cpp +++ b/llvm/lib/Analysis/GlobalsModRef.cpp @@ -102,7 +102,7 @@ class GlobalsAAResult::FunctionInfo { "Insufficient low bits to store our flag and ModRef info."); public: - FunctionInfo() : Info() {} + FunctionInfo() {} ~FunctionInfo() { delete Info.getPointer(); } @@ -401,14 +401,14 @@ bool GlobalsAAResult::AnalyzeUsesOfPointer(Value *V, /// AnalyzeIndirectGlobalMemory - We found an non-address-taken global variable /// which holds a pointer type. See if the global always points to non-aliased -/// heap memory: that is, all initializers of the globals are allocations, and -/// those allocations have no use other than initialization of the global. +/// heap memory: that is, all initializers of the globals store a value known +/// to be obtained via a noalias return function call which have no other use. /// Further, all loads out of GV must directly use the memory, not store the /// pointer somewhere. If this is true, we consider the memory pointed to by /// GV to be owned by GV and can disambiguate other pointers from it. bool GlobalsAAResult::AnalyzeIndirectGlobalMemory(GlobalVariable *GV) { // Keep track of values related to the allocation of the memory, f.e. the - // value produced by the malloc call and any casts. + // value produced by the noalias call and any casts. std::vector<Value *> AllocRelatedValues; // If the initializer is a valid pointer, bail. @@ -438,7 +438,7 @@ bool GlobalsAAResult::AnalyzeIndirectGlobalMemory(GlobalVariable *GV) { // Check the value being stored. Value *Ptr = getUnderlyingObject(SI->getOperand(0)); - if (!isAllocLikeFn(Ptr, &GetTLI(*SI->getFunction()))) + if (!isNoAliasCall(Ptr)) return false; // Too hard to analyze. // Analyze all uses of the allocation. If any of them are used in a @@ -963,7 +963,7 @@ ModRefInfo GlobalsAAResult::getModRefInfo(const CallBase *Call, GlobalsAAResult::GlobalsAAResult( const DataLayout &DL, std::function<const TargetLibraryInfo &(Function &F)> GetTLI) - : AAResultBase(), DL(DL), GetTLI(std::move(GetTLI)) {} + : DL(DL), GetTLI(std::move(GetTLI)) {} GlobalsAAResult::GlobalsAAResult(GlobalsAAResult &&Arg) : AAResultBase(std::move(Arg)), DL(Arg.DL), GetTLI(std::move(Arg.GetTLI)), diff --git a/llvm/lib/Analysis/IRSimilarityIdentifier.cpp b/llvm/lib/Analysis/IRSimilarityIdentifier.cpp index 2ec6cbeabda2..d2f0c57f6dab 100644 --- a/llvm/lib/Analysis/IRSimilarityIdentifier.cpp +++ b/llvm/lib/Analysis/IRSimilarityIdentifier.cpp @@ -23,12 +23,24 @@ using namespace llvm; using namespace IRSimilarity; +namespace llvm { cl::opt<bool> DisableBranches("no-ir-sim-branch-matching", cl::init(false), cl::ReallyHidden, cl::desc("disable similarity matching, and outlining, " "across branches for debugging purposes.")); +cl::opt<bool> + DisableIndirectCalls("no-ir-sim-indirect-calls", cl::init(false), + cl::ReallyHidden, + cl::desc("disable outlining indirect calls.")); + +cl::opt<bool> + MatchCallsByName("ir-sim-calls-by-name", cl::init(false), cl::ReallyHidden, + cl::desc("only allow matching call instructions if the " + "name and type signature match.")); +} // namespace llvm + IRInstructionData::IRInstructionData(Instruction &I, bool Legality, IRInstructionDataList &IDList) : Inst(&I), Legal(Legality), IDL(&IDList) { @@ -57,10 +69,16 @@ void IRInstructionData::initializeInstruction() { OperVals.push_back(OI.get()); } + + // We capture the incoming BasicBlocks as values as well as the incoming + // Values in order to check for structural similarity. + if (PHINode *PN = dyn_cast<PHINode>(Inst)) + for (BasicBlock *BB : PN->blocks()) + OperVals.push_back(BB); } IRInstructionData::IRInstructionData(IRInstructionDataList &IDList) - : Inst(nullptr), Legal(false), IDL(&IDList) {} + : IDL(&IDList) {} void IRInstructionData::setBranchSuccessors( DenseMap<BasicBlock *, unsigned> &BasicBlockToInteger) { @@ -86,6 +104,43 @@ void IRInstructionData::setBranchSuccessors( } } +void IRInstructionData::setCalleeName(bool MatchByName) { + CallInst *CI = dyn_cast<CallInst>(Inst); + assert(CI && "Instruction must be call"); + + CalleeName = ""; + if (!CI->isIndirectCall() && MatchByName) + CalleeName = CI->getCalledFunction()->getName().str(); +} + +void IRInstructionData::setPHIPredecessors( + DenseMap<BasicBlock *, unsigned> &BasicBlockToInteger) { + assert(isa<PHINode>(Inst) && "Instruction must be phi node"); + + PHINode *PN = cast<PHINode>(Inst); + DenseMap<BasicBlock *, unsigned>::iterator BBNumIt; + + BBNumIt = BasicBlockToInteger.find(PN->getParent()); + assert(BBNumIt != BasicBlockToInteger.end() && + "Could not find location for BasicBlock!"); + + int CurrentBlockNumber = static_cast<int>(BBNumIt->second); + + // Convert the incoming blocks of the PHINode to an integer value, based on + // the relative distances between the current block and the incoming block. + for (unsigned Idx = 0; Idx < PN->getNumIncomingValues(); Idx++) { + BasicBlock *Incoming = PN->getIncomingBlock(Idx); + BBNumIt = BasicBlockToInteger.find(Incoming); + assert(BBNumIt != BasicBlockToInteger.end() && + "Could not find number for BasicBlock!"); + int OtherBlockNumber = static_cast<int>(BBNumIt->second); + + int Relative = OtherBlockNumber - CurrentBlockNumber; + RelativeBlockLocations.push_back(Relative); + RelativeBlockLocations.push_back(Relative); + } +} + CmpInst::Predicate IRInstructionData::predicateForConsistency(CmpInst *CI) { switch (CI->getPredicate()) { case CmpInst::FCMP_OGT: @@ -112,10 +167,13 @@ CmpInst::Predicate IRInstructionData::getPredicate() const { return cast<CmpInst>(Inst)->getPredicate(); } -static StringRef getCalledFunctionName(CallInst &CI) { - assert(CI.getCalledFunction() != nullptr && "Called Function is nullptr?"); +StringRef IRInstructionData::getCalleeName() const { + assert(isa<CallInst>(Inst) && + "Can only get a name from a call instruction"); - return CI.getCalledFunction()->getName(); + assert(CalleeName.hasValue() && "CalleeName has not been set"); + + return *CalleeName; } bool IRSimilarity::isClose(const IRInstructionData &A, @@ -170,13 +228,11 @@ bool IRSimilarity::isClose(const IRInstructionData &A, }); } - // If the instructions are functions, we make sure that the function name is - // the same. We already know that the types are since is isSameOperationAs is - // true. + // If the instructions are functions calls, we make sure that the function + // name is the same. We already know that the types are since is + // isSameOperationAs is true. if (isa<CallInst>(A.Inst) && isa<CallInst>(B.Inst)) { - CallInst *CIA = cast<CallInst>(A.Inst); - CallInst *CIB = cast<CallInst>(B.Inst); - if (getCalledFunctionName(*CIA).compare(getCalledFunctionName(*CIB)) != 0) + if (A.getCalleeName().str().compare(B.getCalleeName().str()) != 0) return false; } @@ -244,6 +300,12 @@ unsigned IRInstructionMapper::mapToLegalUnsigned( if (isa<BranchInst>(*It)) ID->setBranchSuccessors(BasicBlockToInteger); + if (isa<CallInst>(*It)) + ID->setCalleeName(EnableMatchCallsByName); + + if (isa<PHINode>(*It)) + ID->setPHIPredecessors(BasicBlockToInteger); + // Add to the instruction list bool WasInserted; DenseMap<IRInstructionData *, unsigned, IRInstructionDataTraits>::iterator @@ -1075,6 +1137,8 @@ SimilarityGroupList &IRSimilarityIdentifier::findSimilarity( std::vector<IRInstructionData *> InstrList; std::vector<unsigned> IntegerMapping; Mapper.InstClassifier.EnableBranches = this->EnableBranches; + Mapper.InstClassifier.EnableIndirectCalls = EnableIndirectCalls; + Mapper.EnableMatchCallsByName = EnableMatchingCallsByName; populateMapper(Modules, InstrList, IntegerMapping); findCandidates(InstrList, IntegerMapping); @@ -1085,6 +1149,8 @@ SimilarityGroupList &IRSimilarityIdentifier::findSimilarity( SimilarityGroupList &IRSimilarityIdentifier::findSimilarity(Module &M) { resetSimilarityCandidates(); Mapper.InstClassifier.EnableBranches = this->EnableBranches; + Mapper.InstClassifier.EnableIndirectCalls = EnableIndirectCalls; + Mapper.EnableMatchCallsByName = EnableMatchingCallsByName; std::vector<IRInstructionData *> InstrList; std::vector<unsigned> IntegerMapping; @@ -1105,7 +1171,8 @@ IRSimilarityIdentifierWrapperPass::IRSimilarityIdentifierWrapperPass() } bool IRSimilarityIdentifierWrapperPass::doInitialization(Module &M) { - IRSI.reset(new IRSimilarityIdentifier(!DisableBranches)); + IRSI.reset(new IRSimilarityIdentifier(!DisableBranches, !DisableIndirectCalls, + MatchCallsByName)); return false; } @@ -1123,7 +1190,8 @@ AnalysisKey IRSimilarityAnalysis::Key; IRSimilarityIdentifier IRSimilarityAnalysis::run(Module &M, ModuleAnalysisManager &) { - auto IRSI = IRSimilarityIdentifier(!DisableBranches); + auto IRSI = IRSimilarityIdentifier(!DisableBranches, !DisableIndirectCalls, + MatchCallsByName); IRSI.findSimilarity(M); return IRSI; } diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp index f5fa6748d053..44b1d94ebdc8 100644 --- a/llvm/lib/Analysis/IVDescriptors.cpp +++ b/llvm/lib/Analysis/IVDescriptors.cpp @@ -161,19 +161,22 @@ static std::pair<Type *, bool> computeRecurrenceType(Instruction *Exit, /// Collect cast instructions that can be ignored in the vectorizer's cost /// model, given a reduction exit value and the minimal type in which the -/// reduction can be represented. -static void collectCastsToIgnore(Loop *TheLoop, Instruction *Exit, - Type *RecurrenceType, - SmallPtrSetImpl<Instruction *> &Casts) { +// reduction can be represented. Also search casts to the recurrence type +// to find the minimum width used by the recurrence. +static void collectCastInstrs(Loop *TheLoop, Instruction *Exit, + Type *RecurrenceType, + SmallPtrSetImpl<Instruction *> &Casts, + unsigned &MinWidthCastToRecurTy) { SmallVector<Instruction *, 8> Worklist; SmallPtrSet<Instruction *, 8> Visited; Worklist.push_back(Exit); + MinWidthCastToRecurTy = -1U; while (!Worklist.empty()) { Instruction *Val = Worklist.pop_back_val(); Visited.insert(Val); - if (auto *Cast = dyn_cast<CastInst>(Val)) + if (auto *Cast = dyn_cast<CastInst>(Val)) { if (Cast->getSrcTy() == RecurrenceType) { // If the source type of a cast instruction is equal to the recurrence // type, it will be eliminated, and should be ignored in the vectorizer @@ -181,7 +184,16 @@ static void collectCastsToIgnore(Loop *TheLoop, Instruction *Exit, Casts.insert(Cast); continue; } - + if (Cast->getDestTy() == RecurrenceType) { + // The minimum width used by the recurrence is found by checking for + // casts on its operands. The minimum width is used by the vectorizer + // when finding the widest type for in-loop reductions without any + // loads/stores. + MinWidthCastToRecurTy = std::min<unsigned>( + MinWidthCastToRecurTy, Cast->getSrcTy()->getScalarSizeInBits()); + continue; + } + } // Add all operands to the work list if they are loop-varying values that // we haven't yet visited. for (Value *O : cast<User>(Val)->operands()) @@ -265,6 +277,7 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurKind Kind, // Data used for determining if the recurrence has been type-promoted. Type *RecurrenceType = Phi->getType(); SmallPtrSet<Instruction *, 4> CastInsts; + unsigned MinWidthCastToRecurrenceType; Instruction *Start = Phi; bool IsSigned = false; @@ -296,6 +309,10 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurKind Kind, // flags from all the reduction operations. FastMathFlags FMF = FastMathFlags::getFast(); + // The first instruction in the use-def chain of the Phi node that requires + // exact floating point operations. + Instruction *ExactFPMathInst = nullptr; + // A value in the reduction can be used: // - By the reduction: // - Reduction operation: @@ -339,6 +356,9 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurKind Kind, if (Cur != Start) { ReduxDesc = isRecurrenceInstr(TheLoop, Phi, Cur, Kind, ReduxDesc, FuncFMF); + ExactFPMathInst = ExactFPMathInst == nullptr + ? ReduxDesc.getExactFPMathInst() + : ExactFPMathInst; if (!ReduxDesc.isRecurrence()) return false; // FIXME: FMF is allowed on phi, but propagation is not handled correctly. @@ -467,8 +487,8 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurKind Kind, if (!FoundStartPHI || !FoundReduxOp || !ExitInstruction) return false; - const bool IsOrdered = checkOrderedReduction( - Kind, ReduxDesc.getExactFPMathInst(), ExitInstruction, Phi); + const bool IsOrdered = + checkOrderedReduction(Kind, ExactFPMathInst, ExitInstruction, Phi); if (Start != Phi) { // If the starting value is not the same as the phi node, we speculatively @@ -500,21 +520,24 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurKind Kind, computeRecurrenceType(ExitInstruction, DB, AC, DT); if (ComputedType != RecurrenceType) return false; - - // The recurrence expression will be represented in a narrower type. If - // there are any cast instructions that will be unnecessary, collect them - // in CastInsts. Note that the 'and' instruction was already included in - // this list. - // - // TODO: A better way to represent this may be to tag in some way all the - // instructions that are a part of the reduction. The vectorizer cost - // model could then apply the recurrence type to these instructions, - // without needing a white list of instructions to ignore. - // This may also be useful for the inloop reductions, if it can be - // kept simple enough. - collectCastsToIgnore(TheLoop, ExitInstruction, RecurrenceType, CastInsts); } + // Collect cast instructions and the minimum width used by the recurrence. + // If the starting value is not the same as the phi node and the computed + // recurrence type is equal to the recurrence type, the recurrence expression + // will be represented in a narrower or wider type. If there are any cast + // instructions that will be unnecessary, collect them in CastsFromRecurTy. + // Note that the 'and' instruction was already included in this list. + // + // TODO: A better way to represent this may be to tag in some way all the + // instructions that are a part of the reduction. The vectorizer cost + // model could then apply the recurrence type to these instructions, + // without needing a white list of instructions to ignore. + // This may also be useful for the inloop reductions, if it can be + // kept simple enough. + collectCastInstrs(TheLoop, ExitInstruction, RecurrenceType, CastInsts, + MinWidthCastToRecurrenceType); + // We found a reduction var if we have reached the original phi node and we // only have a single instruction with out-of-loop users. @@ -522,9 +545,9 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurKind Kind, // is saved as part of the RecurrenceDescriptor. // Save the description of this reduction variable. - RecurrenceDescriptor RD(RdxStart, ExitInstruction, Kind, FMF, - ReduxDesc.getExactFPMathInst(), RecurrenceType, - IsSigned, IsOrdered, CastInsts); + RecurrenceDescriptor RD(RdxStart, ExitInstruction, Kind, FMF, ExactFPMathInst, + RecurrenceType, IsSigned, IsOrdered, CastInsts, + MinWidthCastToRecurrenceType); RedDes = RD; return true; @@ -1397,8 +1420,9 @@ bool InductionDescriptor::isInductionPHI( // Always use i8 element type for opaque pointer inductions. PointerType *PtrTy = cast<PointerType>(PhiTy); - Type *ElementType = PtrTy->isOpaque() ? Type::getInt8Ty(PtrTy->getContext()) - : PtrTy->getElementType(); + Type *ElementType = PtrTy->isOpaque() + ? Type::getInt8Ty(PtrTy->getContext()) + : PtrTy->getNonOpaquePointerElementType(); if (!ElementType->isSized()) return false; diff --git a/llvm/lib/Analysis/IVUsers.cpp b/llvm/lib/Analysis/IVUsers.cpp index d7b202f83189..0f3929f45506 100644 --- a/llvm/lib/Analysis/IVUsers.cpp +++ b/llvm/lib/Analysis/IVUsers.cpp @@ -254,7 +254,7 @@ IVStrideUse &IVUsers::AddUser(Instruction *User, Value *Operand) { IVUsers::IVUsers(Loop *L, AssumptionCache *AC, LoopInfo *LI, DominatorTree *DT, ScalarEvolution *SE) - : L(L), AC(AC), LI(LI), DT(DT), SE(SE), IVUses() { + : L(L), AC(AC), LI(LI), DT(DT), SE(SE) { // Collect ephemeral values so that AddUsersIfInteresting skips them. EphValues.clear(); CodeMetrics::collectEphemeralValues(L, AC, EphValues); diff --git a/llvm/lib/Analysis/InlineAdvisor.cpp b/llvm/lib/Analysis/InlineAdvisor.cpp index 140c88eb8b0d..f6e3dd354ff8 100644 --- a/llvm/lib/Analysis/InlineAdvisor.cpp +++ b/llvm/lib/Analysis/InlineAdvisor.cpp @@ -21,11 +21,15 @@ #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/DebugInfoMetadata.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/PassManager.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/raw_ostream.h" using namespace llvm; #define DEBUG_TYPE "inline" +#ifdef LLVM_HAVE_TF_AOT_INLINERSIZEMODEL +#define LLVM_HAVE_TF_AOT +#endif // This weirdly named statistic tracks the number of times that, when attempting // to inline a function A into B, we analyze the callers of B in order to see @@ -160,18 +164,6 @@ InlineAdvice::InlineAdvice(InlineAdvisor *Advisor, CallBase &CB, DLoc(CB.getDebugLoc()), Block(CB.getParent()), ORE(ORE), IsInliningRecommended(IsInliningRecommended) {} -void InlineAdvisor::markFunctionAsDeleted(Function *F) { - assert((!DeletedFunctions.count(F)) && - "Cannot put cause a function to become dead twice!"); - DeletedFunctions.insert(F); -} - -void InlineAdvisor::freeDeletedFunctions() { - for (auto *F : DeletedFunctions) - delete F; - DeletedFunctions.clear(); -} - void InlineAdvice::recordInlineStatsIfNeeded() { if (Advisor->ImportedFunctionsStats) Advisor->ImportedFunctionsStats->recordInline(*Caller, *Callee); @@ -186,7 +178,6 @@ void InlineAdvice::recordInlining() { void InlineAdvice::recordInliningWithCalleeDeleted() { markRecorded(); recordInlineStatsIfNeeded(); - Advisor->markFunctionAsDeleted(Callee); recordInliningWithCalleeDeletedImpl(); } @@ -523,8 +514,6 @@ InlineAdvisor::~InlineAdvisor() { ImportedFunctionsStats->dump(InlinerFunctionImportStats == InlinerFunctionImportStatsOpts::Verbose); } - - freeDeletedFunctions(); } std::unique_ptr<InlineAdvice> InlineAdvisor::getMandatoryAdvice(CallBase &CB, @@ -569,3 +558,13 @@ std::unique_ptr<InlineAdvice> InlineAdvisor::getAdvice(CallBase &CB, OptimizationRemarkEmitter &InlineAdvisor::getCallerORE(CallBase &CB) { return FAM.getResult<OptimizationRemarkEmitterAnalysis>(*CB.getCaller()); } + +PreservedAnalyses +InlineAdvisorAnalysisPrinterPass::run(Module &M, ModuleAnalysisManager &MAM) { + const auto *IA = MAM.getCachedResult<InlineAdvisorAnalysis>(M); + if (!IA) + OS << "No Inline Advisor\n"; + else + IA->getAdvisor()->print(OS); + return PreservedAnalyses::all(); +} diff --git a/llvm/lib/Analysis/InlineCost.cpp b/llvm/lib/Analysis/InlineCost.cpp index ff31e81aad08..d5411d916c77 100644 --- a/llvm/lib/Analysis/InlineCost.cpp +++ b/llvm/lib/Analysis/InlineCost.cpp @@ -361,10 +361,10 @@ protected: /// Model the elimination of repeated loads that is expected to happen /// whenever we simplify away the stores that would otherwise cause them to be /// loads. - bool EnableLoadElimination; + bool EnableLoadElimination = true; /// Whether we allow inlining for recursive call. - bool AllowRecursiveCall; + bool AllowRecursiveCall = false; SmallPtrSet<Value *, 16> LoadAddrSet; @@ -455,8 +455,7 @@ public: OptimizationRemarkEmitter *ORE = nullptr) : TTI(TTI), GetAssumptionCache(GetAssumptionCache), GetBFI(GetBFI), PSI(PSI), F(Callee), DL(F.getParent()->getDataLayout()), ORE(ORE), - CandidateCall(Call), EnableLoadElimination(true), - AllowRecursiveCall(false) {} + CandidateCall(Call) {} InlineResult analyze(); @@ -2898,15 +2897,6 @@ Optional<InlineResult> llvm::getAttributeBasedInliningDecision( if (Call.isNoInline()) return InlineResult::failure("noinline call site attribute"); - // Don't inline functions if one does not have any stack protector attribute - // but the other does. - if (Caller->hasStackProtectorFnAttr() && !Callee->hasStackProtectorFnAttr()) - return InlineResult::failure( - "stack protected caller but callee requested no stack protector"); - if (Callee->hasStackProtectorFnAttr() && !Caller->hasStackProtectorFnAttr()) - return InlineResult::failure( - "stack protected callee but caller requested no stack protector"); - return None; } diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp index 4831b22b1d46..b71b39334ace 100644 --- a/llvm/lib/Analysis/InstructionSimplify.cpp +++ b/llvm/lib/Analysis/InstructionSimplify.cpp @@ -27,6 +27,7 @@ #include "llvm/Analysis/CaptureTracking.h" #include "llvm/Analysis/CmpInstAnalysis.h" #include "llvm/Analysis/ConstantFolding.h" +#include "llvm/Analysis/InstSimplifyFolder.h" #include "llvm/Analysis/LoopAnalysisManager.h" #include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/Analysis/OverflowInstAnalysis.h" @@ -70,7 +71,7 @@ static Value *SimplifyOrInst(Value *, Value *, const SimplifyQuery &, unsigned); static Value *SimplifyXorInst(Value *, Value *, const SimplifyQuery &, unsigned); static Value *SimplifyCastInst(unsigned, Value *, Type *, const SimplifyQuery &, unsigned); -static Value *SimplifyGEPInst(Type *, ArrayRef<Value *>, bool, +static Value *SimplifyGEPInst(Type *, Value *, ArrayRef<Value *>, bool, const SimplifyQuery &, unsigned); static Value *SimplifySelectInst(Value *, Value *, Value *, const SimplifyQuery &, unsigned); @@ -620,6 +621,10 @@ static Value *SimplifyAddInst(Value *Op0, Value *Op1, bool IsNSW, bool IsNUW, if (Constant *C = foldOrCommuteConstant(Instruction::Add, Op0, Op1, Q)) return C; + // X + poison -> poison + if (isa<PoisonValue>(Op1)) + return Op1; + // X + undef -> undef if (Q.isUndefValue(Op1)) return Op1; @@ -1074,6 +1079,16 @@ static bool isDivZero(Value *X, Value *Y, const SimplifyQuery &Q, } // IsSigned == false. + + // Is the unsigned dividend known to be less than a constant divisor? + // TODO: Convert this (and above) to range analysis + // ("computeConstantRangeIncludingKnownBits")? + const APInt *C; + if (match(Y, m_APInt(C)) && + computeKnownBits(X, Q.DL, 0, Q.AC, Q.CxtI, Q.DT).getMaxValue().ult(*C)) + return true; + + // Try again for any divisor: // Is the dividend unsigned less than the divisor? return isICmpTrue(ICmpInst::ICMP_ULT, X, Y, Q, MaxRecurse); } @@ -2254,14 +2269,21 @@ static Value *simplifyOrLogic(Value *X, Value *Y) { match(Y, m_Not(m_c_Or(m_Specific(A), m_Specific(B))))) return NotA; - // ~(A ^ B) | (A & B) --> ~(A & B) - // ~(A ^ B) | (B & A) --> ~(A & B) + // ~(A ^ B) | (A & B) --> ~(A ^ B) + // ~(A ^ B) | (B & A) --> ~(A ^ B) Value *NotAB; if (match(X, m_CombineAnd(m_NotForbidUndef(m_Xor(m_Value(A), m_Value(B))), m_Value(NotAB))) && match(Y, m_c_And(m_Specific(A), m_Specific(B)))) return NotAB; + // ~(A & B) | (A ^ B) --> ~(A & B) + // ~(A & B) | (B ^ A) --> ~(A & B) + if (match(X, m_CombineAnd(m_NotForbidUndef(m_And(m_Value(A), m_Value(B))), + m_Value(NotAB))) && + match(Y, m_c_Xor(m_Specific(A), m_Specific(B)))) + return NotAB; + return nullptr; } @@ -2685,7 +2707,9 @@ computePointerICmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, // Fold comparisons for non-escaping pointer even if the allocation call // cannot be elided. We cannot fold malloc comparison to null. Also, the - // dynamic allocation call could be either of the operands. + // dynamic allocation call could be either of the operands. Note that + // the other operand can not be based on the alloc - if it were, then + // the cmp itself would be a capture. Value *MI = nullptr; if (isAllocLikeFn(LHS, TLI) && llvm::isKnownNonZero(RHS, DL, 0, nullptr, CxtI, DT)) @@ -2890,7 +2914,8 @@ static Value *simplifyICmpWithConstant(CmpInst::Predicate Pred, Value *LHS, if (RHS_CR.isFullSet()) return ConstantInt::getTrue(ITy); - ConstantRange LHS_CR = computeConstantRange(LHS, IIQ.UseInstrInfo); + ConstantRange LHS_CR = + computeConstantRange(LHS, CmpInst::isSigned(Pred), IIQ.UseInstrInfo); if (!LHS_CR.isFullSet()) { if (RHS_CR.contains(LHS_CR)) return ConstantInt::getTrue(ITy); @@ -4057,9 +4082,9 @@ static Value *simplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp, NewOps[1], Q, MaxRecurse - 1)); if (auto *GEP = dyn_cast<GetElementPtrInst>(I)) - return PreventSelfSimplify(SimplifyGEPInst(GEP->getSourceElementType(), - NewOps, GEP->isInBounds(), Q, - MaxRecurse - 1)); + return PreventSelfSimplify(SimplifyGEPInst( + GEP->getSourceElementType(), NewOps[0], makeArrayRef(NewOps).slice(1), + GEP->isInBounds(), Q, MaxRecurse - 1)); if (isa<SelectInst>(I)) return PreventSelfSimplify( @@ -4417,45 +4442,52 @@ Value *llvm::SimplifySelectInst(Value *Cond, Value *TrueVal, Value *FalseVal, /// Given operands for an GetElementPtrInst, see if we can fold the result. /// If not, this returns null. -static Value *SimplifyGEPInst(Type *SrcTy, ArrayRef<Value *> Ops, bool InBounds, +static Value *SimplifyGEPInst(Type *SrcTy, Value *Ptr, + ArrayRef<Value *> Indices, bool InBounds, const SimplifyQuery &Q, unsigned) { // The type of the GEP pointer operand. unsigned AS = - cast<PointerType>(Ops[0]->getType()->getScalarType())->getAddressSpace(); + cast<PointerType>(Ptr->getType()->getScalarType())->getAddressSpace(); // getelementptr P -> P. - if (Ops.size() == 1) - return Ops[0]; + if (Indices.empty()) + return Ptr; // Compute the (pointer) type returned by the GEP instruction. - Type *LastType = GetElementPtrInst::getIndexedType(SrcTy, Ops.slice(1)); + Type *LastType = GetElementPtrInst::getIndexedType(SrcTy, Indices); Type *GEPTy = PointerType::get(LastType, AS); - for (Value *Op : Ops) { - // If one of the operands is a vector, the result type is a vector of - // pointers. All vector operands must have the same number of elements. - if (VectorType *VT = dyn_cast<VectorType>(Op->getType())) { - GEPTy = VectorType::get(GEPTy, VT->getElementCount()); - break; + if (VectorType *VT = dyn_cast<VectorType>(Ptr->getType())) + GEPTy = VectorType::get(GEPTy, VT->getElementCount()); + else { + for (Value *Op : Indices) { + // If one of the operands is a vector, the result type is a vector of + // pointers. All vector operands must have the same number of elements. + if (VectorType *VT = dyn_cast<VectorType>(Op->getType())) { + GEPTy = VectorType::get(GEPTy, VT->getElementCount()); + break; + } } } // getelementptr poison, idx -> poison // getelementptr baseptr, poison -> poison - if (any_of(Ops, [](const auto *V) { return isa<PoisonValue>(V); })) + if (isa<PoisonValue>(Ptr) || + any_of(Indices, [](const auto *V) { return isa<PoisonValue>(V); })) return PoisonValue::get(GEPTy); - if (Q.isUndefValue(Ops[0])) - return UndefValue::get(GEPTy); + if (Q.isUndefValue(Ptr)) + // If inbounds, we can choose an out-of-bounds pointer as a base pointer. + return InBounds ? PoisonValue::get(GEPTy) : UndefValue::get(GEPTy); bool IsScalableVec = - isa<ScalableVectorType>(SrcTy) || any_of(Ops, [](const Value *V) { + isa<ScalableVectorType>(SrcTy) || any_of(Indices, [](const Value *V) { return isa<ScalableVectorType>(V->getType()); }); - if (Ops.size() == 2) { + if (Indices.size() == 1) { // getelementptr P, 0 -> P. - if (match(Ops[1], m_Zero()) && Ops[0]->getType() == GEPTy) - return Ops[0]; + if (match(Indices[0], m_Zero()) && Ptr->getType() == GEPTy) + return Ptr; Type *Ty = SrcTy; if (!IsScalableVec && Ty->isSized()) { @@ -4463,37 +4495,37 @@ static Value *SimplifyGEPInst(Type *SrcTy, ArrayRef<Value *> Ops, bool InBounds, uint64_t C; uint64_t TyAllocSize = Q.DL.getTypeAllocSize(Ty); // getelementptr P, N -> P if P points to a type of zero size. - if (TyAllocSize == 0 && Ops[0]->getType() == GEPTy) - return Ops[0]; + if (TyAllocSize == 0 && Ptr->getType() == GEPTy) + return Ptr; // The following transforms are only safe if the ptrtoint cast // doesn't truncate the pointers. - if (Ops[1]->getType()->getScalarSizeInBits() == + if (Indices[0]->getType()->getScalarSizeInBits() == Q.DL.getPointerSizeInBits(AS)) { - auto CanSimplify = [GEPTy, &P, V = Ops[0]]() -> bool { + auto CanSimplify = [GEPTy, &P, Ptr]() -> bool { return P->getType() == GEPTy && - getUnderlyingObject(P) == getUnderlyingObject(V); + getUnderlyingObject(P) == getUnderlyingObject(Ptr); }; // getelementptr V, (sub P, V) -> P if P points to a type of size 1. if (TyAllocSize == 1 && - match(Ops[1], m_Sub(m_PtrToInt(m_Value(P)), - m_PtrToInt(m_Specific(Ops[0])))) && + match(Indices[0], + m_Sub(m_PtrToInt(m_Value(P)), m_PtrToInt(m_Specific(Ptr)))) && CanSimplify()) return P; // getelementptr V, (ashr (sub P, V), C) -> P if P points to a type of // size 1 << C. - if (match(Ops[1], m_AShr(m_Sub(m_PtrToInt(m_Value(P)), - m_PtrToInt(m_Specific(Ops[0]))), - m_ConstantInt(C))) && + if (match(Indices[0], m_AShr(m_Sub(m_PtrToInt(m_Value(P)), + m_PtrToInt(m_Specific(Ptr))), + m_ConstantInt(C))) && TyAllocSize == 1ULL << C && CanSimplify()) return P; // getelementptr V, (sdiv (sub P, V), C) -> P if P points to a type of // size C. - if (match(Ops[1], m_SDiv(m_Sub(m_PtrToInt(m_Value(P)), - m_PtrToInt(m_Specific(Ops[0]))), - m_SpecificInt(TyAllocSize))) && + if (match(Indices[0], m_SDiv(m_Sub(m_PtrToInt(m_Value(P)), + m_PtrToInt(m_Specific(Ptr))), + m_SpecificInt(TyAllocSize))) && CanSimplify()) return P; } @@ -4501,29 +4533,28 @@ static Value *SimplifyGEPInst(Type *SrcTy, ArrayRef<Value *> Ops, bool InBounds, } if (!IsScalableVec && Q.DL.getTypeAllocSize(LastType) == 1 && - all_of(Ops.slice(1).drop_back(1), + all_of(Indices.drop_back(1), [](Value *Idx) { return match(Idx, m_Zero()); })) { unsigned IdxWidth = - Q.DL.getIndexSizeInBits(Ops[0]->getType()->getPointerAddressSpace()); - if (Q.DL.getTypeSizeInBits(Ops.back()->getType()) == IdxWidth) { + Q.DL.getIndexSizeInBits(Ptr->getType()->getPointerAddressSpace()); + if (Q.DL.getTypeSizeInBits(Indices.back()->getType()) == IdxWidth) { APInt BasePtrOffset(IdxWidth, 0); Value *StrippedBasePtr = - Ops[0]->stripAndAccumulateInBoundsConstantOffsets(Q.DL, - BasePtrOffset); + Ptr->stripAndAccumulateInBoundsConstantOffsets(Q.DL, BasePtrOffset); // Avoid creating inttoptr of zero here: While LLVMs treatment of // inttoptr is generally conservative, this particular case is folded to // a null pointer, which will have incorrect provenance. // gep (gep V, C), (sub 0, V) -> C - if (match(Ops.back(), + if (match(Indices.back(), m_Sub(m_Zero(), m_PtrToInt(m_Specific(StrippedBasePtr)))) && !BasePtrOffset.isZero()) { auto *CI = ConstantInt::get(GEPTy->getContext(), BasePtrOffset); return ConstantExpr::getIntToPtr(CI, GEPTy); } // gep (gep V, C), (xor V, -1) -> C-1 - if (match(Ops.back(), + if (match(Indices.back(), m_Xor(m_PtrToInt(m_Specific(StrippedBasePtr)), m_AllOnes())) && !BasePtrOffset.isOne()) { auto *CI = ConstantInt::get(GEPTy->getContext(), BasePtrOffset - 1); @@ -4533,17 +4564,18 @@ static Value *SimplifyGEPInst(Type *SrcTy, ArrayRef<Value *> Ops, bool InBounds, } // Check to see if this is constant foldable. - if (!all_of(Ops, [](Value *V) { return isa<Constant>(V); })) + if (!isa<Constant>(Ptr) || + !all_of(Indices, [](Value *V) { return isa<Constant>(V); })) return nullptr; - auto *CE = ConstantExpr::getGetElementPtr(SrcTy, cast<Constant>(Ops[0]), - Ops.slice(1), InBounds); + auto *CE = ConstantExpr::getGetElementPtr(SrcTy, cast<Constant>(Ptr), Indices, + InBounds); return ConstantFoldConstant(CE, Q.DL); } -Value *llvm::SimplifyGEPInst(Type *SrcTy, ArrayRef<Value *> Ops, bool InBounds, - const SimplifyQuery &Q) { - return ::SimplifyGEPInst(SrcTy, Ops, InBounds, Q, RecursionLimit); +Value *llvm::SimplifyGEPInst(Type *SrcTy, Value *Ptr, ArrayRef<Value *> Indices, + bool InBounds, const SimplifyQuery &Q) { + return ::SimplifyGEPInst(SrcTy, Ptr, Indices, InBounds, Q, RecursionLimit); } /// Given operands for an InsertValueInst, see if we can fold the result. @@ -5603,26 +5635,6 @@ static Value *simplifyUnaryIntrinsic(Function *F, Value *Op0, return nullptr; } -static APInt getMaxMinLimit(Intrinsic::ID IID, unsigned BitWidth) { - switch (IID) { - case Intrinsic::smax: return APInt::getSignedMaxValue(BitWidth); - case Intrinsic::smin: return APInt::getSignedMinValue(BitWidth); - case Intrinsic::umax: return APInt::getMaxValue(BitWidth); - case Intrinsic::umin: return APInt::getMinValue(BitWidth); - default: llvm_unreachable("Unexpected intrinsic"); - } -} - -static ICmpInst::Predicate getMaxMinPredicate(Intrinsic::ID IID) { - switch (IID) { - case Intrinsic::smax: return ICmpInst::ICMP_SGE; - case Intrinsic::smin: return ICmpInst::ICMP_SLE; - case Intrinsic::umax: return ICmpInst::ICMP_UGE; - case Intrinsic::umin: return ICmpInst::ICMP_ULE; - default: llvm_unreachable("Unexpected intrinsic"); - } -} - /// Given a min/max intrinsic, see if it can be removed based on having an /// operand that is another min/max intrinsic with shared operand(s). The caller /// is expected to swap the operand arguments to handle commutation. @@ -5690,19 +5702,21 @@ static Value *simplifyBinaryIntrinsic(Function *F, Value *Op0, Value *Op1, // Assume undef is the limit value. if (Q.isUndefValue(Op1)) - return ConstantInt::get(ReturnType, getMaxMinLimit(IID, BitWidth)); + return ConstantInt::get( + ReturnType, MinMaxIntrinsic::getSaturationPoint(IID, BitWidth)); const APInt *C; if (match(Op1, m_APIntAllowUndef(C))) { // Clamp to limit value. For example: // umax(i8 %x, i8 255) --> 255 - if (*C == getMaxMinLimit(IID, BitWidth)) + if (*C == MinMaxIntrinsic::getSaturationPoint(IID, BitWidth)) return ConstantInt::get(ReturnType, *C); // If the constant op is the opposite of the limit value, the other must // be larger/smaller or equal. For example: // umin(i8 %x, i8 255) --> %x - if (*C == getMaxMinLimit(getInverseMinMaxIntrinsic(IID), BitWidth)) + if (*C == MinMaxIntrinsic::getSaturationPoint( + getInverseMinMaxIntrinsic(IID), BitWidth)) return Op0; // Remove nested call if constant operands allow it. Example: @@ -5713,10 +5727,9 @@ static Value *simplifyBinaryIntrinsic(Function *F, Value *Op0, Value *Op1, Value *M00 = MinMax0->getOperand(0), *M01 = MinMax0->getOperand(1); const APInt *InnerC; if ((match(M00, m_APInt(InnerC)) || match(M01, m_APInt(InnerC))) && - ((IID == Intrinsic::smax && InnerC->sge(*C)) || - (IID == Intrinsic::smin && InnerC->sle(*C)) || - (IID == Intrinsic::umax && InnerC->uge(*C)) || - (IID == Intrinsic::umin && InnerC->ule(*C)))) + ICmpInst::compare(*InnerC, *C, + ICmpInst::getNonStrictPredicate( + MinMaxIntrinsic::getPredicate(IID)))) return Op0; } } @@ -5726,7 +5739,8 @@ static Value *simplifyBinaryIntrinsic(Function *F, Value *Op0, Value *Op1, if (Value *V = foldMinMaxSharedOp(IID, Op1, Op0)) return V; - ICmpInst::Predicate Pred = getMaxMinPredicate(IID); + ICmpInst::Predicate Pred = + ICmpInst::getNonStrictPredicate(MinMaxIntrinsic::getPredicate(IID)); if (isICmpTrue(Pred, Op0, Op1, Q.getWithoutUndef(), RecursionLimit)) return Op0; if (isICmpTrue(Pred, Op1, Op0, Q.getWithoutUndef(), RecursionLimit)) @@ -6277,8 +6291,9 @@ static Value *simplifyInstructionWithOperands(Instruction *I, break; case Instruction::GetElementPtr: { auto *GEPI = cast<GetElementPtrInst>(I); - Result = SimplifyGEPInst(GEPI->getSourceElementType(), NewOps, - GEPI->isInBounds(), Q); + Result = + SimplifyGEPInst(GEPI->getSourceElementType(), NewOps[0], + makeArrayRef(NewOps).slice(1), GEPI->isInBounds(), Q); break; } case Instruction::InsertValue: { @@ -6460,3 +6475,5 @@ const SimplifyQuery getBestSimplifyQuery(AnalysisManager<T, TArgs...> &AM, template const SimplifyQuery getBestSimplifyQuery(AnalysisManager<Function> &, Function &); } + +void InstSimplifyFolder::anchor() {} diff --git a/llvm/lib/Analysis/LazyCallGraph.cpp b/llvm/lib/Analysis/LazyCallGraph.cpp index 0007c54b16d0..e8e9593d7030 100644 --- a/llvm/lib/Analysis/LazyCallGraph.cpp +++ b/llvm/lib/Analysis/LazyCallGraph.cpp @@ -1503,7 +1503,7 @@ void LazyCallGraph::removeEdge(Node &SourceN, Node &TargetN) { void LazyCallGraph::removeDeadFunction(Function &F) { // FIXME: This is unnecessarily restrictive. We should be able to remove // functions which recursively call themselves. - assert(F.use_empty() && + assert(F.hasZeroLiveUses() && "This routine should only be called on trivially dead functions!"); // We shouldn't remove library functions as they are never really dead while @@ -1522,13 +1522,6 @@ void LazyCallGraph::removeDeadFunction(Function &F) { // Remove this from the entry edges if present. EntryEdges.removeEdgeInternal(N); - if (SCCMap.empty()) { - // No SCCs have been formed, so removing this is fine and there is nothing - // else necessary at this point but clearing out the node. - N.clear(); - return; - } - // Cannot remove a function which has yet to be visited in the DFS walk, so // if we have a node at all then we must have an SCC and RefSCC. auto CI = SCCMap.find(&N); @@ -1544,15 +1537,9 @@ void LazyCallGraph::removeDeadFunction(Function &F) { assert(C.size() == 1 && "Dead functions must be in a singular SCC"); assert(RC.size() == 1 && "Dead functions must be in a singular RefSCC"); - auto RCIndexI = RefSCCIndices.find(&RC); - int RCIndex = RCIndexI->second; - PostOrderRefSCCs.erase(PostOrderRefSCCs.begin() + RCIndex); - RefSCCIndices.erase(RCIndexI); - for (int i = RCIndex, Size = PostOrderRefSCCs.size(); i < Size; ++i) - RefSCCIndices[PostOrderRefSCCs[i]] = i; - // Finally clear out all the data structures from the node down through the - // components. + // components. postorder_ref_scc_iterator will skip empty RefSCCs, so no need + // to adjust LazyCallGraph data structures. N.clear(); N.G = nullptr; N.F = nullptr; diff --git a/llvm/lib/Analysis/LazyValueInfo.cpp b/llvm/lib/Analysis/LazyValueInfo.cpp index 5b5d48bf6fe5..e311b40ab25c 100644 --- a/llvm/lib/Analysis/LazyValueInfo.cpp +++ b/llvm/lib/Analysis/LazyValueInfo.cpp @@ -395,7 +395,8 @@ class LazyValueInfoImpl { /// if it exists in the module. Function *GuardDecl; - Optional<ValueLatticeElement> getBlockValue(Value *Val, BasicBlock *BB); + Optional<ValueLatticeElement> getBlockValue(Value *Val, BasicBlock *BB, + Instruction *CxtI); Optional<ValueLatticeElement> getEdgeValue(Value *V, BasicBlock *F, BasicBlock *T, Instruction *CxtI = nullptr); @@ -533,15 +534,17 @@ void LazyValueInfoImpl::solve() { } } -Optional<ValueLatticeElement> LazyValueInfoImpl::getBlockValue(Value *Val, - BasicBlock *BB) { +Optional<ValueLatticeElement> LazyValueInfoImpl::getBlockValue( + Value *Val, BasicBlock *BB, Instruction *CxtI) { // If already a constant, there is nothing to compute. if (Constant *VC = dyn_cast<Constant>(Val)) return ValueLatticeElement::get(VC); if (Optional<ValueLatticeElement> OptLatticeVal = - TheCache.getCachedValueInfo(Val, BB)) + TheCache.getCachedValueInfo(Val, BB)) { + intersectAssumeOrGuardBlockValueConstantRange(Val, *OptLatticeVal, CxtI); return OptLatticeVal; + } // We have hit a cycle, assume overdefined. if (!pushBlockValue({ BB, Val })) @@ -792,31 +795,41 @@ void LazyValueInfoImpl::intersectAssumeOrGuardBlockValueConstantRange( } } +static ConstantRange getConstantRangeOrFull(const ValueLatticeElement &Val, + Type *Ty, const DataLayout &DL) { + if (Val.isConstantRange()) + return Val.getConstantRange(); + return ConstantRange::getFull(DL.getTypeSizeInBits(Ty)); +} + Optional<ValueLatticeElement> LazyValueInfoImpl::solveBlockValueSelect( SelectInst *SI, BasicBlock *BB) { // Recurse on our inputs if needed Optional<ValueLatticeElement> OptTrueVal = - getBlockValue(SI->getTrueValue(), BB); + getBlockValue(SI->getTrueValue(), BB, SI); if (!OptTrueVal) return None; ValueLatticeElement &TrueVal = *OptTrueVal; Optional<ValueLatticeElement> OptFalseVal = - getBlockValue(SI->getFalseValue(), BB); + getBlockValue(SI->getFalseValue(), BB, SI); if (!OptFalseVal) return None; ValueLatticeElement &FalseVal = *OptFalseVal; - if (TrueVal.isConstantRange() && FalseVal.isConstantRange()) { - const ConstantRange &TrueCR = TrueVal.getConstantRange(); - const ConstantRange &FalseCR = FalseVal.getConstantRange(); + if (TrueVal.isConstantRange() || FalseVal.isConstantRange()) { + const ConstantRange &TrueCR = + getConstantRangeOrFull(TrueVal, SI->getType(), DL); + const ConstantRange &FalseCR = + getConstantRangeOrFull(FalseVal, SI->getType(), DL); Value *LHS = nullptr; Value *RHS = nullptr; SelectPatternResult SPR = matchSelectPattern(SI, LHS, RHS); // Is this a min specifically of our two inputs? (Avoid the risk of // ValueTracking getting smarter looking back past our immediate inputs.) if (SelectPatternResult::isMinOrMax(SPR.Flavor) && - LHS == SI->getTrueValue() && RHS == SI->getFalseValue()) { + ((LHS == SI->getTrueValue() && RHS == SI->getFalseValue()) || + (RHS == SI->getTrueValue() && LHS == SI->getFalseValue()))) { ConstantRange ResultCR = [&]() { switch (SPR.Flavor) { default: @@ -873,17 +886,10 @@ Optional<ValueLatticeElement> LazyValueInfoImpl::solveBlockValueSelect( Optional<ConstantRange> LazyValueInfoImpl::getRangeFor(Value *V, Instruction *CxtI, BasicBlock *BB) { - Optional<ValueLatticeElement> OptVal = getBlockValue(V, BB); + Optional<ValueLatticeElement> OptVal = getBlockValue(V, BB, CxtI); if (!OptVal) return None; - - ValueLatticeElement &Val = *OptVal; - intersectAssumeOrGuardBlockValueConstantRange(V, Val, CxtI); - if (Val.isConstantRange()) - return Val.getConstantRange(); - - const unsigned OperandBitWidth = DL.getTypeSizeInBits(V->getType()); - return ConstantRange::getFull(OperandBitWidth); + return getConstantRangeOrFull(*OptVal, V->getType(), DL); } Optional<ValueLatticeElement> LazyValueInfoImpl::solveBlockValueCast( @@ -1017,7 +1023,7 @@ Optional<ValueLatticeElement> LazyValueInfoImpl::solveBlockValueExtractValue( if (Value *V = SimplifyExtractValueInst( EVI->getAggregateOperand(), EVI->getIndices(), EVI->getModule()->getDataLayout())) - return getBlockValue(V, BB); + return getBlockValue(V, BB, EVI); LLVM_DEBUG(dbgs() << " compute BB '" << BB->getName() << "' - overdefined (unknown extractvalue).\n"); @@ -1126,14 +1132,16 @@ static ValueLatticeElement getValueFromICmpCondition(Value *Val, ICmpInst *ICI, } // If (X urem Modulus) >= C, then X >= C. + // If trunc X >= C, then X >= C. // TODO: An upper bound could be computed as well. - if (match(LHS, m_URem(m_Specific(Val), m_Value())) && + if (match(LHS, m_CombineOr(m_URem(m_Specific(Val), m_Value()), + m_Trunc(m_Specific(Val)))) && match(RHS, m_APInt(C))) { // Use the icmp region so we don't have to deal with different predicates. ConstantRange CR = ConstantRange::makeExactICmpRegion(EdgePred, *C); if (!CR.isEmptySet()) return ValueLatticeElement::getRange(ConstantRange::getNonEmpty( - CR.getUnsignedMin(), APInt(BitWidth, 0))); + CR.getUnsignedMin().zextOrSelf(BitWidth), APInt(BitWidth, 0))); } return ValueLatticeElement::getOverdefined(); @@ -1430,14 +1438,12 @@ Optional<ValueLatticeElement> LazyValueInfoImpl::getEdgeValue( // Can't get any more precise here return LocalResult; - Optional<ValueLatticeElement> OptInBlock = getBlockValue(Val, BBFrom); + Optional<ValueLatticeElement> OptInBlock = + getBlockValue(Val, BBFrom, BBFrom->getTerminator()); if (!OptInBlock) return None; ValueLatticeElement &InBlock = *OptInBlock; - // Try to intersect ranges of the BB and the constraint on the edge. - intersectAssumeOrGuardBlockValueConstantRange(Val, InBlock, - BBFrom->getTerminator()); // We can use the context instruction (generically the ultimate instruction // the calling pass is trying to simplify) here, even though the result of // this function is generally cached when called from the solve* functions @@ -1457,15 +1463,14 @@ ValueLatticeElement LazyValueInfoImpl::getValueInBlock(Value *V, BasicBlock *BB, << BB->getName() << "'\n"); assert(BlockValueStack.empty() && BlockValueSet.empty()); - Optional<ValueLatticeElement> OptResult = getBlockValue(V, BB); + Optional<ValueLatticeElement> OptResult = getBlockValue(V, BB, CxtI); if (!OptResult) { solve(); - OptResult = getBlockValue(V, BB); + OptResult = getBlockValue(V, BB, CxtI); assert(OptResult && "Value not available after solving"); } - ValueLatticeElement Result = *OptResult; - intersectAssumeOrGuardBlockValueConstantRange(V, Result, CxtI); + ValueLatticeElement Result = *OptResult; LLVM_DEBUG(dbgs() << " Result = " << Result << "\n"); return Result; } diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp index 6444518dc70c..2ab78d2b7ee2 100644 --- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -519,8 +519,7 @@ public: AccessAnalysis(Loop *TheLoop, AAResults *AA, LoopInfo *LI, MemoryDepChecker::DepCandidates &DA, PredicatedScalarEvolution &PSE) - : TheLoop(TheLoop), AST(*AA), LI(LI), DepCands(DA), - IsRTCheckAnalysisNeeded(false), PSE(PSE) {} + : TheLoop(TheLoop), AST(*AA), LI(LI), DepCands(DA), PSE(PSE) {} /// Register a load and whether it is only read from. void addLoad(MemoryLocation &Loc, bool IsReadOnly) { @@ -620,7 +619,7 @@ private: /// memcheck analysis without dependency checking /// (i.e. FoundNonConstantDistanceDependence), isDependencyCheckNeeded is /// cleared while this remains set if we have potentially dependent accesses. - bool IsRTCheckAnalysisNeeded; + bool IsRTCheckAnalysisNeeded = false; /// The SCEV predicate containing all the SCEV-related assumptions. PredicatedScalarEvolution &PSE; @@ -1055,7 +1054,6 @@ int64_t llvm::getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, bool ShouldCheckWrap) { Type *Ty = Ptr->getType(); assert(Ty->isPointerTy() && "Unexpected non-ptr"); - assert(!AccessTy->isAggregateType() && "Bad stride - Not a pointer to a scalar type"); if (isa<ScalableVectorType>(AccessTy)) { LLVM_DEBUG(dbgs() << "LAA: Bad stride - Scalable object: " << *AccessTy @@ -2245,10 +2243,7 @@ LoopAccessInfo::LoopAccessInfo(Loop *L, ScalarEvolution *SE, DominatorTree *DT, LoopInfo *LI) : PSE(std::make_unique<PredicatedScalarEvolution>(*SE, *L)), PtrRtChecking(std::make_unique<RuntimePointerChecking>(SE)), - DepChecker(std::make_unique<MemoryDepChecker>(*PSE, L)), TheLoop(L), - NumLoads(0), NumStores(0), MaxSafeDepDistBytes(-1), CanVecMem(false), - HasConvergentOp(false), - HasDependenceInvolvingLoopInvariantAddress(false) { + DepChecker(std::make_unique<MemoryDepChecker>(*PSE, L)), TheLoop(L) { if (canAnalyzeLoop()) analyzeLoop(AA, LI, TLI, DT); } diff --git a/llvm/lib/Analysis/LoopCacheAnalysis.cpp b/llvm/lib/Analysis/LoopCacheAnalysis.cpp index 7b895d8a5dc2..ba014bd08c98 100644 --- a/llvm/lib/Analysis/LoopCacheAnalysis.cpp +++ b/llvm/lib/Analysis/LoopCacheAnalysis.cpp @@ -477,9 +477,8 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, const CacheCost &CC) { CacheCost::CacheCost(const LoopVectorTy &Loops, const LoopInfo &LI, ScalarEvolution &SE, TargetTransformInfo &TTI, - AAResults &AA, DependenceInfo &DI, - Optional<unsigned> TRT) - : Loops(Loops), TripCounts(), LoopCosts(), + AAResults &AA, DependenceInfo &DI, Optional<unsigned> TRT) + : Loops(Loops), TRT((TRT == None) ? Optional<unsigned>(TemporalReuseThreshold) : TRT), LI(LI), SE(SE), TTI(TTI), AA(AA), DI(DI) { assert(!Loops.empty() && "Expecting a non-empty loop vector."); diff --git a/llvm/lib/Analysis/LoopInfo.cpp b/llvm/lib/Analysis/LoopInfo.cpp index b35fb2a190f6..dd6958716127 100644 --- a/llvm/lib/Analysis/LoopInfo.cpp +++ b/llvm/lib/Analysis/LoopInfo.cpp @@ -695,11 +695,10 @@ class UnloopUpdater { // Flag the presence of an irreducible backedge whose destination is a block // directly contained by the original unloop. - bool FoundIB; + bool FoundIB = false; public: - UnloopUpdater(Loop *UL, LoopInfo *LInfo) - : Unloop(*UL), LI(LInfo), DFS(UL), FoundIB(false) {} + UnloopUpdater(Loop *UL, LoopInfo *LInfo) : Unloop(*UL), LI(LInfo), DFS(UL) {} void updateBlockParents(); diff --git a/llvm/lib/Analysis/LoopPass.cpp b/llvm/lib/Analysis/LoopPass.cpp index 9e470e998e67..b720bab454e9 100644 --- a/llvm/lib/Analysis/LoopPass.cpp +++ b/llvm/lib/Analysis/LoopPass.cpp @@ -69,8 +69,7 @@ char PrintLoopPassWrapper::ID = 0; char LPPassManager::ID = 0; -LPPassManager::LPPassManager() - : FunctionPass(ID), PMDataManager() { +LPPassManager::LPPassManager() : FunctionPass(ID) { LI = nullptr; CurrentLoop = nullptr; } diff --git a/llvm/lib/Analysis/MLInlineAdvisor.cpp b/llvm/lib/Analysis/MLInlineAdvisor.cpp index f5a65cd2b689..0480c1cd2842 100644 --- a/llvm/lib/Analysis/MLInlineAdvisor.cpp +++ b/llvm/lib/Analysis/MLInlineAdvisor.cpp @@ -11,35 +11,34 @@ // 'release' mode) or a runtime-loaded model (the 'development' case). // //===----------------------------------------------------------------------===// -#include "llvm/Config/config.h" -#if defined(LLVM_HAVE_TF_AOT) || defined(LLVM_HAVE_TF_API) - -#include <limits> -#include <unordered_map> -#include <unordered_set> - +#include "llvm/Analysis/MLInlineAdvisor.h" #include "llvm/ADT/SCCIterator.h" #include "llvm/Analysis/CallGraph.h" #include "llvm/Analysis/FunctionPropertiesAnalysis.h" #include "llvm/Analysis/InlineCost.h" -#include "llvm/Analysis/MLInlineAdvisor.h" +#include "llvm/Analysis/InlineModelFeatureMaps.h" +#include "llvm/Analysis/LazyCallGraph.h" #include "llvm/Analysis/MLModelRunner.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/Analysis/ReleaseModeModelRunner.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Config/config.h" #include "llvm/IR/InstIterator.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/PassManager.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Path.h" +#include <limits> +#include <unordered_map> +#include <unordered_set> + using namespace llvm; -#ifdef LLVM_HAVE_TF_AOT -#include "llvm/Analysis/ReleaseModeModelRunner.h" +#if defined(LLVM_HAVE_TF_AOT_INLINERSIZEMODEL) // codegen-ed file #include "InlinerSizeModel.h" // NOLINT -#include "llvm/Analysis/InlineModelFeatureMaps.h" std::unique_ptr<InlineAdvisor> llvm::getReleaseModeAdvisor(Module &M, ModuleAnalysisManager &MAM) { @@ -90,7 +89,8 @@ MLInlineAdvisor::MLInlineAdvisor(Module &M, ModuleAnalysisManager &MAM, std::unique_ptr<MLModelRunner> Runner) : InlineAdvisor( M, MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager()), - ModelRunner(std::move(Runner)), CG(new CallGraph(M)), + ModelRunner(std::move(Runner)), + CG(MAM.getResult<LazyCallGraphAnalysis>(M)), InitialIRSize(getModuleIRSize()), CurrentIRSize(InitialIRSize) { assert(ModelRunner); @@ -100,7 +100,8 @@ MLInlineAdvisor::MLInlineAdvisor(Module &M, ModuleAnalysisManager &MAM, // critical in behavioral cloning - i.e. training a model to mimic the manual // heuristic's decisions - and, thus, equally important for training for // improvement. - for (auto I = scc_begin(CG.get()); !I.isAtEnd(); ++I) { + CallGraph CGraph(M); + for (auto I = scc_begin(&CGraph); !I.isAtEnd(); ++I) { const std::vector<CallGraphNode *> &CGNodes = *I; unsigned Level = 0; for (auto *CGNode : CGNodes) { @@ -110,7 +111,7 @@ MLInlineAdvisor::MLInlineAdvisor(Module &M, ModuleAnalysisManager &MAM, for (auto &I : instructions(F)) { if (auto *CS = getInlinableCS(I)) { auto *Called = CS->getCalledFunction(); - auto Pos = FunctionLevels.find(Called); + auto Pos = FunctionLevels.find(&CG.get(*Called)); // In bottom up traversal, an inlinable callee is either in the // same SCC, or to a function in a visited SCC. So not finding its // level means we haven't visited it yet, meaning it's in this SCC. @@ -123,24 +124,73 @@ MLInlineAdvisor::MLInlineAdvisor(Module &M, ModuleAnalysisManager &MAM, for (auto *CGNode : CGNodes) { Function *F = CGNode->getFunction(); if (F && !F->isDeclaration()) - FunctionLevels[F] = Level; + FunctionLevels[&CG.get(*F)] = Level; } } + for (auto KVP : FunctionLevels) { + AllNodes.insert(KVP.first); + EdgeCount += getLocalCalls(KVP.first->getFunction()); + } + NodeCount = AllNodes.size(); +} + +unsigned MLInlineAdvisor::getInitialFunctionLevel(const Function &F) const { + return CG.lookup(F) ? FunctionLevels.at(CG.lookup(F)) : 0; } void MLInlineAdvisor::onPassEntry() { // Function passes executed between InlinerPass runs may have changed the // module-wide features. - if (!Invalid) - return; - NodeCount = 0; - EdgeCount = 0; - for (auto &F : M) - if (!F.isDeclaration()) { - ++NodeCount; - EdgeCount += getLocalCalls(F); + // The cgscc pass manager rules are such that: + // - if a pass leads to merging SCCs, then the pipeline is restarted on the + // merged SCC + // - if a pass leads to splitting the SCC, then we continue with one of the + // splits + // This means that the NodesInLastSCC is a superset (not strict) of the nodes + // that subsequent passes would have processed + // - in addition, if new Nodes were created by a pass (e.g. CoroSplit), + // they'd be adjacent to Nodes in the last SCC. So we just need to check the + // boundary of Nodes in NodesInLastSCC for Nodes we haven't seen. We don't + // care about the nature of the Edge (call or ref). + NodeCount -= static_cast<int64_t>(NodesInLastSCC.size()); + while (!NodesInLastSCC.empty()) { + const auto *N = NodesInLastSCC.front(); + NodesInLastSCC.pop_front(); + // The Function wrapped by N could have been deleted since we last saw it. + if (N->isDead()) { + assert(!N->getFunction().isDeclaration()); + continue; + } + ++NodeCount; + EdgeCount += getLocalCalls(N->getFunction()); + for (const auto &E : *(*N)) { + const auto *AdjNode = &E.getNode(); + assert(!AdjNode->isDead() && !AdjNode->getFunction().isDeclaration()); + auto I = AllNodes.insert(AdjNode); + if (I.second) + NodesInLastSCC.push_back(AdjNode); } - Invalid = false; + } + + EdgeCount -= EdgesOfLastSeenNodes; + EdgesOfLastSeenNodes = 0; +} + +void MLInlineAdvisor::onPassExit(LazyCallGraph::SCC *LastSCC) { + if (!LastSCC) + return; + // Keep track of the nodes and edges we last saw. Then, in onPassEntry, + // we update the node count and edge count from the subset of these nodes that + // survived. + assert(NodesInLastSCC.empty()); + assert(NodeCount >= LastSCC->size()); + EdgesOfLastSeenNodes = 0; + for (const auto &N : *LastSCC) { + assert(!N.isDead()); + EdgesOfLastSeenNodes += getLocalCalls(N.getFunction()); + NodesInLastSCC.push_back(&N); + } + assert(EdgeCount >= EdgesOfLastSeenNodes); } int64_t MLInlineAdvisor::getLocalCalls(Function &F) { @@ -192,7 +242,7 @@ void MLInlineAdvisor::onSuccessfulInlining(const MLInlineAdvice &Advice, int64_t MLInlineAdvisor::getModuleIRSize() const { int64_t Ret = 0; - for (auto &F : CG->getModule()) + for (auto &F : M) if (!F.isDeclaration()) Ret += getIRSize(F); return Ret; @@ -263,7 +313,7 @@ std::unique_ptr<InlineAdvice> MLInlineAdvisor::getAdviceImpl(CallBase &CB) { *ModelRunner->getTensor<int64_t>(FeatureIndex::CalleeBasicBlockCount) = CalleeBefore.BasicBlockCount; *ModelRunner->getTensor<int64_t>(FeatureIndex::CallSiteHeight) = - FunctionLevels[&Caller]; + getInitialFunctionLevel(Caller); *ModelRunner->getTensor<int64_t>(FeatureIndex::NodeCount) = NodeCount; *ModelRunner->getTensor<int64_t>(FeatureIndex::NrCtantParams) = NrCtantParams; *ModelRunner->getTensor<int64_t>(FeatureIndex::EdgeCount) = EdgeCount; @@ -361,4 +411,3 @@ void MLInlineAdvice::recordUnattemptedInliningImpl() { return R; }); } -#endif // defined(LLVM_HAVE_TF_AOT) || defined(LLVM_HAVE_TF_API) diff --git a/llvm/lib/Analysis/MemoryBuiltins.cpp b/llvm/lib/Analysis/MemoryBuiltins.cpp index ffdd7a2cfd4b..208f93aa1ac6 100644 --- a/llvm/lib/Analysis/MemoryBuiltins.cpp +++ b/llvm/lib/Analysis/MemoryBuiltins.cpp @@ -51,12 +51,13 @@ using namespace llvm; enum AllocType : uint8_t { OpNewLike = 1<<0, // allocates; never returns null - MallocLike = 1<<1 | OpNewLike, // allocates; may return null + MallocLike = 1<<1, // allocates; may return null AlignedAllocLike = 1<<2, // allocates with alignment; may return null CallocLike = 1<<3, // allocates + bzero ReallocLike = 1<<4, // reallocates StrDupLike = 1<<5, - MallocOrCallocLike = MallocLike | CallocLike | AlignedAllocLike, + MallocOrOpNewLike = MallocLike | OpNewLike, + MallocOrCallocLike = MallocLike | OpNewLike | CallocLike | AlignedAllocLike, AllocLike = MallocOrCallocLike | StrDupLike, AnyAlloc = AllocLike | ReallocLike }; @@ -66,64 +67,59 @@ struct AllocFnsTy { unsigned NumParams; // First and Second size parameters (or -1 if unused) int FstParam, SndParam; + // Alignment parameter for aligned_alloc and aligned new + int AlignParam; }; // FIXME: certain users need more information. E.g., SimplifyLibCalls needs to // know which functions are nounwind, noalias, nocapture parameters, etc. static const std::pair<LibFunc, AllocFnsTy> AllocationFnData[] = { - {LibFunc_malloc, {MallocLike, 1, 0, -1}}, - {LibFunc_vec_malloc, {MallocLike, 1, 0, -1}}, - {LibFunc_valloc, {MallocLike, 1, 0, -1}}, - {LibFunc_Znwj, {OpNewLike, 1, 0, -1}}, // new(unsigned int) - {LibFunc_ZnwjRKSt9nothrow_t, {MallocLike, 2, 0, -1}}, // new(unsigned int, nothrow) - {LibFunc_ZnwjSt11align_val_t, {OpNewLike, 2, 0, -1}}, // new(unsigned int, align_val_t) - {LibFunc_ZnwjSt11align_val_tRKSt9nothrow_t, // new(unsigned int, align_val_t, nothrow) - {MallocLike, 3, 0, -1}}, - {LibFunc_Znwm, {OpNewLike, 1, 0, -1}}, // new(unsigned long) - {LibFunc_ZnwmRKSt9nothrow_t, {MallocLike, 2, 0, -1}}, // new(unsigned long, nothrow) - {LibFunc_ZnwmSt11align_val_t, {OpNewLike, 2, 0, -1}}, // new(unsigned long, align_val_t) - {LibFunc_ZnwmSt11align_val_tRKSt9nothrow_t, // new(unsigned long, align_val_t, nothrow) - {MallocLike, 3, 0, -1}}, - {LibFunc_Znaj, {OpNewLike, 1, 0, -1}}, // new[](unsigned int) - {LibFunc_ZnajRKSt9nothrow_t, {MallocLike, 2, 0, -1}}, // new[](unsigned int, nothrow) - {LibFunc_ZnajSt11align_val_t, {OpNewLike, 2, 0, -1}}, // new[](unsigned int, align_val_t) - {LibFunc_ZnajSt11align_val_tRKSt9nothrow_t, // new[](unsigned int, align_val_t, nothrow) - {MallocLike, 3, 0, -1}}, - {LibFunc_Znam, {OpNewLike, 1, 0, -1}}, // new[](unsigned long) - {LibFunc_ZnamRKSt9nothrow_t, {MallocLike, 2, 0, -1}}, // new[](unsigned long, nothrow) - {LibFunc_ZnamSt11align_val_t, {OpNewLike, 2, 0, -1}}, // new[](unsigned long, align_val_t) - {LibFunc_ZnamSt11align_val_tRKSt9nothrow_t, // new[](unsigned long, align_val_t, nothrow) - {MallocLike, 3, 0, -1}}, - {LibFunc_msvc_new_int, {OpNewLike, 1, 0, -1}}, // new(unsigned int) - {LibFunc_msvc_new_int_nothrow, {MallocLike, 2, 0, -1}}, // new(unsigned int, nothrow) - {LibFunc_msvc_new_longlong, {OpNewLike, 1, 0, -1}}, // new(unsigned long long) - {LibFunc_msvc_new_longlong_nothrow, {MallocLike, 2, 0, -1}}, // new(unsigned long long, nothrow) - {LibFunc_msvc_new_array_int, {OpNewLike, 1, 0, -1}}, // new[](unsigned int) - {LibFunc_msvc_new_array_int_nothrow, {MallocLike, 2, 0, -1}}, // new[](unsigned int, nothrow) - {LibFunc_msvc_new_array_longlong, {OpNewLike, 1, 0, -1}}, // new[](unsigned long long) - {LibFunc_msvc_new_array_longlong_nothrow, {MallocLike, 2, 0, -1}}, // new[](unsigned long long, nothrow) - {LibFunc_aligned_alloc, {AlignedAllocLike, 2, 1, -1}}, - {LibFunc_memalign, {AlignedAllocLike, 2, 1, -1}}, - {LibFunc_calloc, {CallocLike, 2, 0, 1}}, - {LibFunc_vec_calloc, {CallocLike, 2, 0, 1}}, - {LibFunc_realloc, {ReallocLike, 2, 1, -1}}, - {LibFunc_vec_realloc, {ReallocLike, 2, 1, -1}}, - {LibFunc_reallocf, {ReallocLike, 2, 1, -1}}, - {LibFunc_strdup, {StrDupLike, 1, -1, -1}}, - {LibFunc_strndup, {StrDupLike, 2, 1, -1}}, - {LibFunc___kmpc_alloc_shared, {MallocLike, 1, 0, -1}}, - // TODO: Handle "int posix_memalign(void **, size_t, size_t)" + {LibFunc_malloc, {MallocLike, 1, 0, -1, -1}}, + {LibFunc_vec_malloc, {MallocLike, 1, 0, -1, -1}}, + {LibFunc_valloc, {MallocLike, 1, 0, -1, -1}}, + {LibFunc_Znwj, {OpNewLike, 1, 0, -1, -1}}, // new(unsigned int) + {LibFunc_ZnwjRKSt9nothrow_t, {MallocLike, 2, 0, -1, -1}}, // new(unsigned int, nothrow) + {LibFunc_ZnwjSt11align_val_t, {OpNewLike, 2, 0, -1, 1}}, // new(unsigned int, align_val_t) + {LibFunc_ZnwjSt11align_val_tRKSt9nothrow_t, {MallocLike, 3, 0, -1, 1}}, // new(unsigned int, align_val_t, nothrow) + {LibFunc_Znwm, {OpNewLike, 1, 0, -1, -1}}, // new(unsigned long) + {LibFunc_ZnwmRKSt9nothrow_t, {MallocLike, 2, 0, -1, -1}}, // new(unsigned long, nothrow) + {LibFunc_ZnwmSt11align_val_t, {OpNewLike, 2, 0, -1, 1}}, // new(unsigned long, align_val_t) + {LibFunc_ZnwmSt11align_val_tRKSt9nothrow_t, {MallocLike, 3, 0, -1, 1}}, // new(unsigned long, align_val_t, nothrow) + {LibFunc_Znaj, {OpNewLike, 1, 0, -1, -1}}, // new[](unsigned int) + {LibFunc_ZnajRKSt9nothrow_t, {MallocLike, 2, 0, -1, -1}}, // new[](unsigned int, nothrow) + {LibFunc_ZnajSt11align_val_t, {OpNewLike, 2, 0, -1, 1}}, // new[](unsigned int, align_val_t) + {LibFunc_ZnajSt11align_val_tRKSt9nothrow_t, {MallocLike, 3, 0, -1, 1}}, // new[](unsigned int, align_val_t, nothrow) + {LibFunc_Znam, {OpNewLike, 1, 0, -1, -1}}, // new[](unsigned long) + {LibFunc_ZnamRKSt9nothrow_t, {MallocLike, 2, 0, -1, -1}}, // new[](unsigned long, nothrow) + {LibFunc_ZnamSt11align_val_t, {OpNewLike, 2, 0, -1, 1}}, // new[](unsigned long, align_val_t) + {LibFunc_ZnamSt11align_val_tRKSt9nothrow_t, {MallocLike, 3, 0, -1, 1}}, // new[](unsigned long, align_val_t, nothrow) + {LibFunc_msvc_new_int, {OpNewLike, 1, 0, -1, -1}}, // new(unsigned int) + {LibFunc_msvc_new_int_nothrow, {MallocLike, 2, 0, -1, -1}}, // new(unsigned int, nothrow) + {LibFunc_msvc_new_longlong, {OpNewLike, 1, 0, -1, -1}}, // new(unsigned long long) + {LibFunc_msvc_new_longlong_nothrow, {MallocLike, 2, 0, -1, -1}}, // new(unsigned long long, nothrow) + {LibFunc_msvc_new_array_int, {OpNewLike, 1, 0, -1, -1}}, // new[](unsigned int) + {LibFunc_msvc_new_array_int_nothrow, {MallocLike, 2, 0, -1, -1}}, // new[](unsigned int, nothrow) + {LibFunc_msvc_new_array_longlong, {OpNewLike, 1, 0, -1, -1}}, // new[](unsigned long long) + {LibFunc_msvc_new_array_longlong_nothrow, {MallocLike, 2, 0, -1, -1}}, // new[](unsigned long long, nothrow) + {LibFunc_aligned_alloc, {AlignedAllocLike, 2, 1, -1, 0}}, + {LibFunc_memalign, {AlignedAllocLike, 2, 1, -1, 0}}, + {LibFunc_calloc, {CallocLike, 2, 0, 1, -1}}, + {LibFunc_vec_calloc, {CallocLike, 2, 0, 1, -1}}, + {LibFunc_realloc, {ReallocLike, 2, 1, -1, -1}}, + {LibFunc_vec_realloc, {ReallocLike, 2, 1, -1, -1}}, + {LibFunc_reallocf, {ReallocLike, 2, 1, -1, -1}}, + {LibFunc_strdup, {StrDupLike, 1, -1, -1, -1}}, + {LibFunc_strndup, {StrDupLike, 2, 1, -1, -1}}, + {LibFunc___kmpc_alloc_shared, {MallocLike, 1, 0, -1, -1}}, + // TODO: Handle "int posix_memalign(void **, size_t, size_t)" }; -static const Function *getCalledFunction(const Value *V, bool LookThroughBitCast, +static const Function *getCalledFunction(const Value *V, bool &IsNoBuiltin) { // Don't care about intrinsics in this case. if (isa<IntrinsicInst>(V)) return nullptr; - if (LookThroughBitCast) - V = V->stripPointerCasts(); - const auto *CB = dyn_cast<CallBase>(V); if (!CB) return nullptr; @@ -175,11 +171,9 @@ getAllocationDataForFunction(const Function *Callee, AllocType AllocTy, } static Optional<AllocFnsTy> getAllocationData(const Value *V, AllocType AllocTy, - const TargetLibraryInfo *TLI, - bool LookThroughBitCast = false) { + const TargetLibraryInfo *TLI) { bool IsNoBuiltinCall; - if (const Function *Callee = - getCalledFunction(V, LookThroughBitCast, IsNoBuiltinCall)) + if (const Function *Callee = getCalledFunction(V, IsNoBuiltinCall)) if (!IsNoBuiltinCall) return getAllocationDataForFunction(Callee, AllocTy, TLI); return None; @@ -187,11 +181,9 @@ static Optional<AllocFnsTy> getAllocationData(const Value *V, AllocType AllocTy, static Optional<AllocFnsTy> getAllocationData(const Value *V, AllocType AllocTy, - function_ref<const TargetLibraryInfo &(Function &)> GetTLI, - bool LookThroughBitCast = false) { + function_ref<const TargetLibraryInfo &(Function &)> GetTLI) { bool IsNoBuiltinCall; - if (const Function *Callee = - getCalledFunction(V, LookThroughBitCast, IsNoBuiltinCall)) + if (const Function *Callee = getCalledFunction(V, IsNoBuiltinCall)) if (!IsNoBuiltinCall) return getAllocationDataForFunction( Callee, AllocTy, &GetTLI(const_cast<Function &>(*Callee))); @@ -202,7 +194,7 @@ static Optional<AllocFnsTy> getAllocationSize(const Value *V, const TargetLibraryInfo *TLI) { bool IsNoBuiltinCall; const Function *Callee = - getCalledFunction(V, /*LookThroughBitCast=*/false, IsNoBuiltinCall); + getCalledFunction(V, IsNoBuiltinCall); if (!Callee) return None; @@ -226,92 +218,57 @@ static Optional<AllocFnsTy> getAllocationSize(const Value *V, Result.NumParams = Callee->getNumOperands(); Result.FstParam = Args.first; Result.SndParam = Args.second.getValueOr(-1); + // Allocsize has no way to specify an alignment argument + Result.AlignParam = -1; return Result; } -static bool hasNoAliasAttr(const Value *V, bool LookThroughBitCast) { - const auto *CB = - dyn_cast<CallBase>(LookThroughBitCast ? V->stripPointerCasts() : V); - return CB && CB->hasRetAttr(Attribute::NoAlias); -} - /// Tests if a value is a call or invoke to a library function that /// allocates or reallocates memory (either malloc, calloc, realloc, or strdup /// like). -bool llvm::isAllocationFn(const Value *V, const TargetLibraryInfo *TLI, - bool LookThroughBitCast) { - return getAllocationData(V, AnyAlloc, TLI, LookThroughBitCast).hasValue(); +bool llvm::isAllocationFn(const Value *V, const TargetLibraryInfo *TLI) { + return getAllocationData(V, AnyAlloc, TLI).hasValue(); } bool llvm::isAllocationFn( - const Value *V, function_ref<const TargetLibraryInfo &(Function &)> GetTLI, - bool LookThroughBitCast) { - return getAllocationData(V, AnyAlloc, GetTLI, LookThroughBitCast).hasValue(); -} - -/// Tests if a value is a call or invoke to a function that returns a -/// NoAlias pointer (including malloc/calloc/realloc/strdup-like functions). -bool llvm::isNoAliasFn(const Value *V, const TargetLibraryInfo *TLI, - bool LookThroughBitCast) { - // it's safe to consider realloc as noalias since accessing the original - // pointer is undefined behavior - return isAllocationFn(V, TLI, LookThroughBitCast) || - hasNoAliasAttr(V, LookThroughBitCast); + const Value *V, function_ref<const TargetLibraryInfo &(Function &)> GetTLI) { + return getAllocationData(V, AnyAlloc, GetTLI).hasValue(); } /// Tests if a value is a call or invoke to a library function that /// allocates uninitialized memory (such as malloc). -bool llvm::isMallocLikeFn(const Value *V, const TargetLibraryInfo *TLI, - bool LookThroughBitCast) { - return getAllocationData(V, MallocLike, TLI, LookThroughBitCast).hasValue(); -} -bool llvm::isMallocLikeFn( - const Value *V, function_ref<const TargetLibraryInfo &(Function &)> GetTLI, - bool LookThroughBitCast) { - return getAllocationData(V, MallocLike, GetTLI, LookThroughBitCast) - .hasValue(); +static bool isMallocLikeFn(const Value *V, const TargetLibraryInfo *TLI) { + return getAllocationData(V, MallocOrOpNewLike, TLI).hasValue(); } /// Tests if a value is a call or invoke to a library function that /// allocates uninitialized memory with alignment (such as aligned_alloc). -bool llvm::isAlignedAllocLikeFn(const Value *V, const TargetLibraryInfo *TLI, - bool LookThroughBitCast) { - return getAllocationData(V, AlignedAllocLike, TLI, LookThroughBitCast) - .hasValue(); -} -bool llvm::isAlignedAllocLikeFn( - const Value *V, function_ref<const TargetLibraryInfo &(Function &)> GetTLI, - bool LookThroughBitCast) { - return getAllocationData(V, AlignedAllocLike, GetTLI, LookThroughBitCast) +static bool isAlignedAllocLikeFn(const Value *V, const TargetLibraryInfo *TLI) { + return getAllocationData(V, AlignedAllocLike, TLI) .hasValue(); } /// Tests if a value is a call or invoke to a library function that /// allocates zero-filled memory (such as calloc). -bool llvm::isCallocLikeFn(const Value *V, const TargetLibraryInfo *TLI, - bool LookThroughBitCast) { - return getAllocationData(V, CallocLike, TLI, LookThroughBitCast).hasValue(); +static bool isCallocLikeFn(const Value *V, const TargetLibraryInfo *TLI) { + return getAllocationData(V, CallocLike, TLI).hasValue(); } /// Tests if a value is a call or invoke to a library function that /// allocates memory similar to malloc or calloc. -bool llvm::isMallocOrCallocLikeFn(const Value *V, const TargetLibraryInfo *TLI, - bool LookThroughBitCast) { - return getAllocationData(V, MallocOrCallocLike, TLI, - LookThroughBitCast).hasValue(); +bool llvm::isMallocOrCallocLikeFn(const Value *V, const TargetLibraryInfo *TLI) { + return getAllocationData(V, MallocOrCallocLike, TLI).hasValue(); } /// Tests if a value is a call or invoke to a library function that /// allocates memory (either malloc, calloc, or strdup like). -bool llvm::isAllocLikeFn(const Value *V, const TargetLibraryInfo *TLI, - bool LookThroughBitCast) { - return getAllocationData(V, AllocLike, TLI, LookThroughBitCast).hasValue(); +bool llvm::isAllocLikeFn(const Value *V, const TargetLibraryInfo *TLI) { + return getAllocationData(V, AllocLike, TLI).hasValue(); } /// Tests if a value is a call or invoke to a library function that /// reallocates memory (e.g., realloc). -bool llvm::isReallocLikeFn(const Value *V, const TargetLibraryInfo *TLI, - bool LookThroughBitCast) { - return getAllocationData(V, ReallocLike, TLI, LookThroughBitCast).hasValue(); +bool llvm::isReallocLikeFn(const Value *V, const TargetLibraryInfo *TLI) { + return getAllocationData(V, ReallocLike, TLI).hasValue(); } /// Tests if a functions is a call or invoke to a library function that @@ -320,113 +277,122 @@ bool llvm::isReallocLikeFn(const Function *F, const TargetLibraryInfo *TLI) { return getAllocationDataForFunction(F, ReallocLike, TLI).hasValue(); } -/// Tests if a value is a call or invoke to a library function that -/// allocates memory and throws if an allocation failed (e.g., new). -bool llvm::isOpNewLikeFn(const Value *V, const TargetLibraryInfo *TLI, - bool LookThroughBitCast) { - return getAllocationData(V, OpNewLike, TLI, LookThroughBitCast).hasValue(); -} +bool llvm::isAllocRemovable(const CallBase *CB, const TargetLibraryInfo *TLI) { + assert(isAllocationFn(CB, TLI)); -/// Tests if a value is a call or invoke to a library function that -/// allocates memory (strdup, strndup). -bool llvm::isStrdupLikeFn(const Value *V, const TargetLibraryInfo *TLI, - bool LookThroughBitCast) { - return getAllocationData(V, StrDupLike, TLI, LookThroughBitCast).hasValue(); -} + // Note: Removability is highly dependent on the source language. For + // example, recent C++ requires direct calls to the global allocation + // [basic.stc.dynamic.allocation] to be observable unless part of a new + // expression [expr.new paragraph 13]. -/// extractMallocCall - Returns the corresponding CallInst if the instruction -/// is a malloc call. Since CallInst::CreateMalloc() only creates calls, we -/// ignore InvokeInst here. -const CallInst *llvm::extractMallocCall( - const Value *I, - function_ref<const TargetLibraryInfo &(Function &)> GetTLI) { - return isMallocLikeFn(I, GetTLI) ? dyn_cast<CallInst>(I) : nullptr; + // Historically we've treated the C family allocation routines as removable + return isAllocLikeFn(CB, TLI); } -static Value *computeArraySize(const CallInst *CI, const DataLayout &DL, - const TargetLibraryInfo *TLI, - bool LookThroughSExt = false) { - if (!CI) - return nullptr; +Value *llvm::getAllocAlignment(const CallBase *V, + const TargetLibraryInfo *TLI) { + assert(isAllocationFn(V, TLI)); - // The size of the malloc's result type must be known to determine array size. - Type *T = getMallocAllocatedType(CI, TLI); - if (!T || !T->isSized()) + const Optional<AllocFnsTy> FnData = getAllocationData(V, AnyAlloc, TLI); + if (!FnData.hasValue() || FnData->AlignParam < 0) { return nullptr; + } + return V->getOperand(FnData->AlignParam); +} - unsigned ElementSize = DL.getTypeAllocSize(T); - if (StructType *ST = dyn_cast<StructType>(T)) - ElementSize = DL.getStructLayout(ST)->getSizeInBytes(); +/// When we're compiling N-bit code, and the user uses parameters that are +/// greater than N bits (e.g. uint64_t on a 32-bit build), we can run into +/// trouble with APInt size issues. This function handles resizing + overflow +/// checks for us. Check and zext or trunc \p I depending on IntTyBits and +/// I's value. +static bool CheckedZextOrTrunc(APInt &I, unsigned IntTyBits) { + // More bits than we can handle. Checking the bit width isn't necessary, but + // it's faster than checking active bits, and should give `false` in the + // vast majority of cases. + if (I.getBitWidth() > IntTyBits && I.getActiveBits() > IntTyBits) + return false; + if (I.getBitWidth() != IntTyBits) + I = I.zextOrTrunc(IntTyBits); + return true; +} - // If malloc call's arg can be determined to be a multiple of ElementSize, - // return the multiple. Otherwise, return NULL. - Value *MallocArg = CI->getArgOperand(0); - Value *Multiple = nullptr; - if (ComputeMultiple(MallocArg, ElementSize, Multiple, LookThroughSExt)) - return Multiple; +Optional<APInt> +llvm::getAllocSize(const CallBase *CB, + const TargetLibraryInfo *TLI, + std::function<const Value*(const Value*)> Mapper) { + // Note: This handles both explicitly listed allocation functions and + // allocsize. The code structure could stand to be cleaned up a bit. + Optional<AllocFnsTy> FnData = getAllocationSize(CB, TLI); + if (!FnData) + return None; - return nullptr; -} + // Get the index type for this address space, results and intermediate + // computations are performed at that width. + auto &DL = CB->getModule()->getDataLayout(); + const unsigned IntTyBits = DL.getIndexTypeSizeInBits(CB->getType()); + + // Handle strdup-like functions separately. + if (FnData->AllocTy == StrDupLike) { + APInt Size(IntTyBits, GetStringLength(Mapper(CB->getArgOperand(0)))); + if (!Size) + return None; -/// getMallocType - Returns the PointerType resulting from the malloc call. -/// The PointerType depends on the number of bitcast uses of the malloc call: -/// 0: PointerType is the calls' return type. -/// 1: PointerType is the bitcast's result type. -/// >1: Unique PointerType cannot be determined, return NULL. -PointerType *llvm::getMallocType(const CallInst *CI, - const TargetLibraryInfo *TLI) { - assert(isMallocLikeFn(CI, TLI) && "getMallocType and not malloc call"); - - PointerType *MallocType = nullptr; - unsigned NumOfBitCastUses = 0; - - // Determine if CallInst has a bitcast use. - for (const User *U : CI->users()) - if (const BitCastInst *BCI = dyn_cast<BitCastInst>(U)) { - MallocType = cast<PointerType>(BCI->getDestTy()); - NumOfBitCastUses++; + // Strndup limits strlen. + if (FnData->FstParam > 0) { + const ConstantInt *Arg = + dyn_cast<ConstantInt>(Mapper(CB->getArgOperand(FnData->FstParam))); + if (!Arg) + return None; + + APInt MaxSize = Arg->getValue().zextOrSelf(IntTyBits); + if (Size.ugt(MaxSize)) + Size = MaxSize + 1; } + return Size; + } - // Malloc call has 1 bitcast use, so type is the bitcast's destination type. - if (NumOfBitCastUses == 1) - return MallocType; + const ConstantInt *Arg = + dyn_cast<ConstantInt>(Mapper(CB->getArgOperand(FnData->FstParam))); + if (!Arg) + return None; - // Malloc call was not bitcast, so type is the malloc function's return type. - if (NumOfBitCastUses == 0) - return cast<PointerType>(CI->getType()); + APInt Size = Arg->getValue(); + if (!CheckedZextOrTrunc(Size, IntTyBits)) + return None; - // Type could not be determined. - return nullptr; -} + // Size is determined by just 1 parameter. + if (FnData->SndParam < 0) + return Size; -/// getMallocAllocatedType - Returns the Type allocated by malloc call. -/// The Type depends on the number of bitcast uses of the malloc call: -/// 0: PointerType is the malloc calls' return type. -/// 1: PointerType is the bitcast's result type. -/// >1: Unique PointerType cannot be determined, return NULL. -Type *llvm::getMallocAllocatedType(const CallInst *CI, - const TargetLibraryInfo *TLI) { - PointerType *PT = getMallocType(CI, TLI); - return PT ? PT->getElementType() : nullptr; -} + Arg = dyn_cast<ConstantInt>(Mapper(CB->getArgOperand(FnData->SndParam))); + if (!Arg) + return None; + + APInt NumElems = Arg->getValue(); + if (!CheckedZextOrTrunc(NumElems, IntTyBits)) + return None; -/// getMallocArraySize - Returns the array size of a malloc call. If the -/// argument passed to malloc is a multiple of the size of the malloced type, -/// then return that multiple. For non-array mallocs, the multiple is -/// constant 1. Otherwise, return NULL for mallocs whose array size cannot be -/// determined. -Value *llvm::getMallocArraySize(CallInst *CI, const DataLayout &DL, - const TargetLibraryInfo *TLI, - bool LookThroughSExt) { - assert(isMallocLikeFn(CI, TLI) && "getMallocArraySize and not malloc call"); - return computeArraySize(CI, DL, TLI, LookThroughSExt); + bool Overflow; + Size = Size.umul_ov(NumElems, Overflow); + if (Overflow) + return None; + return Size; } -/// extractCallocCall - Returns the corresponding CallInst if the instruction -/// is a calloc call. -const CallInst *llvm::extractCallocCall(const Value *I, - const TargetLibraryInfo *TLI) { - return isCallocLikeFn(I, TLI) ? cast<CallInst>(I) : nullptr; +Constant *llvm::getInitialValueOfAllocation(const CallBase *Alloc, + const TargetLibraryInfo *TLI, + Type *Ty) { + assert(isAllocationFn(Alloc, TLI)); + + // malloc and aligned_alloc are uninitialized (undef) + if (isMallocLikeFn(Alloc, TLI) || isAlignedAllocLikeFn(Alloc, TLI)) + return UndefValue::get(Ty); + + // calloc zero initializes + if (isCallocLikeFn(Alloc, TLI)) + return Constant::getNullValue(Ty); + + return nullptr; } /// isLibFreeFunction - Returns true if the function is a builtin free() @@ -485,8 +451,7 @@ bool llvm::isLibFreeFunction(const Function *F, const LibFunc TLIFn) { /// isFreeCall - Returns non-null if the value is a call to the builtin free() const CallInst *llvm::isFreeCall(const Value *I, const TargetLibraryInfo *TLI) { bool IsNoBuiltinCall; - const Function *Callee = - getCalledFunction(I, /*LookThroughBitCast=*/false, IsNoBuiltinCall); + const Function *Callee = getCalledFunction(I, IsNoBuiltinCall); if (Callee == nullptr || IsNoBuiltinCall) return nullptr; @@ -644,20 +609,8 @@ SizeOffsetType ObjectSizeOffsetVisitor::compute(Value *V) { return unknown(); } -/// When we're compiling N-bit code, and the user uses parameters that are -/// greater than N bits (e.g. uint64_t on a 32-bit build), we can run into -/// trouble with APInt size issues. This function handles resizing + overflow -/// checks for us. Check and zext or trunc \p I depending on IntTyBits and -/// I's value. bool ObjectSizeOffsetVisitor::CheckedZextOrTrunc(APInt &I) { - // More bits than we can handle. Checking the bit width isn't necessary, but - // it's faster than checking active bits, and should give `false` in the - // vast majority of cases. - if (I.getBitWidth() > IntTyBits && I.getActiveBits() > IntTyBits) - return false; - if (I.getBitWidth() != IntTyBits) - I = I.zextOrTrunc(IntTyBits); - return true; + return ::CheckedZextOrTrunc(I, IntTyBits); } SizeOffsetType ObjectSizeOffsetVisitor::visitAllocaInst(AllocaInst &I) { @@ -698,61 +651,10 @@ SizeOffsetType ObjectSizeOffsetVisitor::visitArgument(Argument &A) { } SizeOffsetType ObjectSizeOffsetVisitor::visitCallBase(CallBase &CB) { - Optional<AllocFnsTy> FnData = getAllocationSize(&CB, TLI); - if (!FnData) - return unknown(); - - // Handle strdup-like functions separately. - if (FnData->AllocTy == StrDupLike) { - APInt Size(IntTyBits, GetStringLength(CB.getArgOperand(0))); - if (!Size) - return unknown(); - - // Strndup limits strlen. - if (FnData->FstParam > 0) { - ConstantInt *Arg = - dyn_cast<ConstantInt>(CB.getArgOperand(FnData->FstParam)); - if (!Arg) - return unknown(); - - APInt MaxSize = Arg->getValue().zextOrSelf(IntTyBits); - if (Size.ugt(MaxSize)) - Size = MaxSize + 1; - } - return std::make_pair(Size, Zero); - } - - ConstantInt *Arg = dyn_cast<ConstantInt>(CB.getArgOperand(FnData->FstParam)); - if (!Arg) - return unknown(); - - APInt Size = Arg->getValue(); - if (!CheckedZextOrTrunc(Size)) - return unknown(); - - // Size is determined by just 1 parameter. - if (FnData->SndParam < 0) - return std::make_pair(Size, Zero); - - Arg = dyn_cast<ConstantInt>(CB.getArgOperand(FnData->SndParam)); - if (!Arg) - return unknown(); - - APInt NumElems = Arg->getValue(); - if (!CheckedZextOrTrunc(NumElems)) - return unknown(); - - bool Overflow; - Size = Size.umul_ov(NumElems, Overflow); - return Overflow ? unknown() : std::make_pair(Size, Zero); - - // TODO: handle more standard functions (+ wchar cousins): - // - strdup / strndup - // - strcpy / strncpy - // - strcat / strncat - // - memcpy / memmove - // - strcat / strncat - // - memset + auto Mapper = [](const Value *V) { return V; }; + if (Optional<APInt> Size = getAllocSize(&CB, TLI, Mapper)) + return std::make_pair(*Size, Zero); + return unknown(); } SizeOffsetType @@ -976,7 +878,7 @@ SizeOffsetEvalType ObjectSizeOffsetEvaluator::visitCallBase(CallBase &CB) { // Handle strdup-like functions separately. if (FnData->AllocTy == StrDupLike) { - // TODO + // TODO: implement evaluation of strdup/strndup return unknown(); } @@ -989,14 +891,6 @@ SizeOffsetEvalType ObjectSizeOffsetEvaluator::visitCallBase(CallBase &CB) { SecondArg = Builder.CreateZExtOrTrunc(SecondArg, IntTy); Value *Size = Builder.CreateMul(FirstArg, SecondArg); return std::make_pair(Size, Zero); - - // TODO: handle more standard functions (+ wchar cousins): - // - strdup / strndup - // - strcpy / strncpy - // - strcat / strncat - // - memcpy / memmove - // - strcat / strncat - // - memset } SizeOffsetEvalType diff --git a/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp b/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp index da6bb4c49cba..36df462c7a66 100644 --- a/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp +++ b/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp @@ -594,7 +594,7 @@ MemDepResult MemoryDependenceResults::getSimplePointerDependencyFrom( // turn into undef. Note that we can bypass the allocation itself when // looking for a clobber in many cases; that's an alias property and is // handled by BasicAA. - if (isa<AllocaInst>(Inst) || isNoAliasFn(Inst, &TLI)) { + if (isa<AllocaInst>(Inst) || isNoAliasCall(Inst)) { const Value *AccessPtr = getUnderlyingObject(MemLoc.Ptr); if (AccessPtr == Inst || BatchAA.isMustAlias(Inst, AccessPtr)) return MemDepResult::getDef(Inst); diff --git a/llvm/lib/Analysis/MemorySSA.cpp b/llvm/lib/Analysis/MemorySSA.cpp index ac20e20f0c0d..57f431ec21f5 100644 --- a/llvm/lib/Analysis/MemorySSA.cpp +++ b/llvm/lib/Analysis/MemorySSA.cpp @@ -1265,8 +1265,8 @@ void MemorySSA::markUnreachableAsLiveOnEntry(BasicBlock *BB) { } MemorySSA::MemorySSA(Function &Func, AliasAnalysis *AA, DominatorTree *DT) - : AA(nullptr), DT(DT), F(Func), LiveOnEntryDef(nullptr), Walker(nullptr), - SkipWalker(nullptr), NextID(0) { + : DT(DT), F(Func), LiveOnEntryDef(nullptr), Walker(nullptr), + SkipWalker(nullptr) { // Build MemorySSA using a batch alias analysis. This reuses the internal // state that AA collects during an alias()/getModRefInfo() call. This is // safe because there are no CFG changes while building MemorySSA and can diff --git a/llvm/lib/Analysis/ModelUnderTrainingRunner.cpp b/llvm/lib/Analysis/ModelUnderTrainingRunner.cpp index 941458f648bc..fab51d6a7aaf 100644 --- a/llvm/lib/Analysis/ModelUnderTrainingRunner.cpp +++ b/llvm/lib/Analysis/ModelUnderTrainingRunner.cpp @@ -22,12 +22,13 @@ ModelUnderTrainingRunner::ModelUnderTrainingRunner( LLVMContext &Ctx, const std::string &ModelPath, const std::vector<TensorSpec> &InputSpecs, const std::vector<LoggedFeatureSpec> &OutputSpecs) - : MLModelRunner(Ctx), OutputSpecs(OutputSpecs) { + : MLModelRunner(Ctx, MLModelRunner::Kind::Development), + OutputSpecs(OutputSpecs) { Evaluator = std::make_unique<TFModelEvaluator>( ModelPath, InputSpecs, [&](size_t I) { return OutputSpecs[I].Spec; }, OutputSpecs.size()); if (!Evaluator || !Evaluator->isValid()) { - Ctx.emitError("Failed to create inliner saved model evaluator"); + Ctx.emitError("Failed to create saved model evaluator"); Evaluator.reset(); return; } @@ -46,4 +47,21 @@ void *ModelUnderTrainingRunner::getTensorUntyped(size_t Index) { return Evaluator->getUntypedInput(Index); } +std::unique_ptr<ModelUnderTrainingRunner> +ModelUnderTrainingRunner::createAndEnsureValid( + LLVMContext &Ctx, const std::string &ModelPath, StringRef DecisionName, + const std::vector<TensorSpec> &InputSpecs, + StringRef OutputSpecsPathOverride) { + std::unique_ptr<ModelUnderTrainingRunner> MUTR; + if (auto MaybeOutputSpecs = loadOutputSpecs(Ctx, DecisionName, ModelPath, + OutputSpecsPathOverride)) + MUTR.reset(new ModelUnderTrainingRunner(Ctx, ModelPath, InputSpecs, + *MaybeOutputSpecs)); + if (MUTR && MUTR->isValid()) + return MUTR; + + Ctx.emitError("Could not load the policy model from the provided path"); + return nullptr; +} + #endif // defined(LLVM_HAVE_TF_API) diff --git a/llvm/lib/Analysis/NoInferenceModelRunner.cpp b/llvm/lib/Analysis/NoInferenceModelRunner.cpp index 02ece6aa3900..7178120ebe4f 100644 --- a/llvm/lib/Analysis/NoInferenceModelRunner.cpp +++ b/llvm/lib/Analysis/NoInferenceModelRunner.cpp @@ -20,7 +20,7 @@ using namespace llvm; NoInferenceModelRunner::NoInferenceModelRunner( LLVMContext &Ctx, const std::vector<TensorSpec> &Inputs) - : MLModelRunner(Ctx) { + : MLModelRunner(Ctx, MLModelRunner::Kind::NoOp) { ValuesBuffer.reserve(Inputs.size()); for (const auto &TS : Inputs) ValuesBuffer.push_back(std::make_unique<char[]>(TS.getElementCount() * diff --git a/llvm/lib/Analysis/ObjCARCInstKind.cpp b/llvm/lib/Analysis/ObjCARCInstKind.cpp index f74a9f7f104f..d177ee056a93 100644 --- a/llvm/lib/Analysis/ObjCARCInstKind.cpp +++ b/llvm/lib/Analysis/ObjCARCInstKind.cpp @@ -32,8 +32,8 @@ raw_ostream &llvm::objcarc::operator<<(raw_ostream &OS, return OS << "ARCInstKind::Retain"; case ARCInstKind::RetainRV: return OS << "ARCInstKind::RetainRV"; - case ARCInstKind::ClaimRV: - return OS << "ARCInstKind::ClaimRV"; + case ARCInstKind::UnsafeClaimRV: + return OS << "ARCInstKind::UnsafeClaimRV"; case ARCInstKind::RetainBlock: return OS << "ARCInstKind::RetainBlock"; case ARCInstKind::Release: @@ -127,7 +127,7 @@ ARCInstKind llvm::objcarc::GetFunctionClass(const Function *F) { case Intrinsic::objc_clang_arc_use: return ARCInstKind::IntrinsicUser; case Intrinsic::objc_unsafeClaimAutoreleasedReturnValue: - return ARCInstKind::ClaimRV; + return ARCInstKind::UnsafeClaimRV; case Intrinsic::objc_retainedObject: return ARCInstKind::NoopCast; case Intrinsic::objc_unretainedObject: @@ -334,7 +334,7 @@ bool llvm::objcarc::IsUser(ARCInstKind Class) { case ARCInstKind::StoreStrong: case ARCInstKind::Call: case ARCInstKind::None: - case ARCInstKind::ClaimRV: + case ARCInstKind::UnsafeClaimRV: return false; } llvm_unreachable("covered switch isn't covered?"); @@ -370,7 +370,7 @@ bool llvm::objcarc::IsRetain(ARCInstKind Class) { case ARCInstKind::Call: case ARCInstKind::User: case ARCInstKind::None: - case ARCInstKind::ClaimRV: + case ARCInstKind::UnsafeClaimRV: return false; } llvm_unreachable("covered switch isn't covered?"); @@ -384,7 +384,7 @@ bool llvm::objcarc::IsAutorelease(ARCInstKind Class) { return true; case ARCInstKind::Retain: case ARCInstKind::RetainRV: - case ARCInstKind::ClaimRV: + case ARCInstKind::UnsafeClaimRV: case ARCInstKind::RetainBlock: case ARCInstKind::Release: case ARCInstKind::AutoreleasepoolPush: @@ -416,7 +416,7 @@ bool llvm::objcarc::IsForwarding(ARCInstKind Class) { switch (Class) { case ARCInstKind::Retain: case ARCInstKind::RetainRV: - case ARCInstKind::ClaimRV: + case ARCInstKind::UnsafeClaimRV: case ARCInstKind::Autorelease: case ARCInstKind::AutoreleaseRV: case ARCInstKind::NoopCast: @@ -451,7 +451,7 @@ bool llvm::objcarc::IsNoopOnNull(ARCInstKind Class) { switch (Class) { case ARCInstKind::Retain: case ARCInstKind::RetainRV: - case ARCInstKind::ClaimRV: + case ARCInstKind::UnsafeClaimRV: case ARCInstKind::Release: case ARCInstKind::Autorelease: case ARCInstKind::AutoreleaseRV: @@ -486,7 +486,7 @@ bool llvm::objcarc::IsNoopOnGlobal(ARCInstKind Class) { switch (Class) { case ARCInstKind::Retain: case ARCInstKind::RetainRV: - case ARCInstKind::ClaimRV: + case ARCInstKind::UnsafeClaimRV: case ARCInstKind::Release: case ARCInstKind::Autorelease: case ARCInstKind::AutoreleaseRV: @@ -522,7 +522,7 @@ bool llvm::objcarc::IsAlwaysTail(ARCInstKind Class) { switch (Class) { case ARCInstKind::Retain: case ARCInstKind::RetainRV: - case ARCInstKind::ClaimRV: + case ARCInstKind::UnsafeClaimRV: case ARCInstKind::AutoreleaseRV: return true; case ARCInstKind::Release: @@ -563,7 +563,7 @@ bool llvm::objcarc::IsNeverTail(ARCInstKind Class) { return true; case ARCInstKind::Retain: case ARCInstKind::RetainRV: - case ARCInstKind::ClaimRV: + case ARCInstKind::UnsafeClaimRV: case ARCInstKind::AutoreleaseRV: case ARCInstKind::Release: case ARCInstKind::RetainBlock: @@ -598,7 +598,7 @@ bool llvm::objcarc::IsNoThrow(ARCInstKind Class) { switch (Class) { case ARCInstKind::Retain: case ARCInstKind::RetainRV: - case ARCInstKind::ClaimRV: + case ARCInstKind::UnsafeClaimRV: case ARCInstKind::Release: case ARCInstKind::Autorelease: case ARCInstKind::AutoreleaseRV: @@ -643,7 +643,7 @@ bool llvm::objcarc::CanInterruptRV(ARCInstKind Class) { return true; case ARCInstKind::Retain: case ARCInstKind::RetainRV: - case ARCInstKind::ClaimRV: + case ARCInstKind::UnsafeClaimRV: case ARCInstKind::Release: case ARCInstKind::AutoreleasepoolPush: case ARCInstKind::RetainBlock: @@ -696,7 +696,7 @@ bool llvm::objcarc::CanDecrementRefCount(ARCInstKind Kind) { case ARCInstKind::StoreStrong: case ARCInstKind::CallOrUser: case ARCInstKind::Call: - case ARCInstKind::ClaimRV: + case ARCInstKind::UnsafeClaimRV: return true; } diff --git a/llvm/lib/Analysis/PHITransAddr.cpp b/llvm/lib/Analysis/PHITransAddr.cpp index 4c80f6743411..02d084937ccb 100644 --- a/llvm/lib/Analysis/PHITransAddr.cpp +++ b/llvm/lib/Analysis/PHITransAddr.cpp @@ -226,7 +226,8 @@ Value *PHITransAddr::PHITranslateSubExpr(Value *V, BasicBlock *CurBB, return GEP; // Simplify the GEP to handle 'gep x, 0' -> x etc. - if (Value *V = SimplifyGEPInst(GEP->getSourceElementType(), GEPOps, + if (Value *V = SimplifyGEPInst(GEP->getSourceElementType(), GEPOps[0], + ArrayRef<Value *>(GEPOps).slice(1), GEP->isInBounds(), {DL, TLI, DT, AC})) { for (unsigned i = 0, e = GEPOps.size(); i != e; ++i) RemoveInstInputs(GEPOps[i], InstInputs); diff --git a/llvm/lib/Analysis/RegionPass.cpp b/llvm/lib/Analysis/RegionPass.cpp index c20ecff5f912..10c8569096c6 100644 --- a/llvm/lib/Analysis/RegionPass.cpp +++ b/llvm/lib/Analysis/RegionPass.cpp @@ -30,8 +30,7 @@ using namespace llvm; char RGPassManager::ID = 0; -RGPassManager::RGPassManager() - : FunctionPass(ID), PMDataManager() { +RGPassManager::RGPassManager() : FunctionPass(ID) { RI = nullptr; CurrentRegion = nullptr; } diff --git a/llvm/lib/Analysis/ReplayInlineAdvisor.cpp b/llvm/lib/Analysis/ReplayInlineAdvisor.cpp index f83d8b0fd230..294bc38c17ad 100644 --- a/llvm/lib/Analysis/ReplayInlineAdvisor.cpp +++ b/llvm/lib/Analysis/ReplayInlineAdvisor.cpp @@ -28,8 +28,7 @@ ReplayInlineAdvisor::ReplayInlineAdvisor( std::unique_ptr<InlineAdvisor> OriginalAdvisor, const ReplayInlinerSettings &ReplaySettings, bool EmitRemarks) : InlineAdvisor(M, FAM), OriginalAdvisor(std::move(OriginalAdvisor)), - HasReplayRemarks(false), ReplaySettings(ReplaySettings), - EmitRemarks(EmitRemarks) { + ReplaySettings(ReplaySettings), EmitRemarks(EmitRemarks) { auto BufferOrErr = MemoryBuffer::getFileOrSTDIN(ReplaySettings.ReplayFile); std::error_code EC = BufferOrErr.getError(); diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp index 0c3f32295ae1..07aac1523b47 100644 --- a/llvm/lib/Analysis/ScalarEvolution.cpp +++ b/llvm/lib/Analysis/ScalarEvolution.cpp @@ -301,7 +301,8 @@ void SCEV::print(raw_ostream &OS) const { case scUMaxExpr: case scSMaxExpr: case scUMinExpr: - case scSMinExpr: { + case scSMinExpr: + case scSequentialUMinExpr: { const SCEVNAryExpr *NAry = cast<SCEVNAryExpr>(this); const char *OpStr = nullptr; switch (NAry->getSCEVType()) { @@ -315,6 +316,9 @@ void SCEV::print(raw_ostream &OS) const { case scSMinExpr: OpStr = " smin "; break; + case scSequentialUMinExpr: + OpStr = " umin_seq "; + break; default: llvm_unreachable("There are no other nary expression types."); } @@ -392,6 +396,8 @@ Type *SCEV::getType() const { case scUMinExpr: case scSMinExpr: return cast<SCEVMinMaxExpr>(this)->getType(); + case scSequentialUMinExpr: + return cast<SCEVSequentialMinMaxExpr>(this)->getType(); case scAddExpr: return cast<SCEVAddExpr>(this)->getType(); case scUDivExpr: @@ -774,7 +780,8 @@ CompareSCEVComplexity(EquivalenceClasses<const SCEV *> &EqCacheSCEV, case scSMaxExpr: case scUMaxExpr: case scSMinExpr: - case scUMinExpr: { + case scUMinExpr: + case scSequentialUMinExpr: { const SCEVNAryExpr *LC = cast<SCEVNAryExpr>(LHS); const SCEVNAryExpr *RC = cast<SCEVNAryExpr>(RHS); @@ -2110,6 +2117,22 @@ ScalarEvolution::getSignExtendExpr(const SCEV *Op, Type *Ty, unsigned Depth) { return S; } +const SCEV *ScalarEvolution::getCastExpr(SCEVTypes Kind, const SCEV *Op, + Type *Ty) { + switch (Kind) { + case scTruncate: + return getTruncateExpr(Op, Ty); + case scZeroExtend: + return getZeroExtendExpr(Op, Ty); + case scSignExtend: + return getSignExtendExpr(Op, Ty); + case scPtrToInt: + return getPtrToIntExpr(Op, Ty); + default: + llvm_unreachable("Not a SCEV cast expression!"); + } +} + /// getAnyExtendExpr - Return a SCEV for the given operand extended with /// unspecified bits out to the given type. const SCEV *ScalarEvolution::getAnyExtendExpr(const SCEV *Op, @@ -3463,7 +3486,7 @@ const SCEV *ScalarEvolution::getUDivExpr(const SCEV *LHS, return S; } -static const APInt gcd(const SCEVConstant *C1, const SCEVConstant *C2) { +const APInt gcd(const SCEVConstant *C1, const SCEVConstant *C2) { APInt A = C1->getAPInt().abs(); APInt B = C2->getAPInt().abs(); uint32_t ABW = A.getBitWidth(); @@ -3721,6 +3744,7 @@ const SCEV *ScalarEvolution::getAbsExpr(const SCEV *Op, bool IsNSW) { const SCEV *ScalarEvolution::getMinMaxExpr(SCEVTypes Kind, SmallVectorImpl<const SCEV *> &Ops) { + assert(SCEVMinMaxExpr::isMinMaxType(Kind) && "Not a SCEVMinMaxExpr!"); assert(!Ops.empty() && "Cannot get empty (u|s)(min|max)!"); if (Ops.size() == 1) return Ops[0]; #ifndef NDEBUG @@ -3857,6 +3881,209 @@ const SCEV *ScalarEvolution::getMinMaxExpr(SCEVTypes Kind, return S; } +namespace { + +class SCEVSequentialMinMaxDeduplicatingVisitor final + : public SCEVVisitor<SCEVSequentialMinMaxDeduplicatingVisitor, + Optional<const SCEV *>> { + using RetVal = Optional<const SCEV *>; + using Base = SCEVVisitor<SCEVSequentialMinMaxDeduplicatingVisitor, RetVal>; + + ScalarEvolution &SE; + const SCEVTypes RootKind; // Must be a sequential min/max expression. + const SCEVTypes NonSequentialRootKind; // Non-sequential variant of RootKind. + SmallPtrSet<const SCEV *, 16> SeenOps; + + bool canRecurseInto(SCEVTypes Kind) const { + // We can only recurse into the SCEV expression of the same effective type + // as the type of our root SCEV expression. + return RootKind == Kind || NonSequentialRootKind == Kind; + }; + + RetVal visitAnyMinMaxExpr(const SCEV *S) { + assert((isa<SCEVMinMaxExpr>(S) || isa<SCEVSequentialMinMaxExpr>(S)) && + "Only for min/max expressions."); + SCEVTypes Kind = S->getSCEVType(); + + if (!canRecurseInto(Kind)) + return S; + + auto *NAry = cast<SCEVNAryExpr>(S); + SmallVector<const SCEV *> NewOps; + bool Changed = + visit(Kind, makeArrayRef(NAry->op_begin(), NAry->op_end()), NewOps); + + if (!Changed) + return S; + if (NewOps.empty()) + return None; + + return isa<SCEVSequentialMinMaxExpr>(S) + ? SE.getSequentialMinMaxExpr(Kind, NewOps) + : SE.getMinMaxExpr(Kind, NewOps); + } + + RetVal visit(const SCEV *S) { + // Has the whole operand been seen already? + if (!SeenOps.insert(S).second) + return None; + return Base::visit(S); + } + +public: + SCEVSequentialMinMaxDeduplicatingVisitor(ScalarEvolution &SE, + SCEVTypes RootKind) + : SE(SE), RootKind(RootKind), + NonSequentialRootKind( + SCEVSequentialMinMaxExpr::getEquivalentNonSequentialSCEVType( + RootKind)) {} + + bool /*Changed*/ visit(SCEVTypes Kind, ArrayRef<const SCEV *> OrigOps, + SmallVectorImpl<const SCEV *> &NewOps) { + bool Changed = false; + SmallVector<const SCEV *> Ops; + Ops.reserve(OrigOps.size()); + + for (const SCEV *Op : OrigOps) { + RetVal NewOp = visit(Op); + if (NewOp != Op) + Changed = true; + if (NewOp) + Ops.emplace_back(*NewOp); + } + + if (Changed) + NewOps = std::move(Ops); + return Changed; + } + + RetVal visitConstant(const SCEVConstant *Constant) { return Constant; } + + RetVal visitPtrToIntExpr(const SCEVPtrToIntExpr *Expr) { return Expr; } + + RetVal visitTruncateExpr(const SCEVTruncateExpr *Expr) { return Expr; } + + RetVal visitZeroExtendExpr(const SCEVZeroExtendExpr *Expr) { return Expr; } + + RetVal visitSignExtendExpr(const SCEVSignExtendExpr *Expr) { return Expr; } + + RetVal visitAddExpr(const SCEVAddExpr *Expr) { return Expr; } + + RetVal visitMulExpr(const SCEVMulExpr *Expr) { return Expr; } + + RetVal visitUDivExpr(const SCEVUDivExpr *Expr) { return Expr; } + + RetVal visitAddRecExpr(const SCEVAddRecExpr *Expr) { return Expr; } + + RetVal visitSMaxExpr(const SCEVSMaxExpr *Expr) { + return visitAnyMinMaxExpr(Expr); + } + + RetVal visitUMaxExpr(const SCEVUMaxExpr *Expr) { + return visitAnyMinMaxExpr(Expr); + } + + RetVal visitSMinExpr(const SCEVSMinExpr *Expr) { + return visitAnyMinMaxExpr(Expr); + } + + RetVal visitUMinExpr(const SCEVUMinExpr *Expr) { + return visitAnyMinMaxExpr(Expr); + } + + RetVal visitSequentialUMinExpr(const SCEVSequentialUMinExpr *Expr) { + return visitAnyMinMaxExpr(Expr); + } + + RetVal visitUnknown(const SCEVUnknown *Expr) { return Expr; } + + RetVal visitCouldNotCompute(const SCEVCouldNotCompute *Expr) { return Expr; } +}; + +} // namespace + +const SCEV * +ScalarEvolution::getSequentialMinMaxExpr(SCEVTypes Kind, + SmallVectorImpl<const SCEV *> &Ops) { + assert(SCEVSequentialMinMaxExpr::isSequentialMinMaxType(Kind) && + "Not a SCEVSequentialMinMaxExpr!"); + assert(!Ops.empty() && "Cannot get empty (u|s)(min|max)!"); + if (Ops.size() == 1) + return Ops[0]; + if (Ops.size() == 2 && + any_of(Ops, [](const SCEV *Op) { return isa<SCEVConstant>(Op); })) + return getMinMaxExpr( + SCEVSequentialMinMaxExpr::getEquivalentNonSequentialSCEVType(Kind), + Ops); +#ifndef NDEBUG + Type *ETy = getEffectiveSCEVType(Ops[0]->getType()); + for (unsigned i = 1, e = Ops.size(); i != e; ++i) { + assert(getEffectiveSCEVType(Ops[i]->getType()) == ETy && + "Operand types don't match!"); + assert(Ops[0]->getType()->isPointerTy() == + Ops[i]->getType()->isPointerTy() && + "min/max should be consistently pointerish"); + } +#endif + + // Note that SCEVSequentialMinMaxExpr is *NOT* commutative, + // so we can *NOT* do any kind of sorting of the expressions! + + // Check if we have created the same expression before. + if (const SCEV *S = findExistingSCEVInCache(Kind, Ops)) + return S; + + // FIXME: there are *some* simplifications that we can do here. + + // Keep only the first instance of an operand. + { + SCEVSequentialMinMaxDeduplicatingVisitor Deduplicator(*this, Kind); + bool Changed = Deduplicator.visit(Kind, Ops, Ops); + if (Changed) + return getSequentialMinMaxExpr(Kind, Ops); + } + + // Check to see if one of the operands is of the same kind. If so, expand its + // operands onto our operand list, and recurse to simplify. + { + unsigned Idx = 0; + bool DeletedAny = false; + while (Idx < Ops.size()) { + if (Ops[Idx]->getSCEVType() != Kind) { + ++Idx; + continue; + } + const auto *SMME = cast<SCEVSequentialMinMaxExpr>(Ops[Idx]); + Ops.erase(Ops.begin() + Idx); + Ops.insert(Ops.begin() + Idx, SMME->op_begin(), SMME->op_end()); + DeletedAny = true; + } + + if (DeletedAny) + return getSequentialMinMaxExpr(Kind, Ops); + } + + // Okay, it looks like we really DO need an expr. Check to see if we + // already have one, otherwise create a new one. + FoldingSetNodeID ID; + ID.AddInteger(Kind); + for (unsigned i = 0, e = Ops.size(); i != e; ++i) + ID.AddPointer(Ops[i]); + void *IP = nullptr; + const SCEV *ExistingSCEV = UniqueSCEVs.FindNodeOrInsertPos(ID, IP); + if (ExistingSCEV) + return ExistingSCEV; + + const SCEV **O = SCEVAllocator.Allocate<const SCEV *>(Ops.size()); + std::uninitialized_copy(Ops.begin(), Ops.end(), O); + SCEV *S = new (SCEVAllocator) + SCEVSequentialMinMaxExpr(ID.Intern(SCEVAllocator), Kind, O, Ops.size()); + + UniqueSCEVs.InsertNode(S, IP); + registerUser(S, Ops); + return S; +} + const SCEV *ScalarEvolution::getSMaxExpr(const SCEV *LHS, const SCEV *RHS) { SmallVector<const SCEV *, 2> Ops = {LHS, RHS}; return getSMaxExpr(Ops); @@ -3885,14 +4112,16 @@ const SCEV *ScalarEvolution::getSMinExpr(SmallVectorImpl<const SCEV *> &Ops) { return getMinMaxExpr(scSMinExpr, Ops); } -const SCEV *ScalarEvolution::getUMinExpr(const SCEV *LHS, - const SCEV *RHS) { +const SCEV *ScalarEvolution::getUMinExpr(const SCEV *LHS, const SCEV *RHS, + bool Sequential) { SmallVector<const SCEV *, 2> Ops = { LHS, RHS }; - return getUMinExpr(Ops); + return getUMinExpr(Ops, Sequential); } -const SCEV *ScalarEvolution::getUMinExpr(SmallVectorImpl<const SCEV *> &Ops) { - return getMinMaxExpr(scUMinExpr, Ops); +const SCEV *ScalarEvolution::getUMinExpr(SmallVectorImpl<const SCEV *> &Ops, + bool Sequential) { + return Sequential ? getSequentialMinMaxExpr(scSequentialUMinExpr, Ops) + : getMinMaxExpr(scUMinExpr, Ops); } const SCEV * @@ -4375,13 +4604,15 @@ const SCEV *ScalarEvolution::getUMaxFromMismatchedTypes(const SCEV *LHS, } const SCEV *ScalarEvolution::getUMinFromMismatchedTypes(const SCEV *LHS, - const SCEV *RHS) { + const SCEV *RHS, + bool Sequential) { SmallVector<const SCEV *, 2> Ops = { LHS, RHS }; - return getUMinFromMismatchedTypes(Ops); + return getUMinFromMismatchedTypes(Ops, Sequential); } -const SCEV *ScalarEvolution::getUMinFromMismatchedTypes( - SmallVectorImpl<const SCEV *> &Ops) { +const SCEV * +ScalarEvolution::getUMinFromMismatchedTypes(SmallVectorImpl<const SCEV *> &Ops, + bool Sequential) { assert(!Ops.empty() && "At least one operand must be!"); // Trivial case. if (Ops.size() == 1) @@ -4402,7 +4633,7 @@ const SCEV *ScalarEvolution::getUMinFromMismatchedTypes( PromotedOps.push_back(getNoopOrZeroExtend(S, MaxType)); // Generate umin. - return getUMinExpr(PromotedOps); + return getUMinExpr(PromotedOps, Sequential); } const SCEV *ScalarEvolution::getPointerBase(const SCEV *V) { @@ -5513,6 +5744,7 @@ static bool IsAvailableOnEntry(const Loop *L, DominatorTree &DT, const SCEV *S, case scSMaxExpr: case scUMinExpr: case scSMinExpr: + case scSequentialUMinExpr: // These expressions are available if their operand(s) is/are. return true; @@ -6060,35 +6292,31 @@ ScalarEvolution::getRangeRef(const SCEV *S, ConservativeResult.intersectWith(X, RangeType)); } - if (const SCEVSMaxExpr *SMax = dyn_cast<SCEVSMaxExpr>(S)) { - ConstantRange X = getRangeRef(SMax->getOperand(0), SignHint); - for (unsigned i = 1, e = SMax->getNumOperands(); i != e; ++i) - X = X.smax(getRangeRef(SMax->getOperand(i), SignHint)); - return setRange(SMax, SignHint, - ConservativeResult.intersectWith(X, RangeType)); - } - - if (const SCEVUMaxExpr *UMax = dyn_cast<SCEVUMaxExpr>(S)) { - ConstantRange X = getRangeRef(UMax->getOperand(0), SignHint); - for (unsigned i = 1, e = UMax->getNumOperands(); i != e; ++i) - X = X.umax(getRangeRef(UMax->getOperand(i), SignHint)); - return setRange(UMax, SignHint, - ConservativeResult.intersectWith(X, RangeType)); - } - - if (const SCEVSMinExpr *SMin = dyn_cast<SCEVSMinExpr>(S)) { - ConstantRange X = getRangeRef(SMin->getOperand(0), SignHint); - for (unsigned i = 1, e = SMin->getNumOperands(); i != e; ++i) - X = X.smin(getRangeRef(SMin->getOperand(i), SignHint)); - return setRange(SMin, SignHint, - ConservativeResult.intersectWith(X, RangeType)); - } + if (isa<SCEVMinMaxExpr>(S) || isa<SCEVSequentialMinMaxExpr>(S)) { + Intrinsic::ID ID; + switch (S->getSCEVType()) { + case scUMaxExpr: + ID = Intrinsic::umax; + break; + case scSMaxExpr: + ID = Intrinsic::smax; + break; + case scUMinExpr: + case scSequentialUMinExpr: + ID = Intrinsic::umin; + break; + case scSMinExpr: + ID = Intrinsic::smin; + break; + default: + llvm_unreachable("Unknown SCEVMinMaxExpr/SCEVSequentialMinMaxExpr."); + } - if (const SCEVUMinExpr *UMin = dyn_cast<SCEVUMinExpr>(S)) { - ConstantRange X = getRangeRef(UMin->getOperand(0), SignHint); - for (unsigned i = 1, e = UMin->getNumOperands(); i != e; ++i) - X = X.umin(getRangeRef(UMin->getOperand(i), SignHint)); - return setRange(UMin, SignHint, + const auto *NAry = cast<SCEVNAryExpr>(S); + ConstantRange X = getRangeRef(NAry->getOperand(0), SignHint); + for (unsigned i = 1, e = NAry->getNumOperands(); i != e; ++i) + X = X.intrinsic(ID, {X, getRangeRef(NAry->getOperand(i), SignHint)}); + return setRange(S, SignHint, ConservativeResult.intersectWith(X, RangeType)); } @@ -7368,11 +7596,6 @@ const SCEV *ScalarEvolution::getConstantMaxTripCountFromArray(const Loop *L) { auto *ArrSize = dyn_cast<ConstantInt>(AllocateInst->getArraySize()); if (!Ty || !ArrSize || !ArrSize->isOne()) continue; - // Also make sure step was increased the same with sizeof allocated - // element type. - const PointerType *GEPT = dyn_cast<PointerType>(GEP->getType()); - if (Ty->getElementType() != GEPT->getElementType()) - continue; // FIXME: Since gep indices are silently zext to the indexing type, // we will have a narrow gep index which wraps around rather than @@ -8093,6 +8316,29 @@ ScalarEvolution::ExitLimit ScalarEvolution::computeExitLimitFromCondImpl( return getZero(CI->getType()); } + // If we're exiting based on the overflow flag of an x.with.overflow intrinsic + // with a constant step, we can form an equivalent icmp predicate and figure + // out how many iterations will be taken before we exit. + const WithOverflowInst *WO; + const APInt *C; + if (match(ExitCond, m_ExtractValue<1>(m_WithOverflowInst(WO))) && + match(WO->getRHS(), m_APInt(C))) { + ConstantRange NWR = + ConstantRange::makeExactNoWrapRegion(WO->getBinaryOp(), *C, + WO->getNoWrapKind()); + CmpInst::Predicate Pred; + APInt NewRHSC, Offset; + NWR.getEquivalentICmp(Pred, NewRHSC, Offset); + if (!ExitIfTrue) + Pred = ICmpInst::getInversePredicate(Pred); + auto *LHS = getSCEV(WO->getLHS()); + if (Offset != 0) + LHS = getAddExpr(LHS, getConstant(Offset)); + auto EL = computeExitLimitFromICmp(L, Pred, LHS, getConstant(NewRHSC), + ControlsExit, AllowPredicates); + if (EL.hasAnyInfo()) return EL; + } + // If it's not an integer or pointer comparison then compute it the hard way. return computeExitCountExhaustively(L, ExitCond, ExitIfTrue); } @@ -8134,26 +8380,11 @@ ScalarEvolution::computeExitLimitFromCondFromBinOp( if (EitherMayExit) { // Both conditions must be same for the loop to continue executing. // Choose the less conservative count. - // If ExitCond is a short-circuit form (select), using - // umin(EL0.ExactNotTaken, EL1.ExactNotTaken) is unsafe in general. - // To see the detailed examples, please see - // test/Analysis/ScalarEvolution/exit-count-select.ll - bool PoisonSafe = isa<BinaryOperator>(ExitCond); - if (!PoisonSafe) - // Even if ExitCond is select, we can safely derive BECount using both - // EL0 and EL1 in these cases: - // (1) EL0.ExactNotTaken is non-zero - // (2) EL1.ExactNotTaken is non-poison - // (3) EL0.ExactNotTaken is zero (BECount should be simply zero and - // it cannot be umin(0, ..)) - // The PoisonSafe assignment below is simplified and the assertion after - // BECount calculation fully guarantees the condition (3). - PoisonSafe = isa<SCEVConstant>(EL0.ExactNotTaken) || - isa<SCEVConstant>(EL1.ExactNotTaken); if (EL0.ExactNotTaken != getCouldNotCompute() && - EL1.ExactNotTaken != getCouldNotCompute() && PoisonSafe) { - BECount = - getUMinFromMismatchedTypes(EL0.ExactNotTaken, EL1.ExactNotTaken); + EL1.ExactNotTaken != getCouldNotCompute()) { + BECount = getUMinFromMismatchedTypes( + EL0.ExactNotTaken, EL1.ExactNotTaken, + /*Sequential=*/!isa<BinaryOperator>(ExitCond)); // If EL0.ExactNotTaken was zero and ExitCond was a short-circuit form, // it should have been simplified to zero (see the condition (3) above) @@ -8203,6 +8434,26 @@ ScalarEvolution::computeExitLimitFromICmp(const Loop *L, const SCEV *LHS = getSCEV(ExitCond->getOperand(0)); const SCEV *RHS = getSCEV(ExitCond->getOperand(1)); + ExitLimit EL = computeExitLimitFromICmp(L, Pred, LHS, RHS, ControlsExit, + AllowPredicates); + if (EL.hasAnyInfo()) return EL; + + auto *ExhaustiveCount = + computeExitCountExhaustively(L, ExitCond, ExitIfTrue); + + if (!isa<SCEVCouldNotCompute>(ExhaustiveCount)) + return ExhaustiveCount; + + return computeShiftCompareExitLimit(ExitCond->getOperand(0), + ExitCond->getOperand(1), L, OriginalPred); +} +ScalarEvolution::ExitLimit +ScalarEvolution::computeExitLimitFromICmp(const Loop *L, + ICmpInst::Predicate Pred, + const SCEV *LHS, const SCEV *RHS, + bool ControlsExit, + bool AllowPredicates) { + // Try to evaluate any dependencies out of the loop. LHS = getSCEVAtScope(LHS, L); RHS = getSCEVAtScope(RHS, L); @@ -8312,14 +8563,7 @@ ScalarEvolution::computeExitLimitFromICmp(const Loop *L, break; } - auto *ExhaustiveCount = - computeExitCountExhaustively(L, ExitCond, ExitIfTrue); - - if (!isa<SCEVCouldNotCompute>(ExhaustiveCount)) - return ExhaustiveCount; - - return computeShiftCompareExitLimit(ExitCond->getOperand(0), - ExitCond->getOperand(1), L, OriginalPred); + return getCouldNotCompute(); } ScalarEvolution::ExitLimit @@ -8941,7 +9185,8 @@ static Constant *BuildConstantFromSCEV(const SCEV *V) { case scUMaxExpr: case scSMinExpr: case scUMinExpr: - return nullptr; // TODO: smax, umax, smin, umax. + case scSequentialUMinExpr: + return nullptr; // TODO: smax, umax, smin, umax, umin_seq. } llvm_unreachable("Unknown SCEV kind!"); } @@ -9070,7 +9315,8 @@ const SCEV *ScalarEvolution::computeSCEVAtScope(const SCEV *V, const Loop *L) { return V; } - if (const SCEVCommutativeExpr *Comm = dyn_cast<SCEVCommutativeExpr>(V)) { + if (isa<SCEVCommutativeExpr>(V) || isa<SCEVSequentialMinMaxExpr>(V)) { + const auto *Comm = cast<SCEVNAryExpr>(V); // Avoid performing the look-up in the common case where the specified // expression has no loop-variant portions. for (unsigned i = 0, e = Comm->getNumOperands(); i != e; ++i) { @@ -9092,7 +9338,9 @@ const SCEV *ScalarEvolution::computeSCEVAtScope(const SCEV *V, const Loop *L) { return getMulExpr(NewOps, Comm->getNoWrapFlags()); if (isa<SCEVMinMaxExpr>(Comm)) return getMinMaxExpr(Comm->getSCEVType(), NewOps); - llvm_unreachable("Unknown commutative SCEV type!"); + if (isa<SCEVSequentialMinMaxExpr>(Comm)) + return getSequentialMinMaxExpr(Comm->getSCEVType(), NewOps); + llvm_unreachable("Unknown commutative / sequential min/max SCEV type!"); } } // If we got here, all operands are loop invariant. @@ -9153,32 +9401,11 @@ const SCEV *ScalarEvolution::computeSCEVAtScope(const SCEV *V, const Loop *L) { return AddRec; } - if (const SCEVZeroExtendExpr *Cast = dyn_cast<SCEVZeroExtendExpr>(V)) { - const SCEV *Op = getSCEVAtScope(Cast->getOperand(), L); - if (Op == Cast->getOperand()) - return Cast; // must be loop invariant - return getZeroExtendExpr(Op, Cast->getType()); - } - - if (const SCEVSignExtendExpr *Cast = dyn_cast<SCEVSignExtendExpr>(V)) { - const SCEV *Op = getSCEVAtScope(Cast->getOperand(), L); - if (Op == Cast->getOperand()) - return Cast; // must be loop invariant - return getSignExtendExpr(Op, Cast->getType()); - } - - if (const SCEVTruncateExpr *Cast = dyn_cast<SCEVTruncateExpr>(V)) { + if (const SCEVCastExpr *Cast = dyn_cast<SCEVCastExpr>(V)) { const SCEV *Op = getSCEVAtScope(Cast->getOperand(), L); if (Op == Cast->getOperand()) return Cast; // must be loop invariant - return getTruncateExpr(Op, Cast->getType()); - } - - if (const SCEVPtrToIntExpr *Cast = dyn_cast<SCEVPtrToIntExpr>(V)) { - const SCEV *Op = getSCEVAtScope(Cast->getOperand(), L); - if (Op == Cast->getOperand()) - return Cast; // must be loop invariant - return getPtrToIntExpr(Op, Cast->getType()); + return getCastExpr(Cast->getSCEVType(), Op, Cast->getType()); } llvm_unreachable("Unknown SCEV type!"); @@ -11236,6 +11463,48 @@ bool ScalarEvolution::isImpliedViaMerge(ICmpInst::Predicate Pred, return true; } +bool ScalarEvolution::isImpliedCondOperandsViaShift(ICmpInst::Predicate Pred, + const SCEV *LHS, + const SCEV *RHS, + const SCEV *FoundLHS, + const SCEV *FoundRHS) { + // We want to imply LHS < RHS from LHS < (RHS >> shiftvalue). First, make + // sure that we are dealing with same LHS. + if (RHS == FoundRHS) { + std::swap(LHS, RHS); + std::swap(FoundLHS, FoundRHS); + Pred = ICmpInst::getSwappedPredicate(Pred); + } + if (LHS != FoundLHS) + return false; + + auto *SUFoundRHS = dyn_cast<SCEVUnknown>(FoundRHS); + if (!SUFoundRHS) + return false; + + Value *Shiftee, *ShiftValue; + + using namespace PatternMatch; + if (match(SUFoundRHS->getValue(), + m_LShr(m_Value(Shiftee), m_Value(ShiftValue)))) { + auto *ShifteeS = getSCEV(Shiftee); + // Prove one of the following: + // LHS <u (shiftee >> shiftvalue) && shiftee <=u RHS ---> LHS <u RHS + // LHS <=u (shiftee >> shiftvalue) && shiftee <=u RHS ---> LHS <=u RHS + // LHS <s (shiftee >> shiftvalue) && shiftee <=s RHS && shiftee >=s 0 + // ---> LHS <s RHS + // LHS <=s (shiftee >> shiftvalue) && shiftee <=s RHS && shiftee >=s 0 + // ---> LHS <=s RHS + if (Pred == ICmpInst::ICMP_ULT || Pred == ICmpInst::ICMP_ULE) + return isKnownPredicate(ICmpInst::ICMP_ULE, ShifteeS, RHS); + if (Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_SLE) + if (isKnownNonNegative(ShifteeS)) + return isKnownPredicate(ICmpInst::ICMP_SLE, ShifteeS, RHS); + } + + return false; +} + bool ScalarEvolution::isImpliedCondOperands(ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS, const SCEV *FoundLHS, @@ -11247,6 +11516,9 @@ bool ScalarEvolution::isImpliedCondOperands(ICmpInst::Predicate Pred, if (isImpliedCondOperandsViaNoOverflow(Pred, LHS, RHS, FoundLHS, FoundRHS)) return true; + if (isImpliedCondOperandsViaShift(Pred, LHS, RHS, FoundLHS, FoundRHS)) + return true; + if (isImpliedCondOperandsViaAddRecStart(Pred, LHS, RHS, FoundLHS, FoundRHS, CtxI)) return true; @@ -11323,6 +11595,7 @@ static bool IsKnownPredicateViaMinOrMax(ScalarEvolution &SE, case ICmpInst::ICMP_ULE: return // min(A, ...) <= A + // FIXME: what about umin_seq? IsMinMaxConsistingOf<SCEVUMinExpr>(LHS, RHS) || // A <= max(A, ...) IsMinMaxConsistingOf<SCEVUMaxExpr>(RHS, LHS); @@ -12723,7 +12996,8 @@ ScalarEvolution::computeLoopDisposition(const SCEV *S, const Loop *L) { case scUMaxExpr: case scSMaxExpr: case scUMinExpr: - case scSMinExpr: { + case scSMinExpr: + case scSequentialUMinExpr: { bool HasVarying = false; for (auto *Op : cast<SCEVNAryExpr>(S)->operands()) { LoopDisposition D = getLoopDisposition(Op, L); @@ -12813,7 +13087,8 @@ ScalarEvolution::computeBlockDisposition(const SCEV *S, const BasicBlock *BB) { case scUMaxExpr: case scSMaxExpr: case scUMinExpr: - case scSMinExpr: { + case scSMinExpr: + case scSequentialUMinExpr: { const SCEVNAryExpr *NAry = cast<SCEVNAryExpr>(S); bool Proper = true; for (const SCEV *NAryOp : NAry->operands()) { diff --git a/llvm/lib/Analysis/TFUtils.cpp b/llvm/lib/Analysis/TFUtils.cpp index 3d10479c4544..26bc63983b4e 100644 --- a/llvm/lib/Analysis/TFUtils.cpp +++ b/llvm/lib/Analysis/TFUtils.cpp @@ -14,6 +14,7 @@ #include "llvm/ADT/Twine.h" #include "llvm/Analysis/Utils/TFUtils.h" +#include "llvm/Support/Base64.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/JSON.h" @@ -22,6 +23,7 @@ #include "llvm/Support/Path.h" #include "llvm/Support/raw_ostream.h" +#include "google/protobuf/struct.pb.h" #include "google/protobuf/text_format.h" #include "tensorflow/c/c_api.h" #include "tensorflow/c/c_api_experimental.h" @@ -72,6 +74,14 @@ TFStatusPtr createTFStatus() { TFSessionOptionsPtr createTFSessionOptions() { return TFSessionOptionsPtr(TF_NewSessionOptions(), &TF_DeleteSessionOptions); } + +void serialize(const Message &SE, std::string *OutStr) { + if (ProtobufTextMode) { + TextFormat::PrintToString(SE, OutStr); + } else { + *OutStr = SE.SerializeAsString(); + } +} } // namespace namespace llvm { @@ -307,19 +317,13 @@ public: IncludeReward(IncludeReward), FeatureLists(LoggedFeatureSpecs.size()) {} // flush the logged info to a stream and clear the log contents. - void flush(raw_ostream &OS) { + void flush(std::string *Str) { size_t NrRecords = getNrRecords(); (void)NrRecords; tensorflow::SequenceExample SE; transferLog(SE); assert(isSelfConsistent(SE, NrRecords)); - std::string OutStr; - if (ProtobufTextMode) - google::protobuf::TextFormat::PrintToString(SE, &OutStr); - else - OutStr = SE.SerializeAsString(); - - OS << OutStr; + serialize(SE, Str); } char *addNewTensor(size_t FeatureID) { @@ -567,5 +571,31 @@ char *Logger::addEntryAndGetFloatOrInt64Buffer(size_t FeatureID) { return reinterpret_cast<char *>(LoggerData->addNewTensor(FeatureID)); } -void Logger::flush(raw_ostream &OS) { LoggerData->flush(OS); } +void Logger::flush(std::string *Str) { LoggerData->flush(Str); } + +void Logger::flush(raw_ostream &OS) { + std::string Buff; + LoggerData->flush(&Buff); + OS << Buff; +} + +void Logger::flushLogs(raw_ostream &OS, + const StringMap<std::unique_ptr<Logger>> &Loggers) { + google::protobuf::Struct Msg; + for (const auto &NamedLogger : Loggers) { + tensorflow::SequenceExample SE; + const auto &Logger = NamedLogger.second; + std::string Unencoded; + if (Logger->LoggerData->getNrRecords() > 0) + Logger->flush(&Unencoded); + + (*Msg.mutable_fields())[NamedLogger.first().str()] + .mutable_string_value() + ->append(ProtobufTextMode ? Unencoded : encodeBase64(Unencoded)); + } + + std::string OutStr; + serialize(Msg, &OutStr); + OS << OutStr; +} #endif // defined(LLVM_HAVE_TF_API) diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index 6aa9a77391dc..25e9dee98e13 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -408,6 +408,16 @@ bool TargetTransformInfo::isLegalMaskedScatter(Type *DataType, return TTIImpl->isLegalMaskedScatter(DataType, Alignment); } +bool TargetTransformInfo::forceScalarizeMaskedGather(VectorType *DataType, + Align Alignment) const { + return TTIImpl->forceScalarizeMaskedGather(DataType, Alignment); +} + +bool TargetTransformInfo::forceScalarizeMaskedScatter(VectorType *DataType, + Align Alignment) const { + return TTIImpl->forceScalarizeMaskedScatter(DataType, Alignment); +} + bool TargetTransformInfo::isLegalMaskedCompressStore(Type *DataType) const { return TTIImpl->isLegalMaskedCompressStore(DataType); } diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp index fc378f97de0b..34358739f9a8 100644 --- a/llvm/lib/Analysis/ValueTracking.cpp +++ b/llvm/lib/Analysis/ValueTracking.cpp @@ -396,10 +396,10 @@ unsigned llvm::ComputeNumSignBits(const Value *V, const DataLayout &DL, V, Depth, Query(DL, AC, safeCxtI(V, CxtI), DT, UseInstrInfo)); } -unsigned llvm::ComputeMinSignedBits(const Value *V, const DataLayout &DL, - unsigned Depth, AssumptionCache *AC, - const Instruction *CxtI, - const DominatorTree *DT) { +unsigned llvm::ComputeMaxSignificantBits(const Value *V, const DataLayout &DL, + unsigned Depth, AssumptionCache *AC, + const Instruction *CxtI, + const DominatorTree *DT) { unsigned SignBits = ComputeNumSignBits(V, DL, Depth, AC, CxtI, DT); return V->getType()->getScalarSizeInBits() - SignBits + 1; } @@ -1593,7 +1593,7 @@ static void computeKnownBitsFromOperator(const Operator *I, computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q); // If we have a known 1, its position is our upper bound. unsigned PossibleLZ = Known2.countMaxLeadingZeros(); - // If this call is undefined for 0, the result will be less than 2^n. + // If this call is poison for 0 input, the result will be less than 2^n. if (II->getArgOperand(1) == ConstantInt::getTrue(II->getContext())) PossibleLZ = std::min(PossibleLZ, BitWidth - 1); unsigned LowBits = Log2_32(PossibleLZ)+1; @@ -1604,7 +1604,7 @@ static void computeKnownBitsFromOperator(const Operator *I, computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q); // If we have a known 1, its position is our upper bound. unsigned PossibleTZ = Known2.countMaxTrailingZeros(); - // If this call is undefined for 0, the result will be less than 2^n. + // If this call is poison for 0 input, the result will be less than 2^n. if (II->getArgOperand(1) == ConstantInt::getTrue(II->getContext())) PossibleTZ = std::min(PossibleTZ, BitWidth - 1); unsigned LowBits = Log2_32(PossibleTZ)+1; @@ -3248,125 +3248,6 @@ static unsigned ComputeNumSignBitsImpl(const Value *V, return std::max(FirstAnswer, Known.countMinSignBits()); } -/// This function computes the integer multiple of Base that equals V. -/// If successful, it returns true and returns the multiple in -/// Multiple. If unsuccessful, it returns false. It looks -/// through SExt instructions only if LookThroughSExt is true. -bool llvm::ComputeMultiple(Value *V, unsigned Base, Value *&Multiple, - bool LookThroughSExt, unsigned Depth) { - assert(V && "No Value?"); - assert(Depth <= MaxAnalysisRecursionDepth && "Limit Search Depth"); - assert(V->getType()->isIntegerTy() && "Not integer or pointer type!"); - - Type *T = V->getType(); - - ConstantInt *CI = dyn_cast<ConstantInt>(V); - - if (Base == 0) - return false; - - if (Base == 1) { - Multiple = V; - return true; - } - - ConstantExpr *CO = dyn_cast<ConstantExpr>(V); - Constant *BaseVal = ConstantInt::get(T, Base); - if (CO && CO == BaseVal) { - // Multiple is 1. - Multiple = ConstantInt::get(T, 1); - return true; - } - - if (CI && CI->getZExtValue() % Base == 0) { - Multiple = ConstantInt::get(T, CI->getZExtValue() / Base); - return true; - } - - if (Depth == MaxAnalysisRecursionDepth) return false; - - Operator *I = dyn_cast<Operator>(V); - if (!I) return false; - - switch (I->getOpcode()) { - default: break; - case Instruction::SExt: - if (!LookThroughSExt) return false; - // otherwise fall through to ZExt - LLVM_FALLTHROUGH; - case Instruction::ZExt: - return ComputeMultiple(I->getOperand(0), Base, Multiple, - LookThroughSExt, Depth+1); - case Instruction::Shl: - case Instruction::Mul: { - Value *Op0 = I->getOperand(0); - Value *Op1 = I->getOperand(1); - - if (I->getOpcode() == Instruction::Shl) { - ConstantInt *Op1CI = dyn_cast<ConstantInt>(Op1); - if (!Op1CI) return false; - // Turn Op0 << Op1 into Op0 * 2^Op1 - APInt Op1Int = Op1CI->getValue(); - uint64_t BitToSet = Op1Int.getLimitedValue(Op1Int.getBitWidth() - 1); - APInt API(Op1Int.getBitWidth(), 0); - API.setBit(BitToSet); - Op1 = ConstantInt::get(V->getContext(), API); - } - - Value *Mul0 = nullptr; - if (ComputeMultiple(Op0, Base, Mul0, LookThroughSExt, Depth+1)) { - if (Constant *Op1C = dyn_cast<Constant>(Op1)) - if (Constant *MulC = dyn_cast<Constant>(Mul0)) { - if (Op1C->getType()->getPrimitiveSizeInBits().getFixedSize() < - MulC->getType()->getPrimitiveSizeInBits().getFixedSize()) - Op1C = ConstantExpr::getZExt(Op1C, MulC->getType()); - if (Op1C->getType()->getPrimitiveSizeInBits().getFixedSize() > - MulC->getType()->getPrimitiveSizeInBits().getFixedSize()) - MulC = ConstantExpr::getZExt(MulC, Op1C->getType()); - - // V == Base * (Mul0 * Op1), so return (Mul0 * Op1) - Multiple = ConstantExpr::getMul(MulC, Op1C); - return true; - } - - if (ConstantInt *Mul0CI = dyn_cast<ConstantInt>(Mul0)) - if (Mul0CI->getValue() == 1) { - // V == Base * Op1, so return Op1 - Multiple = Op1; - return true; - } - } - - Value *Mul1 = nullptr; - if (ComputeMultiple(Op1, Base, Mul1, LookThroughSExt, Depth+1)) { - if (Constant *Op0C = dyn_cast<Constant>(Op0)) - if (Constant *MulC = dyn_cast<Constant>(Mul1)) { - if (Op0C->getType()->getPrimitiveSizeInBits().getFixedSize() < - MulC->getType()->getPrimitiveSizeInBits().getFixedSize()) - Op0C = ConstantExpr::getZExt(Op0C, MulC->getType()); - if (Op0C->getType()->getPrimitiveSizeInBits().getFixedSize() > - MulC->getType()->getPrimitiveSizeInBits().getFixedSize()) - MulC = ConstantExpr::getZExt(MulC, Op0C->getType()); - - // V == Base * (Mul1 * Op0), so return (Mul1 * Op0) - Multiple = ConstantExpr::getMul(MulC, Op0C); - return true; - } - - if (ConstantInt *Mul1CI = dyn_cast<ConstantInt>(Mul1)) - if (Mul1CI->getValue() == 1) { - // V == Base * Op0, so return Op0 - Multiple = Op0; - return true; - } - } - } - } - - // We could not determine if V is a multiple of Base. - return false; -} - Intrinsic::ID llvm::getIntrinsicForCallSite(const CallBase &CB, const TargetLibraryInfo *TLI) { const Function *F = CB.getCalledFunction(); @@ -6756,17 +6637,27 @@ Optional<bool> llvm::isImpliedByDomCondition(CmpInst::Predicate Pred, } static void setLimitsForBinOp(const BinaryOperator &BO, APInt &Lower, - APInt &Upper, const InstrInfoQuery &IIQ) { + APInt &Upper, const InstrInfoQuery &IIQ, + bool PreferSignedRange) { unsigned Width = Lower.getBitWidth(); const APInt *C; switch (BO.getOpcode()) { case Instruction::Add: if (match(BO.getOperand(1), m_APInt(C)) && !C->isZero()) { - // FIXME: If we have both nuw and nsw, we should reduce the range further. - if (IIQ.hasNoUnsignedWrap(cast<OverflowingBinaryOperator>(&BO))) { + bool HasNSW = IIQ.hasNoSignedWrap(&BO); + bool HasNUW = IIQ.hasNoUnsignedWrap(&BO); + + // If the caller expects a signed compare, then try to use a signed range. + // Otherwise if both no-wraps are set, use the unsigned range because it + // is never larger than the signed range. Example: + // "add nuw nsw i8 X, -2" is unsigned [254,255] vs. signed [-128, 125]. + if (PreferSignedRange && HasNSW && HasNUW) + HasNUW = false; + + if (HasNUW) { // 'add nuw x, C' produces [C, UINT_MAX]. Lower = *C; - } else if (IIQ.hasNoSignedWrap(cast<OverflowingBinaryOperator>(&BO))) { + } else if (HasNSW) { if (C->isNegative()) { // 'add nsw x, -C' produces [SINT_MIN, SINT_MAX - C]. Lower = APInt::getSignedMinValue(Width); @@ -7083,8 +6974,8 @@ static void setLimitForFPToI(const Instruction *I, APInt &Lower, APInt &Upper) { } } -ConstantRange llvm::computeConstantRange(const Value *V, bool UseInstrInfo, - AssumptionCache *AC, +ConstantRange llvm::computeConstantRange(const Value *V, bool ForSigned, + bool UseInstrInfo, AssumptionCache *AC, const Instruction *CtxI, const DominatorTree *DT, unsigned Depth) { @@ -7102,7 +6993,7 @@ ConstantRange llvm::computeConstantRange(const Value *V, bool UseInstrInfo, APInt Lower = APInt(BitWidth, 0); APInt Upper = APInt(BitWidth, 0); if (auto *BO = dyn_cast<BinaryOperator>(V)) - setLimitsForBinOp(*BO, Lower, Upper, IIQ); + setLimitsForBinOp(*BO, Lower, Upper, IIQ, ForSigned); else if (auto *II = dyn_cast<IntrinsicInst>(V)) setLimitsForIntrinsic(*II, Lower, Upper); else if (auto *SI = dyn_cast<SelectInst>(V)) @@ -7134,8 +7025,10 @@ ConstantRange llvm::computeConstantRange(const Value *V, bool UseInstrInfo, // Currently we just use information from comparisons. if (!Cmp || Cmp->getOperand(0) != V) continue; - ConstantRange RHS = computeConstantRange(Cmp->getOperand(1), UseInstrInfo, - AC, I, DT, Depth + 1); + // TODO: Set "ForSigned" parameter via Cmp->isSigned()? + ConstantRange RHS = + computeConstantRange(Cmp->getOperand(1), /* ForSigned */ false, + UseInstrInfo, AC, I, DT, Depth + 1); CR = CR.intersectWith( ConstantRange::makeAllowedICmpRegion(Cmp->getPredicate(), RHS)); } diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp index 35c615522fe2..432ec151cf8a 100644 --- a/llvm/lib/AsmParser/LLParser.cpp +++ b/llvm/lib/AsmParser/LLParser.cpp @@ -133,14 +133,17 @@ bool LLParser::validateEndOfModule(bool UpgradeDebugInfo) { for (const auto &RAG : ForwardRefAttrGroups) { Value *V = RAG.first; const std::vector<unsigned> &Attrs = RAG.second; - AttrBuilder B; + AttrBuilder B(Context); - for (const auto &Attr : Attrs) - B.merge(NumberedAttrBuilders[Attr]); + for (const auto &Attr : Attrs) { + auto R = NumberedAttrBuilders.find(Attr); + if (R != NumberedAttrBuilders.end()) + B.merge(R->second); + } if (Function *Fn = dyn_cast<Function>(V)) { AttributeList AS = Fn->getAttributes(); - AttrBuilder FnAttrs(AS.getFnAttrs()); + AttrBuilder FnAttrs(M->getContext(), AS.getFnAttrs()); AS = AS.removeFnAttributes(Context); FnAttrs.merge(B); @@ -156,27 +159,27 @@ bool LLParser::validateEndOfModule(bool UpgradeDebugInfo) { Fn->setAttributes(AS); } else if (CallInst *CI = dyn_cast<CallInst>(V)) { AttributeList AS = CI->getAttributes(); - AttrBuilder FnAttrs(AS.getFnAttrs()); + AttrBuilder FnAttrs(M->getContext(), AS.getFnAttrs()); AS = AS.removeFnAttributes(Context); FnAttrs.merge(B); AS = AS.addFnAttributes(Context, FnAttrs); CI->setAttributes(AS); } else if (InvokeInst *II = dyn_cast<InvokeInst>(V)) { AttributeList AS = II->getAttributes(); - AttrBuilder FnAttrs(AS.getFnAttrs()); + AttrBuilder FnAttrs(M->getContext(), AS.getFnAttrs()); AS = AS.removeFnAttributes(Context); FnAttrs.merge(B); AS = AS.addFnAttributes(Context, FnAttrs); II->setAttributes(AS); } else if (CallBrInst *CBI = dyn_cast<CallBrInst>(V)) { AttributeList AS = CBI->getAttributes(); - AttrBuilder FnAttrs(AS.getFnAttrs()); + AttrBuilder FnAttrs(M->getContext(), AS.getFnAttrs()); AS = AS.removeFnAttributes(Context); FnAttrs.merge(B); AS = AS.addFnAttributes(Context, FnAttrs); CBI->setAttributes(AS); } else if (auto *GV = dyn_cast<GlobalVariable>(V)) { - AttrBuilder Attrs(GV->getAttributes()); + AttrBuilder Attrs(M->getContext(), GV->getAttributes()); Attrs.merge(B); GV->setAttributes(AttributeSet::get(Context,Attrs)); } else { @@ -982,17 +985,18 @@ bool LLParser::parseAliasOrIFunc(const std::string &Name, LocTy NameLoc, return error(AliaseeLoc, "An alias or ifunc must have pointer type"); unsigned AddrSpace = PTy->getAddressSpace(); - if (IsAlias && !PTy->isOpaqueOrPointeeTypeMatches(Ty)) { - return error( - ExplicitTypeLoc, - typeComparisonErrorMessage( - "explicit pointee type doesn't match operand's pointee type", Ty, - PTy->getElementType())); - } - - if (!IsAlias && !PTy->getElementType()->isFunctionTy()) { - return error(ExplicitTypeLoc, - "explicit pointee type should be a function type"); + if (IsAlias) { + if (!PTy->isOpaqueOrPointeeTypeMatches(Ty)) + return error( + ExplicitTypeLoc, + typeComparisonErrorMessage( + "explicit pointee type doesn't match operand's pointee type", Ty, + PTy->getNonOpaquePointerElementType())); + } else { + if (!PTy->isOpaque() && + !PTy->getNonOpaquePointerElementType()->isFunctionTy()) + return error(ExplicitTypeLoc, + "explicit pointee type should be a function type"); } GlobalValue *GVal = nullptr; @@ -1206,7 +1210,7 @@ bool LLParser::parseGlobal(const std::string &Name, LocTy NameLoc, } } - AttrBuilder Attrs; + AttrBuilder Attrs(M->getContext()); LocTy BuiltinLoc; std::vector<unsigned> FwdRefAttrGrps; if (parseFnAttributeValuePairs(Attrs, FwdRefAttrGrps, false, BuiltinLoc)) @@ -1235,13 +1239,18 @@ bool LLParser::parseUnnamedAttrGrp() { Lex.Lex(); if (parseToken(lltok::equal, "expected '=' here") || - parseToken(lltok::lbrace, "expected '{' here") || - parseFnAttributeValuePairs(NumberedAttrBuilders[VarID], unused, true, - BuiltinLoc) || + parseToken(lltok::lbrace, "expected '{' here")) + return true; + + auto R = NumberedAttrBuilders.find(VarID); + if (R == NumberedAttrBuilders.end()) + R = NumberedAttrBuilders.emplace(VarID, AttrBuilder(M->getContext())).first; + + if (parseFnAttributeValuePairs(R->second, unused, true, BuiltinLoc) || parseToken(lltok::rbrace, "expected end of attribute group")) return true; - if (!NumberedAttrBuilders[VarID].hasAttributes()) + if (!R->second.hasAttributes()) return error(AttrGrpLoc, "attribute group has no attributes"); return false; @@ -1402,14 +1411,14 @@ static inline GlobalValue *createGlobalFwdRef(Module *M, PointerType *PTy) { nullptr, GlobalVariable::NotThreadLocal, PTy->getAddressSpace()); - if (auto *FT = dyn_cast<FunctionType>(PTy->getPointerElementType())) + Type *ElemTy = PTy->getNonOpaquePointerElementType(); + if (auto *FT = dyn_cast<FunctionType>(ElemTy)) return Function::Create(FT, GlobalValue::ExternalWeakLinkage, PTy->getAddressSpace(), "", M); else - return new GlobalVariable(*M, PTy->getPointerElementType(), false, - GlobalValue::ExternalWeakLinkage, nullptr, "", - nullptr, GlobalVariable::NotThreadLocal, - PTy->getAddressSpace()); + return new GlobalVariable( + *M, ElemTy, false, GlobalValue::ExternalWeakLinkage, nullptr, "", + nullptr, GlobalVariable::NotThreadLocal, PTy->getAddressSpace()); } Value *LLParser::checkValidVariableType(LocTy Loc, const Twine &Name, Type *Ty, @@ -2372,11 +2381,12 @@ bool LLParser::parseParameterList(SmallVectorImpl<ParamInfo> &ArgList, // parse the argument. LocTy ArgLoc; Type *ArgTy = nullptr; - AttrBuilder ArgAttrs; Value *V; if (parseType(ArgTy, ArgLoc)) return true; + AttrBuilder ArgAttrs(M->getContext()); + if (ArgTy->isMetadataTy()) { if (parseMetadataAsValue(V, PFS)) return true; @@ -2493,7 +2503,7 @@ bool LLParser::parseArgumentList(SmallVectorImpl<ArgInfo> &ArgList, } else { LocTy TypeLoc = Lex.getLoc(); Type *ArgTy = nullptr; - AttrBuilder Attrs; + AttrBuilder Attrs(M->getContext()); std::string Name; if (parseType(ArgTy) || parseOptionalParamAttrs(Attrs)) @@ -3579,7 +3589,7 @@ bool LLParser::parseValID(ValID &ID, PerFunctionState *PFS, Type *ExpectedTy) { ExplicitTypeLoc, typeComparisonErrorMessage( "explicit pointee type doesn't match operand's pointee type", - Ty, BasePointerType->getElementType())); + Ty, BasePointerType->getNonOpaquePointerElementType())); } unsigned GEPWidth = @@ -4541,16 +4551,17 @@ bool LLParser::parseDIStringType(MDNode *&Result, bool IsDistinct) { OPTIONAL(name, MDStringField, ); \ OPTIONAL(stringLength, MDField, ); \ OPTIONAL(stringLengthExpression, MDField, ); \ + OPTIONAL(stringLocationExpression, MDField, ); \ OPTIONAL(size, MDUnsignedField, (0, UINT64_MAX)); \ OPTIONAL(align, MDUnsignedField, (0, UINT32_MAX)); \ OPTIONAL(encoding, DwarfAttEncodingField, ); PARSE_MD_FIELDS(); #undef VISIT_MD_FIELDS - Result = GET_OR_DISTINCT(DIStringType, - (Context, tag.Val, name.Val, stringLength.Val, - stringLengthExpression.Val, size.Val, align.Val, - encoding.Val)); + Result = GET_OR_DISTINCT( + DIStringType, + (Context, tag.Val, name.Val, stringLength.Val, stringLengthExpression.Val, + stringLocationExpression.Val, size.Val, align.Val, encoding.Val)); return false; } @@ -5462,7 +5473,7 @@ bool LLParser::parseFunctionHeader(Function *&Fn, bool IsDefine) { unsigned Visibility; unsigned DLLStorageClass; bool DSOLocal; - AttrBuilder RetAttrs; + AttrBuilder RetAttrs(M->getContext()); unsigned CC; bool HasLinkage; Type *RetType = nullptr; @@ -5525,7 +5536,7 @@ bool LLParser::parseFunctionHeader(Function *&Fn, bool IsDefine) { SmallVector<ArgInfo, 8> ArgList; bool IsVarArg; - AttrBuilder FuncAttrs; + AttrBuilder FuncAttrs(M->getContext()); std::vector<unsigned> FwdRefAttrGrps; LocTy BuiltinLoc; std::string Section; @@ -5593,7 +5604,7 @@ bool LLParser::parseFunctionHeader(Function *&Fn, bool IsDefine) { if (FRVI != ForwardRefVals.end()) { FwdFn = FRVI->second.first; if (!FwdFn->getType()->isOpaque()) { - if (!FwdFn->getType()->getPointerElementType()->isFunctionTy()) + if (!FwdFn->getType()->getNonOpaquePointerElementType()->isFunctionTy()) return error(FRVI->second.second, "invalid forward reference to " "function as global value!"); if (FwdFn->getType() != PFT) @@ -6248,7 +6259,7 @@ bool LLParser::parseIndirectBr(Instruction *&Inst, PerFunctionState &PFS) { /// OptionalAttrs 'to' TypeAndValue 'unwind' TypeAndValue bool LLParser::parseInvoke(Instruction *&Inst, PerFunctionState &PFS) { LocTy CallLoc = Lex.getLoc(); - AttrBuilder RetAttrs, FnAttrs; + AttrBuilder RetAttrs(M->getContext()), FnAttrs(M->getContext()); std::vector<unsigned> FwdRefAttrGrps; LocTy NoBuiltinLoc; unsigned CC; @@ -6558,7 +6569,7 @@ bool LLParser::parseUnaryOp(Instruction *&Inst, PerFunctionState &PFS, /// '[' LabelList ']' bool LLParser::parseCallBr(Instruction *&Inst, PerFunctionState &PFS) { LocTy CallLoc = Lex.getLoc(); - AttrBuilder RetAttrs, FnAttrs; + AttrBuilder RetAttrs(M->getContext()), FnAttrs(M->getContext()); std::vector<unsigned> FwdRefAttrGrps; LocTy NoBuiltinLoc; unsigned CC; @@ -6975,7 +6986,7 @@ bool LLParser::parseFreeze(Instruction *&Inst, PerFunctionState &PFS) { /// OptionalAttrs Type Value ParameterList OptionalAttrs bool LLParser::parseCall(Instruction *&Inst, PerFunctionState &PFS, CallInst::TailCallKind TCK) { - AttrBuilder RetAttrs, FnAttrs; + AttrBuilder RetAttrs(M->getContext()), FnAttrs(M->getContext()); std::vector<unsigned> FwdRefAttrGrps; LocTy BuiltinLoc; unsigned CallAddrSpace; @@ -7196,7 +7207,7 @@ int LLParser::parseLoad(Instruction *&Inst, PerFunctionState &PFS) { ExplicitTypeLoc, typeComparisonErrorMessage( "explicit pointee type doesn't match operand's pointee type", Ty, - cast<PointerType>(Val->getType())->getElementType())); + Val->getType()->getNonOpaquePointerElementType())); } SmallPtrSet<Type *, 4> Visited; if (!Alignment && !Ty->isSized(&Visited)) @@ -7456,7 +7467,7 @@ int LLParser::parseGetElementPtr(Instruction *&Inst, PerFunctionState &PFS) { ExplicitTypeLoc, typeComparisonErrorMessage( "explicit pointee type doesn't match operand's pointee type", Ty, - BasePointerType->getElementType())); + BasePointerType->getNonOpaquePointerElementType())); } SmallVector<Value*, 16> Indices; diff --git a/llvm/lib/BinaryFormat/AMDGPUMetadataVerifier.cpp b/llvm/lib/BinaryFormat/AMDGPUMetadataVerifier.cpp index 284e469a1d2f..99d2c8221281 100644 --- a/llvm/lib/BinaryFormat/AMDGPUMetadataVerifier.cpp +++ b/llvm/lib/BinaryFormat/AMDGPUMetadataVerifier.cpp @@ -12,8 +12,14 @@ //===----------------------------------------------------------------------===// #include "llvm/BinaryFormat/AMDGPUMetadataVerifier.h" + +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/STLForwardCompat.h" #include "llvm/ADT/StringSwitch.h" -#include "llvm/Support/AMDGPUMetadata.h" +#include "llvm/BinaryFormat/MsgPackDocument.h" + +#include <map> +#include <utility> namespace llvm { namespace AMDGPU { diff --git a/llvm/lib/BinaryFormat/ELF.cpp b/llvm/lib/BinaryFormat/ELF.cpp index 2ede63f464d3..e2e601b6d90f 100644 --- a/llvm/lib/BinaryFormat/ELF.cpp +++ b/llvm/lib/BinaryFormat/ELF.cpp @@ -7,9 +7,8 @@ //===----------------------------------------------------------------------===// #include "llvm/BinaryFormat/ELF.h" -#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DenseMapInfo.h" #include "llvm/ADT/StringSwitch.h" -#include "llvm/Support/Error.h" using namespace llvm; using namespace ELF; diff --git a/llvm/lib/BinaryFormat/Magic.cpp b/llvm/lib/BinaryFormat/Magic.cpp index 8c7f7b7043a0..044e4840cb3b 100644 --- a/llvm/lib/BinaryFormat/Magic.cpp +++ b/llvm/lib/BinaryFormat/Magic.cpp @@ -10,10 +10,8 @@ #include "llvm/ADT/StringRef.h" #include "llvm/ADT/Twine.h" #include "llvm/BinaryFormat/COFF.h" -#include "llvm/BinaryFormat/ELF.h" #include "llvm/BinaryFormat/MachO.h" #include "llvm/Support/Endian.h" -#include "llvm/Support/FileSystem.h" #include "llvm/Support/MemoryBuffer.h" #if !defined(_MSC_VER) && !defined(__MINGW32__) @@ -88,7 +86,10 @@ file_magic llvm::identify_magic(StringRef Magic) { if (startswith(Magic, "!<arch>\n") || startswith(Magic, "!<thin>\n")) return file_magic::archive; break; - + case '<': + if (startswith(Magic, "<bigaf>\n")) + return file_magic::archive; + break; case '\177': if (startswith(Magic, "\177ELF") && Magic.size() >= 18) { bool Data2MSB = Magic[5] == 2; diff --git a/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp b/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp index a36b256c29b6..ffef35299981 100644 --- a/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp +++ b/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp @@ -781,7 +781,7 @@ Error BitcodeAnalyzer::parseBlock(unsigned BlockID, unsigned IndentLevel, uint64_t MetadataIndexOffset = 0; // Read all the records for this block. - while (1) { + while (true) { if (Stream.AtEndOfStream()) return reportError("Premature end of bitstream"); diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp index f5a878f8788a..720ab560f988 100644 --- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp +++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp @@ -1349,7 +1349,7 @@ Error BitcodeReader::parseAttributeBlock() { return error("Invalid record"); for (unsigned i = 0, e = Record.size(); i != e; i += 2) { - AttrBuilder B; + AttrBuilder B(Context); decodeLLVMAttributesForBitcode(B, Record[i+1]); Attrs.push_back(AttributeList::get(Context, Record[i], B)); } @@ -1591,7 +1591,7 @@ Error BitcodeReader::parseAttributeGroupBlock() { uint64_t GrpID = Record[0]; uint64_t Idx = Record[1]; // Index of the object this attribute refers to. - AttrBuilder B; + AttrBuilder B(Context); for (unsigned i = 2, e = Record.size(); i != e; ++i) { if (Record[i] == 0) { // Enum attribute Attribute::AttrKind Kind; @@ -2702,7 +2702,7 @@ Error BitcodeReader::parseConstants() { PointerType *OrigPtrTy = cast<PointerType>(Elt0FullTy->getScalarType()); if (!PointeeType) - PointeeType = OrigPtrTy->getElementType(); + PointeeType = OrigPtrTy->getPointerElementType(); else if (!OrigPtrTy->isOpaqueOrPointeeTypeMatches(PointeeType)) return error("Explicit gep operator type does not match pointee type " "of pointer operand"); @@ -2824,9 +2824,9 @@ Error BitcodeReader::parseConstants() { for (unsigned i = 0; i != ConstStrSize; ++i) ConstrStr += (char)Record[3+AsmStrSize+i]; UpgradeInlineAsmString(&AsmStr); - V = InlineAsm::get( - cast<FunctionType>(cast<PointerType>(CurTy)->getElementType()), - AsmStr, ConstrStr, HasSideEffects, IsAlignStack); + // FIXME: support upgrading in opaque pointers mode. + V = InlineAsm::get(cast<FunctionType>(CurTy->getPointerElementType()), + AsmStr, ConstrStr, HasSideEffects, IsAlignStack); break; } // This version adds support for the asm dialect keywords (e.g., @@ -2850,37 +2850,74 @@ Error BitcodeReader::parseConstants() { for (unsigned i = 0; i != ConstStrSize; ++i) ConstrStr += (char)Record[3+AsmStrSize+i]; UpgradeInlineAsmString(&AsmStr); - V = InlineAsm::get( - cast<FunctionType>(cast<PointerType>(CurTy)->getElementType()), - AsmStr, ConstrStr, HasSideEffects, IsAlignStack, - InlineAsm::AsmDialect(AsmDialect)); + // FIXME: support upgrading in opaque pointers mode. + V = InlineAsm::get(cast<FunctionType>(CurTy->getPointerElementType()), + AsmStr, ConstrStr, HasSideEffects, IsAlignStack, + InlineAsm::AsmDialect(AsmDialect)); break; } // This version adds support for the unwind keyword. - case bitc::CST_CODE_INLINEASM: { + case bitc::CST_CODE_INLINEASM_OLD3: { if (Record.size() < 2) return error("Invalid record"); + unsigned OpNum = 0; std::string AsmStr, ConstrStr; - bool HasSideEffects = Record[0] & 1; - bool IsAlignStack = (Record[0] >> 1) & 1; - unsigned AsmDialect = (Record[0] >> 2) & 1; - bool CanThrow = (Record[0] >> 3) & 1; - unsigned AsmStrSize = Record[1]; - if (2 + AsmStrSize >= Record.size()) + bool HasSideEffects = Record[OpNum] & 1; + bool IsAlignStack = (Record[OpNum] >> 1) & 1; + unsigned AsmDialect = (Record[OpNum] >> 2) & 1; + bool CanThrow = (Record[OpNum] >> 3) & 1; + ++OpNum; + unsigned AsmStrSize = Record[OpNum]; + ++OpNum; + if (OpNum + AsmStrSize >= Record.size()) return error("Invalid record"); - unsigned ConstStrSize = Record[2 + AsmStrSize]; - if (3 + AsmStrSize + ConstStrSize > Record.size()) + unsigned ConstStrSize = Record[OpNum + AsmStrSize]; + if (OpNum + 1 + AsmStrSize + ConstStrSize > Record.size()) return error("Invalid record"); for (unsigned i = 0; i != AsmStrSize; ++i) - AsmStr += (char)Record[2 + i]; + AsmStr += (char)Record[OpNum + i]; + ++OpNum; for (unsigned i = 0; i != ConstStrSize; ++i) - ConstrStr += (char)Record[3 + AsmStrSize + i]; + ConstrStr += (char)Record[OpNum + AsmStrSize + i]; UpgradeInlineAsmString(&AsmStr); - V = InlineAsm::get( - cast<FunctionType>(cast<PointerType>(CurTy)->getElementType()), - AsmStr, ConstrStr, HasSideEffects, IsAlignStack, - InlineAsm::AsmDialect(AsmDialect), CanThrow); + // FIXME: support upgrading in opaque pointers mode. + V = InlineAsm::get(cast<FunctionType>(CurTy->getPointerElementType()), + AsmStr, ConstrStr, HasSideEffects, IsAlignStack, + InlineAsm::AsmDialect(AsmDialect), CanThrow); + break; + } + // This version adds explicit function type. + case bitc::CST_CODE_INLINEASM: { + if (Record.size() < 3) + return error("Invalid record"); + unsigned OpNum = 0; + auto *FnTy = dyn_cast_or_null<FunctionType>(getTypeByID(Record[OpNum])); + ++OpNum; + if (!FnTy) + return error("Invalid record"); + std::string AsmStr, ConstrStr; + bool HasSideEffects = Record[OpNum] & 1; + bool IsAlignStack = (Record[OpNum] >> 1) & 1; + unsigned AsmDialect = (Record[OpNum] >> 2) & 1; + bool CanThrow = (Record[OpNum] >> 3) & 1; + ++OpNum; + unsigned AsmStrSize = Record[OpNum]; + ++OpNum; + if (OpNum + AsmStrSize >= Record.size()) + return error("Invalid record"); + unsigned ConstStrSize = Record[OpNum + AsmStrSize]; + if (OpNum + 1 + AsmStrSize + ConstStrSize > Record.size()) + return error("Invalid record"); + + for (unsigned i = 0; i != AsmStrSize; ++i) + AsmStr += (char)Record[OpNum + i]; + ++OpNum; + for (unsigned i = 0; i != ConstStrSize; ++i) + ConstrStr += (char)Record[OpNum + AsmStrSize + i]; + UpgradeInlineAsmString(&AsmStr); + V = InlineAsm::get(FnTy, AsmStr, ConstrStr, HasSideEffects, IsAlignStack, + InlineAsm::AsmDialect(AsmDialect), CanThrow); break; } case bitc::CST_CODE_BLOCKADDRESS:{ @@ -3242,7 +3279,7 @@ Error BitcodeReader::parseGlobalVarRecord(ArrayRef<uint64_t> Record) { if (!Ty->isPointerTy()) return error("Invalid type for value"); AddressSpace = cast<PointerType>(Ty)->getAddressSpace(); - Ty = cast<PointerType>(Ty)->getElementType(); + Ty = Ty->getPointerElementType(); } uint64_t RawLinkage = Record[3]; @@ -3335,7 +3372,7 @@ Error BitcodeReader::parseFunctionRecord(ArrayRef<uint64_t> Record) { if (!FTy) return error("Invalid record"); if (auto *PTy = dyn_cast<PointerType>(FTy)) - FTy = PTy->getElementType(); + FTy = PTy->getPointerElementType(); if (!isa<FunctionType>(FTy)) return error("Invalid type for value"); @@ -3376,7 +3413,7 @@ Error BitcodeReader::parseFunctionRecord(ArrayRef<uint64_t> Record) { Func->removeParamAttr(i, Kind); Type *PTy = cast<FunctionType>(FTy)->getParamType(i); - Type *PtrEltTy = cast<PointerType>(PTy)->getElementType(); + Type *PtrEltTy = PTy->getPointerElementType(); Attribute NewAttr; switch (Kind) { case Attribute::ByVal: @@ -3499,7 +3536,7 @@ Error BitcodeReader::parseGlobalIndirectSymbolRecord( auto *PTy = dyn_cast<PointerType>(Ty); if (!PTy) return error("Invalid type for value"); - Ty = PTy->getElementType(); + Ty = PTy->getPointerElementType(); AddrSpace = PTy->getAddressSpace(); } else { AddrSpace = Record[OpNum++]; @@ -3795,6 +3832,11 @@ Error BitcodeReader::parseModule(uint64_t ResumeBit, if (Error Err = parseComdatRecord(Record)) return Err; break; + // FIXME: BitcodeReader should handle {GLOBALVAR, FUNCTION, ALIAS, IFUNC} + // written by ThinLinkBitcodeWriter. See + // `ThinLinkBitcodeWriter::writeSimplifiedModuleInfo` for the format of each + // record + // (https://github.com/llvm/llvm-project/blob/b6a93967d9c11e79802b5e75cec1584d6c8aa472/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp#L4714) case bitc::MODULE_CODE_GLOBALVAR: if (Error Err = parseGlobalVarRecord(Record)) return Err; @@ -3857,12 +3899,13 @@ void BitcodeReader::propagateAttributeTypes(CallBase *CB, for (unsigned i = 0; i != CB->arg_size(); ++i) { for (Attribute::AttrKind Kind : {Attribute::ByVal, Attribute::StructRet, Attribute::InAlloca}) { - if (!CB->paramHasAttr(i, Kind)) + if (!CB->paramHasAttr(i, Kind) || + CB->getParamAttr(i, Kind).getValueAsType()) continue; CB->removeParamAttr(i, Kind); - Type *PtrEltTy = cast<PointerType>(ArgsTys[i])->getElementType(); + Type *PtrEltTy = ArgsTys[i]->getPointerElementType(); Attribute NewAttr; switch (Kind) { case Attribute::ByVal: @@ -3882,11 +3925,28 @@ void BitcodeReader::propagateAttributeTypes(CallBase *CB, } } + if (CB->isInlineAsm()) { + const InlineAsm *IA = cast<InlineAsm>(CB->getCalledOperand()); + unsigned ArgNo = 0; + for (const InlineAsm::ConstraintInfo &CI : IA->ParseConstraints()) { + if (!CI.hasArg()) + continue; + + if (CI.isIndirect && !CB->getAttributes().getParamElementType(ArgNo)) { + Type *ElemTy = ArgsTys[ArgNo]->getPointerElementType(); + CB->addParamAttr( + ArgNo, Attribute::get(Context, Attribute::ElementType, ElemTy)); + } + + ArgNo++; + } + } + switch (CB->getIntrinsicID()) { case Intrinsic::preserve_array_access_index: case Intrinsic::preserve_struct_access_index: if (!CB->getAttributes().getParamElementType(0)) { - Type *ElTy = cast<PointerType>(ArgsTys[0])->getElementType(); + Type *ElTy = ArgsTys[0]->getPointerElementType(); Attribute NewAttr = Attribute::get(Context, Attribute::ElementType, ElTy); CB->addParamAttr(0, NewAttr); } @@ -4176,8 +4236,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) { return error("Invalid record"); if (!Ty) { - Ty = cast<PointerType>(BasePtr->getType()->getScalarType()) - ->getElementType(); + Ty = BasePtr->getType()->getScalarType()->getPointerElementType(); } else if (!cast<PointerType>(BasePtr->getType()->getScalarType()) ->isOpaqueOrPointeeTypeMatches(Ty)) { return error( @@ -4693,8 +4752,8 @@ Error BitcodeReader::parseFunctionBody(Function *F) { if (!CalleeTy) return error("Callee is not a pointer"); if (!FTy) { - FTy = dyn_cast<FunctionType>( - cast<PointerType>(Callee->getType())->getElementType()); + FTy = + dyn_cast<FunctionType>(Callee->getType()->getPointerElementType()); if (!FTy) return error("Callee is not of pointer to function type"); } else if (!CalleeTy->isOpaqueOrPointeeTypeMatches(FTy)) @@ -4774,26 +4833,29 @@ Error BitcodeReader::parseFunctionBody(Function *F) { if (!OpTy) return error("Callee is not a pointer type"); if (!FTy) { - FTy = dyn_cast<FunctionType>( - cast<PointerType>(Callee->getType())->getElementType()); + FTy = + dyn_cast<FunctionType>(Callee->getType()->getPointerElementType()); if (!FTy) return error("Callee is not of pointer to function type"); - } else if (cast<PointerType>(Callee->getType())->getElementType() != FTy) + } else if (!OpTy->isOpaqueOrPointeeTypeMatches(FTy)) return error("Explicit call type does not match pointee type of " "callee operand"); if (Record.size() < FTy->getNumParams() + OpNum) return error("Insufficient operands to call"); SmallVector<Value*, 16> Args; + SmallVector<Type *, 16> ArgsTys; // Read the fixed params. for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i, ++OpNum) { + Value *Arg; if (FTy->getParamType(i)->isLabelTy()) - Args.push_back(getBasicBlock(Record[OpNum])); + Arg = getBasicBlock(Record[OpNum]); else - Args.push_back(getValue(Record, OpNum, NextValueNo, - FTy->getParamType(i))); - if (!Args.back()) + Arg = getValue(Record, OpNum, NextValueNo, FTy->getParamType(i)); + if (!Arg) return error("Invalid record"); + Args.push_back(Arg); + ArgsTys.push_back(Arg->getType()); } // Read type/value pairs for varargs params. @@ -4806,6 +4868,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) { if (getValueTypePair(Record, OpNum, NextValueNo, Op)) return error("Invalid record"); Args.push_back(Op); + ArgsTys.push_back(Op->getType()); } } @@ -4816,6 +4879,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) { cast<CallBrInst>(I)->setCallingConv( static_cast<CallingConv::ID>((0x7ff & CCInfo) >> bitc::CALL_CCONV)); cast<CallBrInst>(I)->setAttributes(PAL); + propagateAttributeTypes(cast<CallBase>(I), ArgsTys); break; } case bitc::FUNC_CODE_INST_UNREACHABLE: // UNREACHABLE @@ -4932,7 +4996,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) { auto *PTy = dyn_cast_or_null<PointerType>(Ty); if (!PTy) return error("Old-style alloca with a non-pointer type"); - Ty = PTy->getElementType(); + Ty = PTy->getPointerElementType(); } Type *OpTy = getTypeByID(Record[1]); Value *Size = getFnValueByID(Record[2], OpTy); @@ -4977,7 +5041,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) { if (OpNum + 3 == Record.size()) { Ty = getTypeByID(Record[OpNum++]); } else { - Ty = cast<PointerType>(Op->getType())->getElementType(); + Ty = Op->getType()->getPointerElementType(); } if (Error Err = typeCheckLoadStoreInst(Ty, Op->getType())) @@ -5010,7 +5074,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) { if (OpNum + 5 == Record.size()) { Ty = getTypeByID(Record[OpNum++]); } else { - Ty = cast<PointerType>(Op->getType())->getElementType(); + Ty = Op->getType()->getPointerElementType(); } if (Error Err = typeCheckLoadStoreInst(Ty, Op->getType())) @@ -5042,8 +5106,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) { (BitCode == bitc::FUNC_CODE_INST_STORE ? getValueTypePair(Record, OpNum, NextValueNo, Val) : popValue(Record, OpNum, NextValueNo, - cast<PointerType>(Ptr->getType())->getElementType(), - Val)) || + Ptr->getType()->getPointerElementType(), Val)) || OpNum + 2 != Record.size()) return error("Invalid record"); @@ -5071,8 +5134,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) { (BitCode == bitc::FUNC_CODE_INST_STOREATOMIC ? getValueTypePair(Record, OpNum, NextValueNo, Val) : popValue(Record, OpNum, NextValueNo, - cast<PointerType>(Ptr->getType())->getElementType(), - Val)) || + Ptr->getType()->getPointerElementType(), Val)) || OpNum + 4 != Record.size()) return error("Invalid record"); @@ -5323,8 +5385,8 @@ Error BitcodeReader::parseFunctionBody(Function *F) { if (!OpTy) return error("Callee is not a pointer type"); if (!FTy) { - FTy = dyn_cast<FunctionType>( - cast<PointerType>(Callee->getType())->getElementType()); + FTy = + dyn_cast<FunctionType>(Callee->getType()->getPointerElementType()); if (!FTy) return error("Callee is not of pointer to function type"); } else if (!OpTy->isOpaqueOrPointeeTypeMatches(FTy)) diff --git a/llvm/lib/Bitcode/Reader/MetadataLoader.cpp b/llvm/lib/Bitcode/Reader/MetadataLoader.cpp index 60530d7f7a00..0f4111514057 100644 --- a/llvm/lib/Bitcode/Reader/MetadataLoader.cpp +++ b/llvm/lib/Bitcode/Reader/MetadataLoader.cpp @@ -1105,7 +1105,7 @@ void MetadataLoader::MetadataLoaderImpl::lazyLoadOneMetadata( void MetadataLoader::MetadataLoaderImpl::resolveForwardRefsAndPlaceholders( PlaceholderQueue &Placeholders) { DenseSet<unsigned> Temporaries; - while (1) { + while (true) { // Populate Temporaries with the placeholders that haven't been loaded yet. Placeholders.getTemporaries(MetadataList, Temporaries); @@ -1423,15 +1423,21 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata( break; } case bitc::METADATA_STRING_TYPE: { - if (Record.size() != 8) + if (Record.size() > 9 || Record.size() < 8) return error("Invalid record"); IsDistinct = Record[0]; + bool SizeIs8 = Record.size() == 8; + // StringLocationExp (i.e. Record[5]) is added at a later time + // than the other fields. The code here enables backward compatibility. + Metadata *StringLocationExp = SizeIs8 ? nullptr : getMDOrNull(Record[5]); + unsigned Offset = SizeIs8 ? 5 : 6; MetadataList.assignValue( GET_OR_DISTINCT(DIStringType, (Context, Record[1], getMDString(Record[2]), getMDOrNull(Record[3]), getMDOrNull(Record[4]), - Record[5], Record[6], Record[7])), + StringLocationExp, Record[Offset], Record[Offset + 1], + Record[Offset + 2])), NextMetadataNo); NextMetadataNo++; break; @@ -1632,7 +1638,7 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata( Record.size() <= 16 ? true : Record[16], Record.size() <= 17 ? false : Record[17], Record.size() <= 18 ? 0 : Record[18], - Record.size() <= 19 ? 0 : Record[19], + Record.size() <= 19 ? false : Record[19], Record.size() <= 20 ? nullptr : getMDString(Record[20]), Record.size() <= 21 ? nullptr : getMDString(Record[21])); @@ -1675,7 +1681,7 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata( SPFlags = DISubprogram::toSPFlags( /*IsLocalToUnit=*/Record[7], /*IsDefinition=*/Record[8], /*IsOptimized=*/Record[14], /*Virtuality=*/Record[11], - /*DIFlagMainSubprogram=*/HasOldMainSubprogramFlag); + /*IsMainSubprogram=*/HasOldMainSubprogramFlag); // All definitions should be distinct. IsDistinct = (Record[0] & 1) || (SPFlags & DISubprogram::SPFlagDefinition); diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp index dc06bc10cf95..eb4e09ea3a26 100644 --- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp +++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp @@ -948,7 +948,7 @@ void ModuleBitcodeWriter::writeTypeTable() { } else { // POINTER: [pointee type, address space] Code = bitc::TYPE_CODE_POINTER; - TypeVals.push_back(VE.getTypeID(PTy->getElementType())); + TypeVals.push_back(VE.getTypeID(PTy->getNonOpaquePointerElementType())); TypeVals.push_back(AddressSpace); if (AddressSpace == 0) AbbrevToUse = PtrAbbrev; @@ -1657,6 +1657,7 @@ void ModuleBitcodeWriter::writeDIStringType(const DIStringType *N, Record.push_back(VE.getMetadataOrNullID(N->getRawName())); Record.push_back(VE.getMetadataOrNullID(N->getStringLength())); Record.push_back(VE.getMetadataOrNullID(N->getStringLengthExp())); + Record.push_back(VE.getMetadataOrNullID(N->getStringLocationExp())); Record.push_back(N->getSizeInBits()); Record.push_back(N->getAlignInBits()); Record.push_back(N->getEncoding()); @@ -2458,6 +2459,7 @@ void ModuleBitcodeWriter::writeConstants(unsigned FirstVal, unsigned LastVal, } if (const InlineAsm *IA = dyn_cast<InlineAsm>(V)) { + Record.push_back(VE.getTypeID(IA->getFunctionType())); Record.push_back( unsigned(IA->hasSideEffects()) | unsigned(IA->isAlignStack()) << 1 | unsigned(IA->getDialect() & 1) << 2 | unsigned(IA->canThrow()) << 3); diff --git a/llvm/lib/Bitcode/Writer/ValueEnumerator.cpp b/llvm/lib/Bitcode/Writer/ValueEnumerator.cpp index df4f1a1873d7..01f7e85bd60e 100644 --- a/llvm/lib/Bitcode/Writer/ValueEnumerator.cpp +++ b/llvm/lib/Bitcode/Writer/ValueEnumerator.cpp @@ -386,8 +386,10 @@ ValueEnumerator::ValueEnumerator(const Module &M, } // Enumerate the ifuncs. - for (const GlobalIFunc &GIF : M.ifuncs()) + for (const GlobalIFunc &GIF : M.ifuncs()) { EnumerateValue(&GIF); + EnumerateType(GIF.getValueType()); + } // Remember what is the cutoff between globalvalue's and other constants. unsigned FirstConstant = Values.size(); diff --git a/llvm/lib/CodeGen/AggressiveAntiDepBreaker.cpp b/llvm/lib/CodeGen/AggressiveAntiDepBreaker.cpp index 5c64622c7245..bb71d72256d8 100644 --- a/llvm/lib/CodeGen/AggressiveAntiDepBreaker.cpp +++ b/llvm/lib/CodeGen/AggressiveAntiDepBreaker.cpp @@ -120,8 +120,7 @@ bool AggressiveAntiDepState::IsLive(unsigned Reg) { AggressiveAntiDepBreaker::AggressiveAntiDepBreaker( MachineFunction &MFi, const RegisterClassInfo &RCI, TargetSubtargetInfo::RegClassVector &CriticalPathRCs) - : AntiDepBreaker(), MF(MFi), MRI(MF.getRegInfo()), - TII(MF.getSubtarget().getInstrInfo()), + : MF(MFi), MRI(MF.getRegInfo()), TII(MF.getSubtarget().getInstrInfo()), TRI(MF.getSubtarget().getRegisterInfo()), RegClassInfo(RCI) { /* Collect a bitset of all registers that are only broken if they are on the critical path. */ diff --git a/llvm/lib/CodeGen/Analysis.cpp b/llvm/lib/CodeGen/Analysis.cpp index 7e68e5e22879..e8fef505e43d 100644 --- a/llvm/lib/CodeGen/Analysis.cpp +++ b/llvm/lib/CodeGen/Analysis.cpp @@ -577,9 +577,9 @@ bool llvm::attributesPermitTailCall(const Function *F, const Instruction *I, bool &ADS = AllowDifferingSizes ? *AllowDifferingSizes : DummyADS; ADS = true; - AttrBuilder CallerAttrs(F->getAttributes(), AttributeList::ReturnIndex); - AttrBuilder CalleeAttrs(cast<CallInst>(I)->getAttributes(), - AttributeList::ReturnIndex); + AttrBuilder CallerAttrs(F->getContext(), F->getAttributes().getRetAttrs()); + AttrBuilder CalleeAttrs(F->getContext(), + cast<CallInst>(I)->getAttributes().getRetAttrs()); // Following attributes are completely benign as far as calling convention // goes, they shouldn't affect whether the call is a tail call. diff --git a/llvm/lib/CodeGen/AsmPrinter/AIXException.cpp b/llvm/lib/CodeGen/AsmPrinter/AIXException.cpp index 964cef75d164..03e63321e3c4 100644 --- a/llvm/lib/CodeGen/AsmPrinter/AIXException.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/AIXException.cpp @@ -23,6 +23,8 @@ namespace llvm { AIXException::AIXException(AsmPrinter *A) : DwarfCFIExceptionBase(A) {} +void AIXException::markFunctionEnd() { endFragment(); } + void AIXException::emitExceptionInfoTable(const MCSymbol *LSDA, const MCSymbol *PerSym) { // Generate EH Info Table. diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp index 533f20535655..4f3f798fe6f8 100644 --- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp @@ -247,6 +247,11 @@ void AsmPrinter::emitInitialRawDwarfLocDirective(const MachineFunction &MF) { if (DD) { assert(OutStreamer->hasRawTextSupport() && "Expected assembly output mode."); + // This is NVPTX specific and it's unclear why. + // PR51079: If we have code without debug information we need to give up. + DISubprogram *MFSP = MF.getFunction().getSubprogram(); + if (!MFSP) + return; (void)DD->emitInitialLocDirective(MF, /*CUID=*/0); } } @@ -2477,7 +2482,8 @@ void AsmPrinter::emitLabelPlusOffset(const MCSymbol *Label, uint64_t Offset, // two boundary. If a global value is specified, and if that global has // an explicit alignment requested, it will override the alignment request // if required for correctness. -void AsmPrinter::emitAlignment(Align Alignment, const GlobalObject *GV) const { +void AsmPrinter::emitAlignment(Align Alignment, const GlobalObject *GV, + unsigned MaxBytesToEmit) const { if (GV) Alignment = getGVAlignment(GV, GV->getParent()->getDataLayout(), Alignment); @@ -2490,9 +2496,9 @@ void AsmPrinter::emitAlignment(Align Alignment, const GlobalObject *GV) const { STI = &getSubtargetInfo(); else STI = TM.getMCSubtargetInfo(); - OutStreamer->emitCodeAlignment(Alignment.value(), STI); + OutStreamer->emitCodeAlignment(Alignment.value(), STI, MaxBytesToEmit); } else - OutStreamer->emitValueToAlignment(Alignment.value()); + OutStreamer->emitValueToAlignment(Alignment.value(), 0, 1, MaxBytesToEmit); } //===----------------------------------------------------------------------===// @@ -3286,7 +3292,7 @@ void AsmPrinter::emitBasicBlockStart(const MachineBasicBlock &MBB) { // Emit an alignment directive for this block, if needed. const Align Alignment = MBB.getAlignment(); if (Alignment != Align(1)) - emitAlignment(Alignment); + emitAlignment(Alignment, nullptr, MBB.getMaxBytesForAlignment()); // Switch to a new section if this basic block must begin a section. The // entry block is always placed in the function section and is handled @@ -3648,6 +3654,12 @@ unsigned int AsmPrinter::getDwarfOffsetByteSize() const { OutStreamer->getContext().getDwarfFormat()); } +dwarf::FormParams AsmPrinter::getDwarfFormParams() const { + return {getDwarfVersion(), uint8_t(getPointerSize()), + OutStreamer->getContext().getDwarfFormat(), + MAI->doesDwarfUseRelocationsAcrossSections()}; +} + unsigned int AsmPrinter::getUnitLengthFieldByteSize() const { return dwarf::getUnitLengthFieldByteSize( OutStreamer->getContext().getDwarfFormat()); diff --git a/llvm/lib/CodeGen/AsmPrinter/ByteStreamer.h b/llvm/lib/CodeGen/AsmPrinter/ByteStreamer.h index 5e7db1f2f76c..bd2c60eadd61 100644 --- a/llvm/lib/CodeGen/AsmPrinter/ByteStreamer.h +++ b/llvm/lib/CodeGen/AsmPrinter/ByteStreamer.h @@ -33,6 +33,7 @@ class ByteStreamer { virtual void emitSLEB128(uint64_t DWord, const Twine &Comment = "") = 0; virtual void emitULEB128(uint64_t DWord, const Twine &Comment = "", unsigned PadTo = 0) = 0; + virtual unsigned emitDIERef(const DIE &D) = 0; }; class APByteStreamer final : public ByteStreamer { @@ -54,15 +55,24 @@ public: AP.OutStreamer->AddComment(Comment); AP.emitULEB128(DWord, nullptr, PadTo); } + unsigned emitDIERef(const DIE &D) override { + uint64_t Offset = D.getOffset(); + static constexpr unsigned ULEB128PadSize = 4; + assert(Offset < (1ULL << (ULEB128PadSize * 7)) && "Offset wont fit"); + emitULEB128(Offset, "", ULEB128PadSize); + // Return how many comments to skip in DwarfDebug::emitDebugLocEntry to keep + // comments aligned with debug loc entries. + return ULEB128PadSize; + } }; class HashingByteStreamer final : public ByteStreamer { private: DIEHash &Hash; public: - HashingByteStreamer(DIEHash &H) : Hash(H) {} - void emitInt8(uint8_t Byte, const Twine &Comment) override { - Hash.update(Byte); + HashingByteStreamer(DIEHash &H) : Hash(H) {} + void emitInt8(uint8_t Byte, const Twine &Comment) override { + Hash.update(Byte); } void emitSLEB128(uint64_t DWord, const Twine &Comment) override { Hash.addSLEB128(DWord); @@ -71,6 +81,10 @@ class HashingByteStreamer final : public ByteStreamer { unsigned PadTo) override { Hash.addULEB128(DWord); } + unsigned emitDIERef(const DIE &D) override { + Hash.hashRawTypeReference(D); + return 0; // Only used together with the APByteStreamer. + } }; class BufferByteStreamer final : public ByteStreamer { @@ -115,9 +129,15 @@ public: // with each other. for (size_t i = 1; i < Length; ++i) Comments.push_back(""); - } } + unsigned emitDIERef(const DIE &D) override { + uint64_t Offset = D.getOffset(); + static constexpr unsigned ULEB128PadSize = 4; + assert(Offset < (1ULL << (ULEB128PadSize * 7)) && "Offset wont fit"); + emitULEB128(Offset, "", ULEB128PadSize); + return 0; // Only used together with the APByteStreamer. + } }; } diff --git a/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp index d621108408f0..52c74713551c 100644 --- a/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp @@ -68,6 +68,7 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/FormatVariadic.h" #include "llvm/Support/Path.h" +#include "llvm/Support/Program.h" #include "llvm/Support/SMLoc.h" #include "llvm/Support/ScopedPrinter.h" #include "llvm/Target/TargetLoweringObjectFile.h" @@ -600,6 +601,8 @@ static SourceLanguage MapDWLangToCVLang(unsigned DWLang) { return SourceLanguage::D; case dwarf::DW_LANG_Swift: return SourceLanguage::Swift; + case dwarf::DW_LANG_Rust: + return SourceLanguage::Rust; default: // There's no CodeView representation for this language, and CV doesn't // have an "unknown" option for the language field, so we'll use MASM, @@ -843,6 +846,12 @@ void CodeViewDebug::emitCompilerInformation() { if (MMI->getModule()->getProfileSummary(/*IsCS*/ false) != nullptr) { Flags |= static_cast<uint32_t>(CompileSym3Flags::PGO); } + using ArchType = llvm::Triple::ArchType; + ArchType Arch = Triple(MMI->getModule()->getTargetTriple()).getArch(); + if (Asm->TM.Options.Hotpatch || Arch == ArchType::thumb || + Arch == ArchType::aarch64) { + Flags |= static_cast<uint32_t>(CompileSym3Flags::HotPatch); + } OS.AddComment("Flags and language"); OS.emitInt32(Flags); @@ -857,8 +866,10 @@ void CodeViewDebug::emitCompilerInformation() { StringRef CompilerVersion = CU->getProducer(); Version FrontVer = parseVersion(CompilerVersion); OS.AddComment("Frontend version"); - for (int N : FrontVer.Part) + for (int N : FrontVer.Part) { + N = std::min<int>(N, std::numeric_limits<uint16_t>::max()); OS.emitInt16(N); + } // Some Microsoft tools, like Binscope, expect a backend version number of at // least 8.something, so we'll coerce the LLVM version into a form that @@ -885,6 +896,34 @@ static TypeIndex getStringIdTypeIdx(GlobalTypeTableBuilder &TypeTable, return TypeTable.writeLeafType(SIR); } +static std::string flattenCommandLine(ArrayRef<std::string> Args, + StringRef MainFilename) { + std::string FlatCmdLine; + raw_string_ostream OS(FlatCmdLine); + bool PrintedOneArg = false; + if (!StringRef(Args[0]).contains("-cc1")) { + llvm::sys::printArg(OS, "-cc1", /*Quote=*/true); + PrintedOneArg = true; + } + for (unsigned i = 0; i < Args.size(); i++) { + StringRef Arg = Args[i]; + if (Arg.empty()) + continue; + if (Arg == "-main-file-name" || Arg == "-o") { + i++; // Skip this argument and next one. + continue; + } + if (Arg.startswith("-object-file-name") || Arg == MainFilename) + continue; + if (PrintedOneArg) + OS << " "; + llvm::sys::printArg(OS, Arg, /*Quote=*/true); + PrintedOneArg = true; + } + OS.flush(); + return FlatCmdLine; +} + void CodeViewDebug::emitBuildInfo() { // First, make LF_BUILDINFO. It's a sequence of strings with various bits of // build info. The known prefix is: @@ -905,8 +944,16 @@ void CodeViewDebug::emitBuildInfo() { getStringIdTypeIdx(TypeTable, MainSourceFile->getDirectory()); BuildInfoArgs[BuildInfoRecord::SourceFile] = getStringIdTypeIdx(TypeTable, MainSourceFile->getFilename()); - // FIXME: Path to compiler and command line. PDB is intentionally blank unless - // we implement /Zi type servers. + // FIXME: PDB is intentionally blank unless we implement /Zi type servers. + BuildInfoArgs[BuildInfoRecord::TypeServerPDB] = + getStringIdTypeIdx(TypeTable, ""); + if (Asm->TM.Options.MCOptions.Argv0 != nullptr) { + BuildInfoArgs[BuildInfoRecord::BuildTool] = + getStringIdTypeIdx(TypeTable, Asm->TM.Options.MCOptions.Argv0); + BuildInfoArgs[BuildInfoRecord::CommandLine] = getStringIdTypeIdx( + TypeTable, flattenCommandLine(Asm->TM.Options.MCOptions.CommandLineArgs, + MainSourceFile->getFilename())); + } BuildInfoRecord BIR(BuildInfoArgs); TypeIndex BuildInfoIndex = TypeTable.writeLeafType(BIR); diff --git a/llvm/lib/CodeGen/AsmPrinter/DIE.cpp b/llvm/lib/CodeGen/AsmPrinter/DIE.cpp index 2834d9c3ebbf..1a0256f30d41 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DIE.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/DIE.cpp @@ -274,7 +274,7 @@ LLVM_DUMP_METHOD void DIE::dump() const { } #endif -unsigned DIE::computeOffsetsAndAbbrevs(const AsmPrinter *AP, +unsigned DIE::computeOffsetsAndAbbrevs(const dwarf::FormParams &FormParams, DIEAbbrevSet &AbbrevSet, unsigned CUOffset) { // Unique the abbreviation and fill in the abbreviation number so this DIE @@ -289,7 +289,7 @@ unsigned DIE::computeOffsetsAndAbbrevs(const AsmPrinter *AP, // Add the byte size of all the DIE attribute values. for (const auto &V : values()) - CUOffset += V.SizeOf(AP); + CUOffset += V.sizeOf(FormParams); // Let the children compute their offsets and abbreviation numbers. if (hasChildren()) { @@ -297,7 +297,8 @@ unsigned DIE::computeOffsetsAndAbbrevs(const AsmPrinter *AP, assert(Abbrev.hasChildren() && "Children flag not set"); for (auto &Child : children()) - CUOffset = Child.computeOffsetsAndAbbrevs(AP, AbbrevSet, CUOffset); + CUOffset = + Child.computeOffsetsAndAbbrevs(FormParams, AbbrevSet, CUOffset); // Each child chain is terminated with a zero byte, adjust the offset. CUOffset += sizeof(int8_t); @@ -335,13 +336,13 @@ void DIEValue::emitValue(const AsmPrinter *AP) const { } } -unsigned DIEValue::SizeOf(const AsmPrinter *AP) const { +unsigned DIEValue::sizeOf(const dwarf::FormParams &FormParams) const { switch (Ty) { case isNone: llvm_unreachable("Expected valid DIEValue"); #define HANDLE_DIEVALUE(T) \ case is##T: \ - return getDIE##T().SizeOf(AP, Form); + return getDIE##T().sizeOf(FormParams, Form); #include "llvm/CodeGen/DIEValue.def" } llvm_unreachable("Unknown DIE kind"); @@ -407,7 +408,8 @@ void DIEInteger::emitValue(const AsmPrinter *Asm, dwarf::Form Form) const { case dwarf::DW_FORM_strp_sup: case dwarf::DW_FORM_addr: case dwarf::DW_FORM_ref_addr: - Asm->OutStreamer->emitIntValue(Integer, SizeOf(Asm, Form)); + Asm->OutStreamer->emitIntValue(Integer, + sizeOf(Asm->getDwarfFormParams(), Form)); return; case dwarf::DW_FORM_GNU_str_index: case dwarf::DW_FORM_GNU_addr_index: @@ -425,15 +427,12 @@ void DIEInteger::emitValue(const AsmPrinter *Asm, dwarf::Form Form) const { } } -/// SizeOf - Determine size of integer value in bytes. +/// sizeOf - Determine size of integer value in bytes. /// -unsigned DIEInteger::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const { - assert(AP && "AsmPrinter is required to set FormParams"); - dwarf::FormParams Params = {AP->getDwarfVersion(), - uint8_t(AP->getPointerSize()), - AP->OutStreamer->getContext().getDwarfFormat()}; - - if (Optional<uint8_t> FixedSize = dwarf::getFixedFormByteSize(Form, Params)) +unsigned DIEInteger::sizeOf(const dwarf::FormParams &FormParams, + dwarf::Form Form) const { + if (Optional<uint8_t> FixedSize = + dwarf::getFixedFormByteSize(Form, FormParams)) return *FixedSize; switch (Form) { @@ -464,19 +463,20 @@ void DIEInteger::print(raw_ostream &O) const { /// EmitValue - Emit expression value. /// void DIEExpr::emitValue(const AsmPrinter *AP, dwarf::Form Form) const { - AP->emitDebugValue(Expr, SizeOf(AP, Form)); + AP->emitDebugValue(Expr, sizeOf(AP->getDwarfFormParams(), Form)); } /// SizeOf - Determine size of expression value in bytes. /// -unsigned DIEExpr::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const { +unsigned DIEExpr::sizeOf(const dwarf::FormParams &FormParams, + dwarf::Form Form) const { switch (Form) { case dwarf::DW_FORM_data4: return 4; case dwarf::DW_FORM_data8: return 8; case dwarf::DW_FORM_sec_offset: - return AP->getDwarfOffsetByteSize(); + return FormParams.getDwarfOffsetByteSize(); default: llvm_unreachable("DIE Value form not supported yet"); } @@ -493,12 +493,14 @@ void DIEExpr::print(raw_ostream &O) const { O << "Expr: " << *Expr; } /// void DIELabel::emitValue(const AsmPrinter *AP, dwarf::Form Form) const { bool IsSectionRelative = Form != dwarf::DW_FORM_addr; - AP->emitLabelReference(Label, SizeOf(AP, Form), IsSectionRelative); + AP->emitLabelReference(Label, sizeOf(AP->getDwarfFormParams(), Form), + IsSectionRelative); } -/// SizeOf - Determine size of label value in bytes. +/// sizeOf - Determine size of label value in bytes. /// -unsigned DIELabel::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const { +unsigned DIELabel::sizeOf(const dwarf::FormParams &FormParams, + dwarf::Form Form) const { switch (Form) { case dwarf::DW_FORM_data4: return 4; @@ -506,9 +508,9 @@ unsigned DIELabel::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const { return 8; case dwarf::DW_FORM_sec_offset: case dwarf::DW_FORM_strp: - return AP->getDwarfOffsetByteSize(); + return FormParams.getDwarfOffsetByteSize(); case dwarf::DW_FORM_addr: - return AP->MAI->getCodePointerSize(); + return FormParams.AddrSize; default: llvm_unreachable("DIE Value form not supported yet"); } @@ -527,7 +529,7 @@ void DIEBaseTypeRef::emitValue(const AsmPrinter *AP, dwarf::Form Form) const { AP->emitULEB128(Offset, nullptr, ULEB128PadSize); } -unsigned DIEBaseTypeRef::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const { +unsigned DIEBaseTypeRef::sizeOf(const dwarf::FormParams &, dwarf::Form) const { return ULEB128PadSize; } @@ -541,19 +543,21 @@ void DIEBaseTypeRef::print(raw_ostream &O) const { O << "BaseTypeRef: " << Index /// EmitValue - Emit delta value. /// void DIEDelta::emitValue(const AsmPrinter *AP, dwarf::Form Form) const { - AP->emitLabelDifference(LabelHi, LabelLo, SizeOf(AP, Form)); + AP->emitLabelDifference(LabelHi, LabelLo, + sizeOf(AP->getDwarfFormParams(), Form)); } /// SizeOf - Determine size of delta value in bytes. /// -unsigned DIEDelta::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const { +unsigned DIEDelta::sizeOf(const dwarf::FormParams &FormParams, + dwarf::Form Form) const { switch (Form) { case dwarf::DW_FORM_data4: return 4; case dwarf::DW_FORM_data8: return 8; case dwarf::DW_FORM_sec_offset: - return AP->getDwarfOffsetByteSize(); + return FormParams.getDwarfOffsetByteSize(); default: llvm_unreachable("DIE Value form not supported yet"); } @@ -592,9 +596,10 @@ void DIEString::emitValue(const AsmPrinter *AP, dwarf::Form Form) const { } } -/// SizeOf - Determine size of delta value in bytes. +/// sizeOf - Determine size of delta value in bytes. /// -unsigned DIEString::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const { +unsigned DIEString::sizeOf(const dwarf::FormParams &FormParams, + dwarf::Form Form) const { // Index of string in symbol table. switch (Form) { case dwarf::DW_FORM_GNU_str_index: @@ -603,11 +608,11 @@ unsigned DIEString::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const { case dwarf::DW_FORM_strx2: case dwarf::DW_FORM_strx3: case dwarf::DW_FORM_strx4: - return DIEInteger(S.getIndex()).SizeOf(AP, Form); + return DIEInteger(S.getIndex()).sizeOf(FormParams, Form); case dwarf::DW_FORM_strp: - if (AP->MAI->doesDwarfUseRelocationsAcrossSections()) - return DIELabel(S.getSymbol()).SizeOf(AP, Form); - return DIEInteger(S.getOffset()).SizeOf(AP, Form); + if (FormParams.DwarfUsesRelocationsAcrossSections) + return DIELabel(S.getSymbol()).sizeOf(FormParams, Form); + return DIEInteger(S.getOffset()).sizeOf(FormParams, Form); default: llvm_unreachable("Expected valid string form"); } @@ -630,7 +635,7 @@ void DIEInlineString::emitValue(const AsmPrinter *AP, dwarf::Form Form) const { llvm_unreachable("Expected valid string form"); } -unsigned DIEInlineString::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const { +unsigned DIEInlineString::sizeOf(const dwarf::FormParams &, dwarf::Form) const { // Emit string bytes + NULL byte. return S.size() + 1; } @@ -653,7 +658,8 @@ void DIEEntry::emitValue(const AsmPrinter *AP, dwarf::Form Form) const { case dwarf::DW_FORM_ref2: case dwarf::DW_FORM_ref4: case dwarf::DW_FORM_ref8: - AP->OutStreamer->emitIntValue(Entry->getOffset(), SizeOf(AP, Form)); + AP->OutStreamer->emitIntValue(Entry->getOffset(), + sizeOf(AP->getDwarfFormParams(), Form)); return; case dwarf::DW_FORM_ref_udata: @@ -665,11 +671,12 @@ void DIEEntry::emitValue(const AsmPrinter *AP, dwarf::Form Form) const { uint64_t Addr = Entry->getDebugSectionOffset(); if (const MCSymbol *SectionSym = Entry->getUnit()->getCrossSectionRelativeBaseAddress()) { - AP->emitLabelPlusOffset(SectionSym, Addr, SizeOf(AP, Form), true); + AP->emitLabelPlusOffset(SectionSym, Addr, + sizeOf(AP->getDwarfFormParams(), Form), true); return; } - AP->OutStreamer->emitIntValue(Addr, SizeOf(AP, Form)); + AP->OutStreamer->emitIntValue(Addr, sizeOf(AP->getDwarfFormParams(), Form)); return; } default: @@ -677,7 +684,8 @@ void DIEEntry::emitValue(const AsmPrinter *AP, dwarf::Form Form) const { } } -unsigned DIEEntry::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const { +unsigned DIEEntry::sizeOf(const dwarf::FormParams &FormParams, + dwarf::Form Form) const { switch (Form) { case dwarf::DW_FORM_ref1: return 1; @@ -690,15 +698,7 @@ unsigned DIEEntry::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const { case dwarf::DW_FORM_ref_udata: return getULEB128Size(Entry->getOffset()); case dwarf::DW_FORM_ref_addr: - if (AP->getDwarfVersion() == 2) - return AP->MAI->getCodePointerSize(); - switch (AP->OutStreamer->getContext().getDwarfFormat()) { - case dwarf::DWARF32: - return 4; - case dwarf::DWARF64: - return 8; - } - llvm_unreachable("Invalid DWARF format"); + return FormParams.getRefAddrByteSize(); default: llvm_unreachable("Improper form for DIE reference"); @@ -714,12 +714,10 @@ void DIEEntry::print(raw_ostream &O) const { // DIELoc Implementation //===----------------------------------------------------------------------===// -/// ComputeSize - calculate the size of the location expression. -/// -unsigned DIELoc::ComputeSize(const AsmPrinter *AP) const { +unsigned DIELoc::computeSize(const dwarf::FormParams &FormParams) const { if (!Size) { for (const auto &V : values()) - Size += V.SizeOf(AP); + Size += V.sizeOf(FormParams); } return Size; @@ -743,9 +741,9 @@ void DIELoc::emitValue(const AsmPrinter *Asm, dwarf::Form Form) const { V.emitValue(Asm); } -/// SizeOf - Determine size of location data in bytes. +/// sizeOf - Determine size of location data in bytes. /// -unsigned DIELoc::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const { +unsigned DIELoc::sizeOf(const dwarf::FormParams &, dwarf::Form Form) const { switch (Form) { case dwarf::DW_FORM_block1: return Size + sizeof(int8_t); case dwarf::DW_FORM_block2: return Size + sizeof(int16_t); @@ -766,12 +764,10 @@ void DIELoc::print(raw_ostream &O) const { // DIEBlock Implementation //===----------------------------------------------------------------------===// -/// ComputeSize - calculate the size of the block. -/// -unsigned DIEBlock::ComputeSize(const AsmPrinter *AP) const { +unsigned DIEBlock::computeSize(const dwarf::FormParams &FormParams) const { if (!Size) { for (const auto &V : values()) - Size += V.SizeOf(AP); + Size += V.sizeOf(FormParams); } return Size; @@ -797,9 +793,9 @@ void DIEBlock::emitValue(const AsmPrinter *Asm, dwarf::Form Form) const { V.emitValue(Asm); } -/// SizeOf - Determine size of block data in bytes. +/// sizeOf - Determine size of block data in bytes. /// -unsigned DIEBlock::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const { +unsigned DIEBlock::sizeOf(const dwarf::FormParams &, dwarf::Form Form) const { switch (Form) { case dwarf::DW_FORM_block1: return Size + sizeof(int8_t); case dwarf::DW_FORM_block2: return Size + sizeof(int16_t); @@ -820,22 +816,23 @@ void DIEBlock::print(raw_ostream &O) const { // DIELocList Implementation //===----------------------------------------------------------------------===// -unsigned DIELocList::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const { +unsigned DIELocList::sizeOf(const dwarf::FormParams &FormParams, + dwarf::Form Form) const { switch (Form) { case dwarf::DW_FORM_loclistx: return getULEB128Size(Index); case dwarf::DW_FORM_data4: - assert(!AP->isDwarf64() && + assert(FormParams.Format != dwarf::DWARF64 && "DW_FORM_data4 is not suitable to emit a pointer to a location list " "in the 64-bit DWARF format"); return 4; case dwarf::DW_FORM_data8: - assert(AP->isDwarf64() && + assert(FormParams.Format == dwarf::DWARF64 && "DW_FORM_data8 is not suitable to emit a pointer to a location list " "in the 32-bit DWARF format"); return 8; case dwarf::DW_FORM_sec_offset: - return AP->getDwarfOffsetByteSize(); + return FormParams.getDwarfOffsetByteSize(); default: llvm_unreachable("DIE Value form not supported yet"); } @@ -860,9 +857,10 @@ void DIELocList::print(raw_ostream &O) const { O << "LocList: " << Index; } // DIEAddrOffset Implementation //===----------------------------------------------------------------------===// -unsigned DIEAddrOffset::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const { - return Addr.SizeOf(AP, dwarf::DW_FORM_addrx) + - Offset.SizeOf(AP, dwarf::DW_FORM_data4); +unsigned DIEAddrOffset::sizeOf(const dwarf::FormParams &FormParams, + dwarf::Form) const { + return Addr.sizeOf(FormParams, dwarf::DW_FORM_addrx) + + Offset.sizeOf(FormParams, dwarf::DW_FORM_data4); } /// EmitValue - Emit label value. diff --git a/llvm/lib/CodeGen/AsmPrinter/DIEHash.cpp b/llvm/lib/CodeGen/AsmPrinter/DIEHash.cpp index 5f4ee747fcca..e175854f7b93 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DIEHash.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/DIEHash.cpp @@ -207,6 +207,18 @@ void DIEHash::hashDIEEntry(dwarf::Attribute Attribute, dwarf::Tag Tag, computeHash(Entry); } +void DIEHash::hashRawTypeReference(const DIE &Entry) { + unsigned &DieNumber = Numbering[&Entry]; + if (DieNumber) { + addULEB128('R'); + addULEB128(DieNumber); + return; + } + DieNumber = Numbering.size(); + addULEB128('T'); + computeHash(Entry); +} + // Hash all of the values in a block like set of values. This assumes that // all of the data is going to be added as integers. void DIEHash::hashBlockData(const DIE::const_value_range &Values) { @@ -298,10 +310,10 @@ void DIEHash::hashAttribute(const DIEValue &Value, dwarf::Tag Tag) { addULEB128(Attribute); addULEB128(dwarf::DW_FORM_block); if (Value.getType() == DIEValue::isBlock) { - addULEB128(Value.getDIEBlock().ComputeSize(AP)); + addULEB128(Value.getDIEBlock().computeSize(AP->getDwarfFormParams())); hashBlockData(Value.getDIEBlock().values()); } else if (Value.getType() == DIEValue::isLoc) { - addULEB128(Value.getDIELoc().ComputeSize(AP)); + addULEB128(Value.getDIELoc().computeSize(AP->getDwarfFormParams())); hashBlockData(Value.getDIELoc().values()); } else { // We could add the block length, but that would take diff --git a/llvm/lib/CodeGen/AsmPrinter/DIEHash.h b/llvm/lib/CodeGen/AsmPrinter/DIEHash.h index 29e1da4c5d60..24a973b39271 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DIEHash.h +++ b/llvm/lib/CodeGen/AsmPrinter/DIEHash.h @@ -62,6 +62,8 @@ public: /// Encodes and adds \param Value to the hash as a SLEB128. void addSLEB128(int64_t Value); + void hashRawTypeReference(const DIE &Entry); + private: /// Adds \param Str to the hash and includes a NULL byte. void addString(StringRef Str); diff --git a/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp b/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp index 4df34d2c9402..18fc46c74eb4 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp @@ -155,7 +155,8 @@ uint64_t DebugHandlerBase::getBaseTypeSize(const DIType *Ty) { if (Tag != dwarf::DW_TAG_member && Tag != dwarf::DW_TAG_typedef && Tag != dwarf::DW_TAG_const_type && Tag != dwarf::DW_TAG_volatile_type && - Tag != dwarf::DW_TAG_restrict_type && Tag != dwarf::DW_TAG_atomic_type) + Tag != dwarf::DW_TAG_restrict_type && Tag != dwarf::DW_TAG_atomic_type && + Tag != dwarf::DW_TAG_immutable_type) return DDTy->getSizeInBits(); DIType *BaseType = DDTy->getBaseType(); @@ -210,7 +211,8 @@ bool DebugHandlerBase::isUnsignedDIType(const DIType *Ty) { return true; assert(T == dwarf::DW_TAG_typedef || T == dwarf::DW_TAG_const_type || T == dwarf::DW_TAG_volatile_type || - T == dwarf::DW_TAG_restrict_type || T == dwarf::DW_TAG_atomic_type); + T == dwarf::DW_TAG_restrict_type || T == dwarf::DW_TAG_atomic_type || + T == dwarf::DW_TAG_immutable_type); assert(DTy->getBaseType() && "Expected valid base type"); return isUnsignedDIType(DTy->getBaseType()); } diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp index 9b73f0ab2f05..5913c687db48 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp @@ -127,9 +127,14 @@ unsigned DwarfCompileUnit::getOrCreateSourceID(const DIFile *File) { if (!File) return Asm->OutStreamer->emitDwarfFileDirective(0, "", "", None, None, CUID); - return Asm->OutStreamer->emitDwarfFileDirective( - 0, File->getDirectory(), File->getFilename(), DD->getMD5AsBytes(File), - File->getSource(), CUID); + + if (LastFile != File) { + LastFile = File; + LastFileID = Asm->OutStreamer->emitDwarfFileDirective( + 0, File->getDirectory(), File->getFilename(), DD->getMD5AsBytes(File), + File->getSource(), CUID); + } + return LastFileID; } DIE *DwarfCompileUnit::getOrCreateGlobalVariableDIE( @@ -260,9 +265,20 @@ void DwarfCompileUnit::addLocationAttribute( if (Global) { const MCSymbol *Sym = Asm->getSymbol(Global); - unsigned PointerSize = Asm->getDataLayout().getPointerSize(); - assert((PointerSize == 4 || PointerSize == 8) && - "Add support for other sizes if necessary"); + // 16-bit platforms like MSP430 and AVR take this path, so sink this + // assert to platforms that use it. + auto GetPointerSizedFormAndOp = [this]() { + unsigned PointerSize = Asm->getDataLayout().getPointerSize(); + assert((PointerSize == 4 || PointerSize == 8) && + "Add support for other sizes if necessary"); + struct FormAndOp { + dwarf::Form Form; + dwarf::LocationAtom Op; + }; + return PointerSize == 4 + ? FormAndOp{dwarf::DW_FORM_data4, dwarf::DW_OP_const4u} + : FormAndOp{dwarf::DW_FORM_data8, dwarf::DW_OP_const8u}; + }; if (Global->isThreadLocal()) { if (Asm->TM.useEmulatedTLS()) { // TODO: add debug info for emulated thread local mode. @@ -270,15 +286,12 @@ void DwarfCompileUnit::addLocationAttribute( // FIXME: Make this work with -gsplit-dwarf. // Based on GCC's support for TLS: if (!DD->useSplitDwarf()) { + auto FormAndOp = GetPointerSizedFormAndOp(); // 1) Start with a constNu of the appropriate pointer size - addUInt(*Loc, dwarf::DW_FORM_data1, - PointerSize == 4 ? dwarf::DW_OP_const4u - : dwarf::DW_OP_const8u); + addUInt(*Loc, dwarf::DW_FORM_data1, FormAndOp.Op); // 2) containing the (relocated) offset of the TLS variable // within the module's TLS block. - addExpr(*Loc, - PointerSize == 4 ? dwarf::DW_FORM_data4 - : dwarf::DW_FORM_data8, + addExpr(*Loc, FormAndOp.Form, Asm->getObjFileLowering().getDebugThreadLocalSymbol(Sym)); } else { addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_GNU_const_index); @@ -292,13 +305,11 @@ void DwarfCompileUnit::addLocationAttribute( } } else if (Asm->TM.getRelocationModel() == Reloc::RWPI || Asm->TM.getRelocationModel() == Reloc::ROPI_RWPI) { + auto FormAndOp = GetPointerSizedFormAndOp(); // Constant - addUInt(*Loc, dwarf::DW_FORM_data1, - PointerSize == 4 ? dwarf::DW_OP_const4u - : dwarf::DW_OP_const8u); + addUInt(*Loc, dwarf::DW_FORM_data1, FormAndOp.Op); // Relocation offset - addExpr(*Loc, PointerSize == 4 ? dwarf::DW_FORM_data4 - : dwarf::DW_FORM_data8, + addExpr(*Loc, FormAndOp.Form, Asm->getObjFileLowering().getIndirectSymViaRWPI(Sym)); // Base register Register BaseReg = Asm->getObjFileLowering().getStaticBase(); @@ -1575,7 +1586,8 @@ void DwarfCompileUnit::createBaseTypeDIEs() { Twine(dwarf::AttributeEncodingString(Btr.Encoding) + "_" + Twine(Btr.BitSize)).toStringRef(Str)); addUInt(Die, dwarf::DW_AT_encoding, dwarf::DW_FORM_data1, Btr.Encoding); - addUInt(Die, dwarf::DW_AT_byte_size, None, Btr.BitSize / 8); + // Round up to smallest number of bytes that contains this number of bits. + addUInt(Die, dwarf::DW_AT_byte_size, None, divideCeil(Btr.BitSize, 8)); Btr.Die = &Die; } diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h index fb03982b5e4a..f2e1f6346803 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h @@ -86,6 +86,9 @@ class DwarfCompileUnit final : public DwarfUnit { /// DWO ID for correlating skeleton and split units. uint64_t DWOId = 0; + const DIFile *LastFile = nullptr; + unsigned LastFileID; + /// Construct a DIE for the given DbgVariable without initializing the /// DbgVariable's DIE reference. DIE *constructVariableDIEImpl(const DbgVariable &DV, bool Abstract); diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp index 48134f1fd774..680b9586228f 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp @@ -2539,12 +2539,10 @@ void DwarfDebug::emitDebugLocEntry(ByteStreamer &Streamer, if (Op.getDescription().Op[I] == Encoding::SizeNA) continue; if (Op.getDescription().Op[I] == Encoding::BaseTypeRef) { - uint64_t Offset = - CU->ExprRefedBaseTypes[Op.getRawOperand(I)].Die->getOffset(); - assert(Offset < (1ULL << (ULEB128PadSize * 7)) && "Offset wont fit"); - Streamer.emitULEB128(Offset, "", ULEB128PadSize); + unsigned Length = + Streamer.emitDIERef(*CU->ExprRefedBaseTypes[Op.getRawOperand(I)].Die); // Make sure comments stay aligned. - for (unsigned J = 0; J < ULEB128PadSize; ++J) + for (unsigned J = 0; J < Length; ++J) if (Comment != End) Comment++; } else { @@ -3369,7 +3367,8 @@ void DwarfDebug::addDwarfTypeUnitType(DwarfCompileUnit &CU, // Fast path if we're building some type units and one has already used the // address pool we know we're going to throw away all this work anyway, so // don't bother building dependent types. - if (!TypeUnitsUnderConstruction.empty() && AddrPool.hasBeenUsed()) + if (!TypeUnitsUnderConstruction.empty() && + (AddrPool.hasBeenUsed() || SeenLocalType)) return; auto Ins = TypeSignatures.insert(std::make_pair(CTy, 0)); @@ -3380,6 +3379,7 @@ void DwarfDebug::addDwarfTypeUnitType(DwarfCompileUnit &CU, bool TopLevelType = TypeUnitsUnderConstruction.empty(); AddrPool.resetUsedFlag(); + SeenLocalType = false; auto OwnedUnit = std::make_unique<DwarfTypeUnit>(CU, Asm, this, &InfoHolder, getDwoLineTable(CU)); @@ -3423,7 +3423,7 @@ void DwarfDebug::addDwarfTypeUnitType(DwarfCompileUnit &CU, // Types referencing entries in the address table cannot be placed in type // units. - if (AddrPool.hasBeenUsed()) { + if (AddrPool.hasBeenUsed() || SeenLocalType) { // Remove all the types built while building this type. // This is pessimistic as some of these types might not be dependent on @@ -3451,14 +3451,18 @@ void DwarfDebug::addDwarfTypeUnitType(DwarfCompileUnit &CU, DwarfDebug::NonTypeUnitContext::NonTypeUnitContext(DwarfDebug *DD) : DD(DD), - TypeUnitsUnderConstruction(std::move(DD->TypeUnitsUnderConstruction)), AddrPoolUsed(DD->AddrPool.hasBeenUsed()) { + TypeUnitsUnderConstruction(std::move(DD->TypeUnitsUnderConstruction)), + AddrPoolUsed(DD->AddrPool.hasBeenUsed()), + SeenLocalType(DD->SeenLocalType) { DD->TypeUnitsUnderConstruction.clear(); DD->AddrPool.resetUsedFlag(); + DD->SeenLocalType = false; } DwarfDebug::NonTypeUnitContext::~NonTypeUnitContext() { DD->TypeUnitsUnderConstruction = std::move(TypeUnitsUnderConstruction); DD->AddrPool.resetUsedFlag(AddrPoolUsed); + DD->SeenLocalType = SeenLocalType; } DwarfDebug::NonTypeUnitContext DwarfDebug::enterNonTypeUnitContext() { diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h index 4e1a1b1e068d..0043000652e8 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h @@ -433,6 +433,7 @@ private: DenseMap<const DIStringType *, unsigned> StringTypeLocMap; AddressPool AddrPool; + bool SeenLocalType = false; /// Accelerator tables. AccelTable<DWARF5AccelTableData> AccelDebugNames; @@ -671,6 +672,7 @@ public: DwarfDebug *DD; decltype(DwarfDebug::TypeUnitsUnderConstruction) TypeUnitsUnderConstruction; bool AddrPoolUsed; + bool SeenLocalType; friend class DwarfDebug; NonTypeUnitContext(DwarfDebug *DD); public: @@ -679,6 +681,7 @@ public: }; NonTypeUnitContext enterNonTypeUnitContext(); + void seenLocalType() { SeenLocalType = true; } /// Add a label so that arange data can be generated for it. void addArangeLabel(SymbolCU SCU) { ArangeLabels.push_back(SCU); } diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfException.h b/llvm/lib/CodeGen/AsmPrinter/DwarfException.h index 40898c9fc855..4defa8a30855 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfException.h +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfException.h @@ -98,6 +98,8 @@ class LLVM_LIBRARY_VISIBILITY AIXException : public DwarfCFIExceptionBase { public: AIXException(AsmPrinter *A); + void markFunctionEnd() override; + void endModule() override {} void beginFunction(const MachineFunction *MF) override {} diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp index 37407c98e75f..ee932d105107 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp @@ -681,9 +681,25 @@ void DwarfExpression::emitLegacySExt(unsigned FromBits) { } void DwarfExpression::emitLegacyZExt(unsigned FromBits) { - // (X & (1 << FromBits - 1)) - emitOp(dwarf::DW_OP_constu); - emitUnsigned((1ULL << FromBits) - 1); + // Heuristic to decide the most efficient encoding. + // A ULEB can encode 7 1-bits per byte. + if (FromBits / 7 < 1+1+1+1+1) { + // (X & (1 << FromBits - 1)) + emitOp(dwarf::DW_OP_constu); + emitUnsigned((1ULL << FromBits) - 1); + } else { + // Note that the DWARF 4 stack consists of pointer-sized elements, + // so technically it doesn't make sense to shift left more than 64 + // bits. We leave that for the consumer to decide though. LLDB for + // example uses APInt for the stack elements and can still deal + // with this. + emitOp(dwarf::DW_OP_lit1); + emitOp(dwarf::DW_OP_constu); + emitUnsigned(FromBits); + emitOp(dwarf::DW_OP_shl); + emitOp(dwarf::DW_OP_lit1); + emitOp(dwarf::DW_OP_minus); + } emitOp(dwarf::DW_OP_and); } diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfFile.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfFile.cpp index 838e1c9a10be..a67d0f032cf6 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfFile.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfFile.cpp @@ -92,7 +92,8 @@ unsigned DwarfFile::computeSizeAndOffsetsForUnit(DwarfUnit *TheU) { // Compute the size and offset of a DIE. The offset is relative to start of the // CU. It returns the offset after laying out the DIE. unsigned DwarfFile::computeSizeAndOffset(DIE &Die, unsigned Offset) { - return Die.computeOffsetsAndAbbrevs(Asm, Abbrevs, Offset); + return Die.computeOffsetsAndAbbrevs(Asm->getDwarfFormParams(), Abbrevs, + Offset); } void DwarfFile::emitAbbrevs(MCSection *Section) { Abbrevs.Emit(Asm, Section); } diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp index 6b6d63f14f87..15d90c54adfc 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp @@ -77,7 +77,7 @@ void DIEDwarfExpression::enableTemporaryBuffer() { void DIEDwarfExpression::disableTemporaryBuffer() { IsBuffering = false; } unsigned DIEDwarfExpression::getTemporaryBufferSize() { - return TmpDIE.ComputeSize(&AP); + return TmpDIE.computeSize(AP.getDwarfFormParams()); } void DIEDwarfExpression::commitTemporaryBuffer() { OutDIE.takeValues(TmpDIE); } @@ -394,14 +394,14 @@ DIE &DwarfUnit::createAndAddDIE(dwarf::Tag Tag, DIE &Parent, const DINode *N) { } void DwarfUnit::addBlock(DIE &Die, dwarf::Attribute Attribute, DIELoc *Loc) { - Loc->ComputeSize(Asm); + Loc->computeSize(Asm->getDwarfFormParams()); DIELocs.push_back(Loc); // Memoize so we can call the destructor later on. addAttribute(Die, Attribute, Loc->BestForm(DD->getDwarfVersion()), Loc); } void DwarfUnit::addBlock(DIE &Die, dwarf::Attribute Attribute, dwarf::Form Form, DIEBlock *Block) { - Block->ComputeSize(Asm); + Block->computeSize(Asm->getDwarfFormParams()); DIEBlocks.push_back(Block); // Memoize so we can call the destructor later on. addAttribute(Die, Attribute, Form, Block); } @@ -597,10 +597,8 @@ DIE *DwarfUnit::createTypeDIE(const DIScope *Context, DIE &ContextDIE, // Skip updating the accelerator tables since this is not the full type. if (MDString *TypeId = CTy->getRawIdentifier()) DD->addDwarfTypeUnitType(getCU(), TypeId->getString(), TyDIE, CTy); - else { - auto X = DD->enterNonTypeUnitContext(); + else finishNonUnitTypeDIE(TyDIE, CTy); - } return &TyDIE; } constructTypeDIE(TyDIE, CTy); @@ -744,6 +742,16 @@ void DwarfUnit::constructTypeDIE(DIE &Buffer, const DIStringType *STy) { addUInt(Buffer, dwarf::DW_AT_byte_size, None, Size); } + if (DIExpression *Expr = STy->getStringLocationExp()) { + DIELoc *Loc = new (DIEValueAllocator) DIELoc; + DIEDwarfExpression DwarfExpr(*Asm, getCU(), *Loc); + // This is to describe the memory location of the + // string, so lock it down as such. + DwarfExpr.setMemoryLocationKind(); + DwarfExpr.addExpression(Expr); + addBlock(Buffer, dwarf::DW_AT_data_location, DwarfExpr.finalize()); + } + if (STy->getEncoding()) { // For eventual Unicode support. addUInt(Buffer, dwarf::DW_AT_encoding, dwarf::DW_FORM_data1, @@ -1189,7 +1197,7 @@ bool DwarfUnit::applySubprogramDefinitionAttributes(const DISubprogram *SP, DefinitionArgs = SP->getType()->getTypeArray(); if (DeclArgs.size() && DefinitionArgs.size()) - if (DefinitionArgs[0] != NULL && DeclArgs[0] != DefinitionArgs[0]) + if (DefinitionArgs[0] != nullptr && DeclArgs[0] != DefinitionArgs[0]) addType(SPDie, DefinitionArgs[0]); DeclDie = getDIE(SPDecl); @@ -1842,5 +1850,25 @@ void DwarfTypeUnit::finishNonUnitTypeDIE(DIE& D, const DICompositeType *CTy) { StringRef Name = CTy->getName(); if (!Name.empty()) addString(D, dwarf::DW_AT_name, Name); + if (Name.startswith("_STN") || !Name.contains('<')) + addTemplateParams(D, CTy->getTemplateParams()); + // If the type is in an anonymous namespace, we can't reference it from a TU + // (since the type would be CU local and the TU doesn't specify which TU has + // the appropriate type definition) - so flag this emission as such and skip + // the rest of the emission now since we're going to throw out all this work + // and put the outer/referencing type in the CU instead. + // FIXME: Probably good to generalize this to a DICompositeType flag populated + // by the frontend, then we could use that to have types that can have + // decl+def merged by LTO but where the definition still doesn't go in a type + // unit because the type has only one definition. + for (DIScope *S = CTy->getScope(); S; S = S->getScope()) { + if (auto *NS = dyn_cast<DINamespace>(S)) { + if (NS->getName().empty()) { + DD->seenLocalType(); + break; + } + } + } + auto X = DD->enterNonTypeUnitContext(); getCU().createTypeDIE(CTy); } diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h index 54b0079dd7ce..330f3bacca43 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h @@ -25,9 +25,7 @@ namespace llvm { class ConstantFP; class ConstantInt; -class DbgVariable; class DwarfCompileUnit; -class MachineOperand; class MCDwarfDwoLineTable; class MCSymbol; diff --git a/llvm/lib/CodeGen/AsmPrinter/PseudoProbePrinter.h b/llvm/lib/CodeGen/AsmPrinter/PseudoProbePrinter.h index 7d5e51218693..a92a89084cad 100644 --- a/llvm/lib/CodeGen/AsmPrinter/PseudoProbePrinter.h +++ b/llvm/lib/CodeGen/AsmPrinter/PseudoProbePrinter.h @@ -19,8 +19,6 @@ namespace llvm { class AsmPrinter; -class MCStreamer; -class Module; class DILocation; class PseudoProbeHandler : public AsmPrinterHandler { diff --git a/llvm/lib/CodeGen/AsmPrinter/WinCFGuard.cpp b/llvm/lib/CodeGen/AsmPrinter/WinCFGuard.cpp index 1e3f33e70715..ad8432343a60 100644 --- a/llvm/lib/CodeGen/AsmPrinter/WinCFGuard.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/WinCFGuard.cpp @@ -27,7 +27,7 @@ using namespace llvm; -WinCFGuard::WinCFGuard(AsmPrinter *A) : AsmPrinterHandler(), Asm(A) {} +WinCFGuard::WinCFGuard(AsmPrinter *A) : Asm(A) {} WinCFGuard::~WinCFGuard() {} diff --git a/llvm/lib/CodeGen/BranchFolding.h b/llvm/lib/CodeGen/BranchFolding.h index 2a4ea92a92aa..95d5dcfbbd0f 100644 --- a/llvm/lib/CodeGen/BranchFolding.h +++ b/llvm/lib/CodeGen/BranchFolding.h @@ -23,7 +23,6 @@ class BasicBlock; class MachineBranchProbabilityInfo; class MachineFunction; class MachineLoopInfo; -class MachineModuleInfo; class MachineRegisterInfo; class MBFIWrapper; class ProfileSummaryInfo; diff --git a/llvm/lib/CodeGen/CFIInstrInserter.cpp b/llvm/lib/CodeGen/CFIInstrInserter.cpp index 1c2e3f998449..de173a9dfd62 100644 --- a/llvm/lib/CodeGen/CFIInstrInserter.cpp +++ b/llvm/lib/CodeGen/CFIInstrInserter.cpp @@ -347,7 +347,7 @@ bool CFIInstrInserter::insertCFIInstrs(MachineFunction &MF) { } if (ForceFullCFA) { - MF.getSubtarget().getFrameLowering()->emitCalleeSavedFrameMoves( + MF.getSubtarget().getFrameLowering()->emitCalleeSavedFrameMovesFullCFA( *MBBInfo.MBB, MBBI); InsertedCFIInstr = true; PrevMBBInfo = &MBBInfo; diff --git a/llvm/lib/CodeGen/CalcSpillWeights.cpp b/llvm/lib/CodeGen/CalcSpillWeights.cpp index 5f9982cd155d..84a0e4142bb6 100644 --- a/llvm/lib/CodeGen/CalcSpillWeights.cpp +++ b/llvm/lib/CodeGen/CalcSpillWeights.cpp @@ -43,9 +43,9 @@ void VirtRegAuxInfo::calculateSpillWeightsAndHints() { } // Return the preferred allocation register for reg, given a COPY instruction. -static Register copyHint(const MachineInstr *MI, unsigned Reg, - const TargetRegisterInfo &TRI, - const MachineRegisterInfo &MRI) { +Register VirtRegAuxInfo::copyHint(const MachineInstr *MI, unsigned Reg, + const TargetRegisterInfo &TRI, + const MachineRegisterInfo &MRI) { unsigned Sub, HSub; Register HReg; if (MI->getOperand(0).getReg() == Reg) { @@ -77,9 +77,10 @@ static Register copyHint(const MachineInstr *MI, unsigned Reg, } // Check if all values in LI are rematerializable -static bool isRematerializable(const LiveInterval &LI, const LiveIntervals &LIS, - const VirtRegMap &VRM, - const TargetInstrInfo &TII) { +bool VirtRegAuxInfo::isRematerializable(const LiveInterval &LI, + const LiveIntervals &LIS, + const VirtRegMap &VRM, + const TargetInstrInfo &TII) { Register Reg = LI.reg(); Register Original = VRM.getOriginal(Reg); for (LiveInterval::const_vni_iterator I = LI.vni_begin(), E = LI.vni_end(); diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp index 747f4e4fdecc..28f24e5ea908 100644 --- a/llvm/lib/CodeGen/CodeGenPrepare.cpp +++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -4168,11 +4168,11 @@ bool TypePromotionHelper::canGetThrough(const Instruction *Inst, // We can get through binary operator, if it is legal. In other words, the // binary operator must have a nuw or nsw flag. - const BinaryOperator *BinOp = dyn_cast<BinaryOperator>(Inst); - if (isa_and_nonnull<OverflowingBinaryOperator>(BinOp) && - ((!IsSExt && BinOp->hasNoUnsignedWrap()) || - (IsSExt && BinOp->hasNoSignedWrap()))) - return true; + if (const auto *BinOp = dyn_cast<BinaryOperator>(Inst)) + if (isa<OverflowingBinaryOperator>(BinOp) && + ((!IsSExt && BinOp->hasNoUnsignedWrap()) || + (IsSExt && BinOp->hasNoSignedWrap()))) + return true; // ext(and(opnd, cst)) --> and(ext(opnd), ext(cst)) if ((Inst->getOpcode() == Instruction::And || @@ -4181,10 +4181,10 @@ bool TypePromotionHelper::canGetThrough(const Instruction *Inst, // ext(xor(opnd, cst)) --> xor(ext(opnd), ext(cst)) if (Inst->getOpcode() == Instruction::Xor) { - const ConstantInt *Cst = dyn_cast<ConstantInt>(Inst->getOperand(1)); // Make sure it is not a NOT. - if (Cst && !Cst->getValue().isAllOnes()) - return true; + if (const auto *Cst = dyn_cast<ConstantInt>(Inst->getOperand(1))) + if (!Cst->getValue().isAllOnes()) + return true; } // zext(shrl(opnd, cst)) --> shrl(zext(opnd), zext(cst)) diff --git a/llvm/lib/CodeGen/CommandFlags.cpp b/llvm/lib/CodeGen/CommandFlags.cpp index 3bed81d5841d..1d50e1d22b95 100644 --- a/llvm/lib/CodeGen/CommandFlags.cpp +++ b/llvm/lib/CodeGen/CommandFlags.cpp @@ -90,7 +90,6 @@ CGOPT(bool, EnableAddrsig) CGOPT(bool, EmitCallSiteInfo) CGOPT(bool, EnableMachineFunctionSplitter) CGOPT(bool, EnableDebugEntryValues) -CGOPT_EXP(bool, ValueTrackingVariableLocations) CGOPT(bool, ForceDwarfFrameSection) CGOPT(bool, XRayOmitFunctionIndex) CGOPT(bool, DebugStrictDwarf) @@ -433,12 +432,6 @@ codegen::RegisterCodeGenFlags::RegisterCodeGenFlags() { cl::init(false)); CGBINDOPT(EnableDebugEntryValues); - static cl::opt<bool> ValueTrackingVariableLocations( - "experimental-debug-variable-locations", - cl::desc("Use experimental new value-tracking variable locations"), - cl::init(false)); - CGBINDOPT(ValueTrackingVariableLocations); - static cl::opt<bool> EnableMachineFunctionSplitter( "split-machine-functions", cl::desc("Split out cold basic blocks from machine functions based on " @@ -539,12 +532,6 @@ codegen::InitTargetOptionsFromCodeGenFlags(const Triple &TheTriple) { Options.DebugStrictDwarf = getDebugStrictDwarf(); Options.LoopAlignment = getAlignLoops(); - if (auto Opt = getExplicitValueTrackingVariableLocations()) - Options.ValueTrackingVariableLocations = *Opt; - else - Options.ValueTrackingVariableLocations = - getDefaultValueTrackingVariableLocations(TheTriple); - Options.MCOptions = mc::InitMCTargetOptionsFromFlags(); Options.ThreadModel = getThreadModel(); @@ -620,7 +607,7 @@ void codegen::setFunctionAttributes(StringRef CPU, StringRef Features, Function &F) { auto &Ctx = F.getContext(); AttributeList Attrs = F.getAttributes(); - AttrBuilder NewAttrs; + AttrBuilder NewAttrs(Ctx); if (!CPU.empty() && !F.hasFnAttribute("target-cpu")) NewAttrs.addAttribute("target-cpu", CPU); @@ -698,8 +685,3 @@ void codegen::setFunctionAttributes(StringRef CPU, StringRef Features, setFunctionAttributes(CPU, Features, F); } -bool codegen::getDefaultValueTrackingVariableLocations(const llvm::Triple &T) { - if (T.getArch() == llvm::Triple::x86_64) - return true; - return false; -} diff --git a/llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp b/llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp index 901409ea9f8f..eb2d449bc4af 100644 --- a/llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp +++ b/llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp @@ -40,8 +40,7 @@ using namespace llvm; CriticalAntiDepBreaker::CriticalAntiDepBreaker(MachineFunction &MFi, const RegisterClassInfo &RCI) - : AntiDepBreaker(), MF(MFi), MRI(MF.getRegInfo()), - TII(MF.getSubtarget().getInstrInfo()), + : MF(MFi), MRI(MF.getRegInfo()), TII(MF.getSubtarget().getInstrInfo()), TRI(MF.getSubtarget().getRegisterInfo()), RegClassInfo(RCI), Classes(TRI->getNumRegs(), nullptr), KillIndices(TRI->getNumRegs(), 0), DefIndices(TRI->getNumRegs(), 0), KeepRegs(TRI->getNumRegs(), false) {} diff --git a/llvm/lib/CodeGen/ExpandPostRAPseudos.cpp b/llvm/lib/CodeGen/ExpandPostRAPseudos.cpp index 7300ea6b50ee..d9caa8ad42d0 100644 --- a/llvm/lib/CodeGen/ExpandPostRAPseudos.cpp +++ b/llvm/lib/CodeGen/ExpandPostRAPseudos.cpp @@ -68,9 +68,16 @@ void ExpandPostRA::TransferImplicitOperands(MachineInstr *MI) { MachineBasicBlock::iterator CopyMI = MI; --CopyMI; - for (const MachineOperand &MO : MI->implicit_operands()) - if (MO.isReg()) - CopyMI->addOperand(MO); + Register DstReg = MI->getOperand(0).getReg(); + for (const MachineOperand &MO : MI->implicit_operands()) { + CopyMI->addOperand(MO); + + // Be conservative about preserving kills when subregister defs are + // involved. If there was implicit kill of a super-register overlapping the + // copy result, we would kill the subregisters previous copies defined. + if (MO.isKill() && TRI->regsOverlap(DstReg, MO.getReg())) + CopyMI->getOperand(CopyMI->getNumOperands() - 1).setIsKill(false); + } } bool ExpandPostRA::LowerSubregToReg(MachineInstr *MI) { diff --git a/llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp b/llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp index 2676becdd807..1a642e233a6a 100644 --- a/llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp @@ -191,10 +191,10 @@ MachineInstrBuilder CSEMIRBuilder::buildInstr(unsigned Opc, assert(DstOps.size() == 1 && "Invalid dsts"); if (SrcOps[0].getLLTTy(*getMRI()).isVector()) { // Try to constant fold vector constants. - auto VecCst = ConstantFoldVectorBinop( + Register VecCst = ConstantFoldVectorBinop( Opc, SrcOps[0].getReg(), SrcOps[1].getReg(), *getMRI(), *this); if (VecCst) - return MachineInstrBuilder(getMF(), *VecCst); + return buildCopy(DstOps[0], VecCst); break; } if (Optional<APInt> Cst = ConstantFoldBinOp(Opc, SrcOps[0].getReg(), diff --git a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp index d061664e8c5d..1ec7868f2234 100644 --- a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp @@ -86,6 +86,7 @@ bool CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, const CallBase &CB, CallLoweringInfo Info; const DataLayout &DL = MIRBuilder.getDataLayout(); MachineFunction &MF = MIRBuilder.getMF(); + MachineRegisterInfo &MRI = MF.getRegInfo(); bool CanBeTailCalled = CB.isTailCall() && isInTailCallPosition(CB, MF.getTarget()) && (MF.getFunction() @@ -109,6 +110,7 @@ bool CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, const CallBase &CB, CanBeTailCalled = false; } + // First step is to marshall all the function's parameters into the correct // physregs and memory locations. Gather the sequence of argument types that // we'll pass to the assigner function. @@ -136,10 +138,23 @@ bool CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, const CallBase &CB, else Info.Callee = MachineOperand::CreateReg(GetCalleeReg(), false); + Register ReturnHintAlignReg; + Align ReturnHintAlign; + Info.OrigRet = ArgInfo{ResRegs, RetTy, 0, ISD::ArgFlagsTy{}}; - if (!Info.OrigRet.Ty->isVoidTy()) + + if (!Info.OrigRet.Ty->isVoidTy()) { setArgFlags(Info.OrigRet, AttributeList::ReturnIndex, DL, CB); + if (MaybeAlign Alignment = CB.getRetAlign()) { + if (*Alignment > Align(1)) { + ReturnHintAlignReg = MRI.cloneVirtualRegister(ResRegs[0]); + Info.OrigRet.Regs[0] = ReturnHintAlignReg; + ReturnHintAlign = *Alignment; + } + } + } + Info.CB = &CB; Info.KnownCallees = CB.getMetadata(LLVMContext::MD_callees); Info.CallConv = CallConv; @@ -147,7 +162,15 @@ bool CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, const CallBase &CB, Info.IsMustTailCall = CB.isMustTailCall(); Info.IsTailCall = CanBeTailCalled; Info.IsVarArg = IsVarArg; - return lowerCall(MIRBuilder, Info); + if (!lowerCall(MIRBuilder, Info)) + return false; + + if (ReturnHintAlignReg && !Info.IsTailCall) { + MIRBuilder.buildAssertAlign(ResRegs[0], ReturnHintAlignReg, + ReturnHintAlign); + } + + return true; } template <typename FuncInfoTy> @@ -509,7 +532,8 @@ static void buildCopyToRegs(MachineIRBuilder &B, ArrayRef<Register> DstRegs, bool CallLowering::determineAndHandleAssignments( ValueHandler &Handler, ValueAssigner &Assigner, SmallVectorImpl<ArgInfo> &Args, MachineIRBuilder &MIRBuilder, - CallingConv::ID CallConv, bool IsVarArg, Register ThisReturnReg) const { + CallingConv::ID CallConv, bool IsVarArg, + ArrayRef<Register> ThisReturnRegs) const { MachineFunction &MF = MIRBuilder.getMF(); const Function &F = MF.getFunction(); SmallVector<CCValAssign, 16> ArgLocs; @@ -519,7 +543,7 @@ bool CallLowering::determineAndHandleAssignments( return false; return handleAssignments(Handler, Args, CCInfo, ArgLocs, MIRBuilder, - ThisReturnReg); + ThisReturnRegs); } static unsigned extendOpFromFlags(llvm::ISD::ArgFlagsTy Flags) { @@ -596,7 +620,7 @@ bool CallLowering::handleAssignments(ValueHandler &Handler, CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs, MachineIRBuilder &MIRBuilder, - Register ThisReturnReg) const { + ArrayRef<Register> ThisReturnRegs) const { MachineFunction &MF = MIRBuilder.getMF(); MachineRegisterInfo &MRI = MF.getRegInfo(); const Function &F = MF.getFunction(); @@ -740,10 +764,10 @@ bool CallLowering::handleAssignments(ValueHandler &Handler, assert(!VA.needsCustom() && "custom loc should have been handled already"); - if (i == 0 && ThisReturnReg.isValid() && + if (i == 0 && !ThisReturnRegs.empty() && Handler.isIncomingArgumentHandler() && isTypeIsValidForThisReturn(ValVT)) { - Handler.assignValueToReg(Args[i].Regs[i], ThisReturnReg, VA); + Handler.assignValueToReg(ArgReg, ThisReturnRegs[Part], VA); continue; } diff --git a/llvm/lib/CodeGen/GlobalISel/Combiner.cpp b/llvm/lib/CodeGen/GlobalISel/Combiner.cpp index dd1ef74e8ad0..30f8838805b5 100644 --- a/llvm/lib/CodeGen/GlobalISel/Combiner.cpp +++ b/llvm/lib/CodeGen/GlobalISel/Combiner.cpp @@ -56,8 +56,7 @@ class WorkListMaintainer : public GISelChangeObserver { SmallPtrSet<const MachineInstr *, 4> CreatedInstrs; public: - WorkListMaintainer(WorkListTy &WorkList) - : GISelChangeObserver(), WorkList(WorkList) {} + WorkListMaintainer(WorkListTy &WorkList) : WorkList(WorkList) {} virtual ~WorkListMaintainer() { } diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index f7a634dad61a..d6a009744161 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -1748,6 +1748,20 @@ void CombinerHelper::applyCombineUnmergeConstant(MachineInstr &MI, MI.eraseFromParent(); } +bool CombinerHelper::matchCombineUnmergeUndef( + MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) { + unsigned SrcIdx = MI.getNumOperands() - 1; + Register SrcReg = MI.getOperand(SrcIdx).getReg(); + MatchInfo = [&MI](MachineIRBuilder &B) { + unsigned NumElems = MI.getNumOperands() - 1; + for (unsigned Idx = 0; Idx < NumElems; ++Idx) { + Register DstReg = MI.getOperand(Idx).getReg(); + B.buildUndef(DstReg); + } + }; + return isa<GImplicitDef>(MRI.getVRegDef(SrcReg)); +} + bool CombinerHelper::matchCombineUnmergeWithDeadLanesToTrunc(MachineInstr &MI) { assert(MI.getOpcode() == TargetOpcode::G_UNMERGE_VALUES && "Expected an unmerge"); @@ -2025,16 +2039,19 @@ void CombinerHelper::applyCombineAddP2IToPtrAdd( } bool CombinerHelper::matchCombineConstPtrAddToI2P(MachineInstr &MI, - int64_t &NewCst) { + APInt &NewCst) { auto &PtrAdd = cast<GPtrAdd>(MI); Register LHS = PtrAdd.getBaseReg(); Register RHS = PtrAdd.getOffsetReg(); MachineRegisterInfo &MRI = Builder.getMF().getRegInfo(); - if (auto RHSCst = getIConstantVRegSExtVal(RHS, MRI)) { - int64_t Cst; + if (auto RHSCst = getIConstantVRegVal(RHS, MRI)) { + APInt Cst; if (mi_match(LHS, MRI, m_GIntToPtr(m_ICst(Cst)))) { - NewCst = Cst + *RHSCst; + auto DstTy = MRI.getType(PtrAdd.getReg(0)); + // G_INTTOPTR uses zero-extension + NewCst = Cst.zextOrTrunc(DstTy.getSizeInBits()); + NewCst += RHSCst->sextOrTrunc(DstTy.getSizeInBits()); return true; } } @@ -2043,7 +2060,7 @@ bool CombinerHelper::matchCombineConstPtrAddToI2P(MachineInstr &MI, } void CombinerHelper::applyCombineConstPtrAddToI2P(MachineInstr &MI, - int64_t &NewCst) { + APInt &NewCst) { auto &PtrAdd = cast<GPtrAdd>(MI); Register Dst = PtrAdd.getReg(0); @@ -3875,39 +3892,48 @@ bool CombinerHelper::matchOrShiftToFunnelShift(MachineInstr &MI, LLT Ty = MRI.getType(Dst); unsigned BitWidth = Ty.getScalarSizeInBits(); - Register ShlSrc, ShlAmt, LShrSrc, LShrAmt; + Register ShlSrc, ShlAmt, LShrSrc, LShrAmt, Amt; unsigned FshOpc = 0; - // Match (or (shl x, amt), (lshr y, sub(bw, amt))). - if (mi_match( - Dst, MRI, - // m_GOr() handles the commuted version as well. - m_GOr(m_GShl(m_Reg(ShlSrc), m_Reg(ShlAmt)), - m_GLShr(m_Reg(LShrSrc), m_GSub(m_SpecificICstOrSplat(BitWidth), - m_Reg(LShrAmt)))))) { + // Match (or (shl ...), (lshr ...)). + if (!mi_match(Dst, MRI, + // m_GOr() handles the commuted version as well. + m_GOr(m_GShl(m_Reg(ShlSrc), m_Reg(ShlAmt)), + m_GLShr(m_Reg(LShrSrc), m_Reg(LShrAmt))))) + return false; + + // Given constants C0 and C1 such that C0 + C1 is bit-width: + // (or (shl x, C0), (lshr y, C1)) -> (fshl x, y, C0) or (fshr x, y, C1) + // TODO: Match constant splat. + int64_t CstShlAmt, CstLShrAmt; + if (mi_match(ShlAmt, MRI, m_ICst(CstShlAmt)) && + mi_match(LShrAmt, MRI, m_ICst(CstLShrAmt)) && + CstShlAmt + CstLShrAmt == BitWidth) { + FshOpc = TargetOpcode::G_FSHR; + Amt = LShrAmt; + + } else if (mi_match(LShrAmt, MRI, + m_GSub(m_SpecificICstOrSplat(BitWidth), m_Reg(Amt))) && + ShlAmt == Amt) { + // (or (shl x, amt), (lshr y, (sub bw, amt))) -> (fshl x, y, amt) FshOpc = TargetOpcode::G_FSHL; - // Match (or (shl x, sub(bw, amt)), (lshr y, amt)). - } else if (mi_match(Dst, MRI, - m_GOr(m_GLShr(m_Reg(LShrSrc), m_Reg(LShrAmt)), - m_GShl(m_Reg(ShlSrc), - m_GSub(m_SpecificICstOrSplat(BitWidth), - m_Reg(ShlAmt)))))) { + } else if (mi_match(ShlAmt, MRI, + m_GSub(m_SpecificICstOrSplat(BitWidth), m_Reg(Amt))) && + LShrAmt == Amt) { + // (or (shl x, (sub bw, amt)), (lshr y, amt)) -> (fshr x, y, amt) FshOpc = TargetOpcode::G_FSHR; } else { return false; } - if (ShlAmt != LShrAmt) - return false; - - LLT AmtTy = MRI.getType(ShlAmt); + LLT AmtTy = MRI.getType(Amt); if (!isLegalOrBeforeLegalizer({FshOpc, {Ty, AmtTy}})) return false; MatchInfo = [=](MachineIRBuilder &B) { - B.buildInstr(FshOpc, {Dst}, {ShlSrc, LShrSrc, ShlAmt}); + B.buildInstr(FshOpc, {Dst}, {ShlSrc, LShrSrc, Amt}); }; return true; } @@ -4127,8 +4153,9 @@ bool CombinerHelper::matchBitfieldExtractFromAnd( assert(MI.getOpcode() == TargetOpcode::G_AND); Register Dst = MI.getOperand(0).getReg(); LLT Ty = MRI.getType(Dst); - if (!getTargetLowering().isConstantUnsignedBitfieldExtactLegal( - TargetOpcode::G_UBFX, Ty, Ty)) + LLT ExtractTy = getTargetLowering().getPreferredShiftAmountTy(Ty); + if (!getTargetLowering().isConstantUnsignedBitfieldExtractLegal( + TargetOpcode::G_UBFX, Ty, ExtractTy)) return false; int64_t AndImm, LSBImm; @@ -4148,7 +4175,6 @@ bool CombinerHelper::matchBitfieldExtractFromAnd( if (static_cast<uint64_t>(LSBImm) >= Size) return false; - LLT ExtractTy = getTargetLowering().getPreferredShiftAmountTy(Ty); uint64_t Width = APInt(Size, AndImm).countTrailingOnes(); MatchInfo = [=](MachineIRBuilder &B) { auto WidthCst = B.buildConstant(ExtractTy, Width); @@ -4214,8 +4240,9 @@ bool CombinerHelper::matchBitfieldExtractFromShrAnd( const Register Dst = MI.getOperand(0).getReg(); LLT Ty = MRI.getType(Dst); - if (!getTargetLowering().isConstantUnsignedBitfieldExtactLegal( - TargetOpcode::G_UBFX, Ty, Ty)) + LLT ExtractTy = getTargetLowering().getPreferredShiftAmountTy(Ty); + if (!getTargetLowering().isConstantUnsignedBitfieldExtractLegal( + TargetOpcode::G_UBFX, Ty, ExtractTy)) return false; // Try to match shr (and x, c1), c2 @@ -4249,8 +4276,8 @@ bool CombinerHelper::matchBitfieldExtractFromShrAnd( return false; MatchInfo = [=](MachineIRBuilder &B) { - auto WidthCst = B.buildConstant(Ty, Width); - auto PosCst = B.buildConstant(Ty, Pos); + auto WidthCst = B.buildConstant(ExtractTy, Width); + auto PosCst = B.buildConstant(ExtractTy, Pos); B.buildInstr(TargetOpcode::G_UBFX, {Dst}, {AndSrc, PosCst, WidthCst}); }; return true; @@ -4850,37 +4877,39 @@ bool CombinerHelper::matchCombineFAddFMulToFMadOrFMA( if (!canCombineFMadOrFMA(MI, AllowFusionGlobally, HasFMAD, Aggressive)) return false; - MachineInstr *LHS = MRI.getVRegDef(MI.getOperand(1).getReg()); - MachineInstr *RHS = MRI.getVRegDef(MI.getOperand(2).getReg()); + Register Op1 = MI.getOperand(1).getReg(); + Register Op2 = MI.getOperand(2).getReg(); + DefinitionAndSourceRegister LHS = {MRI.getVRegDef(Op1), Op1}; + DefinitionAndSourceRegister RHS = {MRI.getVRegDef(Op2), Op2}; unsigned PreferredFusedOpcode = HasFMAD ? TargetOpcode::G_FMAD : TargetOpcode::G_FMA; // If we have two choices trying to fold (fadd (fmul u, v), (fmul x, y)), // prefer to fold the multiply with fewer uses. - if (Aggressive && isContractableFMul(*LHS, AllowFusionGlobally) && - isContractableFMul(*RHS, AllowFusionGlobally)) { - if (hasMoreUses(*LHS, *RHS, MRI)) + if (Aggressive && isContractableFMul(*LHS.MI, AllowFusionGlobally) && + isContractableFMul(*RHS.MI, AllowFusionGlobally)) { + if (hasMoreUses(*LHS.MI, *RHS.MI, MRI)) std::swap(LHS, RHS); } // fold (fadd (fmul x, y), z) -> (fma x, y, z) - if (isContractableFMul(*LHS, AllowFusionGlobally) && - (Aggressive || MRI.hasOneNonDBGUse(LHS->getOperand(0).getReg()))) { + if (isContractableFMul(*LHS.MI, AllowFusionGlobally) && + (Aggressive || MRI.hasOneNonDBGUse(LHS.Reg))) { MatchInfo = [=, &MI](MachineIRBuilder &B) { B.buildInstr(PreferredFusedOpcode, {MI.getOperand(0).getReg()}, - {LHS->getOperand(1).getReg(), LHS->getOperand(2).getReg(), - RHS->getOperand(0).getReg()}); + {LHS.MI->getOperand(1).getReg(), + LHS.MI->getOperand(2).getReg(), RHS.Reg}); }; return true; } // fold (fadd x, (fmul y, z)) -> (fma y, z, x) - if (isContractableFMul(*RHS, AllowFusionGlobally) && - (Aggressive || MRI.hasOneNonDBGUse(RHS->getOperand(0).getReg()))) { + if (isContractableFMul(*RHS.MI, AllowFusionGlobally) && + (Aggressive || MRI.hasOneNonDBGUse(RHS.Reg))) { MatchInfo = [=, &MI](MachineIRBuilder &B) { B.buildInstr(PreferredFusedOpcode, {MI.getOperand(0).getReg()}, - {RHS->getOperand(1).getReg(), RHS->getOperand(2).getReg(), - LHS->getOperand(0).getReg()}); + {RHS.MI->getOperand(1).getReg(), + RHS.MI->getOperand(2).getReg(), LHS.Reg}); }; return true; } @@ -4897,8 +4926,10 @@ bool CombinerHelper::matchCombineFAddFpExtFMulToFMadOrFMA( return false; const auto &TLI = *MI.getMF()->getSubtarget().getTargetLowering(); - MachineInstr *LHS = MRI.getVRegDef(MI.getOperand(1).getReg()); - MachineInstr *RHS = MRI.getVRegDef(MI.getOperand(2).getReg()); + Register Op1 = MI.getOperand(1).getReg(); + Register Op2 = MI.getOperand(2).getReg(); + DefinitionAndSourceRegister LHS = {MRI.getVRegDef(Op1), Op1}; + DefinitionAndSourceRegister RHS = {MRI.getVRegDef(Op2), Op2}; LLT DstType = MRI.getType(MI.getOperand(0).getReg()); unsigned PreferredFusedOpcode = @@ -4906,42 +4937,38 @@ bool CombinerHelper::matchCombineFAddFpExtFMulToFMadOrFMA( // If we have two choices trying to fold (fadd (fmul u, v), (fmul x, y)), // prefer to fold the multiply with fewer uses. - if (Aggressive && isContractableFMul(*LHS, AllowFusionGlobally) && - isContractableFMul(*RHS, AllowFusionGlobally)) { - if (hasMoreUses(*LHS, *RHS, MRI)) + if (Aggressive && isContractableFMul(*LHS.MI, AllowFusionGlobally) && + isContractableFMul(*RHS.MI, AllowFusionGlobally)) { + if (hasMoreUses(*LHS.MI, *RHS.MI, MRI)) std::swap(LHS, RHS); } // fold (fadd (fpext (fmul x, y)), z) -> (fma (fpext x), (fpext y), z) MachineInstr *FpExtSrc; - if (mi_match(LHS->getOperand(0).getReg(), MRI, - m_GFPExt(m_MInstr(FpExtSrc))) && + if (mi_match(LHS.Reg, MRI, m_GFPExt(m_MInstr(FpExtSrc))) && isContractableFMul(*FpExtSrc, AllowFusionGlobally) && TLI.isFPExtFoldable(MI, PreferredFusedOpcode, DstType, MRI.getType(FpExtSrc->getOperand(1).getReg()))) { MatchInfo = [=, &MI](MachineIRBuilder &B) { auto FpExtX = B.buildFPExt(DstType, FpExtSrc->getOperand(1).getReg()); auto FpExtY = B.buildFPExt(DstType, FpExtSrc->getOperand(2).getReg()); - B.buildInstr( - PreferredFusedOpcode, {MI.getOperand(0).getReg()}, - {FpExtX.getReg(0), FpExtY.getReg(0), RHS->getOperand(0).getReg()}); + B.buildInstr(PreferredFusedOpcode, {MI.getOperand(0).getReg()}, + {FpExtX.getReg(0), FpExtY.getReg(0), RHS.Reg}); }; return true; } // fold (fadd z, (fpext (fmul x, y))) -> (fma (fpext x), (fpext y), z) // Note: Commutes FADD operands. - if (mi_match(RHS->getOperand(0).getReg(), MRI, - m_GFPExt(m_MInstr(FpExtSrc))) && + if (mi_match(RHS.Reg, MRI, m_GFPExt(m_MInstr(FpExtSrc))) && isContractableFMul(*FpExtSrc, AllowFusionGlobally) && TLI.isFPExtFoldable(MI, PreferredFusedOpcode, DstType, MRI.getType(FpExtSrc->getOperand(1).getReg()))) { MatchInfo = [=, &MI](MachineIRBuilder &B) { auto FpExtX = B.buildFPExt(DstType, FpExtSrc->getOperand(1).getReg()); auto FpExtY = B.buildFPExt(DstType, FpExtSrc->getOperand(2).getReg()); - B.buildInstr( - PreferredFusedOpcode, {MI.getOperand(0).getReg()}, - {FpExtX.getReg(0), FpExtY.getReg(0), LHS->getOperand(0).getReg()}); + B.buildInstr(PreferredFusedOpcode, {MI.getOperand(0).getReg()}, + {FpExtX.getReg(0), FpExtY.getReg(0), LHS.Reg}); }; return true; } @@ -4957,8 +4984,10 @@ bool CombinerHelper::matchCombineFAddFMAFMulToFMadOrFMA( if (!canCombineFMadOrFMA(MI, AllowFusionGlobally, HasFMAD, Aggressive, true)) return false; - MachineInstr *LHS = MRI.getVRegDef(MI.getOperand(1).getReg()); - MachineInstr *RHS = MRI.getVRegDef(MI.getOperand(2).getReg()); + Register Op1 = MI.getOperand(1).getReg(); + Register Op2 = MI.getOperand(2).getReg(); + DefinitionAndSourceRegister LHS = {MRI.getVRegDef(Op1), Op1}; + DefinitionAndSourceRegister RHS = {MRI.getVRegDef(Op2), Op2}; LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); unsigned PreferredFusedOpcode = @@ -4966,31 +4995,31 @@ bool CombinerHelper::matchCombineFAddFMAFMulToFMadOrFMA( // If we have two choices trying to fold (fadd (fmul u, v), (fmul x, y)), // prefer to fold the multiply with fewer uses. - if (Aggressive && isContractableFMul(*LHS, AllowFusionGlobally) && - isContractableFMul(*RHS, AllowFusionGlobally)) { - if (hasMoreUses(*LHS, *RHS, MRI)) + if (Aggressive && isContractableFMul(*LHS.MI, AllowFusionGlobally) && + isContractableFMul(*RHS.MI, AllowFusionGlobally)) { + if (hasMoreUses(*LHS.MI, *RHS.MI, MRI)) std::swap(LHS, RHS); } MachineInstr *FMA = nullptr; Register Z; // fold (fadd (fma x, y, (fmul u, v)), z) -> (fma x, y, (fma u, v, z)) - if (LHS->getOpcode() == PreferredFusedOpcode && - (MRI.getVRegDef(LHS->getOperand(3).getReg())->getOpcode() == + if (LHS.MI->getOpcode() == PreferredFusedOpcode && + (MRI.getVRegDef(LHS.MI->getOperand(3).getReg())->getOpcode() == TargetOpcode::G_FMUL) && - MRI.hasOneNonDBGUse(LHS->getOperand(0).getReg()) && - MRI.hasOneNonDBGUse(LHS->getOperand(3).getReg())) { - FMA = LHS; - Z = RHS->getOperand(0).getReg(); + MRI.hasOneNonDBGUse(LHS.MI->getOperand(0).getReg()) && + MRI.hasOneNonDBGUse(LHS.MI->getOperand(3).getReg())) { + FMA = LHS.MI; + Z = RHS.Reg; } // fold (fadd z, (fma x, y, (fmul u, v))) -> (fma x, y, (fma u, v, z)) - else if (RHS->getOpcode() == PreferredFusedOpcode && - (MRI.getVRegDef(RHS->getOperand(3).getReg())->getOpcode() == + else if (RHS.MI->getOpcode() == PreferredFusedOpcode && + (MRI.getVRegDef(RHS.MI->getOperand(3).getReg())->getOpcode() == TargetOpcode::G_FMUL) && - MRI.hasOneNonDBGUse(RHS->getOperand(0).getReg()) && - MRI.hasOneNonDBGUse(RHS->getOperand(3).getReg())) { - Z = LHS->getOperand(0).getReg(); - FMA = RHS; + MRI.hasOneNonDBGUse(RHS.MI->getOperand(0).getReg()) && + MRI.hasOneNonDBGUse(RHS.MI->getOperand(3).getReg())) { + Z = LHS.Reg; + FMA = RHS.MI; } if (FMA) { @@ -5025,17 +5054,19 @@ bool CombinerHelper::matchCombineFAddFpExtFMulToFMadOrFMAAggressive( const auto &TLI = *MI.getMF()->getSubtarget().getTargetLowering(); LLT DstType = MRI.getType(MI.getOperand(0).getReg()); - MachineInstr *LHS = MRI.getVRegDef(MI.getOperand(1).getReg()); - MachineInstr *RHS = MRI.getVRegDef(MI.getOperand(2).getReg()); + Register Op1 = MI.getOperand(1).getReg(); + Register Op2 = MI.getOperand(2).getReg(); + DefinitionAndSourceRegister LHS = {MRI.getVRegDef(Op1), Op1}; + DefinitionAndSourceRegister RHS = {MRI.getVRegDef(Op2), Op2}; unsigned PreferredFusedOpcode = HasFMAD ? TargetOpcode::G_FMAD : TargetOpcode::G_FMA; // If we have two choices trying to fold (fadd (fmul u, v), (fmul x, y)), // prefer to fold the multiply with fewer uses. - if (Aggressive && isContractableFMul(*LHS, AllowFusionGlobally) && - isContractableFMul(*RHS, AllowFusionGlobally)) { - if (hasMoreUses(*LHS, *RHS, MRI)) + if (Aggressive && isContractableFMul(*LHS.MI, AllowFusionGlobally) && + isContractableFMul(*RHS.MI, AllowFusionGlobally)) { + if (hasMoreUses(*LHS.MI, *RHS.MI, MRI)) std::swap(LHS, RHS); } @@ -5054,16 +5085,17 @@ bool CombinerHelper::matchCombineFAddFpExtFMulToFMadOrFMAAggressive( MachineInstr *FMulMI, *FMAMI; // fold (fadd (fma x, y, (fpext (fmul u, v))), z) // -> (fma x, y, (fma (fpext u), (fpext v), z)) - if (LHS->getOpcode() == PreferredFusedOpcode && - mi_match(LHS->getOperand(3).getReg(), MRI, m_GFPExt(m_MInstr(FMulMI))) && + if (LHS.MI->getOpcode() == PreferredFusedOpcode && + mi_match(LHS.MI->getOperand(3).getReg(), MRI, + m_GFPExt(m_MInstr(FMulMI))) && isContractableFMul(*FMulMI, AllowFusionGlobally) && TLI.isFPExtFoldable(MI, PreferredFusedOpcode, DstType, MRI.getType(FMulMI->getOperand(0).getReg()))) { MatchInfo = [=](MachineIRBuilder &B) { buildMatchInfo(FMulMI->getOperand(1).getReg(), - FMulMI->getOperand(2).getReg(), - RHS->getOperand(0).getReg(), LHS->getOperand(1).getReg(), - LHS->getOperand(2).getReg(), B); + FMulMI->getOperand(2).getReg(), RHS.Reg, + LHS.MI->getOperand(1).getReg(), + LHS.MI->getOperand(2).getReg(), B); }; return true; } @@ -5073,7 +5105,7 @@ bool CombinerHelper::matchCombineFAddFpExtFMulToFMadOrFMAAggressive( // FIXME: This turns two single-precision and one double-precision // operation into two double-precision operations, which might not be // interesting for all targets, especially GPUs. - if (mi_match(LHS->getOperand(0).getReg(), MRI, m_GFPExt(m_MInstr(FMAMI))) && + if (mi_match(LHS.Reg, MRI, m_GFPExt(m_MInstr(FMAMI))) && FMAMI->getOpcode() == PreferredFusedOpcode) { MachineInstr *FMulMI = MRI.getVRegDef(FMAMI->getOperand(3).getReg()); if (isContractableFMul(*FMulMI, AllowFusionGlobally) && @@ -5085,8 +5117,7 @@ bool CombinerHelper::matchCombineFAddFpExtFMulToFMadOrFMAAggressive( X = B.buildFPExt(DstType, X).getReg(0); Y = B.buildFPExt(DstType, Y).getReg(0); buildMatchInfo(FMulMI->getOperand(1).getReg(), - FMulMI->getOperand(2).getReg(), - RHS->getOperand(0).getReg(), X, Y, B); + FMulMI->getOperand(2).getReg(), RHS.Reg, X, Y, B); }; return true; @@ -5095,16 +5126,17 @@ bool CombinerHelper::matchCombineFAddFpExtFMulToFMadOrFMAAggressive( // fold (fadd z, (fma x, y, (fpext (fmul u, v))) // -> (fma x, y, (fma (fpext u), (fpext v), z)) - if (RHS->getOpcode() == PreferredFusedOpcode && - mi_match(RHS->getOperand(3).getReg(), MRI, m_GFPExt(m_MInstr(FMulMI))) && + if (RHS.MI->getOpcode() == PreferredFusedOpcode && + mi_match(RHS.MI->getOperand(3).getReg(), MRI, + m_GFPExt(m_MInstr(FMulMI))) && isContractableFMul(*FMulMI, AllowFusionGlobally) && TLI.isFPExtFoldable(MI, PreferredFusedOpcode, DstType, MRI.getType(FMulMI->getOperand(0).getReg()))) { MatchInfo = [=](MachineIRBuilder &B) { buildMatchInfo(FMulMI->getOperand(1).getReg(), - FMulMI->getOperand(2).getReg(), - LHS->getOperand(0).getReg(), RHS->getOperand(1).getReg(), - RHS->getOperand(2).getReg(), B); + FMulMI->getOperand(2).getReg(), LHS.Reg, + RHS.MI->getOperand(1).getReg(), + RHS.MI->getOperand(2).getReg(), B); }; return true; } @@ -5114,7 +5146,7 @@ bool CombinerHelper::matchCombineFAddFpExtFMulToFMadOrFMAAggressive( // FIXME: This turns two single-precision and one double-precision // operation into two double-precision operations, which might not be // interesting for all targets, especially GPUs. - if (mi_match(RHS->getOperand(0).getReg(), MRI, m_GFPExt(m_MInstr(FMAMI))) && + if (mi_match(RHS.Reg, MRI, m_GFPExt(m_MInstr(FMAMI))) && FMAMI->getOpcode() == PreferredFusedOpcode) { MachineInstr *FMulMI = MRI.getVRegDef(FMAMI->getOperand(3).getReg()); if (isContractableFMul(*FMulMI, AllowFusionGlobally) && @@ -5126,8 +5158,7 @@ bool CombinerHelper::matchCombineFAddFpExtFMulToFMadOrFMAAggressive( X = B.buildFPExt(DstType, X).getReg(0); Y = B.buildFPExt(DstType, Y).getReg(0); buildMatchInfo(FMulMI->getOperand(1).getReg(), - FMulMI->getOperand(2).getReg(), - LHS->getOperand(0).getReg(), X, Y, B); + FMulMI->getOperand(2).getReg(), LHS.Reg, X, Y, B); }; return true; } @@ -5144,16 +5175,18 @@ bool CombinerHelper::matchCombineFSubFMulToFMadOrFMA( if (!canCombineFMadOrFMA(MI, AllowFusionGlobally, HasFMAD, Aggressive)) return false; - MachineInstr *LHS = MRI.getVRegDef(MI.getOperand(1).getReg()); - MachineInstr *RHS = MRI.getVRegDef(MI.getOperand(2).getReg()); + Register Op1 = MI.getOperand(1).getReg(); + Register Op2 = MI.getOperand(2).getReg(); + DefinitionAndSourceRegister LHS = {MRI.getVRegDef(Op1), Op1}; + DefinitionAndSourceRegister RHS = {MRI.getVRegDef(Op2), Op2}; LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); // If we have two choices trying to fold (fadd (fmul u, v), (fmul x, y)), // prefer to fold the multiply with fewer uses. int FirstMulHasFewerUses = true; - if (isContractableFMul(*LHS, AllowFusionGlobally) && - isContractableFMul(*RHS, AllowFusionGlobally) && - hasMoreUses(*LHS, *RHS, MRI)) + if (isContractableFMul(*LHS.MI, AllowFusionGlobally) && + isContractableFMul(*RHS.MI, AllowFusionGlobally) && + hasMoreUses(*LHS.MI, *RHS.MI, MRI)) FirstMulHasFewerUses = false; unsigned PreferredFusedOpcode = @@ -5161,24 +5194,24 @@ bool CombinerHelper::matchCombineFSubFMulToFMadOrFMA( // fold (fsub (fmul x, y), z) -> (fma x, y, -z) if (FirstMulHasFewerUses && - (isContractableFMul(*LHS, AllowFusionGlobally) && - (Aggressive || MRI.hasOneNonDBGUse(LHS->getOperand(0).getReg())))) { + (isContractableFMul(*LHS.MI, AllowFusionGlobally) && + (Aggressive || MRI.hasOneNonDBGUse(LHS.Reg)))) { MatchInfo = [=, &MI](MachineIRBuilder &B) { - Register NegZ = B.buildFNeg(DstTy, RHS->getOperand(0).getReg()).getReg(0); - B.buildInstr( - PreferredFusedOpcode, {MI.getOperand(0).getReg()}, - {LHS->getOperand(1).getReg(), LHS->getOperand(2).getReg(), NegZ}); + Register NegZ = B.buildFNeg(DstTy, RHS.Reg).getReg(0); + B.buildInstr(PreferredFusedOpcode, {MI.getOperand(0).getReg()}, + {LHS.MI->getOperand(1).getReg(), + LHS.MI->getOperand(2).getReg(), NegZ}); }; return true; } // fold (fsub x, (fmul y, z)) -> (fma -y, z, x) - else if ((isContractableFMul(*RHS, AllowFusionGlobally) && - (Aggressive || MRI.hasOneNonDBGUse(RHS->getOperand(0).getReg())))) { + else if ((isContractableFMul(*RHS.MI, AllowFusionGlobally) && + (Aggressive || MRI.hasOneNonDBGUse(RHS.Reg)))) { MatchInfo = [=, &MI](MachineIRBuilder &B) { - Register NegY = B.buildFNeg(DstTy, RHS->getOperand(1).getReg()).getReg(0); - B.buildInstr( - PreferredFusedOpcode, {MI.getOperand(0).getReg()}, - {NegY, RHS->getOperand(2).getReg(), LHS->getOperand(0).getReg()}); + Register NegY = + B.buildFNeg(DstTy, RHS.MI->getOperand(1).getReg()).getReg(0); + B.buildInstr(PreferredFusedOpcode, {MI.getOperand(0).getReg()}, + {NegY, RHS.MI->getOperand(2).getReg(), LHS.Reg}); }; return true; } diff --git a/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp b/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp index 306af808659a..64c2f0d5f8e4 100644 --- a/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp +++ b/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp @@ -37,6 +37,11 @@ Align GISelKnownBits::computeKnownAlignment(Register R, unsigned Depth) { switch (MI->getOpcode()) { case TargetOpcode::COPY: return computeKnownAlignment(MI->getOperand(1).getReg(), Depth); + case TargetOpcode::G_ASSERT_ALIGN: { + // TODO: Min with source + int64_t LogAlign = MI->getOperand(2).getImm(); + return Align(1ull << LogAlign); + } case TargetOpcode::G_FRAME_INDEX: { int FrameIdx = MI->getOperand(1).getIndex(); return MF.getFrameInfo().getObjectAlign(FrameIdx); @@ -466,6 +471,18 @@ void GISelKnownBits::computeKnownBitsImpl(Register R, KnownBits &Known, Known.Zero.setBitsFrom(SrcBitWidth); break; } + case TargetOpcode::G_ASSERT_ALIGN: { + int64_t LogOfAlign = MI.getOperand(2).getImm(); + if (LogOfAlign == 0) + break; + + // TODO: Should use maximum with source + // If a node is guaranteed to be aligned, set low zero bits accordingly as + // well as clearing one bits. + Known.Zero.setLowBits(LogOfAlign); + Known.One.clearLowBits(LogOfAlign); + break; + } case TargetOpcode::G_MERGE_VALUES: { unsigned NumOps = MI.getNumOperands(); unsigned OpSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); diff --git a/llvm/lib/CodeGen/GlobalISel/InlineAsmLowering.cpp b/llvm/lib/CodeGen/GlobalISel/InlineAsmLowering.cpp index 4ae427484945..e5f95ca5aa73 100644 --- a/llvm/lib/CodeGen/GlobalISel/InlineAsmLowering.cpp +++ b/llvm/lib/CodeGen/GlobalISel/InlineAsmLowering.cpp @@ -297,10 +297,8 @@ bool InlineAsmLowering::lowerInlineAsm( GISelAsmOperandInfo &OpInfo = ConstraintOperands.back(); // Compute the value type for each operand. - if (OpInfo.Type == InlineAsm::isInput || - (OpInfo.Type == InlineAsm::isOutput && OpInfo.isIndirect)) { - - OpInfo.CallOperandVal = const_cast<Value *>(Call.getArgOperand(ArgNo++)); + if (OpInfo.hasArg()) { + OpInfo.CallOperandVal = const_cast<Value *>(Call.getArgOperand(ArgNo)); if (isa<BasicBlock>(OpInfo.CallOperandVal)) { LLVM_DEBUG(dbgs() << "Basic block input operands not supported yet\n"); @@ -312,10 +310,8 @@ bool InlineAsmLowering::lowerInlineAsm( // If this is an indirect operand, the operand is a pointer to the // accessed type. if (OpInfo.isIndirect) { - PointerType *PtrTy = dyn_cast<PointerType>(OpTy); - if (!PtrTy) - report_fatal_error("Indirect operand for inline asm not a pointer!"); - OpTy = PtrTy->getElementType(); + OpTy = Call.getAttributes().getParamElementType(ArgNo); + assert(OpTy && "Indirect operand must have elementtype attribute"); } // FIXME: Support aggregate input operands @@ -327,7 +323,7 @@ bool InlineAsmLowering::lowerInlineAsm( OpInfo.ConstraintVT = TLI->getAsmOperandValueType(DL, OpTy, true).getSimpleVT(); - + ++ArgNo; } else if (OpInfo.Type == InlineAsm::isOutput && !OpInfo.isIndirect) { assert(!Call.getType()->isVoidTy() && "Bad inline asm!"); if (StructType *STy = dyn_cast<StructType>(Call.getType())) { @@ -627,7 +623,8 @@ bool InlineAsmLowering::lowerInlineAsm( Register SrcReg = OpInfo.Regs[0]; unsigned SrcSize = TRI->getRegSizeInBits(SrcReg, *MRI); - if (MRI->getType(ResRegs[i]).getSizeInBits() < SrcSize) { + LLT ResTy = MRI->getType(ResRegs[i]); + if (ResTy.isScalar() && ResTy.getSizeInBits() < SrcSize) { // First copy the non-typed virtual register into a generic virtual // register Register Tmp1Reg = @@ -635,9 +632,14 @@ bool InlineAsmLowering::lowerInlineAsm( MIRBuilder.buildCopy(Tmp1Reg, SrcReg); // Need to truncate the result of the register MIRBuilder.buildTrunc(ResRegs[i], Tmp1Reg); - } else { + } else if (ResTy.getSizeInBits() == SrcSize) { MIRBuilder.buildCopy(ResRegs[i], SrcReg); + } else { + LLVM_DEBUG(dbgs() << "Unhandled output operand with " + "mismatched register size\n"); + return false; } + break; } case TargetLowering::C_Immediate: diff --git a/llvm/lib/CodeGen/GlobalISel/InstructionSelect.cpp b/llvm/lib/CodeGen/GlobalISel/InstructionSelect.cpp index b10c9272a508..2bb5addefe48 100644 --- a/llvm/lib/CodeGen/GlobalISel/InstructionSelect.cpp +++ b/llvm/lib/CodeGen/GlobalISel/InstructionSelect.cpp @@ -71,9 +71,10 @@ InstructionSelect::InstructionSelect() void InstructionSelect::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired<TargetPassConfig>(); + AU.addRequired<GISelKnownBitsAnalysis>(); + AU.addPreserved<GISelKnownBitsAnalysis>(); + if (OptLevel != CodeGenOpt::None) { - AU.addRequired<GISelKnownBitsAnalysis>(); - AU.addPreserved<GISelKnownBitsAnalysis>(); AU.addRequired<ProfileSummaryInfoWrapperPass>(); LazyBlockFrequencyInfoPass::getLazyBFIAnalysisUsage(AU); } @@ -97,9 +98,8 @@ bool InstructionSelect::runOnMachineFunction(MachineFunction &MF) { OptLevel = MF.getFunction().hasOptNone() ? CodeGenOpt::None : MF.getTarget().getOptLevel(); - GISelKnownBits *KB = nullptr; + GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF); if (OptLevel != CodeGenOpt::None) { - KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF); PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); if (PSI && PSI->hasProfileSummary()) BFI = &getAnalysis<LazyBlockFrequencyInfoPass>().getBFI(); diff --git a/llvm/lib/CodeGen/GlobalISel/InstructionSelector.cpp b/llvm/lib/CodeGen/GlobalISel/InstructionSelector.cpp index dc5a4d8f85aa..1d0c106fd5db 100644 --- a/llvm/lib/CodeGen/GlobalISel/InstructionSelector.cpp +++ b/llvm/lib/CodeGen/GlobalISel/InstructionSelector.cpp @@ -29,7 +29,7 @@ using namespace llvm; InstructionSelector::MatcherState::MatcherState(unsigned MaxRenderers) - : Renderers(MaxRenderers), MIs() {} + : Renderers(MaxRenderers) {} InstructionSelector::InstructionSelector() = default; diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp index e8a8efd5dad4..37bc8a65dc7c 100644 --- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -564,7 +564,7 @@ static bool isLibCallInTailPosition(MachineInstr &MI, // the return. Ignore NoAlias and NonNull because they don't affect the // call sequence. AttributeList CallerAttrs = F.getAttributes(); - if (AttrBuilder(CallerAttrs, AttributeList::ReturnIndex) + if (AttrBuilder(F.getContext(), CallerAttrs.getRetAttrs()) .removeAttribute(Attribute::NoAlias) .removeAttribute(Attribute::NonNull) .hasAttributes()) @@ -1677,7 +1677,7 @@ LegalizerHelper::widenScalarUnmergeValues(MachineInstr &MI, unsigned TypeIdx, // Widen SrcTy to WideTy. This does not affect the result, but since the // user requested this size, it is probably better handled than SrcTy and - // should reduce the total number of legalization artifacts + // should reduce the total number of legalization artifacts. if (WideTy.getSizeInBits() > SrcTy.getSizeInBits()) { SrcTy = WideTy; SrcReg = MIRBuilder.buildAnyExt(WideTy, SrcReg).getReg(0); @@ -3655,7 +3655,6 @@ static bool hasSameNumEltsOnAllVectorOperands( if (!Ty.isVector()) { if (!is_contained(NonVecOpIndices, OpIdx)) return false; - is_contained(NonVecOpIndices, OpIdx); continue; } diff --git a/llvm/lib/CodeGen/GlobalISel/Localizer.cpp b/llvm/lib/CodeGen/GlobalISel/Localizer.cpp index a1acc4195840..328a278f3d68 100644 --- a/llvm/lib/CodeGen/GlobalISel/Localizer.cpp +++ b/llvm/lib/CodeGen/GlobalISel/Localizer.cpp @@ -124,14 +124,13 @@ bool Localizer::localizeInterBlock(MachineFunction &MF, LocalizedInstrs.insert(LocalizedMI); MachineInstr &UseMI = *MOUse.getParent(); if (MRI->hasOneUse(Reg) && !UseMI.isPHI()) - InsertMBB->insert(InsertMBB->SkipPHIsAndLabels(UseMI), LocalizedMI); + InsertMBB->insert(UseMI, LocalizedMI); else InsertMBB->insert(InsertMBB->SkipPHIsAndLabels(InsertMBB->begin()), LocalizedMI); // Set a new register for the definition. - Register NewReg = MRI->createGenericVirtualRegister(MRI->getType(Reg)); - MRI->setRegClassOrRegBank(NewReg, MRI->getRegClassOrRegBank(Reg)); + Register NewReg = MRI->cloneVirtualRegister(Reg); LocalizedMI->getOperand(0).setReg(NewReg); NewVRegIt = MBBWithLocalDef.insert(std::make_pair(MBBAndReg, NewReg)).first; @@ -174,9 +173,10 @@ bool Localizer::localizeIntraBlock(LocalizedSetVecT &LocalizedInstrs) { while (II != MBB.end() && !Users.count(&*II)) ++II; - LLVM_DEBUG(dbgs() << "Intra-block: moving " << *MI << " before " << *&*II - << "\n"); assert(II != MBB.end() && "Didn't find the user in the MBB"); + LLVM_DEBUG(dbgs() << "Intra-block: moving " << *MI << " before " << *II + << '\n'); + MI->removeFromParent(); MBB.insert(II, MI); Changed = true; diff --git a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp index 391251886fbb..c6720568b362 100644 --- a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp +++ b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp @@ -282,18 +282,6 @@ MachineInstrBuilder MachineIRBuilder::buildCopy(const DstOp &Res, return buildInstr(TargetOpcode::COPY, Res, Op); } -MachineInstrBuilder MachineIRBuilder::buildAssertSExt(const DstOp &Res, - const SrcOp &Op, - unsigned Size) { - return buildInstr(TargetOpcode::G_ASSERT_SEXT, Res, Op).addImm(Size); -} - -MachineInstrBuilder MachineIRBuilder::buildAssertZExt(const DstOp &Res, - const SrcOp &Op, - unsigned Size) { - return buildInstr(TargetOpcode::G_ASSERT_ZEXT, Res, Op).addImm(Size); -} - MachineInstrBuilder MachineIRBuilder::buildConstant(const DstOp &Res, const ConstantInt &Val) { LLT Ty = Res.getLLTTy(*getMRI()); diff --git a/llvm/lib/CodeGen/GlobalISel/RegBankSelect.cpp b/llvm/lib/CodeGen/GlobalISel/RegBankSelect.cpp index 937d94764be1..01af6bb51bb7 100644 --- a/llvm/lib/CodeGen/GlobalISel/RegBankSelect.cpp +++ b/llvm/lib/CodeGen/GlobalISel/RegBankSelect.cpp @@ -626,7 +626,8 @@ bool RegBankSelect::assignInstr(MachineInstr &MI) { unsigned Opc = MI.getOpcode(); if (isPreISelGenericOptimizationHint(Opc)) { assert((Opc == TargetOpcode::G_ASSERT_ZEXT || - Opc == TargetOpcode::G_ASSERT_SEXT) && + Opc == TargetOpcode::G_ASSERT_SEXT || + Opc == TargetOpcode::G_ASSERT_ALIGN) && "Unexpected hint opcode!"); // The only correct mapping for these is to always use the source register // bank. @@ -856,7 +857,7 @@ void RegBankSelect::RepairingPlacement::addInsertPoint( RegBankSelect::InstrInsertPoint::InstrInsertPoint(MachineInstr &Instr, bool Before) - : InsertPoint(), Instr(Instr), Before(Before) { + : Instr(Instr), Before(Before) { // Since we do not support splitting, we do not need to update // liveness and such, so do not do anything with P. assert((!Before || !Instr.isPHI()) && diff --git a/llvm/lib/CodeGen/GlobalISel/Utils.cpp b/llvm/lib/CodeGen/GlobalISel/Utils.cpp index 4981a537dc7c..544af9a2954f 100644 --- a/llvm/lib/CodeGen/GlobalISel/Utils.cpp +++ b/llvm/lib/CodeGen/GlobalISel/Utils.cpp @@ -592,17 +592,17 @@ Optional<APFloat> llvm::ConstantFoldFPBinOp(unsigned Opcode, const Register Op1, return None; } -Optional<MachineInstr *> -llvm::ConstantFoldVectorBinop(unsigned Opcode, const Register Op1, - const Register Op2, - const MachineRegisterInfo &MRI, - MachineIRBuilder &MIB) { - auto *SrcVec1 = getOpcodeDef<GBuildVector>(Op1, MRI); - if (!SrcVec1) - return None; +Register llvm::ConstantFoldVectorBinop(unsigned Opcode, const Register Op1, + const Register Op2, + const MachineRegisterInfo &MRI, + MachineIRBuilder &MIB) { auto *SrcVec2 = getOpcodeDef<GBuildVector>(Op2, MRI); if (!SrcVec2) - return None; + return Register(); + + auto *SrcVec1 = getOpcodeDef<GBuildVector>(Op1, MRI); + if (!SrcVec1) + return Register(); const LLT EltTy = MRI.getType(SrcVec1->getSourceReg(0)); @@ -611,14 +611,14 @@ llvm::ConstantFoldVectorBinop(unsigned Opcode, const Register Op1, auto MaybeCst = ConstantFoldBinOp(Opcode, SrcVec1->getSourceReg(Idx), SrcVec2->getSourceReg(Idx), MRI); if (!MaybeCst) - return None; + return Register(); auto FoldedCstReg = MIB.buildConstant(EltTy, *MaybeCst).getReg(0); FoldedElements.emplace_back(FoldedCstReg); } // Create the new vector constant. auto CstVec = MIB.buildBuildVector(MRI.getType(SrcVec1->getReg(0)), FoldedElements); - return &*CstVec; + return CstVec.getReg(0); } bool llvm::isKnownNeverNaN(Register Val, const MachineRegisterInfo &MRI, @@ -704,8 +704,7 @@ Register llvm::getFunctionLiveInPhysReg(MachineFunction &MF, const TargetInstrInfo &TII, MCRegister PhysReg, const TargetRegisterClass &RC, - LLT RegTy) { - DebugLoc DL; // FIXME: Is no location the right choice? + const DebugLoc &DL, LLT RegTy) { MachineBasicBlock &EntryMBB = MF.front(); MachineRegisterInfo &MRI = MF.getRegInfo(); Register LiveIn = MRI.getLiveInVirtReg(PhysReg); diff --git a/llvm/lib/CodeGen/InterleavedLoadCombinePass.cpp b/llvm/lib/CodeGen/InterleavedLoadCombinePass.cpp index 9fabcfb1f326..2ee9379cb286 100644 --- a/llvm/lib/CodeGen/InterleavedLoadCombinePass.cpp +++ b/llvm/lib/CodeGen/InterleavedLoadCombinePass.cpp @@ -185,7 +185,7 @@ class Polynomial { APInt A; public: - Polynomial(Value *V) : ErrorMSBs((unsigned)-1), V(V), B(), A() { + Polynomial(Value *V) : ErrorMSBs((unsigned)-1), V(V) { IntegerType *Ty = dyn_cast<IntegerType>(V->getType()); if (Ty) { ErrorMSBs = 0; @@ -195,12 +195,12 @@ public: } Polynomial(const APInt &A, unsigned ErrorMSBs = 0) - : ErrorMSBs(ErrorMSBs), V(NULL), B(), A(A) {} + : ErrorMSBs(ErrorMSBs), V(nullptr), A(A) {} Polynomial(unsigned BitWidth, uint64_t A, unsigned ErrorMSBs = 0) - : ErrorMSBs(ErrorMSBs), V(NULL), B(), A(BitWidth, A) {} + : ErrorMSBs(ErrorMSBs), V(nullptr), A(BitWidth, A) {} - Polynomial() : ErrorMSBs((unsigned)-1), V(NULL), B(), A() {} + Polynomial() : ErrorMSBs((unsigned)-1), V(nullptr) {} /// Increment and clamp the number of undefined bits. void incErrorMSBs(unsigned amt) { @@ -677,7 +677,7 @@ public: FixedVectorType *const VTy; VectorInfo(FixedVectorType *VTy) - : BB(nullptr), PV(nullptr), LIs(), Is(), SVI(nullptr), VTy(VTy) { + : BB(nullptr), PV(nullptr), SVI(nullptr), VTy(VTy) { EI = new ElementInfo[VTy->getNumElements()]; } diff --git a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp index e97dcca201e8..8a190e769941 100644 --- a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp +++ b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp @@ -251,9 +251,10 @@ public: /// creates DBG_VALUEs and puts them in #Transfers, then prepares the other /// object fields to track variable locations as we step through the block. /// FIXME: could just examine mloctracker instead of passing in \p mlocs? - void loadInlocs(MachineBasicBlock &MBB, ValueIDNum *MLocs, - SmallVectorImpl<std::pair<DebugVariable, DbgValue>> &VLocs, - unsigned NumLocs) { + void + loadInlocs(MachineBasicBlock &MBB, ValueIDNum *MLocs, + const SmallVectorImpl<std::pair<DebugVariable, DbgValue>> &VLocs, + unsigned NumLocs) { ActiveMLocs.clear(); ActiveVLocs.clear(); VarLocs.clear(); @@ -272,7 +273,7 @@ public: }; // Map of the preferred location for each value. - std::map<ValueIDNum, LocIdx> ValueToLoc; + DenseMap<ValueIDNum, LocIdx> ValueToLoc; ActiveMLocs.reserve(VLocs.size()); ActiveVLocs.reserve(VLocs.size()); @@ -283,6 +284,11 @@ public: LocIdx Idx = Location.Idx; ValueIDNum &VNum = MLocs[Idx.asU64()]; VarLocs.push_back(VNum); + + // Short-circuit unnecessary preferred location update. + if (VLocs.empty()) + continue; + auto it = ValueToLoc.find(VNum); // In order of preference, pick: // * Callee saved registers, @@ -298,7 +304,7 @@ public: } // Now map variables to their picked LocIdxes. - for (auto Var : VLocs) { + for (const auto &Var : VLocs) { if (Var.second.Kind == DbgValue::Const) { PendingDbgValues.push_back( emitMOLoc(*Var.second.MO, Var.first, Var.second.Properties)); @@ -413,7 +419,8 @@ public: return Reg != SP && Reg != FP; } - bool recoverAsEntryValue(const DebugVariable &Var, DbgValueProperties &Prop, + bool recoverAsEntryValue(const DebugVariable &Var, + const DbgValueProperties &Prop, const ValueIDNum &Num) { // Is this variable location a candidate to be an entry value. First, // should we be trying this at all? @@ -2799,31 +2806,28 @@ void InstrRefBasedLDV::emitLocations( } } - // We have to insert DBG_VALUEs in a consistent order, otherwise they appeaer - // in DWARF in different orders. Use the order that they appear when walking - // through each block / each instruction, stored in AllVarsNumbering. - auto OrderDbgValues = [&](const MachineInstr *A, - const MachineInstr *B) -> bool { - DebugVariable VarA(A->getDebugVariable(), A->getDebugExpression(), - A->getDebugLoc()->getInlinedAt()); - DebugVariable VarB(B->getDebugVariable(), B->getDebugExpression(), - B->getDebugLoc()->getInlinedAt()); - return AllVarsNumbering.find(VarA)->second < - AllVarsNumbering.find(VarB)->second; - }; - // Go through all the transfers recorded in the TransferTracker -- this is // both the live-ins to a block, and any movements of values that happen // in the middle. - for (auto &P : TTracker->Transfers) { - // Sort them according to appearance order. - llvm::sort(P.Insts, OrderDbgValues); + for (const auto &P : TTracker->Transfers) { + // We have to insert DBG_VALUEs in a consistent order, otherwise they + // appear in DWARF in different orders. Use the order that they appear + // when walking through each block / each instruction, stored in + // AllVarsNumbering. + SmallVector<std::pair<unsigned, MachineInstr *>> Insts; + for (MachineInstr *MI : P.Insts) { + DebugVariable Var(MI->getDebugVariable(), MI->getDebugExpression(), + MI->getDebugLoc()->getInlinedAt()); + Insts.emplace_back(AllVarsNumbering.find(Var)->second, MI); + } + llvm::sort(Insts, + [](const auto &A, const auto &B) { return A.first < B.first; }); + // Insert either before or after the designated point... if (P.MBB) { MachineBasicBlock &MBB = *P.MBB; - for (auto *MI : P.Insts) { - MBB.insert(P.Pos, MI); - } + for (const auto &Pair : Insts) + MBB.insert(P.Pos, Pair.second); } else { // Terminators, like tail calls, can clobber things. Don't try and place // transfers after them. @@ -2831,9 +2835,8 @@ void InstrRefBasedLDV::emitLocations( continue; MachineBasicBlock &MBB = *P.Pos->getParent(); - for (auto *MI : P.Insts) { - MBB.insertAfterBundle(P.Pos, MI); - } + for (const auto &Pair : Insts) + MBB.insertAfterBundle(P.Pos, Pair.second); } } } diff --git a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.h b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.h index 789205e61cdb..9e9c0ce394fd 100644 --- a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.h +++ b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.h @@ -494,7 +494,7 @@ public: return StackIdxesToPos.find(Idx)->second; } - unsigned getNumLocs(void) const { return LocIdxToIDNum.size(); } + unsigned getNumLocs() const { return LocIdxToIDNum.size(); } /// Reset all locations to contain a PHI value at the designated block. Used /// sometimes for actual PHI values, othertimes to indicate the block entry @@ -516,7 +516,7 @@ public: } /// Wipe any un-necessary location records after traversing a block. - void reset(void) { + void reset() { // We could reset all the location values too; however either loadFromArray // or setMPhis should be called before this object is re-used. Just // clear Masks, they're definitely not needed. @@ -525,7 +525,7 @@ public: /// Clear all data. Destroys the LocID <=> LocIdx map, which makes most of /// the information in this pass uninterpretable. - void clear(void) { + void clear() { reset(); LocIDToLocIdx.clear(); LocIdxToLocID.clear(); @@ -1082,7 +1082,9 @@ template <> struct DenseMapInfo<ValueIDNum> { return ValueIDNum::TombstoneValue; } - static unsigned getHashValue(const ValueIDNum &Val) { return Val.asU64(); } + static unsigned getHashValue(const ValueIDNum &Val) { + return hash_value(Val.asU64()); + } static bool isEqual(const ValueIDNum &A, const ValueIDNum &B) { return A == B; diff --git a/llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.cpp b/llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.cpp index 691977dc34e6..8f697611a82c 100644 --- a/llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.cpp +++ b/llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.cpp @@ -40,6 +40,10 @@ static cl::opt<bool> "normal DBG_VALUE inputs"), cl::init(false)); +static cl::opt<cl::boolOrDefault> ValueTrackingVariableLocations( + "experimental-debug-variable-locations", + cl::desc("Use experimental new value-tracking variable locations")); + // Options to prevent pathological compile-time behavior. If InputBBLimit and // InputDbgValueLimit are both exceeded, range extension is disabled. static cl::opt<unsigned> InputBBLimit( @@ -117,3 +121,8 @@ bool LiveDebugValues::runOnMachineFunction(MachineFunction &MF) { return TheImpl->ExtendRanges(MF, DomTree, TPC, InputBBLimit, InputDbgValueLimit); } + +bool llvm::debuginfoShouldUseDebugInstrRef(const Triple &T) { + // Enable if explicitly requested on command line. + return ValueTrackingVariableLocations == cl::boolOrDefault::BOU_TRUE; +} diff --git a/llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.h b/llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.h index a5936c8a96f0..8f0b2ec3e1fc 100644 --- a/llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.h +++ b/llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.h @@ -12,6 +12,7 @@ #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/ADT/Triple.h" namespace llvm { @@ -35,6 +36,9 @@ public: // Factory functions for LiveDebugValues implementations. extern LDVImpl *makeVarLocBasedLiveDebugValues(); extern LDVImpl *makeInstrRefBasedLiveDebugValues(); + +extern bool debuginfoShouldUseDebugInstrRef(const Triple &T); + } // namespace llvm #endif // LLVM_LIB_CODEGEN_LIVEDEBUGVALUES_LIVEDEBUGVALUES_H diff --git a/llvm/lib/CodeGen/LiveDebugVariables.cpp b/llvm/lib/CodeGen/LiveDebugVariables.cpp index e6661e5135c3..6d806135240e 100644 --- a/llvm/lib/CodeGen/LiveDebugVariables.cpp +++ b/llvm/lib/CodeGen/LiveDebugVariables.cpp @@ -152,7 +152,7 @@ public: } } - DbgVariableValue() : LocNoCount(0), WasIndirect(0), WasList(0) {} + DbgVariableValue() : LocNoCount(0), WasIndirect(false), WasList(false) {} DbgVariableValue(const DbgVariableValue &Other) : LocNoCount(Other.LocNoCount), WasIndirect(Other.getWasIndirect()), WasList(Other.getWasList()), Expression(Other.getExpression()) { diff --git a/llvm/lib/CodeGen/LiveIntervals.cpp b/llvm/lib/CodeGen/LiveIntervals.cpp index 2f97386b6d18..9571afa434c1 100644 --- a/llvm/lib/CodeGen/LiveIntervals.cpp +++ b/llvm/lib/CodeGen/LiveIntervals.cpp @@ -827,6 +827,8 @@ CancelKill: MachineBasicBlock* LiveIntervals::intervalIsInOneMBB(const LiveInterval &LI) const { + assert(!LI.empty() && "LiveInterval is empty."); + // A local live range must be fully contained inside the block, meaning it is // defined and killed at instructions, not at block boundaries. It is not // live in or out of any block. diff --git a/llvm/lib/CodeGen/MIRParser/MIParser.cpp b/llvm/lib/CodeGen/MIRParser/MIParser.cpp index 1a04e1ca56a9..6477965bdc21 100644 --- a/llvm/lib/CodeGen/MIRParser/MIParser.cpp +++ b/llvm/lib/CodeGen/MIRParser/MIParser.cpp @@ -875,11 +875,11 @@ bool MIParser::parseBasicBlock(MachineBasicBlock &MBB, // N.B: Multiple lists of successors and liveins are allowed and they're // merged into one. // Example: - // liveins: %edi - // liveins: %esi + // liveins: $edi + // liveins: $esi // // is equivalent to - // liveins: %edi, %esi + // liveins: $edi, $esi bool ExplicitSuccessors = false; while (true) { if (Token.is(MIToken::kw_successors)) { diff --git a/llvm/lib/CodeGen/MIRParser/MIRParser.cpp b/llvm/lib/CodeGen/MIRParser/MIRParser.cpp index d0323eaf3d78..f144639770bc 100644 --- a/llvm/lib/CodeGen/MIRParser/MIRParser.cpp +++ b/llvm/lib/CodeGen/MIRParser/MIRParser.cpp @@ -182,8 +182,7 @@ static void handleYAMLDiag(const SMDiagnostic &Diag, void *Context) { MIRParserImpl::MIRParserImpl(std::unique_ptr<MemoryBuffer> Contents, StringRef Filename, LLVMContext &Context, std::function<void(Function &)> Callback) - : SM(), - Context(Context), + : Context(Context), In(SM.getMemoryBuffer(SM.AddNewSourceBuffer(std::move(Contents), SMLoc())) ->getBuffer(), nullptr, handleYAMLDiag, this), diff --git a/llvm/lib/CodeGen/MLRegallocEvictAdvisor.cpp b/llvm/lib/CodeGen/MLRegallocEvictAdvisor.cpp new file mode 100644 index 000000000000..a74c57690640 --- /dev/null +++ b/llvm/lib/CodeGen/MLRegallocEvictAdvisor.cpp @@ -0,0 +1,862 @@ +//===- MLRegAllocEvictAdvisor.cpp - ML eviction advisor -------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Implementation of the ML eviction advisor and reward injection pass +// +//===----------------------------------------------------------------------===// + +#include "RegAllocEvictionAdvisor.h" +#include "RegAllocGreedy.h" +#include "RegAllocScore.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/MLModelRunner.h" +#include "llvm/Analysis/ModelUnderTrainingRunner.h" +#include "llvm/Analysis/NoInferenceModelRunner.h" +#include "llvm/Analysis/ReleaseModeModelRunner.h" +#include "llvm/Analysis/Utils/TFUtils.h" +#include "llvm/CodeGen/CalcSpillWeights.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineBlockFrequencyInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/RegisterClassInfo.h" +#include "llvm/CodeGen/VirtRegMap.h" +#include "llvm/Config/config.h" +#include "llvm/InitializePasses.h" +#include "llvm/Pass.h" +#include "llvm/PassRegistry.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Target/TargetMachine.h" + +#include <array> +#include <memory> + +using namespace llvm; + +#define DEBUG_TYPE "ml-regalloc" + +// Generated header in release (AOT) mode +#if defined(LLVM_HAVE_TF_AOT_REGALLOCEVICTMODEL) +#include "RegallocEvictModel.h" +#endif + +// Options that only make sense in development mode +#ifdef LLVM_HAVE_TF_API +static cl::opt<std::string> TrainingLog( + "regalloc-training-log", cl::Hidden, + cl::desc("Training log for the register allocator eviction model")); + +static cl::opt<std::string> ModelUnderTraining( + "regalloc-model", cl::Hidden, + cl::desc("The model being trained for register allocation eviction")); + +#endif // #ifdef LLVM_HAVE_TF_API + +/// The score injection pass. +/// This pass calculates the score for a function and inserts it in the log, but +/// this happens only in development mode. It's a no-op otherwise. +namespace llvm { +class RegAllocScoring : public MachineFunctionPass { +public: + static char ID; + + RegAllocScoring() : MachineFunctionPass(ID) { + initializeRegAllocScoringPass(*PassRegistry::getPassRegistry()); + } + + ~RegAllocScoring() override = default; + + StringRef getPassName() const override { + return "Register Allocation Pass Scoring"; + } + + /// RegAllocReward analysis usage. + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesAll(); + AU.addRequired<RegAllocEvictionAdvisorAnalysis>(); + AU.addRequired<MachineBlockFrequencyInfo>(); + AU.addRequired<AAResultsWrapperPass>(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + /// Performs this pass + bool runOnMachineFunction(MachineFunction &) override; +}; + +char RegAllocScoring::ID = 0; +FunctionPass *createRegAllocScoringPass() { return new RegAllocScoring(); } + +} // namespace llvm + +INITIALIZE_PASS(RegAllocScoring, "regallocscoringpass", + "Register Allocation Scoring Pass", false, false) + +// =================================== +// Common ML Advisor declarations +// =================================== +namespace { +// This is the maximum number of interfererring ranges. That's the number of +// distinct AllocationOrder values, which comes from MCRegisterClass::RegsSize. +// For X86, that's 32. +// TODO: find a way to get this, statically, in a programmatic way. +static const int64_t MaxInterferences = 32; + +// Logically, we can think of the feature set given to the evaluator as a 2D +// matrix. The rows are the features (see next). The columns correspond to the +// interferences. We treat the candidate virt reg as an 'interference', too, as +// its feature set is the same as that of the interferring ranges. So we'll have +// MaxInterferences + 1 columns and by convention, we will use the last column +// for the virt reg seeking allocation. +static const int64_t CandidateVirtRegPos = MaxInterferences; +static const int64_t NumberOfInterferences = CandidateVirtRegPos + 1; + +// Most features are as described above, so we'll reuse this vector in defining +// them. +static const std::vector<int64_t> PerLiveRangeShape{1, NumberOfInterferences}; + +// -------------- +// Features table +// -------------- +// For each interfering live range (incl. the candidate) we collect a number of +// features. However, because the features are of different types (and because +// of ML best practices), we organize the tensors per feature, not per +// candidate. Each such tensor has a scalar value corresponding to the +// interferring live range at that position, in the order in AllocationOrder. +// The last position corresponds to the virt reg seeking allocation. +// Exception to all that is the progression feature, which is just a scalar (see +// its documentation for details). +// Note on naming: the "_by_max" are normalized using the largest value of that +// tensor, as observed in the current decision making stage (i.e. for the +// current call to the advisor's tryFindEvictionCandidate) +// +// The feature list format: type, name, shape, documentation. +// Note: we can really just use int64 and float, hence the modeling of some +// bools as int64 values. +#define RA_EVICT_FEATURES_LIST(M) \ + M(int64_t, mask, PerLiveRangeShape, \ + "boolean values, 0 for unavailable candidates (i.e. if a position is 0, " \ + "it " \ + "can't be evicted)") \ + M(int64_t, is_free, PerLiveRangeShape, \ + "boolean values, 1 if this phys reg is actually free (no interferences)") \ + M(float, nr_urgent, PerLiveRangeShape, \ + "number of 'urgent' intervals, normalized. Urgent are those that are OK " \ + "to break cascades") \ + M(float, nr_broken_hints, PerLiveRangeShape, \ + "if this position were evicted, how many broken hints would there be") \ + M(int64_t, is_hint, PerLiveRangeShape, \ + "is this a preferred phys reg for the candidate") \ + M(int64_t, is_local, PerLiveRangeShape, \ + "is this live range local to a basic block") \ + M(float, nr_rematerializable, PerLiveRangeShape, \ + "nr rematerializable ranges") \ + M(float, nr_defs_and_uses, PerLiveRangeShape, \ + "bb freq - weighed nr defs and uses") \ + M(float, weighed_reads_by_max, PerLiveRangeShape, \ + "bb freq - weighed nr of reads, normalized") \ + M(float, weighed_writes_by_max, PerLiveRangeShape, \ + "bb feq - weighed nr of writes, normalized") \ + M(float, weighed_read_writes_by_max, PerLiveRangeShape, \ + "bb freq - weighed nr of uses that are both read and writes, normalized") \ + M(float, weighed_indvars_by_max, PerLiveRangeShape, \ + "bb freq - weighed nr of uses that are indvars, normalized") \ + M(float, hint_weights_by_max, PerLiveRangeShape, \ + "bb freq - weighed nr of uses that are hints, normalized") \ + M(float, start_bb_freq_by_max, PerLiveRangeShape, \ + "the freq in the start block, normalized") \ + M(float, end_bb_freq_by_max, PerLiveRangeShape, \ + "freq of end block, normalized") \ + M(float, hottest_bb_freq_by_max, PerLiveRangeShape, \ + "hottest BB freq, normalized") \ + M(float, liverange_size, PerLiveRangeShape, \ + "size (instr index diff) of the LR") \ + M(float, use_def_density, PerLiveRangeShape, \ + "the max weight, as computed by the manual heuristic") \ + M(int64_t, max_stage, PerLiveRangeShape, \ + "largest stage of an interval in this LR") \ + M(int64_t, min_stage, PerLiveRangeShape, \ + "lowest stage of an interval in this LR") \ + M(float, progress, {1}, "ratio of current queue size to initial size") + +// The model learns to pick one of the mask == 1 interferences. This is the name +// of the output tensor. +// The contract with the model is that the output will be guaranteed to be to a +// mask == 1 position. +// Using a macro here to avoid 'not used' warnings (and keep cond compilation to +// a minimum) +#define DecisionName "index_to_evict" + +// Named features index. +enum FeatureIDs { +#define _FEATURE_IDX(_, name, __, ___) name, + RA_EVICT_FEATURES_LIST(_FEATURE_IDX) +#undef _FEATURE_IDX + FeatureCount +}; + +// The ML advisor will typically have a sparse input to the evaluator, because +// various phys regs won't be available. It's easier (maintenance-wise) to +// bulk-reset the state of the evaluator each time we are about to use it again. +template <typename T> size_t getTotalSize(const std::vector<int64_t> &Shape) { + size_t Ret = sizeof(T); + for (const auto V : Shape) + Ret *= V; + return Ret; +} + +void resetInputs(MLModelRunner &Runner) { +#define _RESET(TYPE, NAME, SHAPE, __) \ + std::memset(Runner.getTensorUntyped(FeatureIDs::NAME), 0, \ + getTotalSize<TYPE>(SHAPE)); + RA_EVICT_FEATURES_LIST(_RESET) +#undef _RESET +} + +using CandidateRegList = + std::array<std::pair<MCRegister, bool>, NumberOfInterferences>; +using FeaturesListNormalizer = std::array<float, FeatureIDs::FeatureCount>; + +/// The ML evictor (commonalities between release and development mode) +class MLEvictAdvisor : public RegAllocEvictionAdvisor { +public: + MLEvictAdvisor(const MachineFunction &MF, const RAGreedy &RA, + MLModelRunner *Runner, const MachineBlockFrequencyInfo &MBFI, + const MachineLoopInfo &Loops); + +protected: + const RegAllocEvictionAdvisor &getDefaultAdvisor() const { + return static_cast<const RegAllocEvictionAdvisor &>(DefaultAdvisor); + } + + // The assumption is that if the Runner could not be constructed, we emit-ed + // error, and we shouldn't be asking for it here. + const MLModelRunner &getRunner() const { return *Runner; } + + /// This just calls Evaluate on the Runner, but in the development mode case, + /// if we're just capturing the log of the default advisor, it needs to call + /// the latter instead, so we need to pass all the necessary parameters for + /// it. In the development case, it will also log. + virtual int64_t tryFindEvictionCandidatePosition( + LiveInterval &VirtReg, const AllocationOrder &Order, unsigned OrderLimit, + uint8_t CostPerUseLimit, const SmallVirtRegSet &FixedRegisters) const; + + /// Load the features of the given VirtReg (allocated or not) at column Pos, + /// but if that can't be evicted, return false instead. + bool + loadInterferenceFeatures(LiveInterval &VirtReg, MCRegister PhysReg, + bool IsHint, const SmallVirtRegSet &FixedRegisters, + std::array<float, FeatureIDs::FeatureCount> &Largest, + size_t Pos) const; + +private: + static float getInitialQueueSize(const MachineFunction &MF); + + MCRegister tryFindEvictionCandidate( + LiveInterval &VirtReg, const AllocationOrder &Order, + uint8_t CostPerUseLimit, + const SmallVirtRegSet &FixedRegisters) const override; + + void extractFeatures(const SmallVectorImpl<LiveInterval *> &Intervals, + std::array<float, FeatureIDs::FeatureCount> &Largest, + size_t Pos, int64_t IsHint, int64_t LocalIntfsCount, + float NrUrgent) const; + + // Point-in-time: we didn't learn this, so we always delegate to the default. + bool canEvictHintInterference( + LiveInterval &VirtReg, MCRegister PhysReg, + const SmallVirtRegSet &FixedRegisters) const override { + return getDefaultAdvisor().canEvictHintInterference(VirtReg, PhysReg, + FixedRegisters); + } + + // Hold on to a default advisor for: + // 1) the implementation of canEvictHintInterference, because we didn't learn + // that nuance yet; + // 2) for bootstrapping (logging) in the development mode case. + const DefaultEvictionAdvisor DefaultAdvisor; + MLModelRunner *const Runner; + const MachineBlockFrequencyInfo &MBFI; + const MachineLoopInfo &Loops; + + // Indices of those features we don't want to normalize. + // This could be static and shared, but its initialization is non-trivial. + std::bitset<FeatureIDs::FeatureCount> DoNotNormalize; + const float InitialQSize; +}; + +// =================================== +// Release (AOT) - specifics +// =================================== +#if defined(LLVM_HAVE_TF_AOT_REGALLOCEVICTMODEL) +const std::array<std::string, FeatureIDs::FeatureCount> FeatureNames{ +#define _GETNAME(_, NAME, __, ___) #NAME, + RA_EVICT_FEATURES_LIST(_GETNAME) +#undef _GETNAME +}; +class ReleaseModeEvictionAdvisorAnalysis final + : public RegAllocEvictionAdvisorAnalysis { +public: + ReleaseModeEvictionAdvisorAnalysis() + : RegAllocEvictionAdvisorAnalysis(AdvisorMode::Release) {} + // support for isa<> and dyn_cast. + static bool classof(const RegAllocEvictionAdvisorAnalysis *R) { + return R->getAdvisorMode() == AdvisorMode::Release; + } + +private: + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<MachineBlockFrequencyInfo>(); + AU.addRequired<MachineLoopInfo>(); + RegAllocEvictionAdvisorAnalysis::getAnalysisUsage(AU); + } + + std::unique_ptr<RegAllocEvictionAdvisor> + getAdvisor(const MachineFunction &MF, const RAGreedy &RA) override { + if (!Runner) + Runner = std::make_unique<ReleaseModeModelRunner<RegallocEvictModel>>( + MF.getFunction().getContext(), FeatureNames, DecisionName); + return std::make_unique<MLEvictAdvisor>( + MF, RA, Runner.get(), getAnalysis<MachineBlockFrequencyInfo>(), + getAnalysis<MachineLoopInfo>()); + } + std::unique_ptr<ReleaseModeModelRunner<RegallocEvictModel>> Runner; +}; +#endif + +// =================================== +// Development mode-specifics +// =================================== +// +// Features we log +#ifdef LLVM_HAVE_TF_API +#define _DECL_FEATURES(type, name, shape, _) \ + TensorSpec::createSpec<type>(#name, shape), + +static const std::vector<TensorSpec> InputFeatures{ + {RA_EVICT_FEATURES_LIST(_DECL_FEATURES)}, +}; +#undef _DECL_FEATURES +static const TensorSpec Output = + TensorSpec::createSpec<int64_t>(DecisionName, {1}); +static const TensorSpec Reward = TensorSpec::createSpec<float>("reward", {1}); + +// Features we bind on the model. The tensor names have a prefix, and we also +// need to include some tensors that are expected to be present by the training +// algo. +// TODO: can we just get rid of these? +#define _DECL_TRAIN_FEATURES(type, name, shape, _) \ + TensorSpec::createSpec<type>(std::string("action_") + #name, shape), + +static const std::vector<TensorSpec> TrainingInputFeatures{ + {RA_EVICT_FEATURES_LIST(_DECL_TRAIN_FEATURES) + TensorSpec::createSpec<float>("action_discount", {1}), + TensorSpec::createSpec<int32_t>("action_step_type", {1}), + TensorSpec::createSpec<float>("action_reward", {1})}}; +#undef _DECL_TRAIN_FEATURES + +class DevelopmentModeEvictAdvisor : public MLEvictAdvisor { +public: + DevelopmentModeEvictAdvisor(const MachineFunction &MF, const RAGreedy &RA, + MLModelRunner *Runner, + const MachineBlockFrequencyInfo &MBFI, + const MachineLoopInfo &Loops, Logger *Log) + : MLEvictAdvisor(MF, RA, Runner, MBFI, Loops), Log(Log) {} + +private: + int64_t tryFindEvictionCandidatePosition( + LiveInterval &VirtReg, const AllocationOrder &Order, unsigned OrderLimit, + uint8_t CostPerUseLimit, + const SmallVirtRegSet &FixedRegisters) const override; + + Logger *const Log; +}; + +class DevelopmentModeEvictionAdvisorAnalysis final + : public RegAllocEvictionAdvisorAnalysis { +public: + DevelopmentModeEvictionAdvisorAnalysis() + : RegAllocEvictionAdvisorAnalysis(AdvisorMode::Development) {} + // support for isa<> and dyn_cast. + static bool classof(const RegAllocEvictionAdvisorAnalysis *R) { + return R->getAdvisorMode() == AdvisorMode::Development; + } + + /// get the logger for the given function, or nullptr if we didn't collect + /// one. This is used to inject the score by the RegAllocScoring pass. + Logger *getLogger(const MachineFunction &MF) const { + auto I = LogMap.find(MF.getName()); + if (I == LogMap.end()) + return nullptr; + return I->second.get(); + } + +private: + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<MachineBlockFrequencyInfo>(); + AU.addRequired<MachineLoopInfo>(); + RegAllocEvictionAdvisorAnalysis::getAnalysisUsage(AU); + } + + // Save all the logs (when requested). + bool doFinalization(Module &M) override { + if (TrainingLog.empty()) + return false; + std::error_code EC; + auto OS = std::make_unique<raw_fd_ostream>(TrainingLog, EC); + if (EC) { + M.getContext().emitError(EC.message() + ":" + TrainingLog); + return false; + } + Logger::flushLogs(*OS, LogMap); + return false; + } + + std::unique_ptr<RegAllocEvictionAdvisor> + getAdvisor(const MachineFunction &MF, const RAGreedy &RA) override { + LLVMContext &Ctx = MF.getFunction().getContext(); + if (ModelUnderTraining.empty() && TrainingLog.empty()) { + Ctx.emitError("Regalloc development mode should be requested with at " + "least logging enabled and/or a training model"); + return nullptr; + } + if (!Runner) { + if (ModelUnderTraining.empty()) + Runner = std::make_unique<NoInferenceModelRunner>(Ctx, InputFeatures); + else + Runner = ModelUnderTrainingRunner::createAndEnsureValid( + Ctx, ModelUnderTraining, DecisionName, TrainingInputFeatures); + if (!Runner) { + Ctx.emitError("Regalloc: could not set up the model runner"); + return nullptr; + } + } + + Logger *Log = nullptr; + if (!TrainingLog.empty()) { + std::vector<LoggedFeatureSpec> LFS; + for (const auto &FS : InputFeatures) + LFS.push_back({FS, None}); + if (auto *MUTR = dyn_cast<ModelUnderTrainingRunner>(Runner.get())) + if (MUTR->outputLoggedFeatureSpecs().size() > 1) + append_range(LFS, drop_begin(MUTR->outputLoggedFeatureSpecs())); + // We always log the output; in particular, if we're not evaluating, we + // don't have an output spec json file. That's why we handle the + // 'normal' output separately. + LFS.push_back({Output, None}); + auto I = LogMap.insert(std::make_pair( + MF.getFunction().getName(), + std::make_unique<Logger>(LFS, Reward, /*IncludeReward*/ true))); + assert(I.second); + Log = I.first->second.get(); + } + return std::make_unique<DevelopmentModeEvictAdvisor>( + MF, RA, Runner.get(), getAnalysis<MachineBlockFrequencyInfo>(), + getAnalysis<MachineLoopInfo>(), Log); + } + + std::unique_ptr<MLModelRunner> Runner; + StringMap<std::unique_ptr<Logger>> LogMap; +}; +#endif //#ifdef LLVM_HAVE_TF_API +} // namespace + +float MLEvictAdvisor::getInitialQueueSize(const MachineFunction &MF) { + auto &MRI = MF.getRegInfo(); + float Ret = 0.0; + for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) { + Register Reg = Register::index2VirtReg(I); + if (MRI.reg_nodbg_empty(Reg)) + continue; + ++Ret; + } + return Ret; +} + +MLEvictAdvisor::MLEvictAdvisor(const MachineFunction &MF, const RAGreedy &RA, + MLModelRunner *Runner, + const MachineBlockFrequencyInfo &MBFI, + const MachineLoopInfo &Loops) + : RegAllocEvictionAdvisor(MF, RA), DefaultAdvisor(MF, RA), + Runner(std::move(Runner)), MBFI(MBFI), Loops(Loops), + InitialQSize(MLEvictAdvisor::getInitialQueueSize(MF)) { + assert(this->Runner); + DoNotNormalize.set(FeatureIDs::mask); + DoNotNormalize.set(FeatureIDs::is_free); + DoNotNormalize.set(FeatureIDs::is_hint); + DoNotNormalize.set(FeatureIDs::is_local); + DoNotNormalize.set(FeatureIDs::min_stage); + DoNotNormalize.set(FeatureIDs::max_stage); + DoNotNormalize.set(FeatureIDs::progress); +} + +int64_t MLEvictAdvisor::tryFindEvictionCandidatePosition( + LiveInterval &, const AllocationOrder &, unsigned, uint8_t, + const SmallVirtRegSet &) const { + int64_t Ret = Runner->evaluate<int64_t>(); + assert(Ret >= 0); + assert(Ret <= CandidateVirtRegPos); + return Ret; +} + +bool MLEvictAdvisor::loadInterferenceFeatures( + LiveInterval &VirtReg, MCRegister PhysReg, bool IsHint, + const SmallVirtRegSet &FixedRegisters, FeaturesListNormalizer &Largest, + size_t Pos) const { + // It is only possible to evict virtual register interference. + if (Matrix->checkInterference(VirtReg, PhysReg) > LiveRegMatrix::IK_VirtReg) { + // leave unavailable + return false; + } + + const bool IsLocal = LIS->intervalIsInOneMBB(VirtReg); + int64_t LocalIntfs = 0; + float NrUrgent = 0.0f; + + // The cascade tracking is the same as in the default advisor + unsigned Cascade = RA.getExtraInfo().getCascadeOrCurrentNext(VirtReg.reg()); + + SmallVector<LiveInterval *, MaxInterferences> InterferingIntervals; + for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) { + LiveIntervalUnion::Query &Q = Matrix->query(VirtReg, *Units); + // Different from the default heuristic, we don't make any assumptions about + // what having more than 10 results in the query may mean. + const auto &IFIntervals = Q.interferingVRegs(); + if (IFIntervals.empty() && InterferingIntervals.empty()) + continue; + InterferingIntervals.append(IFIntervals.begin(), IFIntervals.end()); + for (LiveInterval *Intf : reverse(IFIntervals)) { + assert(Register::isVirtualRegister(Intf->reg()) && + "Only expecting virtual register interference from query"); + // This is the same set of legality checks as in the default case: don't + // try to evict fixed regs or 'done' ones. Also don't break cascades, + // except in the urgent case, with the same nuances used in the default + // heuristic. + // We could try sharing this between the advisors, but it may end up + // more complex than it is right now. + if (FixedRegisters.count(Intf->reg())) + return false; + if (RA.getExtraInfo().getStage(*Intf) == RS_Done) + return false; + bool Urgent = + !VirtReg.isSpillable() && + (Intf->isSpillable() || + RegClassInfo.getNumAllocatableRegs(MRI->getRegClass(VirtReg.reg())) < + RegClassInfo.getNumAllocatableRegs( + MRI->getRegClass(Intf->reg()))); + // Only evict older cascades or live ranges without a cascade. + unsigned IntfCascade = RA.getExtraInfo().getCascade(Intf->reg()); + if (Cascade <= IntfCascade) { + if (!Urgent) + return false; + ++NrUrgent; + } + + LocalIntfs += (IsLocal && LIS->intervalIsInOneMBB(*Intf) && + (!EnableLocalReassign || !canReassign(*Intf, PhysReg))); + } + } + // OK, so if we made it this far, this LR is an eviction candidate, load its + // features. + extractFeatures(InterferingIntervals, Largest, Pos, IsHint, LocalIntfs, + NrUrgent); + return true; +} + +MCRegister MLEvictAdvisor::tryFindEvictionCandidate( + LiveInterval &VirtReg, const AllocationOrder &Order, + uint8_t CostPerUseLimit, const SmallVirtRegSet &FixedRegisters) const { + auto MaybeOrderLimit = getOrderLimit(VirtReg, Order, CostPerUseLimit); + if (!MaybeOrderLimit) + return MCRegister::NoRegister; + unsigned OrderLimit = *MaybeOrderLimit; + + // The heuristic sets initial costs such as, if CostPerUseLimit is + // max<uint8_t>, then any of the costs of the legally-evictable intervals + // would be lower. When that happens, one of those will be selected. + // Therefore, we allow the candidate be selected, unless the candidate is + // unspillable, in which case it would be incorrect to not find a register for + // it. + const bool MustFindEviction = + (!VirtReg.isSpillable() && CostPerUseLimit == static_cast<uint8_t>(~0u)); + // Number of available candidates - if 0, no need to continue. + size_t Available = 0; + // Make sure we don't have leftover partial state from an attempt where we had + // no available candidates and bailed out early. + resetInputs(*Runner); + + // Track the index->register mapping because AllocationOrder doesn't do that + // and we'd have to scan it. + // Also track their mask, to write asserts/debug. + CandidateRegList Regs; + Regs.fill({0, false}); + + // Track the largest value of features seen during this eviction session. We + // only normalize (some of) the float features, but it's just simpler to + // dimension 'Largest' to all the features, especially since we have the + // 'DoNotNormalize' list. + FeaturesListNormalizer Largest; + Largest.fill(0.0); + + // Same overal idea as in the default eviction policy - we visit the values of + // AllocationOrder one at a time. If it's not legally available, we mask off + // the corresponding feature column (==do nothing because we already reset all + // the features to 0) + // Use Pos to capture the column we load features at - in AllocationOrder + // order. + size_t Pos = 0; + for (auto I = Order.begin(), E = Order.getOrderLimitEnd(OrderLimit); I != E; + ++I, ++Pos) { + MCRegister PhysReg = *I; + Regs[Pos] = std::make_pair(PhysReg, true); + assert(PhysReg); + if (!canAllocatePhysReg(CostPerUseLimit, PhysReg)) { + Regs[Pos].second = false; + continue; + } + if (loadInterferenceFeatures(VirtReg, PhysReg, I.isHint(), FixedRegisters, + Largest, Pos)) { + ++Available; + Regs[Pos].second = true; + } + } + if (Available == 0) { + // Nothing to decide, nothing to learn. + assert(!MustFindEviction); + return MCRegister::NoRegister; + } + // If we must find eviction, the candidate should be masked out of the + // decision making process. + Regs[CandidateVirtRegPos].second = !MustFindEviction; + if (!MustFindEviction) + extractFeatures(SmallVector<LiveInterval *, 1>(1, &VirtReg), Largest, + CandidateVirtRegPos, /*IsHint*/ 0, /*LocalIntfsCount*/ 0, + /*NrUrgent*/ 0.0); + assert(InitialQSize > 0.0 && "We couldn't have gotten here if we had " + "nothing to allocate initially."); + // Normalize the features. + for (auto &V : Largest) + V = V ? V : 1.0; + for (size_t FeatureIndex = 0; FeatureIndex < FeatureIDs::FeatureCount; + ++FeatureIndex) { + if (DoNotNormalize.test(FeatureIndex)) + continue; + for (size_t Pos = 0; Pos < NumberOfInterferences; ++Pos) { + Runner->getTensor<float>(FeatureIndex)[Pos] /= Largest[FeatureIndex]; + } + } + *Runner->getTensor<float>(FeatureIDs::progress) = + static_cast<float>(RA.getQueueSize()) / InitialQSize; + + // Get a decision. + size_t CandidatePos = tryFindEvictionCandidatePosition( + VirtReg, Order, OrderLimit, CostPerUseLimit, FixedRegisters); + // The contract with the ML side is that CandidatePos is mask == 1 (i.e. + // Regs[CandidatePos].second) + assert(Regs[CandidatePos].second); + if (CandidatePos == CandidateVirtRegPos) { + assert(!MustFindEviction); + return MCRegister::NoRegister; + } + return Regs[CandidatePos].first; +} + +// Overall, this currently mimics what we do for weight calculation, but instead +// of accummulating the various features, we keep them separate. +void MLEvictAdvisor::extractFeatures( + const SmallVectorImpl<LiveInterval *> &Intervals, + std::array<float, FeatureIDs::FeatureCount> &Largest, size_t Pos, + int64_t IsHint, int64_t LocalIntfsCount, float NrUrgent) const { + int64_t NrDefsAndUses = 0; + int64_t NrBrokenHints = 0; + float R = 0; + float W = 0; + float RW = 0; + float IndVarUpdates = 0; + float HintWeights = 0.0; + float StartBBFreq = 0.0; + float EndBBFreq = 0.0; + float HottestBlockFreq = 0.0; + int32_t NrRematerializable = 0; + float TotalWeight = 0.0; + + SlotIndex EndSI = LIS->getSlotIndexes()->getZeroIndex(); + SlotIndex StartSI = LIS->getSlotIndexes()->getLastIndex(); + int64_t MaxStage = 0; + int64_t MinStage = + Intervals.empty() ? 0 : std::numeric_limits<int64_t>::max(); + + for (const auto *L : Intervals) { + const LiveInterval &LI = *L; + MaxStage = std::max<int64_t>( + MaxStage, static_cast<int64_t>(RA.getExtraInfo().getStage(LI))); + MinStage = std::min<int64_t>( + MinStage, static_cast<int64_t>(RA.getExtraInfo().getStage(LI))); + + TotalWeight = std::max(TotalWeight, LI.weight()); + + if (LI.beginIndex() < StartSI) + StartSI = LI.beginIndex(); + + if (LI.endIndex() > EndSI) + EndSI = LI.endIndex(); + + SmallPtrSet<MachineInstr *, 8> Visited; + const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); + NrBrokenHints += VRM->hasPreferredPhys(LI.reg()); + + for (MachineRegisterInfo::reg_instr_nodbg_iterator + I = MRI->reg_instr_nodbg_begin(LI.reg()), + E = MRI->reg_instr_nodbg_end(); + I != E;) { + MachineInstr *MI = &*(I++); + + ++NrDefsAndUses; + if (!Visited.insert(MI).second) + continue; + + if (MI->isIdentityCopy() || MI->isImplicitDef()) + continue; + + bool Reads, Writes; + std::tie(Reads, Writes) = MI->readsWritesVirtualRegister(LI.reg()); + + float Freq = MBFI.getBlockFreqRelativeToEntryBlock(MI->getParent()); + if (Freq > HottestBlockFreq) + HottestBlockFreq = Freq; + R += (Reads && !Writes) * Freq; + W += (!Reads && Writes) * Freq; + RW += (Reads && Writes) * Freq; + + auto *MBB = MI->getParent(); + auto *Loop = Loops.getLoopFor(MBB); + bool IsExiting = Loop ? Loop->isLoopExiting(MBB) : false; + + if (Writes && IsExiting && LIS->isLiveOutOfMBB(LI, MBB)) + IndVarUpdates += Freq; + + if (MI->isCopy() && VirtRegAuxInfo::copyHint(MI, LI.reg(), TRI, *MRI)) + HintWeights += Freq; + } + NrRematerializable += VirtRegAuxInfo::isRematerializable( + LI, *LIS, *VRM, *MF.getSubtarget().getInstrInfo()); + } + size_t Size = 0; + if (!Intervals.empty()) { + StartBBFreq = + MBFI.getBlockFreqRelativeToEntryBlock(LIS->getMBBFromIndex(StartSI)); + if (EndSI >= LIS->getSlotIndexes()->getLastIndex()) + EndSI = LIS->getSlotIndexes()->getLastIndex().getPrevIndex(); + EndBBFreq = + MBFI.getBlockFreqRelativeToEntryBlock(LIS->getMBBFromIndex(EndSI)); + Size = StartSI.distance(EndSI); + } + // Set the features at the column 'Pos'. +#define SET(ID, TYPE, VAL) \ + do { \ + Runner->getTensor<TYPE>(FeatureIDs::ID)[Pos] = static_cast<TYPE>(VAL); \ + if (!DoNotNormalize.test(FeatureIDs::ID)) \ + Largest[FeatureIDs::ID] = \ + std::max(Largest[FeatureIDs::ID], static_cast<float>(VAL)); \ + } while (false) + SET(mask, int64_t, 1); + SET(is_free, int64_t, Intervals.empty()); + SET(nr_urgent, float, NrUrgent); + SET(nr_broken_hints, float, NrBrokenHints); + SET(is_hint, int64_t, IsHint); + SET(is_local, int64_t, LocalIntfsCount); + SET(nr_rematerializable, float, NrRematerializable); + SET(nr_defs_and_uses, float, NrDefsAndUses); + SET(weighed_reads_by_max, float, R); + SET(weighed_writes_by_max, float, W); + SET(weighed_read_writes_by_max, float, RW); + SET(weighed_indvars_by_max, float, IndVarUpdates); + SET(hint_weights_by_max, float, HintWeights); + SET(start_bb_freq_by_max, float, StartBBFreq); + SET(end_bb_freq_by_max, float, EndBBFreq); + SET(hottest_bb_freq_by_max, float, HottestBlockFreq); + SET(liverange_size, float, Size); + SET(use_def_density, float, TotalWeight); + SET(max_stage, int64_t, MaxStage); + SET(min_stage, int64_t, MinStage); +#undef SET +} + +// Development mode-specific implementations +#ifdef LLVM_HAVE_TF_API +RegAllocEvictionAdvisorAnalysis *llvm::createDevelopmentModeAdvisor() { + return new DevelopmentModeEvictionAdvisorAnalysis(); +} + +int64_t DevelopmentModeEvictAdvisor::tryFindEvictionCandidatePosition( + LiveInterval &VirtReg, const AllocationOrder &Order, unsigned OrderLimit, + uint8_t CostPerUseLimit, const SmallVirtRegSet &FixedRegisters) const { + int64_t Ret = 0; + if (isa<ModelUnderTrainingRunner>(getRunner())) { + Ret = MLEvictAdvisor::tryFindEvictionCandidatePosition( + VirtReg, Order, OrderLimit, CostPerUseLimit, FixedRegisters); + } else { + MCRegister PhysReg = getDefaultAdvisor().tryFindEvictionCandidate( + VirtReg, Order, CostPerUseLimit, FixedRegisters); + // Find the index of the selected PhysReg. We need it for logging, otherwise + // this is wasted cycles (but so would starting development mode without a + // model nor logging) + if (!PhysReg) + Ret = CandidateVirtRegPos; + else + for (auto I = Order.begin(), E = Order.getOrderLimitEnd(OrderLimit); + I != E; ++I, ++Ret) + if (*I == PhysReg) + break; + } + if (TrainingLog.empty()) + return Ret; + size_t CurrentFeature = 0; + for (; CurrentFeature < FeatureIDs::FeatureCount; ++CurrentFeature) { + Log->logSpecifiedTensorValue( + CurrentFeature, reinterpret_cast<const char *>( + getRunner().getTensorUntyped(CurrentFeature))); + } + if (auto *MUTR = dyn_cast<ModelUnderTrainingRunner>(&getRunner())) + for (size_t I = 1; I < MUTR->outputLoggedFeatureSpecs().size(); + ++I, ++CurrentFeature) + Log->logSpecifiedTensorValue( + CurrentFeature, + reinterpret_cast<const char *>( + MUTR->lastEvaluationResult()->getUntypedTensorValue(I))); + // The output is right after the features and the extra outputs + Log->logInt64Value(CurrentFeature, &Ret); + return Ret; +} + +bool RegAllocScoring::runOnMachineFunction(MachineFunction &MF) { + if (auto *DevModeAnalysis = dyn_cast<DevelopmentModeEvictionAdvisorAnalysis>( + &getAnalysis<RegAllocEvictionAdvisorAnalysis>())) + if (auto *Log = DevModeAnalysis->getLogger(MF)) + Log->logFloatFinalReward(static_cast<float>( + calculateRegAllocScore( + MF, getAnalysis<MachineBlockFrequencyInfo>(), + getAnalysis<AAResultsWrapperPass>().getAAResults()) + .getScore())); + + return false; +} +#endif // #ifdef LLVM_HAVE_TF_API + +#if defined(LLVM_HAVE_TF_AOT_REGALLOCEVICTMODEL) +RegAllocEvictionAdvisorAnalysis *llvm::createReleaseModeAdvisor() { + return new ReleaseModeEvictionAdvisorAnalysis(); +} +#endif + +// In all cases except development mode, we don't need scoring. +#if !defined(LLVM_HAVE_TF_API) +bool RegAllocScoring::runOnMachineFunction(MachineFunction &) { return false; } +#endif diff --git a/llvm/lib/CodeGen/MachineBlockPlacement.cpp b/llvm/lib/CodeGen/MachineBlockPlacement.cpp index 692587cd58fa..c93ffaabf74c 100644 --- a/llvm/lib/CodeGen/MachineBlockPlacement.cpp +++ b/llvm/lib/CodeGen/MachineBlockPlacement.cpp @@ -96,6 +96,12 @@ static cl::opt<unsigned> AlignAllNonFallThruBlocks( "format (e.g 4 means align on 16B boundaries)."), cl::init(0), cl::Hidden); +static cl::opt<unsigned> MaxBytesForAlignmentOverride( + "max-bytes-for-alignment", + cl::desc("Forces the maximum bytes allowed to be emitted when padding for " + "alignment"), + cl::init(0), cl::Hidden); + // FIXME: Find a good default for this flag and remove the flag. static cl::opt<unsigned> ExitBlockBias( "block-placement-exit-block-bias", @@ -2929,10 +2935,21 @@ void MachineBlockPlacement::alignBlocks() { MachineBasicBlock *LayoutPred = &*std::prev(MachineFunction::iterator(ChainBB)); + auto DetermineMaxAlignmentPadding = [&]() { + // Set the maximum bytes allowed to be emitted for alignment. + unsigned MaxBytes; + if (MaxBytesForAlignmentOverride.getNumOccurrences() > 0) + MaxBytes = MaxBytesForAlignmentOverride; + else + MaxBytes = TLI->getMaxPermittedBytesForAlignment(ChainBB); + ChainBB->setMaxBytesForAlignment(MaxBytes); + }; + // Force alignment if all the predecessors are jumps. We already checked // that the block isn't cold above. if (!LayoutPred->isSuccessor(ChainBB)) { ChainBB->setAlignment(Align); + DetermineMaxAlignmentPadding(); continue; } @@ -2943,8 +2960,10 @@ void MachineBlockPlacement::alignBlocks() { BranchProbability LayoutProb = MBPI->getEdgeProbability(LayoutPred, ChainBB); BlockFrequency LayoutEdgeFreq = MBFI->getBlockFreq(LayoutPred) * LayoutProb; - if (LayoutEdgeFreq <= (Freq * ColdProb)) + if (LayoutEdgeFreq <= (Freq * ColdProb)) { ChainBB->setAlignment(Align); + DetermineMaxAlignmentPadding(); + } } } @@ -3418,17 +3437,30 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) { ComputedEdges.clear(); ChainAllocator.DestroyAll(); + bool HasMaxBytesOverride = + MaxBytesForAlignmentOverride.getNumOccurrences() > 0; + if (AlignAllBlock) // Align all of the blocks in the function to a specific alignment. - for (MachineBasicBlock &MBB : MF) - MBB.setAlignment(Align(1ULL << AlignAllBlock)); + for (MachineBasicBlock &MBB : MF) { + if (HasMaxBytesOverride) + MBB.setAlignment(Align(1ULL << AlignAllBlock), + MaxBytesForAlignmentOverride); + else + MBB.setAlignment(Align(1ULL << AlignAllBlock)); + } else if (AlignAllNonFallThruBlocks) { // Align all of the blocks that have no fall-through predecessors to a // specific alignment. for (auto MBI = std::next(MF.begin()), MBE = MF.end(); MBI != MBE; ++MBI) { auto LayoutPred = std::prev(MBI); - if (!LayoutPred->isSuccessor(&*MBI)) - MBI->setAlignment(Align(1ULL << AlignAllNonFallThruBlocks)); + if (!LayoutPred->isSuccessor(&*MBI)) { + if (HasMaxBytesOverride) + MBI->setAlignment(Align(1ULL << AlignAllNonFallThruBlocks), + MaxBytesForAlignmentOverride); + else + MBI->setAlignment(Align(1ULL << AlignAllNonFallThruBlocks)); + } } } if (ViewBlockLayoutWithBFI != GVDT_None && diff --git a/llvm/lib/CodeGen/MachineDominanceFrontier.cpp b/llvm/lib/CodeGen/MachineDominanceFrontier.cpp index 6ddb1758719b..a39dc79baaa8 100644 --- a/llvm/lib/CodeGen/MachineDominanceFrontier.cpp +++ b/llvm/lib/CodeGen/MachineDominanceFrontier.cpp @@ -29,9 +29,7 @@ INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) INITIALIZE_PASS_END(MachineDominanceFrontier, "machine-domfrontier", "Machine Dominance Frontier Construction", true, true) -MachineDominanceFrontier::MachineDominanceFrontier() - : MachineFunctionPass(ID), - Base() { +MachineDominanceFrontier::MachineDominanceFrontier() : MachineFunctionPass(ID) { initializeMachineDominanceFrontierPass(*PassRegistry::getPassRegistry()); } diff --git a/llvm/lib/CodeGen/MachineFunction.cpp b/llvm/lib/CodeGen/MachineFunction.cpp index 81ed3d0e93ff..fd5ea5cad072 100644 --- a/llvm/lib/CodeGen/MachineFunction.cpp +++ b/llvm/lib/CodeGen/MachineFunction.cpp @@ -76,6 +76,8 @@ #include <utility> #include <vector> +#include "LiveDebugValues/LiveDebugValues.h" + using namespace llvm; #define DEBUG_TYPE "codegen" @@ -1238,7 +1240,7 @@ bool MachineFunction::useDebugInstrRef() const { if (F.hasFnAttribute(Attribute::OptimizeNone)) return false; - if (getTarget().Options.ValueTrackingVariableLocations) + if (llvm::debuginfoShouldUseDebugInstrRef(getTarget().getTargetTriple())) return true; return false; diff --git a/llvm/lib/CodeGen/MachineInstrBundle.cpp b/llvm/lib/CodeGen/MachineInstrBundle.cpp index 6ca97031b92a..759cff179790 100644 --- a/llvm/lib/CodeGen/MachineInstrBundle.cpp +++ b/llvm/lib/CodeGen/MachineInstrBundle.cpp @@ -144,6 +144,10 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB, SmallSet<Register, 8> UndefUseSet; SmallVector<MachineOperand*, 4> Defs; for (auto MII = FirstMI; MII != LastMI; ++MII) { + // Debug instructions have no effects to track. + if (MII->isDebugInstr()) + continue; + for (unsigned i = 0, e = MII->getNumOperands(); i != e; ++i) { MachineOperand &MO = MII->getOperand(i); if (!MO.isReg()) diff --git a/llvm/lib/CodeGen/MachineOptimizationRemarkEmitter.cpp b/llvm/lib/CodeGen/MachineOptimizationRemarkEmitter.cpp index 59fc23983d3d..5347a7b0d890 100644 --- a/llvm/lib/CodeGen/MachineOptimizationRemarkEmitter.cpp +++ b/llvm/lib/CodeGen/MachineOptimizationRemarkEmitter.cpp @@ -22,8 +22,7 @@ using namespace llvm; DiagnosticInfoMIROptimization::MachineArgument::MachineArgument( - StringRef MKey, const MachineInstr &MI) - : Argument() { + StringRef MKey, const MachineInstr &MI) { Key = std::string(MKey); raw_string_ostream OS(Val); diff --git a/llvm/lib/CodeGen/MachineSink.cpp b/llvm/lib/CodeGen/MachineSink.cpp index 54c478645dcf..0dbbc218e946 100644 --- a/llvm/lib/CodeGen/MachineSink.cpp +++ b/llvm/lib/CodeGen/MachineSink.cpp @@ -796,9 +796,14 @@ bool MachineSinking::isProfitableToSinkTo(Register Reg, MachineInstr &MI, if (Reg == 0) continue; - // Don't handle physical register. - if (Register::isPhysicalRegister(Reg)) + if (Register::isPhysicalRegister(Reg)) { + if (MO.isUse() && + (MRI->isConstantPhysReg(Reg) || TII->isIgnorableUse(MO))) + continue; + + // Don't handle non-constant and non-ignorable physical register. return false; + } // Users for the defs are all dominated by SuccToSinkTo. if (MO.isDef()) { @@ -898,7 +903,7 @@ MachineSinking::FindSuccToSinkTo(MachineInstr &MI, MachineBasicBlock *MBB, // If the physreg has no defs anywhere, it's just an ambient register // and we can freely move its uses. Alternatively, if it's allocatable, // it could get allocated to something with a def during allocation. - if (!MRI->isConstantPhysReg(Reg)) + if (!MRI->isConstantPhysReg(Reg) && !TII->isIgnorableUse(MO)) return nullptr; } else if (!MO.isDead()) { // A def that isn't dead. We can't move it. diff --git a/llvm/lib/CodeGen/ModuloSchedule.cpp b/llvm/lib/CodeGen/ModuloSchedule.cpp index aaa6403cc978..f91a9d2c3a32 100644 --- a/llvm/lib/CodeGen/ModuloSchedule.cpp +++ b/llvm/lib/CodeGen/ModuloSchedule.cpp @@ -1704,7 +1704,7 @@ void PeelingModuloScheduleExpander::peelPrologAndEpilogs() { // Peel out the prologs. LS.reset(); for (int I = 0; I < Schedule.getNumStages() - 1; ++I) { - LS[I] = 1; + LS[I] = true; Prologs.push_back(peelKernel(LPD_Front)); LiveStages[Prologs.back()] = LS; AvailableStages[Prologs.back()] = LS; @@ -1752,7 +1752,7 @@ void PeelingModuloScheduleExpander::peelPrologAndEpilogs() { // Move stage one block at a time so that Phi nodes are updated correctly. for (size_t K = Iteration; K > I; K--) moveStageBetweenBlocks(Epilogs[K - 1], Epilogs[K], Stage); - LS[Stage] = 1; + LS[Stage] = true; } LiveStages[Epilogs[I]] = LS; AvailableStages[Epilogs[I]] = AS; diff --git a/llvm/lib/CodeGen/NonRelocatableStringpool.cpp b/llvm/lib/CodeGen/NonRelocatableStringpool.cpp index 9ed3471c0fc9..db5217469fba 100644 --- a/llvm/lib/CodeGen/NonRelocatableStringpool.cpp +++ b/llvm/lib/CodeGen/NonRelocatableStringpool.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "llvm/CodeGen/NonRelocatableStringpool.h" +#include "llvm/ADT/STLExtras.h" namespace llvm { diff --git a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp index e3eb3f825851..74b903f99284 100644 --- a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp +++ b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp @@ -97,7 +97,7 @@ static bool lowerObjCCall(Function &F, const char *NewFn, objcarc::ARCInstKind Kind = objcarc::getAttachedARCFunctionKind(CB); (void)Kind; assert((Kind == objcarc::ARCInstKind::RetainRV || - Kind == objcarc::ARCInstKind::ClaimRV) && + Kind == objcarc::ARCInstKind::UnsafeClaimRV) && "use expected to be the argument of operand bundle " "\"clang.arc.attachedcall\""); U.set(FCache.getCallee()); diff --git a/llvm/lib/CodeGen/RegAllocEvictionAdvisor.cpp b/llvm/lib/CodeGen/RegAllocEvictionAdvisor.cpp index 9f1012c95964..87df7bb4a689 100644 --- a/llvm/lib/CodeGen/RegAllocEvictionAdvisor.cpp +++ b/llvm/lib/CodeGen/RegAllocEvictionAdvisor.cpp @@ -11,6 +11,7 @@ //===----------------------------------------------------------------------===// #include "RegAllocEvictionAdvisor.h" +#include "RegAllocGreedy.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/RegisterClassInfo.h" #include "llvm/CodeGen/VirtRegMap.h" @@ -42,6 +43,9 @@ static cl::opt<bool> EnableLocalReassignment( cl::init(false)); #define DEBUG_TYPE "regalloc" +#ifdef LLVM_HAVE_TF_AOT_REGALLOCEVICTMODEL +#define LLVM_HAVE_TF_AOT +#endif char RegAllocEvictionAdvisorAnalysis::ID = 0; INITIALIZE_PASS(RegAllocEvictionAdvisorAnalysis, "regalloc-evict", @@ -62,12 +66,8 @@ public: private: std::unique_ptr<RegAllocEvictionAdvisor> - getAdvisor(const MachineFunction &MF, LiveRegMatrix *Matrix, - LiveIntervals *LIS, VirtRegMap *VRM, - const RegisterClassInfo &RegClassInfo, - ExtraRegInfo *ExtraInfo) override { - return std::make_unique<DefaultEvictionAdvisor>(MF, Matrix, LIS, VRM, - RegClassInfo, ExtraInfo); + getAdvisor(const MachineFunction &MF, const RAGreedy &RA) override { + return std::make_unique<DefaultEvictionAdvisor>(MF, RA); } bool doInitialization(Module &M) override { if (NotAsRequested) @@ -86,10 +86,14 @@ template <> Pass *llvm::callDefaultCtor<RegAllocEvictionAdvisorAnalysis>() { Ret = new DefaultEvictionAdvisorAnalysis(/*NotAsRequested*/ false); break; case RegAllocEvictionAdvisorAnalysis::AdvisorMode::Development: - // TODO(mtrofin): add implementation +#if defined(LLVM_HAVE_TF_API) + Ret = createDevelopmentModeAdvisor(); +#endif break; case RegAllocEvictionAdvisorAnalysis::AdvisorMode::Release: - // TODO(mtrofin): add implementation +#if defined(LLVM_HAVE_TF_AOT) + Ret = createReleaseModeAdvisor(); +#endif break; } if (Ret) @@ -109,13 +113,12 @@ StringRef RegAllocEvictionAdvisorAnalysis::getPassName() const { llvm_unreachable("Unknown advisor kind"); } -RegAllocEvictionAdvisor::RegAllocEvictionAdvisor( - const MachineFunction &MF, LiveRegMatrix *Matrix, LiveIntervals *LIS, - VirtRegMap *VRM, const RegisterClassInfo &RegClassInfo, - ExtraRegInfo *ExtraInfo) - : MF(MF), Matrix(Matrix), LIS(LIS), VRM(VRM), MRI(&VRM->getRegInfo()), - TRI(MF.getSubtarget().getRegisterInfo()), RegClassInfo(RegClassInfo), - RegCosts(TRI->getRegisterCosts(MF)), ExtraInfo(ExtraInfo), +RegAllocEvictionAdvisor::RegAllocEvictionAdvisor(const MachineFunction &MF, + const RAGreedy &RA) + : MF(MF), RA(RA), Matrix(RA.getInterferenceMatrix()), + LIS(RA.getLiveIntervals()), VRM(RA.getVirtRegMap()), + MRI(&VRM->getRegInfo()), TRI(MF.getSubtarget().getRegisterInfo()), + RegClassInfo(RA.getRegClassInfo()), RegCosts(TRI->getRegisterCosts(MF)), EnableLocalReassign(EnableLocalReassignment || MF.getSubtarget().enableRALocalReassignment( MF.getTarget().getOptLevel())) {} diff --git a/llvm/lib/CodeGen/RegAllocEvictionAdvisor.h b/llvm/lib/CodeGen/RegAllocEvictionAdvisor.h index debb75ed5020..33e03aed81a7 100644 --- a/llvm/lib/CodeGen/RegAllocEvictionAdvisor.h +++ b/llvm/lib/CodeGen/RegAllocEvictionAdvisor.h @@ -87,87 +87,9 @@ struct EvictionCost { } }; -/// Track allocation stage and eviction loop prevention during allocation. -// TODO(mtrofin): Consider exposing RAGreedy in a header instead, and folding -// this back into it. -class ExtraRegInfo final { - // RegInfo - Keep additional information about each live range. - struct RegInfo { - LiveRangeStage Stage = RS_New; - - // Cascade - Eviction loop prevention. See - // canEvictInterferenceBasedOnCost(). - unsigned Cascade = 0; - - RegInfo() = default; - }; - - IndexedMap<RegInfo, VirtReg2IndexFunctor> Info; - unsigned NextCascade = 1; - -public: - ExtraRegInfo() = default; - ExtraRegInfo(const ExtraRegInfo &) = delete; - - LiveRangeStage getStage(Register Reg) const { return Info[Reg].Stage; } - - LiveRangeStage getStage(const LiveInterval &VirtReg) const { - return getStage(VirtReg.reg()); - } - - void setStage(Register Reg, LiveRangeStage Stage) { - Info.grow(Reg.id()); - Info[Reg].Stage = Stage; - } - - void setStage(const LiveInterval &VirtReg, LiveRangeStage Stage) { - setStage(VirtReg.reg(), Stage); - } - - /// Return the current stage of the register, if present, otherwise initialize - /// it and return that. - LiveRangeStage getOrInitStage(Register Reg) { - Info.grow(Reg.id()); - return getStage(Reg); - } - - unsigned getCascade(Register Reg) const { return Info[Reg].Cascade; } - - void setCascade(Register Reg, unsigned Cascade) { - Info.grow(Reg.id()); - Info[Reg].Cascade = Cascade; - } - - unsigned getOrAssignNewCascade(Register Reg) { - unsigned Cascade = getCascade(Reg); - if (!Cascade) { - Cascade = NextCascade++; - setCascade(Reg, Cascade); - } - return Cascade; - } - - unsigned getCascadeOrCurrentNext(Register Reg) const { - unsigned Cascade = getCascade(Reg); - if (!Cascade) - Cascade = NextCascade; - return Cascade; - } - - template <typename Iterator> - void setStage(Iterator Begin, Iterator End, LiveRangeStage NewStage) { - for (; Begin != End; ++Begin) { - Register Reg = *Begin; - Info.grow(Reg.id()); - if (Info[Reg].Stage == RS_New) - Info[Reg].Stage = NewStage; - } - } - void LRE_DidCloneVirtReg(Register New, Register Old); -}; - /// Interface to the eviction advisor, which is responsible for making a /// decision as to which live ranges should be evicted (if any). +class RAGreedy; class RegAllocEvictionAdvisor { public: RegAllocEvictionAdvisor(const RegAllocEvictionAdvisor &) = delete; @@ -193,14 +115,23 @@ public: bool isUnusedCalleeSavedReg(MCRegister PhysReg) const; protected: - RegAllocEvictionAdvisor(const MachineFunction &MF, LiveRegMatrix *Matrix, - LiveIntervals *LIS, VirtRegMap *VRM, - const RegisterClassInfo &RegClassInfo, - ExtraRegInfo *ExtraInfo); + RegAllocEvictionAdvisor(const MachineFunction &MF, const RAGreedy &RA); Register canReassign(LiveInterval &VirtReg, Register PrevReg) const; + // Get the upper limit of elements in the given Order we need to analize. + // TODO: is this heuristic, we could consider learning it. + Optional<unsigned> getOrderLimit(const LiveInterval &VirtReg, + const AllocationOrder &Order, + unsigned CostPerUseLimit) const; + + // Determine if it's worth trying to allocate this reg, given the + // CostPerUseLimit + // TODO: this is a heuristic component we could consider learning, too. + bool canAllocatePhysReg(unsigned CostPerUseLimit, MCRegister PhysReg) const; + const MachineFunction &MF; + const RAGreedy &RA; LiveRegMatrix *const Matrix; LiveIntervals *const LIS; VirtRegMap *const VRM; @@ -208,7 +139,6 @@ protected: const TargetRegisterInfo *const TRI; const RegisterClassInfo &RegClassInfo; const ArrayRef<uint8_t> RegCosts; - ExtraRegInfo *const ExtraInfo; /// Run or not the local reassignment heuristic. This information is /// obtained from the TargetSubtargetInfo. @@ -243,19 +173,17 @@ public: /// Get an advisor for the given context (i.e. machine function, etc) virtual std::unique_ptr<RegAllocEvictionAdvisor> - getAdvisor(const MachineFunction &MF, LiveRegMatrix *Matrix, - LiveIntervals *LIS, VirtRegMap *VRM, - const RegisterClassInfo &RegClassInfo, - ExtraRegInfo *ExtraInfo) = 0; + getAdvisor(const MachineFunction &MF, const RAGreedy &RA) = 0; AdvisorMode getAdvisorMode() const { return Mode; } -private: +protected: // This analysis preserves everything, and subclasses may have additional // requirements. void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesAll(); } +private: StringRef getPassName() const override; const AdvisorMode Mode; }; @@ -264,25 +192,16 @@ private: /// an instance of the eviction advisor. template <> Pass *callDefaultCtor<RegAllocEvictionAdvisorAnalysis>(); -// TODO(mtrofin): implement these. -#ifdef LLVM_HAVE_TF_AOT RegAllocEvictionAdvisorAnalysis *createReleaseModeAdvisor(); -#endif -#ifdef LLVM_HAVE_TF_API RegAllocEvictionAdvisorAnalysis *createDevelopmentModeAdvisor(); -#endif // TODO: move to RegAllocEvictionAdvisor.cpp when we move implementation // out of RegAllocGreedy.cpp class DefaultEvictionAdvisor : public RegAllocEvictionAdvisor { public: - DefaultEvictionAdvisor(const MachineFunction &MF, LiveRegMatrix *Matrix, - LiveIntervals *LIS, VirtRegMap *VRM, - const RegisterClassInfo &RegClassInfo, - ExtraRegInfo *ExtraInfo) - : RegAllocEvictionAdvisor(MF, Matrix, LIS, VRM, RegClassInfo, ExtraInfo) { - } + DefaultEvictionAdvisor(const MachineFunction &MF, const RAGreedy &RA) + : RegAllocEvictionAdvisor(MF, RA) {} private: MCRegister tryFindEvictionCandidate(LiveInterval &, const AllocationOrder &, diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp index ce3cf31dbd6b..6ea6dbcbbb74 100644 --- a/llvm/lib/CodeGen/RegAllocGreedy.cpp +++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp @@ -11,6 +11,7 @@ // //===----------------------------------------------------------------------===// +#include "RegAllocGreedy.h" #include "AllocationOrder.h" #include "InterferenceCache.h" #include "LiveDebugVariables.h" @@ -135,362 +136,6 @@ static cl::opt<bool> ConsiderLocalIntervalCost( static RegisterRegAlloc greedyRegAlloc("greedy", "greedy register allocator", createGreedyRegisterAllocator); -namespace { - -class RAGreedy : public MachineFunctionPass, - public RegAllocBase, - private LiveRangeEdit::Delegate { - // Convenient shortcuts. - using PQueue = std::priority_queue<std::pair<unsigned, unsigned>>; - using SmallLISet = SmallPtrSet<LiveInterval *, 4>; - - // context - MachineFunction *MF; - - // Shortcuts to some useful interface. - const TargetInstrInfo *TII; - const TargetRegisterInfo *TRI; - RegisterClassInfo RCI; - - // analyses - SlotIndexes *Indexes; - MachineBlockFrequencyInfo *MBFI; - MachineDominatorTree *DomTree; - MachineLoopInfo *Loops; - MachineOptimizationRemarkEmitter *ORE; - EdgeBundles *Bundles; - SpillPlacement *SpillPlacer; - LiveDebugVariables *DebugVars; - AliasAnalysis *AA; - - // state - std::unique_ptr<Spiller> SpillerInstance; - PQueue Queue; - std::unique_ptr<VirtRegAuxInfo> VRAI; - Optional<ExtraRegInfo> ExtraInfo; - std::unique_ptr<RegAllocEvictionAdvisor> EvictAdvisor; - - // Enum CutOffStage to keep a track whether the register allocation failed - // because of the cutoffs encountered in last chance recoloring. - // Note: This is used as bitmask. New value should be next power of 2. - enum CutOffStage { - // No cutoffs encountered - CO_None = 0, - - // lcr-max-depth cutoff encountered - CO_Depth = 1, - - // lcr-max-interf cutoff encountered - CO_Interf = 2 - }; - - uint8_t CutOffInfo; - -#ifndef NDEBUG - static const char *const StageName[]; -#endif - - /// EvictionTrack - Keeps track of past evictions in order to optimize region - /// split decision. - class EvictionTrack { - - public: - using EvictorInfo = - std::pair<Register /* evictor */, MCRegister /* physreg */>; - using EvicteeInfo = llvm::DenseMap<Register /* evictee */, EvictorInfo>; - - private: - /// Each Vreg that has been evicted in the last stage of selectOrSplit will - /// be mapped to the evictor Vreg and the PhysReg it was evicted from. - EvicteeInfo Evictees; - - public: - /// Clear all eviction information. - void clear() { Evictees.clear(); } - - /// Clear eviction information for the given evictee Vreg. - /// E.g. when Vreg get's a new allocation, the old eviction info is no - /// longer relevant. - /// \param Evictee The evictee Vreg for whom we want to clear collected - /// eviction info. - void clearEvicteeInfo(Register Evictee) { Evictees.erase(Evictee); } - - /// Track new eviction. - /// The Evictor vreg has evicted the Evictee vreg from Physreg. - /// \param PhysReg The physical register Evictee was evicted from. - /// \param Evictor The evictor Vreg that evicted Evictee. - /// \param Evictee The evictee Vreg. - void addEviction(MCRegister PhysReg, Register Evictor, Register Evictee) { - Evictees[Evictee].first = Evictor; - Evictees[Evictee].second = PhysReg; - } - - /// Return the Evictor Vreg which evicted Evictee Vreg from PhysReg. - /// \param Evictee The evictee vreg. - /// \return The Evictor vreg which evicted Evictee vreg from PhysReg. 0 if - /// nobody has evicted Evictee from PhysReg. - EvictorInfo getEvictor(Register Evictee) { - if (Evictees.count(Evictee)) { - return Evictees[Evictee]; - } - - return EvictorInfo(0, 0); - } - }; - - // Keeps track of past evictions in order to optimize region split decision. - EvictionTrack LastEvicted; - - // splitting state. - std::unique_ptr<SplitAnalysis> SA; - std::unique_ptr<SplitEditor> SE; - - /// Cached per-block interference maps - InterferenceCache IntfCache; - - /// All basic blocks where the current register has uses. - SmallVector<SpillPlacement::BlockConstraint, 8> SplitConstraints; - - /// Global live range splitting candidate info. - struct GlobalSplitCandidate { - // Register intended for assignment, or 0. - MCRegister PhysReg; - - // SplitKit interval index for this candidate. - unsigned IntvIdx; - - // Interference for PhysReg. - InterferenceCache::Cursor Intf; - - // Bundles where this candidate should be live. - BitVector LiveBundles; - SmallVector<unsigned, 8> ActiveBlocks; - - void reset(InterferenceCache &Cache, MCRegister Reg) { - PhysReg = Reg; - IntvIdx = 0; - Intf.setPhysReg(Cache, Reg); - LiveBundles.clear(); - ActiveBlocks.clear(); - } - - // Set B[I] = C for every live bundle where B[I] was NoCand. - unsigned getBundles(SmallVectorImpl<unsigned> &B, unsigned C) { - unsigned Count = 0; - for (unsigned I : LiveBundles.set_bits()) - if (B[I] == NoCand) { - B[I] = C; - Count++; - } - return Count; - } - }; - - /// Candidate info for each PhysReg in AllocationOrder. - /// This vector never shrinks, but grows to the size of the largest register - /// class. - SmallVector<GlobalSplitCandidate, 32> GlobalCand; - - enum : unsigned { NoCand = ~0u }; - - /// Candidate map. Each edge bundle is assigned to a GlobalCand entry, or to - /// NoCand which indicates the stack interval. - SmallVector<unsigned, 32> BundleCand; - - /// Callee-save register cost, calculated once per machine function. - BlockFrequency CSRCost; - - /// Enable or not the consideration of the cost of local intervals created - /// by a split candidate when choosing the best split candidate. - bool EnableAdvancedRASplitCost; - - /// Set of broken hints that may be reconciled later because of eviction. - SmallSetVector<LiveInterval *, 8> SetOfBrokenHints; - - /// The register cost values. This list will be recreated for each Machine - /// Function - ArrayRef<uint8_t> RegCosts; - -public: - RAGreedy(const RegClassFilterFunc F = allocateAllRegClasses); - - /// Return the pass name. - StringRef getPassName() const override { return "Greedy Register Allocator"; } - - /// RAGreedy analysis usage. - void getAnalysisUsage(AnalysisUsage &AU) const override; - void releaseMemory() override; - Spiller &spiller() override { return *SpillerInstance; } - void enqueueImpl(LiveInterval *LI) override; - LiveInterval *dequeue() override; - MCRegister selectOrSplit(LiveInterval &, - SmallVectorImpl<Register> &) override; - void aboutToRemoveInterval(LiveInterval &) override; - - /// Perform register allocation. - bool runOnMachineFunction(MachineFunction &mf) override; - - MachineFunctionProperties getRequiredProperties() const override { - return MachineFunctionProperties().set( - MachineFunctionProperties::Property::NoPHIs); - } - - MachineFunctionProperties getClearedProperties() const override { - return MachineFunctionProperties().set( - MachineFunctionProperties::Property::IsSSA); - } - - static char ID; - -private: - MCRegister selectOrSplitImpl(LiveInterval &, SmallVectorImpl<Register> &, - SmallVirtRegSet &, unsigned = 0); - - bool LRE_CanEraseVirtReg(Register) override; - void LRE_WillShrinkVirtReg(Register) override; - void LRE_DidCloneVirtReg(Register, Register) override; - void enqueue(PQueue &CurQueue, LiveInterval *LI); - LiveInterval *dequeue(PQueue &CurQueue); - - BlockFrequency calcSpillCost(); - bool addSplitConstraints(InterferenceCache::Cursor, BlockFrequency&); - bool addThroughConstraints(InterferenceCache::Cursor, ArrayRef<unsigned>); - bool growRegion(GlobalSplitCandidate &Cand); - bool splitCanCauseEvictionChain(Register Evictee, GlobalSplitCandidate &Cand, - unsigned BBNumber, - const AllocationOrder &Order); - bool splitCanCauseLocalSpill(unsigned VirtRegToSplit, - GlobalSplitCandidate &Cand, unsigned BBNumber, - const AllocationOrder &Order); - BlockFrequency calcGlobalSplitCost(GlobalSplitCandidate &, - const AllocationOrder &Order, - bool *CanCauseEvictionChain); - bool calcCompactRegion(GlobalSplitCandidate&); - void splitAroundRegion(LiveRangeEdit&, ArrayRef<unsigned>); - void calcGapWeights(MCRegister, SmallVectorImpl<float> &); - bool canEvictInterferenceInRange(const LiveInterval &VirtReg, - MCRegister PhysReg, SlotIndex Start, - SlotIndex End, EvictionCost &MaxCost) const; - MCRegister getCheapestEvicteeWeight(const AllocationOrder &Order, - const LiveInterval &VirtReg, - SlotIndex Start, SlotIndex End, - float *BestEvictWeight) const; - void evictInterference(LiveInterval &, MCRegister, - SmallVectorImpl<Register> &); - bool mayRecolorAllInterferences(MCRegister PhysReg, LiveInterval &VirtReg, - SmallLISet &RecoloringCandidates, - const SmallVirtRegSet &FixedRegisters); - - MCRegister tryAssign(LiveInterval&, AllocationOrder&, - SmallVectorImpl<Register>&, - const SmallVirtRegSet&); - MCRegister tryFindEvictionCandidate(LiveInterval &, const AllocationOrder &, - uint8_t, const SmallVirtRegSet &) const; - MCRegister tryEvict(LiveInterval &, AllocationOrder &, - SmallVectorImpl<Register> &, uint8_t, - const SmallVirtRegSet &); - MCRegister tryRegionSplit(LiveInterval &, AllocationOrder &, - SmallVectorImpl<Register> &); - /// Calculate cost of region splitting. - unsigned calculateRegionSplitCost(LiveInterval &VirtReg, - AllocationOrder &Order, - BlockFrequency &BestCost, - unsigned &NumCands, bool IgnoreCSR, - bool *CanCauseEvictionChain = nullptr); - /// Perform region splitting. - unsigned doRegionSplit(LiveInterval &VirtReg, unsigned BestCand, - bool HasCompact, - SmallVectorImpl<Register> &NewVRegs); - /// Check other options before using a callee-saved register for the first - /// time. - MCRegister tryAssignCSRFirstTime(LiveInterval &VirtReg, - AllocationOrder &Order, MCRegister PhysReg, - uint8_t &CostPerUseLimit, - SmallVectorImpl<Register> &NewVRegs); - void initializeCSRCost(); - unsigned tryBlockSplit(LiveInterval&, AllocationOrder&, - SmallVectorImpl<Register>&); - unsigned tryInstructionSplit(LiveInterval&, AllocationOrder&, - SmallVectorImpl<Register>&); - unsigned tryLocalSplit(LiveInterval&, AllocationOrder&, - SmallVectorImpl<Register>&); - unsigned trySplit(LiveInterval&, AllocationOrder&, - SmallVectorImpl<Register>&, - const SmallVirtRegSet&); - unsigned tryLastChanceRecoloring(LiveInterval &, AllocationOrder &, - SmallVectorImpl<Register> &, - SmallVirtRegSet &, unsigned); - bool tryRecoloringCandidates(PQueue &, SmallVectorImpl<Register> &, - SmallVirtRegSet &, unsigned); - void tryHintRecoloring(LiveInterval &); - void tryHintsRecoloring(); - - /// Model the information carried by one end of a copy. - struct HintInfo { - /// The frequency of the copy. - BlockFrequency Freq; - /// The virtual register or physical register. - Register Reg; - /// Its currently assigned register. - /// In case of a physical register Reg == PhysReg. - MCRegister PhysReg; - - HintInfo(BlockFrequency Freq, Register Reg, MCRegister PhysReg) - : Freq(Freq), Reg(Reg), PhysReg(PhysReg) {} - }; - using HintsInfo = SmallVector<HintInfo, 4>; - - BlockFrequency getBrokenHintFreq(const HintsInfo &, MCRegister); - void collectHintInfo(Register, HintsInfo &); - - /// Greedy RA statistic to remark. - struct RAGreedyStats { - unsigned Reloads = 0; - unsigned FoldedReloads = 0; - unsigned ZeroCostFoldedReloads = 0; - unsigned Spills = 0; - unsigned FoldedSpills = 0; - unsigned Copies = 0; - float ReloadsCost = 0.0f; - float FoldedReloadsCost = 0.0f; - float SpillsCost = 0.0f; - float FoldedSpillsCost = 0.0f; - float CopiesCost = 0.0f; - - bool isEmpty() { - return !(Reloads || FoldedReloads || Spills || FoldedSpills || - ZeroCostFoldedReloads || Copies); - } - - void add(RAGreedyStats other) { - Reloads += other.Reloads; - FoldedReloads += other.FoldedReloads; - ZeroCostFoldedReloads += other.ZeroCostFoldedReloads; - Spills += other.Spills; - FoldedSpills += other.FoldedSpills; - Copies += other.Copies; - ReloadsCost += other.ReloadsCost; - FoldedReloadsCost += other.FoldedReloadsCost; - SpillsCost += other.SpillsCost; - FoldedSpillsCost += other.FoldedSpillsCost; - CopiesCost += other.CopiesCost; - } - - void report(MachineOptimizationRemarkMissed &R); - }; - - /// Compute statistic for a basic block. - RAGreedyStats computeStats(MachineBasicBlock &MBB); - - /// Compute and report statistic through a remark. - RAGreedyStats reportStats(MachineLoop *L); - - /// Report the statistic for each loop. - void reportStats(); -}; - -} // end anonymous namespace - char RAGreedy::ID = 0; char &llvm::RAGreedyID = RAGreedy::ID; @@ -613,7 +258,7 @@ void RAGreedy::LRE_DidCloneVirtReg(Register New, Register Old) { ExtraInfo->LRE_DidCloneVirtReg(New, Old); } -void ExtraRegInfo::LRE_DidCloneVirtReg(Register New, Register Old) { +void RAGreedy::ExtraRegInfo::LRE_DidCloneVirtReg(Register New, Register Old) { // Cloning a register we haven't even heard about yet? Just ignore it. if (!Info.inBounds(Old)) return; @@ -811,7 +456,7 @@ Register RegAllocEvictionAdvisor::canReassign(LiveInterval &VirtReg, bool DefaultEvictionAdvisor::shouldEvict(LiveInterval &A, bool IsHint, LiveInterval &B, bool BreaksHint) const { - bool CanSplit = ExtraInfo->getStage(B) < RS_Spill; + bool CanSplit = RA.getExtraInfo().getStage(B) < RS_Spill; // Be fairly aggressive about following hints as long as the evictee can be // split. @@ -852,7 +497,7 @@ bool DefaultEvictionAdvisor::canEvictInterferenceBasedOnCost( if (Matrix->checkInterference(VirtReg, PhysReg) > LiveRegMatrix::IK_VirtReg) return false; - bool IsLocal = LIS->intervalIsInOneMBB(VirtReg); + bool IsLocal = VirtReg.empty() || LIS->intervalIsInOneMBB(VirtReg); // Find VirtReg's cascade number. This will be unassigned if VirtReg was never // involved in an eviction before. If a cascade number was assigned, deny @@ -861,7 +506,7 @@ bool DefaultEvictionAdvisor::canEvictInterferenceBasedOnCost( // // This works out so a register without a cascade number is allowed to evict // anything, and it can be evicted by anything. - unsigned Cascade = ExtraInfo->getCascadeOrCurrentNext(VirtReg.reg()); + unsigned Cascade = RA.getExtraInfo().getCascadeOrCurrentNext(VirtReg.reg()); EvictionCost Cost; for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) { @@ -883,7 +528,7 @@ bool DefaultEvictionAdvisor::canEvictInterferenceBasedOnCost( return false; // Never evict spill products. They cannot split or spill. - if (ExtraInfo->getStage(*Intf) == RS_Done) + if (RA.getExtraInfo().getStage(*Intf) == RS_Done) return false; // Once a live range becomes small enough, it is urgent that we find a // register for it. This is indicated by an infinite spill weight. These @@ -898,7 +543,7 @@ bool DefaultEvictionAdvisor::canEvictInterferenceBasedOnCost( RegClassInfo.getNumAllocatableRegs( MRI->getRegClass(Intf->reg()))); // Only evict older cascades or live ranges without a cascade. - unsigned IntfCascade = ExtraInfo->getCascade(Intf->reg()); + unsigned IntfCascade = RA.getExtraInfo().getCascade(Intf->reg()); if (Cascade <= IntfCascade) { if (!Urgent) return false; @@ -1069,28 +714,20 @@ bool RegAllocEvictionAdvisor::isUnusedCalleeSavedReg(MCRegister PhysReg) const { return !Matrix->isPhysRegUsed(PhysReg); } -MCRegister DefaultEvictionAdvisor::tryFindEvictionCandidate( - LiveInterval &VirtReg, const AllocationOrder &Order, - uint8_t CostPerUseLimit, const SmallVirtRegSet &FixedRegisters) const { - // Keep track of the cheapest interference seen so far. - EvictionCost BestCost; - BestCost.setMax(); - MCRegister BestPhys; +Optional<unsigned> +RegAllocEvictionAdvisor::getOrderLimit(const LiveInterval &VirtReg, + const AllocationOrder &Order, + unsigned CostPerUseLimit) const { unsigned OrderLimit = Order.getOrder().size(); - // When we are just looking for a reduced cost per use, don't break any - // hints, and only evict smaller spill weights. if (CostPerUseLimit < uint8_t(~0u)) { - BestCost.BrokenHints = 0; - BestCost.MaxWeight = VirtReg.weight(); - // Check of any registers in RC are below CostPerUseLimit. const TargetRegisterClass *RC = MRI->getRegClass(VirtReg.reg()); uint8_t MinCost = RegClassInfo.getMinCost(RC); if (MinCost >= CostPerUseLimit) { LLVM_DEBUG(dbgs() << TRI->getRegClassName(RC) << " minimum cost = " << MinCost << ", no cheaper registers to be found.\n"); - return 0; + return None; } // It is normal for register classes to have a long tail of registers with @@ -1101,24 +738,50 @@ MCRegister DefaultEvictionAdvisor::tryFindEvictionCandidate( << " regs.\n"); } } + return OrderLimit; +} + +bool RegAllocEvictionAdvisor::canAllocatePhysReg(unsigned CostPerUseLimit, + MCRegister PhysReg) const { + if (RegCosts[PhysReg] >= CostPerUseLimit) + return false; + // The first use of a callee-saved register in a function has cost 1. + // Don't start using a CSR when the CostPerUseLimit is low. + if (CostPerUseLimit == 1 && isUnusedCalleeSavedReg(PhysReg)) { + LLVM_DEBUG( + dbgs() << printReg(PhysReg, TRI) << " would clobber CSR " + << printReg(RegClassInfo.getLastCalleeSavedAlias(PhysReg), TRI) + << '\n'); + return false; + } + return true; +} + +MCRegister DefaultEvictionAdvisor::tryFindEvictionCandidate( + LiveInterval &VirtReg, const AllocationOrder &Order, + uint8_t CostPerUseLimit, const SmallVirtRegSet &FixedRegisters) const { + // Keep track of the cheapest interference seen so far. + EvictionCost BestCost; + BestCost.setMax(); + MCRegister BestPhys; + auto MaybeOrderLimit = getOrderLimit(VirtReg, Order, CostPerUseLimit); + if (!MaybeOrderLimit) + return MCRegister::NoRegister; + unsigned OrderLimit = *MaybeOrderLimit; + + // When we are just looking for a reduced cost per use, don't break any + // hints, and only evict smaller spill weights. + if (CostPerUseLimit < uint8_t(~0u)) { + BestCost.BrokenHints = 0; + BestCost.MaxWeight = VirtReg.weight(); + } for (auto I = Order.begin(), E = Order.getOrderLimitEnd(OrderLimit); I != E; ++I) { MCRegister PhysReg = *I; assert(PhysReg); - if (RegCosts[PhysReg] >= CostPerUseLimit) - continue; - // The first use of a callee-saved register in a function has cost 1. - // Don't start using a CSR when the CostPerUseLimit is low. - if (CostPerUseLimit == 1 && isUnusedCalleeSavedReg(PhysReg)) { - LLVM_DEBUG( - dbgs() << printReg(PhysReg, TRI) << " would clobber CSR " - << printReg(RegClassInfo.getLastCalleeSavedAlias(PhysReg), TRI) - << '\n'); - continue; - } - - if (!canEvictInterferenceBasedOnCost(VirtReg, PhysReg, false, BestCost, + if (!canAllocatePhysReg(CostPerUseLimit, PhysReg) || + !canEvictInterferenceBasedOnCost(VirtReg, PhysReg, false, BestCost, FixedRegisters)) continue; @@ -3269,8 +2932,8 @@ bool RAGreedy::runOnMachineFunction(MachineFunction &mf) { SA.reset(new SplitAnalysis(*VRM, *LIS, *Loops)); SE.reset(new SplitEditor(*SA, *AA, *LIS, *VRM, *DomTree, *MBFI, *VRAI)); ExtraInfo.emplace(); - EvictAdvisor = getAnalysis<RegAllocEvictionAdvisorAnalysis>().getAdvisor( - *MF, Matrix, LIS, VRM, RegClassInfo, &*ExtraInfo); + EvictAdvisor = + getAnalysis<RegAllocEvictionAdvisorAnalysis>().getAdvisor(*MF, *this); IntfCache.init(MF, Matrix->getLiveUnions(), Indexes, LIS, TRI); GlobalCand.resize(32); // This will grow as needed. SetOfBrokenHints.clear(); diff --git a/llvm/lib/CodeGen/RegAllocGreedy.h b/llvm/lib/CodeGen/RegAllocGreedy.h new file mode 100644 index 000000000000..e9a5fe635f26 --- /dev/null +++ b/llvm/lib/CodeGen/RegAllocGreedy.h @@ -0,0 +1,507 @@ +//==- RegAllocGreedy.h ------- greedy register allocator ----------*-C++-*-==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// This file defines the RAGreedy function pass for register allocation in +// optimized builds. +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CODEGEN_REGALLOCGREEDY_H_ +#define LLVM_CODEGEN_REGALLOCGREEDY_H_ + +#include "AllocationOrder.h" +#include "InterferenceCache.h" +#include "LiveDebugVariables.h" +#include "RegAllocBase.h" +#include "RegAllocEvictionAdvisor.h" +#include "SpillPlacement.h" +#include "SplitKit.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/BitVector.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/IndexedMap.h" +#include "llvm/ADT/MapVector.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/CodeGen/CalcSpillWeights.h" +#include "llvm/CodeGen/EdgeBundles.h" +#include "llvm/CodeGen/LiveInterval.h" +#include "llvm/CodeGen/LiveIntervalUnion.h" +#include "llvm/CodeGen/LiveIntervals.h" +#include "llvm/CodeGen/LiveRangeEdit.h" +#include "llvm/CodeGen/LiveRegMatrix.h" +#include "llvm/CodeGen/LiveStacks.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineBlockFrequencyInfo.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/RegisterClassInfo.h" +#include "llvm/CodeGen/SlotIndexes.h" +#include "llvm/CodeGen/Spiller.h" +#include "llvm/CodeGen/TargetInstrInfo.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" +#include "llvm/CodeGen/TargetSubtargetInfo.h" +#include "llvm/CodeGen/VirtRegMap.h" +#include "llvm/IR/DebugInfoMetadata.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/Pass.h" +#include "llvm/Support/BranchProbability.h" +#include "llvm/Target/TargetMachine.h" +#include <algorithm> +#include <cassert> +#include <cstdint> +#include <memory> +#include <queue> +#include <tuple> +#include <utility> + +namespace llvm { +class LLVM_LIBRARY_VISIBILITY RAGreedy : public MachineFunctionPass, + public RegAllocBase, + private LiveRangeEdit::Delegate { + // Interface to eviction advisers +public: + /// Track allocation stage and eviction loop prevention during allocation. + class ExtraRegInfo final { + // RegInfo - Keep additional information about each live range. + struct RegInfo { + LiveRangeStage Stage = RS_New; + + // Cascade - Eviction loop prevention. See + // canEvictInterferenceBasedOnCost(). + unsigned Cascade = 0; + + RegInfo() = default; + }; + + IndexedMap<RegInfo, VirtReg2IndexFunctor> Info; + unsigned NextCascade = 1; + + public: + ExtraRegInfo() = default; + ExtraRegInfo(const ExtraRegInfo &) = delete; + + LiveRangeStage getStage(Register Reg) const { return Info[Reg].Stage; } + + LiveRangeStage getStage(const LiveInterval &VirtReg) const { + return getStage(VirtReg.reg()); + } + + void setStage(Register Reg, LiveRangeStage Stage) { + Info.grow(Reg.id()); + Info[Reg].Stage = Stage; + } + + void setStage(const LiveInterval &VirtReg, LiveRangeStage Stage) { + setStage(VirtReg.reg(), Stage); + } + + /// Return the current stage of the register, if present, otherwise + /// initialize it and return that. + LiveRangeStage getOrInitStage(Register Reg) { + Info.grow(Reg.id()); + return getStage(Reg); + } + + unsigned getCascade(Register Reg) const { return Info[Reg].Cascade; } + + void setCascade(Register Reg, unsigned Cascade) { + Info.grow(Reg.id()); + Info[Reg].Cascade = Cascade; + } + + unsigned getOrAssignNewCascade(Register Reg) { + unsigned Cascade = getCascade(Reg); + if (!Cascade) { + Cascade = NextCascade++; + setCascade(Reg, Cascade); + } + return Cascade; + } + + unsigned getCascadeOrCurrentNext(Register Reg) const { + unsigned Cascade = getCascade(Reg); + if (!Cascade) + Cascade = NextCascade; + return Cascade; + } + + template <typename Iterator> + void setStage(Iterator Begin, Iterator End, LiveRangeStage NewStage) { + for (; Begin != End; ++Begin) { + Register Reg = *Begin; + Info.grow(Reg.id()); + if (Info[Reg].Stage == RS_New) + Info[Reg].Stage = NewStage; + } + } + void LRE_DidCloneVirtReg(Register New, Register Old); + }; + + LiveRegMatrix *getInterferenceMatrix() const { return Matrix; } + LiveIntervals *getLiveIntervals() const { return LIS; } + VirtRegMap *getVirtRegMap() const { return VRM; } + const RegisterClassInfo &getRegClassInfo() const { return RegClassInfo; } + const ExtraRegInfo &getExtraInfo() const { return *ExtraInfo; } + size_t getQueueSize() const { return Queue.size(); } + // end (interface to eviction advisers) + +private: + // Convenient shortcuts. + using PQueue = std::priority_queue<std::pair<unsigned, unsigned>>; + using SmallLISet = SmallPtrSet<LiveInterval *, 4>; + + // context + MachineFunction *MF; + + // Shortcuts to some useful interface. + const TargetInstrInfo *TII; + const TargetRegisterInfo *TRI; + RegisterClassInfo RCI; + + // analyses + SlotIndexes *Indexes; + MachineBlockFrequencyInfo *MBFI; + MachineDominatorTree *DomTree; + MachineLoopInfo *Loops; + MachineOptimizationRemarkEmitter *ORE; + EdgeBundles *Bundles; + SpillPlacement *SpillPlacer; + LiveDebugVariables *DebugVars; + AliasAnalysis *AA; + + // state + std::unique_ptr<Spiller> SpillerInstance; + PQueue Queue; + std::unique_ptr<VirtRegAuxInfo> VRAI; + Optional<ExtraRegInfo> ExtraInfo; + std::unique_ptr<RegAllocEvictionAdvisor> EvictAdvisor; + + // Enum CutOffStage to keep a track whether the register allocation failed + // because of the cutoffs encountered in last chance recoloring. + // Note: This is used as bitmask. New value should be next power of 2. + enum CutOffStage { + // No cutoffs encountered + CO_None = 0, + + // lcr-max-depth cutoff encountered + CO_Depth = 1, + + // lcr-max-interf cutoff encountered + CO_Interf = 2 + }; + + uint8_t CutOffInfo; + +#ifndef NDEBUG + static const char *const StageName[]; +#endif + + /// EvictionTrack - Keeps track of past evictions in order to optimize region + /// split decision. + class EvictionTrack { + + public: + using EvictorInfo = + std::pair<Register /* evictor */, MCRegister /* physreg */>; + using EvicteeInfo = llvm::DenseMap<Register /* evictee */, EvictorInfo>; + + private: + /// Each Vreg that has been evicted in the last stage of selectOrSplit will + /// be mapped to the evictor Vreg and the PhysReg it was evicted from. + EvicteeInfo Evictees; + + public: + /// Clear all eviction information. + void clear() { Evictees.clear(); } + + /// Clear eviction information for the given evictee Vreg. + /// E.g. when Vreg get's a new allocation, the old eviction info is no + /// longer relevant. + /// \param Evictee The evictee Vreg for whom we want to clear collected + /// eviction info. + void clearEvicteeInfo(Register Evictee) { Evictees.erase(Evictee); } + + /// Track new eviction. + /// The Evictor vreg has evicted the Evictee vreg from Physreg. + /// \param PhysReg The physical register Evictee was evicted from. + /// \param Evictor The evictor Vreg that evicted Evictee. + /// \param Evictee The evictee Vreg. + void addEviction(MCRegister PhysReg, Register Evictor, Register Evictee) { + Evictees[Evictee].first = Evictor; + Evictees[Evictee].second = PhysReg; + } + + /// Return the Evictor Vreg which evicted Evictee Vreg from PhysReg. + /// \param Evictee The evictee vreg. + /// \return The Evictor vreg which evicted Evictee vreg from PhysReg. 0 if + /// nobody has evicted Evictee from PhysReg. + EvictorInfo getEvictor(Register Evictee) { + if (Evictees.count(Evictee)) { + return Evictees[Evictee]; + } + + return EvictorInfo(0, 0); + } + }; + + // Keeps track of past evictions in order to optimize region split decision. + EvictionTrack LastEvicted; + + // splitting state. + std::unique_ptr<SplitAnalysis> SA; + std::unique_ptr<SplitEditor> SE; + + /// Cached per-block interference maps + InterferenceCache IntfCache; + + /// All basic blocks where the current register has uses. + SmallVector<SpillPlacement::BlockConstraint, 8> SplitConstraints; + + /// Global live range splitting candidate info. + struct GlobalSplitCandidate { + // Register intended for assignment, or 0. + MCRegister PhysReg; + + // SplitKit interval index for this candidate. + unsigned IntvIdx; + + // Interference for PhysReg. + InterferenceCache::Cursor Intf; + + // Bundles where this candidate should be live. + BitVector LiveBundles; + SmallVector<unsigned, 8> ActiveBlocks; + + void reset(InterferenceCache &Cache, MCRegister Reg) { + PhysReg = Reg; + IntvIdx = 0; + Intf.setPhysReg(Cache, Reg); + LiveBundles.clear(); + ActiveBlocks.clear(); + } + + // Set B[I] = C for every live bundle where B[I] was NoCand. + unsigned getBundles(SmallVectorImpl<unsigned> &B, unsigned C) { + unsigned Count = 0; + for (unsigned I : LiveBundles.set_bits()) + if (B[I] == NoCand) { + B[I] = C; + Count++; + } + return Count; + } + }; + + /// Candidate info for each PhysReg in AllocationOrder. + /// This vector never shrinks, but grows to the size of the largest register + /// class. + SmallVector<GlobalSplitCandidate, 32> GlobalCand; + + enum : unsigned { NoCand = ~0u }; + + /// Candidate map. Each edge bundle is assigned to a GlobalCand entry, or to + /// NoCand which indicates the stack interval. + SmallVector<unsigned, 32> BundleCand; + + /// Callee-save register cost, calculated once per machine function. + BlockFrequency CSRCost; + + /// Enable or not the consideration of the cost of local intervals created + /// by a split candidate when choosing the best split candidate. + bool EnableAdvancedRASplitCost; + + /// Set of broken hints that may be reconciled later because of eviction. + SmallSetVector<LiveInterval *, 8> SetOfBrokenHints; + + /// The register cost values. This list will be recreated for each Machine + /// Function + ArrayRef<uint8_t> RegCosts; + +public: + RAGreedy(const RegClassFilterFunc F = allocateAllRegClasses); + + /// Return the pass name. + StringRef getPassName() const override { return "Greedy Register Allocator"; } + + /// RAGreedy analysis usage. + void getAnalysisUsage(AnalysisUsage &AU) const override; + void releaseMemory() override; + Spiller &spiller() override { return *SpillerInstance; } + void enqueueImpl(LiveInterval *LI) override; + LiveInterval *dequeue() override; + MCRegister selectOrSplit(LiveInterval &, + SmallVectorImpl<Register> &) override; + void aboutToRemoveInterval(LiveInterval &) override; + + /// Perform register allocation. + bool runOnMachineFunction(MachineFunction &mf) override; + + MachineFunctionProperties getRequiredProperties() const override { + return MachineFunctionProperties().set( + MachineFunctionProperties::Property::NoPHIs); + } + + MachineFunctionProperties getClearedProperties() const override { + return MachineFunctionProperties().set( + MachineFunctionProperties::Property::IsSSA); + } + + static char ID; + +private: + MCRegister selectOrSplitImpl(LiveInterval &, SmallVectorImpl<Register> &, + SmallVirtRegSet &, unsigned = 0); + + bool LRE_CanEraseVirtReg(Register) override; + void LRE_WillShrinkVirtReg(Register) override; + void LRE_DidCloneVirtReg(Register, Register) override; + void enqueue(PQueue &CurQueue, LiveInterval *LI); + LiveInterval *dequeue(PQueue &CurQueue); + + BlockFrequency calcSpillCost(); + bool addSplitConstraints(InterferenceCache::Cursor, BlockFrequency &); + bool addThroughConstraints(InterferenceCache::Cursor, ArrayRef<unsigned>); + bool growRegion(GlobalSplitCandidate &Cand); + bool splitCanCauseEvictionChain(Register Evictee, GlobalSplitCandidate &Cand, + unsigned BBNumber, + const AllocationOrder &Order); + bool splitCanCauseLocalSpill(unsigned VirtRegToSplit, + GlobalSplitCandidate &Cand, unsigned BBNumber, + const AllocationOrder &Order); + BlockFrequency calcGlobalSplitCost(GlobalSplitCandidate &, + const AllocationOrder &Order, + bool *CanCauseEvictionChain); + bool calcCompactRegion(GlobalSplitCandidate &); + void splitAroundRegion(LiveRangeEdit &, ArrayRef<unsigned>); + void calcGapWeights(MCRegister, SmallVectorImpl<float> &); + bool canEvictInterferenceInRange(const LiveInterval &VirtReg, + MCRegister PhysReg, SlotIndex Start, + SlotIndex End, EvictionCost &MaxCost) const; + MCRegister getCheapestEvicteeWeight(const AllocationOrder &Order, + const LiveInterval &VirtReg, + SlotIndex Start, SlotIndex End, + float *BestEvictWeight) const; + void evictInterference(LiveInterval &, MCRegister, + SmallVectorImpl<Register> &); + bool mayRecolorAllInterferences(MCRegister PhysReg, LiveInterval &VirtReg, + SmallLISet &RecoloringCandidates, + const SmallVirtRegSet &FixedRegisters); + + MCRegister tryAssign(LiveInterval &, AllocationOrder &, + SmallVectorImpl<Register> &, const SmallVirtRegSet &); + MCRegister tryEvict(LiveInterval &, AllocationOrder &, + SmallVectorImpl<Register> &, uint8_t, + const SmallVirtRegSet &); + MCRegister tryRegionSplit(LiveInterval &, AllocationOrder &, + SmallVectorImpl<Register> &); + /// Calculate cost of region splitting. + unsigned calculateRegionSplitCost(LiveInterval &VirtReg, + AllocationOrder &Order, + BlockFrequency &BestCost, + unsigned &NumCands, bool IgnoreCSR, + bool *CanCauseEvictionChain = nullptr); + /// Perform region splitting. + unsigned doRegionSplit(LiveInterval &VirtReg, unsigned BestCand, + bool HasCompact, SmallVectorImpl<Register> &NewVRegs); + /// Check other options before using a callee-saved register for the first + /// time. + MCRegister tryAssignCSRFirstTime(LiveInterval &VirtReg, + AllocationOrder &Order, MCRegister PhysReg, + uint8_t &CostPerUseLimit, + SmallVectorImpl<Register> &NewVRegs); + void initializeCSRCost(); + unsigned tryBlockSplit(LiveInterval &, AllocationOrder &, + SmallVectorImpl<Register> &); + unsigned tryInstructionSplit(LiveInterval &, AllocationOrder &, + SmallVectorImpl<Register> &); + unsigned tryLocalSplit(LiveInterval &, AllocationOrder &, + SmallVectorImpl<Register> &); + unsigned trySplit(LiveInterval &, AllocationOrder &, + SmallVectorImpl<Register> &, const SmallVirtRegSet &); + unsigned tryLastChanceRecoloring(LiveInterval &, AllocationOrder &, + SmallVectorImpl<Register> &, + SmallVirtRegSet &, unsigned); + bool tryRecoloringCandidates(PQueue &, SmallVectorImpl<Register> &, + SmallVirtRegSet &, unsigned); + void tryHintRecoloring(LiveInterval &); + void tryHintsRecoloring(); + + /// Model the information carried by one end of a copy. + struct HintInfo { + /// The frequency of the copy. + BlockFrequency Freq; + /// The virtual register or physical register. + Register Reg; + /// Its currently assigned register. + /// In case of a physical register Reg == PhysReg. + MCRegister PhysReg; + + HintInfo(BlockFrequency Freq, Register Reg, MCRegister PhysReg) + : Freq(Freq), Reg(Reg), PhysReg(PhysReg) {} + }; + using HintsInfo = SmallVector<HintInfo, 4>; + + BlockFrequency getBrokenHintFreq(const HintsInfo &, MCRegister); + void collectHintInfo(Register, HintsInfo &); + + /// Greedy RA statistic to remark. + struct RAGreedyStats { + unsigned Reloads = 0; + unsigned FoldedReloads = 0; + unsigned ZeroCostFoldedReloads = 0; + unsigned Spills = 0; + unsigned FoldedSpills = 0; + unsigned Copies = 0; + float ReloadsCost = 0.0f; + float FoldedReloadsCost = 0.0f; + float SpillsCost = 0.0f; + float FoldedSpillsCost = 0.0f; + float CopiesCost = 0.0f; + + bool isEmpty() { + return !(Reloads || FoldedReloads || Spills || FoldedSpills || + ZeroCostFoldedReloads || Copies); + } + + void add(RAGreedyStats other) { + Reloads += other.Reloads; + FoldedReloads += other.FoldedReloads; + ZeroCostFoldedReloads += other.ZeroCostFoldedReloads; + Spills += other.Spills; + FoldedSpills += other.FoldedSpills; + Copies += other.Copies; + ReloadsCost += other.ReloadsCost; + FoldedReloadsCost += other.FoldedReloadsCost; + SpillsCost += other.SpillsCost; + FoldedSpillsCost += other.FoldedSpillsCost; + CopiesCost += other.CopiesCost; + } + + void report(MachineOptimizationRemarkMissed &R); + }; + + /// Compute statistic for a basic block. + RAGreedyStats computeStats(MachineBasicBlock &MBB); + + /// Compute and report statistic through a remark. + RAGreedyStats reportStats(MachineLoop *L); + + /// Report the statistic for each loop. + void reportStats(); +}; +} // namespace llvm +#endif // #ifndef LLVM_CODEGEN_REGALLOCGREEDY_H_ diff --git a/llvm/lib/CodeGen/RegisterScavenging.cpp b/llvm/lib/CodeGen/RegisterScavenging.cpp index c0a07ec4c91d..424ad7419165 100644 --- a/llvm/lib/CodeGen/RegisterScavenging.cpp +++ b/llvm/lib/CodeGen/RegisterScavenging.cpp @@ -533,6 +533,22 @@ Register RegScavenger::scavengeRegister(const TargetRegisterClass *RC, Candidates.reset(*AI); } + // If we have already scavenged some registers, remove them from the + // candidates. If we end up recursively calling eliminateFrameIndex, we don't + // want to be clobbering previously scavenged registers or their associated + // stack slots. + for (ScavengedInfo &SI : Scavenged) { + if (SI.Reg) { + if (isRegUsed(SI.Reg)) { + LLVM_DEBUG( + dbgs() << "Removing " << printReg(SI.Reg, TRI) << + " from scavenging candidates since it was already scavenged\n"); + for (MCRegAliasIterator AI(SI.Reg, TRI, true); AI.isValid(); ++AI) + Candidates.reset(*AI); + } + } + } + // Try to find a register that's unused if there is one, as then we won't // have to spill. BitVector Available = getRegsAvailable(RC); @@ -553,6 +569,12 @@ Register RegScavenger::scavengeRegister(const TargetRegisterClass *RC, if (!AllowSpill) return 0; +#ifndef NDEBUG + for (ScavengedInfo &SI : Scavenged) { + assert(SI.Reg != SReg && "scavenged a previously scavenged register"); + } +#endif + ScavengedInfo &Scavenged = spill(SReg, *RC, SPAdj, I, UseMI); Scavenged.Restore = &*std::prev(UseMI); diff --git a/llvm/lib/CodeGen/ScoreboardHazardRecognizer.cpp b/llvm/lib/CodeGen/ScoreboardHazardRecognizer.cpp index 6e05de888cc0..a61a2b2728fa 100644 --- a/llvm/lib/CodeGen/ScoreboardHazardRecognizer.cpp +++ b/llvm/lib/CodeGen/ScoreboardHazardRecognizer.cpp @@ -30,8 +30,7 @@ using namespace llvm; ScoreboardHazardRecognizer::ScoreboardHazardRecognizer( const InstrItineraryData *II, const ScheduleDAG *SchedDAG, const char *ParentDebugType) - : ScheduleHazardRecognizer(), DebugType(ParentDebugType), ItinData(II), - DAG(SchedDAG) { + : DebugType(ParentDebugType), ItinData(II), DAG(SchedDAG) { (void)DebugType; // Determine the maximum depth of any itinerary. This determines the depth of // the scoreboard. We always make the scoreboard at least 1 cycle deep to diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 067ad819e0d2..932f263d2558 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -593,7 +593,7 @@ namespace { SDValue MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL); SDValue MatchLoadCombine(SDNode *N); SDValue mergeTruncStores(StoreSDNode *N); - SDValue ReduceLoadWidth(SDNode *N); + SDValue reduceLoadWidth(SDNode *N); SDValue ReduceLoadOpStoreWidth(SDNode *N); SDValue splitMergedValStore(StoreSDNode *ST); SDValue TransformFPLoadStorePair(SDNode *N); @@ -1070,7 +1070,7 @@ SDValue DAGCombiner::reassociateOpsCommutative(unsigned Opc, const SDLoc &DL, return DAG.getNode(Opc, DL, VT, N00, OpNode); return SDValue(); } - if (N0.hasOneUse()) { + if (TLI.isReassocProfitable(DAG, N0, N1)) { // Reassociate: (op (op x, c1), y) -> (op (op x, y), c1) // iff (op x, c1) has one use if (SDValue OpNode = DAG.getNode(Opc, SDLoc(N0), VT, N00, N1)) @@ -3058,9 +3058,8 @@ static SDValue combineADDCARRYDiamond(DAGCombiner &Combiner, SelectionDAG &DAG, // // Our goal is to identify A, B, and CarryIn and produce ADDCARRY/SUBCARRY with // a single path for carry/borrow out propagation: -static SDValue combineCarryDiamond(DAGCombiner &Combiner, SelectionDAG &DAG, - const TargetLowering &TLI, SDValue Carry0, - SDValue Carry1, SDNode *N) { +static SDValue combineCarryDiamond(SelectionDAG &DAG, const TargetLowering &TLI, + SDValue Carry0, SDValue Carry1, SDNode *N) { if (Carry0.getResNo() != 1 || Carry1.getResNo() != 1) return SDValue(); unsigned Opcode = Carry0.getOpcode(); @@ -3908,7 +3907,7 @@ SDValue DAGCombiner::visitMUL(SDNode *N) { // Change (mul (shl X, C), Y) -> (shl (mul X, Y), C) when the shift has one // use. { - SDValue Sh(nullptr, 0), Y(nullptr, 0); + SDValue Sh, Y; // Check for both (mul (shl X, C), Y) and (mul Y, (shl X, C)). if (N0.getOpcode() == ISD::SHL && @@ -4471,15 +4470,15 @@ SDValue DAGCombiner::visitMULHS(SDNode *N) { return FoldedVOp; // fold (mulhs x, 0) -> 0 - // do not return N0/N1, because undef node may exist. - if (ISD::isConstantSplatVectorAllZeros(N0.getNode()) || - ISD::isConstantSplatVectorAllZeros(N1.getNode())) + // do not return N1, because undef node may exist. + if (ISD::isConstantSplatVectorAllZeros(N1.getNode())) return DAG.getConstant(0, DL, VT); } // fold (mulhs x, 0) -> 0 if (isNullConstant(N1)) return N1; + // fold (mulhs x, 1) -> (sra x, size(x)-1) if (isOneConstant(N1)) return DAG.getNode(ISD::SRA, DL, N0.getValueType(), N0, @@ -4531,18 +4530,19 @@ SDValue DAGCombiner::visitMULHU(SDNode *N) { return FoldedVOp; // fold (mulhu x, 0) -> 0 - // do not return N0/N1, because undef node may exist. - if (ISD::isConstantSplatVectorAllZeros(N0.getNode()) || - ISD::isConstantSplatVectorAllZeros(N1.getNode())) + // do not return N1, because undef node may exist. + if (ISD::isConstantSplatVectorAllZeros(N1.getNode())) return DAG.getConstant(0, DL, VT); } // fold (mulhu x, 0) -> 0 if (isNullConstant(N1)) return N1; + // fold (mulhu x, 1) -> 0 if (isOneConstant(N1)) return DAG.getConstant(0, DL, N0.getValueType()); + // fold (mulhu x, undef) -> 0 if (N0.isUndef() || N1.isUndef()) return DAG.getConstant(0, DL, VT); @@ -4892,6 +4892,42 @@ static SDValue PerformMinMaxFpToSatCombine(SDValue N0, SDValue N1, SDValue N2, : DAG.getSExtOrTrunc(Sat, DL, N2->getValueType(0)); } +static SDValue PerformUMinFpToSatCombine(SDValue N0, SDValue N1, SDValue N2, + SDValue N3, ISD::CondCode CC, + SelectionDAG &DAG) { + // We are looking for UMIN(FPTOUI(X), (2^n)-1), which may have come via a + // select/vselect/select_cc. The two operands pairs for the select (N2/N3) may + // be truncated versions of the the setcc (N0/N1). + if ((N0 != N2 && + (N2.getOpcode() != ISD::TRUNCATE || N0 != N2.getOperand(0))) || + N0.getOpcode() != ISD::FP_TO_UINT || CC != ISD::SETULT) + return SDValue(); + ConstantSDNode *N1C = isConstOrConstSplat(N1); + ConstantSDNode *N3C = isConstOrConstSplat(N3); + if (!N1C || !N3C) + return SDValue(); + const APInt &C1 = N1C->getAPIntValue(); + const APInt &C3 = N3C->getAPIntValue(); + if (!(C1 + 1).isPowerOf2() || C1.getBitWidth() < C3.getBitWidth() || + C1 != C3.zextOrSelf(C1.getBitWidth())) + return SDValue(); + + unsigned BW = (C1 + 1).exactLogBase2(); + EVT FPVT = N0.getOperand(0).getValueType(); + EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), BW); + if (FPVT.isVector()) + NewVT = EVT::getVectorVT(*DAG.getContext(), NewVT, + FPVT.getVectorElementCount()); + if (!DAG.getTargetLoweringInfo().shouldConvertFpToSat(ISD::FP_TO_UINT_SAT, + FPVT, NewVT)) + return SDValue(); + + SDValue Sat = + DAG.getNode(ISD::FP_TO_UINT_SAT, SDLoc(N0), NewVT, N0.getOperand(0), + DAG.getValueType(NewVT.getScalarType())); + return DAG.getZExtOrTrunc(Sat, SDLoc(N0), N3.getValueType()); +} + SDValue DAGCombiner::visitIMINMAX(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); @@ -4934,6 +4970,9 @@ SDValue DAGCombiner::visitIMINMAX(SDNode *N) { if (SDValue S = PerformMinMaxFpToSatCombine( N0, N1, N0, N1, Opcode == ISD::SMIN ? ISD::SETLT : ISD::SETGT, DAG)) return S; + if (Opcode == ISD::UMIN) + if (SDValue S = PerformUMinFpToSatCombine(N0, N1, N0, N1, ISD::SETULT, DAG)) + return S; // Simplify the operands using demanded-bits information. if (SimplifyDemandedBits(SDValue(N, 0))) @@ -5491,6 +5530,8 @@ bool DAGCombiner::SearchForAndLoads(SDNode *N, // Some constants may need fixing up later if they are too large. if (auto *C = dyn_cast<ConstantSDNode>(Op)) { + if (Mask->getValueType(0) != C->getValueType(0)) + return false; if ((N->getOpcode() == ISD::OR || N->getOpcode() == ISD::XOR) && (Mask->getAPIntValue() & C->getAPIntValue()) != C->getAPIntValue()) NodesWithConsts.insert(N); @@ -5524,9 +5565,9 @@ bool DAGCombiner::SearchForAndLoads(SDNode *N, case ISD::AssertZext: { unsigned ActiveBits = Mask->getAPIntValue().countTrailingOnes(); EVT ExtVT = EVT::getIntegerVT(*DAG.getContext(), ActiveBits); - EVT VT = Op.getOpcode() == ISD::AssertZext ? - cast<VTSDNode>(Op.getOperand(1))->getVT() : - Op.getOperand(0).getValueType(); + EVT VT = Op.getOpcode() == ISD::AssertZext + ? cast<VTSDNode>(Op.getOperand(1))->getVT() + : Op.getOperand(0).getValueType(); // We can accept extending nodes if the mask is wider or an equal // width to the original type. @@ -5534,6 +5575,15 @@ bool DAGCombiner::SearchForAndLoads(SDNode *N, continue; break; } + case ISD::ANY_EXTEND: { + unsigned ActiveBits = Mask->getAPIntValue().countTrailingOnes(); + EVT ExtVT = EVT::getIntegerVT(*DAG.getContext(), ActiveBits); + EVT VT = Op.getOperand(0).getValueType(); + if (ExtVT.bitsGE(VT)) + break; + // Fallthrough to searching for nodes from the operands of the extend. + LLVM_FALLTHROUGH; + } case ISD::OR: case ISD::XOR: case ISD::AND: @@ -5593,12 +5643,14 @@ bool DAGCombiner::BackwardsPropagateMask(SDNode *N) { // masking. if (FixupNode) { LLVM_DEBUG(dbgs() << "First, need to fix up: "; FixupNode->dump()); - SDValue And = DAG.getNode(ISD::AND, SDLoc(FixupNode), - FixupNode->getValueType(0), - SDValue(FixupNode, 0), MaskOp); + SDValue MaskOpT = DAG.getZExtOrTrunc(MaskOp, SDLoc(FixupNode), + FixupNode->getValueType(0)); + SDValue And = + DAG.getNode(ISD::AND, SDLoc(FixupNode), FixupNode->getValueType(0), + SDValue(FixupNode, 0), MaskOpT); DAG.ReplaceAllUsesOfValueWith(SDValue(FixupNode, 0), And); if (And.getOpcode() == ISD ::AND) - DAG.UpdateNodeOperands(And.getNode(), SDValue(FixupNode, 0), MaskOp); + DAG.UpdateNodeOperands(And.getNode(), SDValue(FixupNode, 0), MaskOpT); } // Narrow any constants that need it. @@ -5607,10 +5659,12 @@ bool DAGCombiner::BackwardsPropagateMask(SDNode *N) { SDValue Op1 = LogicN->getOperand(1); if (isa<ConstantSDNode>(Op0)) - std::swap(Op0, Op1); + std::swap(Op0, Op1); - SDValue And = DAG.getNode(ISD::AND, SDLoc(Op1), Op1.getValueType(), - Op1, MaskOp); + SDValue MaskOpT = + DAG.getZExtOrTrunc(MaskOp, SDLoc(Op1), Op1.getValueType()); + SDValue And = + DAG.getNode(ISD::AND, SDLoc(Op1), Op1.getValueType(), Op1, MaskOpT); DAG.UpdateNodeOperands(LogicN, Op0, And); } @@ -5618,13 +5672,15 @@ bool DAGCombiner::BackwardsPropagateMask(SDNode *N) { // Create narrow loads. for (auto *Load : Loads) { LLVM_DEBUG(dbgs() << "Propagate AND back to: "; Load->dump()); + SDValue MaskOpT = + DAG.getZExtOrTrunc(MaskOp, SDLoc(Load), Load->getValueType(0)); SDValue And = DAG.getNode(ISD::AND, SDLoc(Load), Load->getValueType(0), - SDValue(Load, 0), MaskOp); + SDValue(Load, 0), MaskOpT); DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 0), And); if (And.getOpcode() == ISD ::AND) And = SDValue( - DAG.UpdateNodeOperands(And.getNode(), SDValue(Load, 0), MaskOp), 0); - SDValue NewLoad = ReduceLoadWidth(And.getNode()); + DAG.UpdateNodeOperands(And.getNode(), SDValue(Load, 0), MaskOpT), 0); + SDValue NewLoad = reduceLoadWidth(And.getNode()); assert(NewLoad && "Shouldn't be masking the load if it can't be narrowed"); CombineTo(Load, NewLoad, NewLoad.getValue(1)); @@ -5799,18 +5855,12 @@ SDValue DAGCombiner::visitAND(SDNode *N) { return FoldedVOp; // fold (and x, 0) -> 0, vector edition - if (ISD::isConstantSplatVectorAllZeros(N0.getNode())) - // do not return N0, because undef node may exist in N0 - return DAG.getConstant(APInt::getZero(N0.getScalarValueSizeInBits()), - SDLoc(N), N0.getValueType()); if (ISD::isConstantSplatVectorAllZeros(N1.getNode())) // do not return N1, because undef node may exist in N1 return DAG.getConstant(APInt::getZero(N1.getScalarValueSizeInBits()), SDLoc(N), N1.getValueType()); // fold (and x, -1) -> x, vector edition - if (ISD::isConstantSplatVectorAllOnes(N0.getNode())) - return N1; if (ISD::isConstantSplatVectorAllOnes(N1.getNode())) return N0; @@ -5862,7 +5912,7 @@ SDValue DAGCombiner::visitAND(SDNode *N) { if (SDValue Shuffle = XformToShuffleWithZero(N)) return Shuffle; - if (SDValue Combined = combineCarryDiamond(*this, DAG, TLI, N0, N1, N)) + if (SDValue Combined = combineCarryDiamond(DAG, TLI, N0, N1, N)) return Combined; // fold (and (or x, C), D) -> D if (C & D) == D @@ -6024,7 +6074,7 @@ SDValue DAGCombiner::visitAND(SDNode *N) { if (!VT.isVector() && N1C && (N0.getOpcode() == ISD::LOAD || (N0.getOpcode() == ISD::ANY_EXTEND && N0.getOperand(0).getOpcode() == ISD::LOAD))) { - if (SDValue Res = ReduceLoadWidth(N)) { + if (SDValue Res = reduceLoadWidth(N)) { LoadSDNode *LN0 = N0->getOpcode() == ISD::ANY_EXTEND ? cast<LoadSDNode>(N0.getOperand(0)) : cast<LoadSDNode>(N0); AddToWorklist(N); @@ -6659,7 +6709,7 @@ SDValue DAGCombiner::visitOR(SDNode *N) { if (SDValue Combined = visitORLike(N0, N1, N)) return Combined; - if (SDValue Combined = combineCarryDiamond(*this, DAG, TLI, N0, N1, N)) + if (SDValue Combined = combineCarryDiamond(DAG, TLI, N0, N1, N)) return Combined; // Recognize halfword bswaps as (bswap + rotl 16) or (bswap + shl 16) @@ -8156,7 +8206,7 @@ SDValue DAGCombiner::visitXOR(SDNode *N) { if (SimplifyDemandedBits(SDValue(N, 0))) return SDValue(N, 0); - if (SDValue Combined = combineCarryDiamond(*this, DAG, TLI, N0, N1, N)) + if (SDValue Combined = combineCarryDiamond(DAG, TLI, N0, N1, N)) return Combined; return SDValue(); @@ -8948,6 +8998,10 @@ SDValue DAGCombiner::visitSRA(SDNode *N) { if (SDValue MULH = combineShiftToMULH(N, DAG, TLI)) return MULH; + // Attempt to convert a sra of a load into a narrower sign-extending load. + if (SDValue NarrowLoad = reduceLoadWidth(N)) + return NarrowLoad; + return SDValue(); } @@ -9140,7 +9194,7 @@ SDValue DAGCombiner::visitSRL(SDNode *N) { return NewSRL; // Attempt to convert a srl of a load into a narrower zero-extending load. - if (SDValue NarrowLoad = ReduceLoadWidth(N)) + if (SDValue NarrowLoad = reduceLoadWidth(N)) return NarrowLoad; // Here is a common situation. We want to optimize: @@ -9358,6 +9412,17 @@ SDValue DAGCombiner::visitBSWAP(SDNode *N) { // fold (bswap (bswap x)) -> x if (N0.getOpcode() == ISD::BSWAP) return N0->getOperand(0); + + // Canonicalize bswap(bitreverse(x)) -> bitreverse(bswap(x)). If bitreverse + // isn't supported, it will be expanded to bswap followed by a manual reversal + // of bits in each byte. By placing bswaps before bitreverse, we can remove + // the two bswaps if the bitreverse gets expanded. + if (N0.getOpcode() == ISD::BITREVERSE && N0.hasOneUse()) { + SDLoc DL(N); + SDValue BSwap = DAG.getNode(ISD::BSWAP, DL, VT, N0.getOperand(0)); + return DAG.getNode(ISD::BITREVERSE, DL, VT, BSwap); + } + return SDValue(); } @@ -10288,6 +10353,8 @@ SDValue DAGCombiner::visitVSELECT(SDNode *N) { if (SDValue S = PerformMinMaxFpToSatCombine(LHS, RHS, N1, N2, CC, DAG)) return S; + if (SDValue S = PerformUMinFpToSatCombine(LHS, RHS, N1, N2, CC, DAG)) + return S; // If this select has a condition (setcc) with narrower operands than the // select, try to widen the compare to match the select width. @@ -11357,7 +11424,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) { if (N0.getOpcode() == ISD::TRUNCATE) { // fold (sext (truncate (load x))) -> (sext (smaller load x)) // fold (sext (truncate (srl (load x), c))) -> (sext (smaller load (x+c/n))) - if (SDValue NarrowLoad = ReduceLoadWidth(N0.getNode())) { + if (SDValue NarrowLoad = reduceLoadWidth(N0.getNode())) { SDNode *oye = N0.getOperand(0).getNode(); if (NarrowLoad.getNode() != N0.getNode()) { CombineTo(N0.getNode(), NarrowLoad); @@ -11621,7 +11688,7 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) { if (N0.getOpcode() == ISD::TRUNCATE) { // fold (zext (truncate (load x))) -> (zext (smaller load x)) // fold (zext (truncate (srl (load x), c))) -> (zext (smaller load (x+c/n))) - if (SDValue NarrowLoad = ReduceLoadWidth(N0.getNode())) { + if (SDValue NarrowLoad = reduceLoadWidth(N0.getNode())) { SDNode *oye = N0.getOperand(0).getNode(); if (NarrowLoad.getNode() != N0.getNode()) { CombineTo(N0.getNode(), NarrowLoad); @@ -11864,7 +11931,7 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) { // fold (aext (truncate (load x))) -> (aext (smaller load x)) // fold (aext (truncate (srl (load x), c))) -> (aext (small load (x+c/n))) if (N0.getOpcode() == ISD::TRUNCATE) { - if (SDValue NarrowLoad = ReduceLoadWidth(N0.getNode())) { + if (SDValue NarrowLoad = reduceLoadWidth(N0.getNode())) { SDNode *oye = N0.getOperand(0).getNode(); if (NarrowLoad.getNode() != N0.getNode()) { CombineTo(N0.getNode(), NarrowLoad); @@ -12095,13 +12162,10 @@ SDValue DAGCombiner::visitAssertAlign(SDNode *N) { return SDValue(); } -/// If the result of a wider load is shifted to right of N bits and then -/// truncated to a narrower type and where N is a multiple of number of bits of -/// the narrower type, transform it to a narrower load from address + N / num of -/// bits of new type. Also narrow the load if the result is masked with an AND -/// to effectively produce a smaller type. If the result is to be extended, also -/// fold the extension to form a extending load. -SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) { +/// If the result of a load is shifted/masked/truncated to an effectively +/// narrower type, try to transform the load to a narrower type and/or +/// use an extending load. +SDValue DAGCombiner::reduceLoadWidth(SDNode *N) { unsigned Opc = N->getOpcode(); ISD::LoadExtType ExtType = ISD::NON_EXTLOAD; @@ -12113,32 +12177,48 @@ SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) { if (VT.isVector()) return SDValue(); + // The ShAmt variable is used to indicate that we've consumed a right + // shift. I.e. we want to narrow the width of the load by skipping to load the + // ShAmt least significant bits. unsigned ShAmt = 0; + // A special case is when the least significant bits from the load are masked + // away, but using an AND rather than a right shift. HasShiftedOffset is used + // to indicate that the narrowed load should be left-shifted ShAmt bits to get + // the result. bool HasShiftedOffset = false; // Special case: SIGN_EXTEND_INREG is basically truncating to ExtVT then // extended to VT. if (Opc == ISD::SIGN_EXTEND_INREG) { ExtType = ISD::SEXTLOAD; ExtVT = cast<VTSDNode>(N->getOperand(1))->getVT(); - } else if (Opc == ISD::SRL) { - // Another special-case: SRL is basically zero-extending a narrower value, - // or it maybe shifting a higher subword, half or byte into the lowest - // bits. - ExtType = ISD::ZEXTLOAD; - N0 = SDValue(N, 0); + } else if (Opc == ISD::SRL || Opc == ISD::SRA) { + // Another special-case: SRL/SRA is basically zero/sign-extending a narrower + // value, or it may be shifting a higher subword, half or byte into the + // lowest bits. - auto *LN0 = dyn_cast<LoadSDNode>(N0.getOperand(0)); - auto *N01 = dyn_cast<ConstantSDNode>(N0.getOperand(1)); - if (!N01 || !LN0) + // Only handle shift with constant shift amount, and the shiftee must be a + // load. + auto *LN = dyn_cast<LoadSDNode>(N0); + auto *N1C = dyn_cast<ConstantSDNode>(N->getOperand(1)); + if (!N1C || !LN) + return SDValue(); + // If the shift amount is larger than the memory type then we're not + // accessing any of the loaded bytes. + ShAmt = N1C->getZExtValue(); + uint64_t MemoryWidth = LN->getMemoryVT().getScalarSizeInBits(); + if (MemoryWidth <= ShAmt) + return SDValue(); + // Attempt to fold away the SRL by using ZEXTLOAD and SRA by using SEXTLOAD. + ExtType = Opc == ISD::SRL ? ISD::ZEXTLOAD : ISD::SEXTLOAD; + ExtVT = EVT::getIntegerVT(*DAG.getContext(), MemoryWidth - ShAmt); + // If original load is a SEXTLOAD then we can't simply replace it by a + // ZEXTLOAD (we could potentially replace it by a more narrow SEXTLOAD + // followed by a ZEXT, but that is not handled at the moment). Similarly if + // the original load is a ZEXTLOAD and we want to use a SEXTLOAD. + if ((LN->getExtensionType() == ISD::SEXTLOAD || + LN->getExtensionType() == ISD::ZEXTLOAD) && + LN->getExtensionType() != ExtType) return SDValue(); - - uint64_t ShiftAmt = N01->getZExtValue(); - uint64_t MemoryWidth = LN0->getMemoryVT().getScalarSizeInBits(); - if (LN0->getExtensionType() != ISD::SEXTLOAD && MemoryWidth > ShiftAmt) - ExtVT = EVT::getIntegerVT(*DAG.getContext(), MemoryWidth - ShiftAmt); - else - ExtVT = EVT::getIntegerVT(*DAG.getContext(), - VT.getScalarSizeInBits() - ShiftAmt); } else if (Opc == ISD::AND) { // An AND with a constant mask is the same as a truncate + zero-extend. auto AndC = dyn_cast<ConstantSDNode>(N->getOperand(1)); @@ -12161,55 +12241,80 @@ SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) { ExtVT = EVT::getIntegerVT(*DAG.getContext(), ActiveBits); } - if (N0.getOpcode() == ISD::SRL && N0.hasOneUse()) { - SDValue SRL = N0; - if (auto *ConstShift = dyn_cast<ConstantSDNode>(SRL.getOperand(1))) { - ShAmt = ConstShift->getZExtValue(); - unsigned EVTBits = ExtVT.getScalarSizeInBits(); - // Is the shift amount a multiple of size of VT? - if ((ShAmt & (EVTBits-1)) == 0) { - N0 = N0.getOperand(0); - // Is the load width a multiple of size of VT? - if ((N0.getScalarValueSizeInBits() & (EVTBits - 1)) != 0) - return SDValue(); - } + // In case Opc==SRL we've already prepared ExtVT/ExtType/ShAmt based on doing + // a right shift. Here we redo some of those checks, to possibly adjust the + // ExtVT even further based on "a masking AND". We could also end up here for + // other reasons (e.g. based on Opc==TRUNCATE) and that is why some checks + // need to be done here as well. + if (Opc == ISD::SRL || N0.getOpcode() == ISD::SRL) { + SDValue SRL = Opc == ISD::SRL ? SDValue(N, 0) : N0; + // Bail out when the SRL has more than one use. This is done for historical + // (undocumented) reasons. Maybe intent was to guard the AND-masking below + // check below? And maybe it could be non-profitable to do the transform in + // case the SRL has multiple uses and we get here with Opc!=ISD::SRL? + // FIXME: Can't we just skip this check for the Opc==ISD::SRL case. + if (!SRL.hasOneUse()) + return SDValue(); + + // Only handle shift with constant shift amount, and the shiftee must be a + // load. + auto *LN = dyn_cast<LoadSDNode>(SRL.getOperand(0)); + auto *SRL1C = dyn_cast<ConstantSDNode>(SRL.getOperand(1)); + if (!SRL1C || !LN) + return SDValue(); - // At this point, we must have a load or else we can't do the transform. - auto *LN0 = dyn_cast<LoadSDNode>(N0); - if (!LN0) return SDValue(); + // If the shift amount is larger than the input type then we're not + // accessing any of the loaded bytes. If the load was a zextload/extload + // then the result of the shift+trunc is zero/undef (handled elsewhere). + ShAmt = SRL1C->getZExtValue(); + uint64_t MemoryWidth = LN->getMemoryVT().getSizeInBits(); + if (ShAmt >= MemoryWidth) + return SDValue(); - // Because a SRL must be assumed to *need* to zero-extend the high bits - // (as opposed to anyext the high bits), we can't combine the zextload - // lowering of SRL and an sextload. - if (LN0->getExtensionType() == ISD::SEXTLOAD) - return SDValue(); + // Because a SRL must be assumed to *need* to zero-extend the high bits + // (as opposed to anyext the high bits), we can't combine the zextload + // lowering of SRL and an sextload. + if (LN->getExtensionType() == ISD::SEXTLOAD) + return SDValue(); - // If the shift amount is larger than the input type then we're not - // accessing any of the loaded bytes. If the load was a zextload/extload - // then the result of the shift+trunc is zero/undef (handled elsewhere). - if (ShAmt >= LN0->getMemoryVT().getSizeInBits()) + // Avoid reading outside the memory accessed by the original load (could + // happened if we only adjust the load base pointer by ShAmt). Instead we + // try to narrow the load even further. The typical scenario here is: + // (i64 (truncate (i96 (srl (load x), 64)))) -> + // (i64 (truncate (i96 (zextload (load i32 + offset) from i32)))) + if (ExtVT.getScalarSizeInBits() > MemoryWidth - ShAmt) { + // Don't replace sextload by zextload. + if (ExtType == ISD::SEXTLOAD) return SDValue(); - - // If the SRL is only used by a masking AND, we may be able to adjust - // the ExtVT to make the AND redundant. - SDNode *Mask = *(SRL->use_begin()); - if (Mask->getOpcode() == ISD::AND && - isa<ConstantSDNode>(Mask->getOperand(1))) { - const APInt& ShiftMask = Mask->getConstantOperandAPInt(1); - if (ShiftMask.isMask()) { - EVT MaskedVT = EVT::getIntegerVT(*DAG.getContext(), - ShiftMask.countTrailingOnes()); - // If the mask is smaller, recompute the type. - if ((ExtVT.getScalarSizeInBits() > MaskedVT.getScalarSizeInBits()) && - TLI.isLoadExtLegal(ExtType, N0.getValueType(), MaskedVT)) - ExtVT = MaskedVT; - } + // Narrow the load. + ExtType = ISD::ZEXTLOAD; + ExtVT = EVT::getIntegerVT(*DAG.getContext(), MemoryWidth - ShAmt); + } + + // If the SRL is only used by a masking AND, we may be able to adjust + // the ExtVT to make the AND redundant. + SDNode *Mask = *(SRL->use_begin()); + if (SRL.hasOneUse() && Mask->getOpcode() == ISD::AND && + isa<ConstantSDNode>(Mask->getOperand(1))) { + const APInt& ShiftMask = Mask->getConstantOperandAPInt(1); + if (ShiftMask.isMask()) { + EVT MaskedVT = EVT::getIntegerVT(*DAG.getContext(), + ShiftMask.countTrailingOnes()); + // If the mask is smaller, recompute the type. + if ((ExtVT.getScalarSizeInBits() > MaskedVT.getScalarSizeInBits()) && + TLI.isLoadExtLegal(ExtType, SRL.getValueType(), MaskedVT)) + ExtVT = MaskedVT; } } + + N0 = SRL.getOperand(0); } - // If the load is shifted left (and the result isn't shifted back right), - // we can fold the truncate through the shift. + // If the load is shifted left (and the result isn't shifted back right), we + // can fold a truncate through the shift. The typical scenario is that N + // points at a TRUNCATE here so the attempted fold is: + // (truncate (shl (load x), c))) -> (shl (narrow load x), c) + // ShLeftAmt will indicate how much a narrowed load should be shifted left. unsigned ShLeftAmt = 0; if (ShAmt == 0 && N0.getOpcode() == ISD::SHL && N0.hasOneUse() && ExtVT == VT && TLI.isNarrowingProfitable(N0.getValueType(), VT)) { @@ -12237,12 +12342,12 @@ SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) { return LVTStoreBits - EVTStoreBits - ShAmt; }; - // For big endian targets, we need to adjust the offset to the pointer to - // load the correct bytes. - if (DAG.getDataLayout().isBigEndian()) - ShAmt = AdjustBigEndianShift(ShAmt); + // We need to adjust the pointer to the load by ShAmt bits in order to load + // the correct bytes. + unsigned PtrAdjustmentInBits = + DAG.getDataLayout().isBigEndian() ? AdjustBigEndianShift(ShAmt) : ShAmt; - uint64_t PtrOff = ShAmt / 8; + uint64_t PtrOff = PtrAdjustmentInBits / 8; Align NewAlign = commonAlignment(LN0->getAlign(), PtrOff); SDLoc DL(LN0); // The original load itself didn't wrap, so an offset within it doesn't. @@ -12285,11 +12390,6 @@ SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) { } if (HasShiftedOffset) { - // Recalculate the shift amount after it has been altered to calculate - // the offset. - if (DAG.getDataLayout().isBigEndian()) - ShAmt = AdjustBigEndianShift(ShAmt); - // We're using a shifted mask, so the load now has an offset. This means // that data has been loaded into the lower bytes than it would have been // before, so we need to shl the loaded data into the correct position in the @@ -12320,7 +12420,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) { return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT, N0, N1); // If the input is already sign extended, just drop the extension. - if (ExtVTBits >= DAG.ComputeMinSignedBits(N0)) + if (ExtVTBits >= DAG.ComputeMaxSignificantBits(N0)) return N0; // fold (sext_in_reg (sext_in_reg x, VT2), VT1) -> (sext_in_reg x, minVT) pt2 @@ -12336,7 +12436,8 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) { if (N0.getOpcode() == ISD::SIGN_EXTEND || N0.getOpcode() == ISD::ANY_EXTEND) { SDValue N00 = N0.getOperand(0); unsigned N00Bits = N00.getScalarValueSizeInBits(); - if ((N00Bits <= ExtVTBits || DAG.ComputeMinSignedBits(N00) <= ExtVTBits) && + if ((N00Bits <= ExtVTBits || + DAG.ComputeMaxSignificantBits(N00) <= ExtVTBits) && (!LegalOperations || TLI.isOperationLegal(ISD::SIGN_EXTEND, VT))) return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), VT, N00); } @@ -12355,7 +12456,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) { APInt DemandedSrcElts = APInt::getLowBitsSet(SrcElts, DstElts); if ((N00Bits == ExtVTBits || (!IsZext && (N00Bits < ExtVTBits || - DAG.ComputeMinSignedBits(N00) <= ExtVTBits))) && + DAG.ComputeMaxSignificantBits(N00) <= ExtVTBits))) && (!LegalOperations || TLI.isOperationLegal(ISD::SIGN_EXTEND_VECTOR_INREG, VT))) return DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, SDLoc(N), VT, N00); @@ -12381,7 +12482,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) { // fold (sext_in_reg (load x)) -> (smaller sextload x) // fold (sext_in_reg (srl (load x), c)) -> (smaller sextload (x+c/evtbits)) - if (SDValue NarrowLoad = ReduceLoadWidth(N)) + if (SDValue NarrowLoad = reduceLoadWidth(N)) return NarrowLoad; // fold (sext_in_reg (srl X, 24), i8) -> (sra X, 24) @@ -12668,7 +12769,7 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) { // fold (truncate (load x)) -> (smaller load x) // fold (truncate (srl (load x), c)) -> (smaller load (x+c/evtbits)) if (!LegalTypes || TLI.isTypeDesirableForOp(N0.getOpcode(), VT)) { - if (SDValue Reduced = ReduceLoadWidth(N)) + if (SDValue Reduced = reduceLoadWidth(N)) return Reduced; // Handle the case where the load remains an extending load even @@ -17491,6 +17592,10 @@ void DAGCombiner::getStoreMergeCandidates( for (auto I2 = (*I)->use_begin(), E2 = (*I)->use_end(); I2 != E2; ++I2) TryToAddCandidate(I2); } + // Check stores that depend on the root (e.g. Store 3 in the chart above). + if (I.getOperandNo() == 0 && isa<StoreSDNode>(*I)) { + TryToAddCandidate(I); + } } } else { for (auto I = RootNode->use_begin(), E = RootNode->use_end(); @@ -18351,6 +18456,15 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) { Value.getValueType().isInteger() && (!isa<ConstantSDNode>(Value) || !cast<ConstantSDNode>(Value)->isOpaque())) { + // Convert a truncating store of a extension into a standard store. + if ((Value.getOpcode() == ISD::ZERO_EXTEND || + Value.getOpcode() == ISD::SIGN_EXTEND || + Value.getOpcode() == ISD::ANY_EXTEND) && + Value.getOperand(0).getValueType() == ST->getMemoryVT() && + TLI.isOperationLegalOrCustom(ISD::STORE, ST->getMemoryVT())) + return DAG.getStore(Chain, SDLoc(N), Value.getOperand(0), Ptr, + ST->getMemOperand()); + APInt TruncDemandedBits = APInt::getLowBitsSet(Value.getScalarValueSizeInBits(), ST->getMemoryVT().getScalarSizeInBits()); @@ -23299,6 +23413,8 @@ SDValue DAGCombiner::SimplifySelectCC(const SDLoc &DL, SDValue N0, SDValue N1, if (SDValue S = PerformMinMaxFpToSatCombine(N0, N1, N2, N3, CC, DAG)) return S; + if (SDValue S = PerformUMinFpToSatCombine(N0, N1, N2, N3, CC, DAG)) + return S; return SDValue(); } diff --git a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp index 4d1449bc2751..bfde35935c7b 100644 --- a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp @@ -1775,12 +1775,13 @@ bool FastISel::selectOperator(const User *I, unsigned Opcode) { return false; case Instruction::Call: - // On AIX, call lowering uses the DAG-ISEL path currently so that the + // On AIX, normal call lowering uses the DAG-ISEL path currently so that the // callee of the direct function call instruction will be mapped to the // symbol for the function's entry point, which is distinct from the // function descriptor symbol. The latter is the symbol whose XCOFF symbol // name is the C-linkage name of the source level function. - if (TM.getTargetTriple().isOSAIX()) + // But fast isel still has the ability to do selection for intrinsics. + if (TM.getTargetTriple().isOSAIX() && !isa<IntrinsicInst>(I)) return false; return selectCall(I); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp index 5dfb65ef131a..54481b94fdd8 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -3593,9 +3593,16 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) { if (Legalized) { // If we expanded the SETCC by swapping LHS and RHS, or by inverting the // condition code, create a new SETCC node. - if (Tmp3.getNode()) - Tmp1 = DAG.getNode(ISD::SETCC, dl, Node->getValueType(0), - Tmp1, Tmp2, Tmp3, Node->getFlags()); + if (Tmp3.getNode()) { + if (IsStrict) { + Tmp1 = DAG.getNode(Node->getOpcode(), dl, Node->getVTList(), + {Chain, Tmp1, Tmp2, Tmp3}, Node->getFlags()); + Chain = Tmp1.getValue(1); + } else { + Tmp1 = DAG.getNode(Node->getOpcode(), dl, Node->getValueType(0), Tmp1, + Tmp2, Tmp3, Node->getFlags()); + } + } // If we expanded the SETCC by inverting the condition code, then wrap // the existing SETCC in a NOT to restore the intended condition. diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp index 27f9cede1922..6bf38d7296a8 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp @@ -1193,7 +1193,7 @@ void DAGTypeLegalizer::ExpandFloatResult(SDNode *N, unsigned ResNo) { llvm_unreachable("Do not know how to expand the result of this operator!"); case ISD::UNDEF: SplitRes_UNDEF(N, Lo, Hi); break; - case ISD::SELECT: SplitRes_SELECT(N, Lo, Hi); break; + case ISD::SELECT: SplitRes_Select(N, Lo, Hi); break; case ISD::SELECT_CC: SplitRes_SELECT_CC(N, Lo, Hi); break; case ISD::MERGE_VALUES: ExpandRes_MERGE_VALUES(N, ResNo, Lo, Hi); break; diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp index 518e525e13d0..8c7b90b6cd33 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -75,30 +75,28 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) { break; case ISD::MGATHER: Res = PromoteIntRes_MGATHER(cast<MaskedGatherSDNode>(N)); break; - case ISD::SELECT: Res = PromoteIntRes_SELECT(N); break; - case ISD::VSELECT: Res = PromoteIntRes_VSELECT(N); break; + case ISD::SELECT: + case ISD::VSELECT: + case ISD::VP_SELECT: + Res = PromoteIntRes_Select(N); + break; case ISD::SELECT_CC: Res = PromoteIntRes_SELECT_CC(N); break; case ISD::STRICT_FSETCC: case ISD::STRICT_FSETCCS: case ISD::SETCC: Res = PromoteIntRes_SETCC(N); break; case ISD::SMIN: - case ISD::SMAX: - Res = PromoteIntRes_SExtIntBinOp(N, /*IsVP*/ false); - break; + case ISD::SMAX: Res = PromoteIntRes_SExtIntBinOp(N); break; case ISD::UMIN: case ISD::UMAX: Res = PromoteIntRes_UMINUMAX(N); break; case ISD::SHL: - Res = PromoteIntRes_SHL(N, /*IsVP*/ false); - break; + case ISD::VP_SHL: Res = PromoteIntRes_SHL(N); break; case ISD::SIGN_EXTEND_INREG: Res = PromoteIntRes_SIGN_EXTEND_INREG(N); break; case ISD::SRA: - Res = PromoteIntRes_SRA(N, /*IsVP*/ false); - break; + case ISD::VP_ASHR: Res = PromoteIntRes_SRA(N); break; case ISD::SRL: - Res = PromoteIntRes_SRL(N, /*IsVP*/ false); - break; + case ISD::VP_LSHR: Res = PromoteIntRes_SRL(N); break; case ISD::TRUNCATE: Res = PromoteIntRes_TRUNCATE(N); break; case ISD::UNDEF: Res = PromoteIntRes_UNDEF(N); break; case ISD::VAARG: Res = PromoteIntRes_VAARG(N); break; @@ -154,18 +152,22 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) { case ISD::ADD: case ISD::SUB: case ISD::MUL: - Res = PromoteIntRes_SimpleIntBinOp(N, /*IsVP*/ false); - break; + case ISD::VP_AND: + case ISD::VP_OR: + case ISD::VP_XOR: + case ISD::VP_ADD: + case ISD::VP_SUB: + case ISD::VP_MUL: Res = PromoteIntRes_SimpleIntBinOp(N); break; case ISD::SDIV: case ISD::SREM: - Res = PromoteIntRes_SExtIntBinOp(N, /*IsVP*/ false); - break; + case ISD::VP_SDIV: + case ISD::VP_SREM: Res = PromoteIntRes_SExtIntBinOp(N); break; case ISD::UDIV: case ISD::UREM: - Res = PromoteIntRes_ZExtIntBinOp(N, /*IsVP*/ false); - break; + case ISD::VP_UDIV: + case ISD::VP_UREM: Res = PromoteIntRes_ZExtIntBinOp(N); break; case ISD::SADDO: case ISD::SSUBO: Res = PromoteIntRes_SADDSUBO(N, ResNo); break; @@ -260,32 +262,6 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) { case ISD::FSHR: Res = PromoteIntRes_FunnelShift(N); break; - - case ISD::VP_AND: - case ISD::VP_OR: - case ISD::VP_XOR: - case ISD::VP_ADD: - case ISD::VP_SUB: - case ISD::VP_MUL: - Res = PromoteIntRes_SimpleIntBinOp(N, /*IsVP*/ true); - break; - case ISD::VP_SDIV: - case ISD::VP_SREM: - Res = PromoteIntRes_SExtIntBinOp(N, /*IsVP*/ true); - break; - case ISD::VP_UDIV: - case ISD::VP_UREM: - Res = PromoteIntRes_ZExtIntBinOp(N, /*IsVP*/ true); - break; - case ISD::VP_SHL: - Res = PromoteIntRes_SHL(N, /*IsVP*/ true); - break; - case ISD::VP_ASHR: - Res = PromoteIntRes_SRA(N, /*IsVP*/ true); - break; - case ISD::VP_LSHR: - Res = PromoteIntRes_SRL(N, /*IsVP*/ true); - break; } // If the result is null then the sub-method took care of registering it. @@ -1127,20 +1103,18 @@ SDValue DAGTypeLegalizer::PromoteIntRes_SADDSUBO(SDNode *N, unsigned ResNo) { return Res; } -SDValue DAGTypeLegalizer::PromoteIntRes_SELECT(SDNode *N) { - SDValue LHS = GetPromotedInteger(N->getOperand(1)); - SDValue RHS = GetPromotedInteger(N->getOperand(2)); - return DAG.getSelect(SDLoc(N), - LHS.getValueType(), N->getOperand(0), LHS, RHS); -} - -SDValue DAGTypeLegalizer::PromoteIntRes_VSELECT(SDNode *N) { +SDValue DAGTypeLegalizer::PromoteIntRes_Select(SDNode *N) { SDValue Mask = N->getOperand(0); SDValue LHS = GetPromotedInteger(N->getOperand(1)); SDValue RHS = GetPromotedInteger(N->getOperand(2)); - return DAG.getNode(ISD::VSELECT, SDLoc(N), - LHS.getValueType(), Mask, LHS, RHS); + + unsigned Opcode = N->getOpcode(); + return Opcode == ISD::VP_SELECT + ? DAG.getNode(Opcode, SDLoc(N), LHS.getValueType(), Mask, LHS, RHS, + N->getOperand(3)) + : DAG.getNode(Opcode, SDLoc(N), LHS.getValueType(), Mask, LHS, + RHS); } SDValue DAGTypeLegalizer::PromoteIntRes_SELECT_CC(SDNode *N) { @@ -1193,12 +1167,12 @@ SDValue DAGTypeLegalizer::PromoteIntRes_SETCC(SDNode *N) { return DAG.getSExtOrTrunc(SetCC, dl, NVT); } -SDValue DAGTypeLegalizer::PromoteIntRes_SHL(SDNode *N, bool IsVP) { +SDValue DAGTypeLegalizer::PromoteIntRes_SHL(SDNode *N) { SDValue LHS = GetPromotedInteger(N->getOperand(0)); SDValue RHS = N->getOperand(1); if (getTypeAction(RHS.getValueType()) == TargetLowering::TypePromoteInteger) RHS = ZExtPromotedInteger(RHS); - if (!IsVP) + if (N->getOpcode() != ISD::VP_SHL) return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS); return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS, N->getOperand(2), N->getOperand(3)); @@ -1210,34 +1184,40 @@ SDValue DAGTypeLegalizer::PromoteIntRes_SIGN_EXTEND_INREG(SDNode *N) { Op.getValueType(), Op, N->getOperand(1)); } -SDValue DAGTypeLegalizer::PromoteIntRes_SimpleIntBinOp(SDNode *N, bool IsVP) { +SDValue DAGTypeLegalizer::PromoteIntRes_SimpleIntBinOp(SDNode *N) { // The input may have strange things in the top bits of the registers, but // these operations don't care. They may have weird bits going out, but // that too is okay if they are integer operations. SDValue LHS = GetPromotedInteger(N->getOperand(0)); SDValue RHS = GetPromotedInteger(N->getOperand(1)); - if (!IsVP) + if (N->getNumOperands() == 2) return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS); + assert(N->getNumOperands() == 4 && "Unexpected number of operands!"); + assert(N->isVPOpcode() && "Expected VP opcode"); return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS, N->getOperand(2), N->getOperand(3)); } -SDValue DAGTypeLegalizer::PromoteIntRes_SExtIntBinOp(SDNode *N, bool IsVP) { +SDValue DAGTypeLegalizer::PromoteIntRes_SExtIntBinOp(SDNode *N) { // Sign extend the input. SDValue LHS = SExtPromotedInteger(N->getOperand(0)); SDValue RHS = SExtPromotedInteger(N->getOperand(1)); - if (!IsVP) + if (N->getNumOperands() == 2) return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS); + assert(N->getNumOperands() == 4 && "Unexpected number of operands!"); + assert(N->isVPOpcode() && "Expected VP opcode"); return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS, N->getOperand(2), N->getOperand(3)); } -SDValue DAGTypeLegalizer::PromoteIntRes_ZExtIntBinOp(SDNode *N, bool IsVP) { +SDValue DAGTypeLegalizer::PromoteIntRes_ZExtIntBinOp(SDNode *N) { // Zero extend the input. SDValue LHS = ZExtPromotedInteger(N->getOperand(0)); SDValue RHS = ZExtPromotedInteger(N->getOperand(1)); - if (!IsVP) + if (N->getNumOperands() == 2) return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS); + assert(N->getNumOperands() == 4 && "Unexpected number of operands!"); + assert(N->isVPOpcode() && "Expected VP opcode"); return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS, N->getOperand(2), N->getOperand(3)); } @@ -1251,25 +1231,25 @@ SDValue DAGTypeLegalizer::PromoteIntRes_UMINUMAX(SDNode *N) { LHS.getValueType(), LHS, RHS); } -SDValue DAGTypeLegalizer::PromoteIntRes_SRA(SDNode *N, bool IsVP) { +SDValue DAGTypeLegalizer::PromoteIntRes_SRA(SDNode *N) { // The input value must be properly sign extended. SDValue LHS = SExtPromotedInteger(N->getOperand(0)); SDValue RHS = N->getOperand(1); if (getTypeAction(RHS.getValueType()) == TargetLowering::TypePromoteInteger) RHS = ZExtPromotedInteger(RHS); - if (!IsVP) + if (N->getOpcode() != ISD::VP_ASHR) return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS); return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS, N->getOperand(2), N->getOperand(3)); } -SDValue DAGTypeLegalizer::PromoteIntRes_SRL(SDNode *N, bool IsVP) { +SDValue DAGTypeLegalizer::PromoteIntRes_SRL(SDNode *N) { // The input value must be properly zero extended. SDValue LHS = ZExtPromotedInteger(N->getOperand(0)); SDValue RHS = N->getOperand(1); if (getTypeAction(RHS.getValueType()) == TargetLowering::TypePromoteInteger) RHS = ZExtPromotedInteger(RHS); - if (!IsVP) + if (N->getOpcode() != ISD::VP_LSHR) return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS); return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS, N->getOperand(2), N->getOperand(3)); @@ -1653,7 +1633,8 @@ bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) { case ISD::UDIVFIX: case ISD::UDIVFIXSAT: Res = PromoteIntOp_FIX(N); break; - case ISD::FPOWI: Res = PromoteIntOp_FPOWI(N); break; + case ISD::FPOWI: + case ISD::STRICT_FPOWI: Res = PromoteIntOp_FPOWI(N); break; case ISD::VECREDUCE_ADD: case ISD::VECREDUCE_MUL: @@ -1703,50 +1684,64 @@ bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) { /// PromoteSetCCOperands - Promote the operands of a comparison. This code is /// shared among BR_CC, SELECT_CC, and SETCC handlers. -void DAGTypeLegalizer::PromoteSetCCOperands(SDValue &NewLHS,SDValue &NewRHS, +void DAGTypeLegalizer::PromoteSetCCOperands(SDValue &LHS, SDValue &RHS, ISD::CondCode CCCode) { // We have to insert explicit sign or zero extends. Note that we could // insert sign extends for ALL conditions. For those operations where either - // zero or sign extension would be valid, use SExtOrZExtPromotedInteger - // which will choose the cheapest for the target. - switch (CCCode) { - default: llvm_unreachable("Unknown integer comparison!"); - case ISD::SETEQ: - case ISD::SETNE: { - SDValue OpL = GetPromotedInteger(NewLHS); - SDValue OpR = GetPromotedInteger(NewRHS); - - // We would prefer to promote the comparison operand with sign extension. - // If the width of OpL/OpR excluding the duplicated sign bits is no greater - // than the width of NewLHS/NewRH, we can avoid inserting real truncate - // instruction, which is redundant eventually. - unsigned OpLEffectiveBits = DAG.ComputeMinSignedBits(OpL); - unsigned OpREffectiveBits = DAG.ComputeMinSignedBits(OpR); - if (OpLEffectiveBits <= NewLHS.getScalarValueSizeInBits() && - OpREffectiveBits <= NewRHS.getScalarValueSizeInBits()) { - NewLHS = OpL; - NewRHS = OpR; - } else { - NewLHS = SExtOrZExtPromotedInteger(NewLHS); - NewRHS = SExtOrZExtPromotedInteger(NewRHS); + // zero or sign extension would be valid, we ask the target which extension + // it would prefer. + + // Signed comparisons always require sign extension. + if (ISD::isSignedIntSetCC(CCCode)) { + LHS = SExtPromotedInteger(LHS); + RHS = SExtPromotedInteger(RHS); + return; + } + + assert((ISD::isUnsignedIntSetCC(CCCode) || ISD::isIntEqualitySetCC(CCCode)) && + "Unknown integer comparison!"); + + SDValue OpL = GetPromotedInteger(LHS); + SDValue OpR = GetPromotedInteger(RHS); + + if (TLI.isSExtCheaperThanZExt(LHS.getValueType(), OpL.getValueType())) { + // The target would prefer to promote the comparison operand with sign + // extension. Honor that unless the promoted values are already zero + // extended. + unsigned OpLEffectiveBits = + DAG.computeKnownBits(OpL).countMaxActiveBits(); + unsigned OpREffectiveBits = + DAG.computeKnownBits(OpR).countMaxActiveBits(); + if (OpLEffectiveBits <= LHS.getScalarValueSizeInBits() && + OpREffectiveBits <= RHS.getScalarValueSizeInBits()) { + LHS = OpL; + RHS = OpR; + return; } - break; + + // The promoted values aren't zero extended, use a sext_inreg. + LHS = SExtPromotedInteger(LHS); + RHS = SExtPromotedInteger(RHS); + return; } - case ISD::SETUGE: - case ISD::SETUGT: - case ISD::SETULE: - case ISD::SETULT: - NewLHS = SExtOrZExtPromotedInteger(NewLHS); - NewRHS = SExtOrZExtPromotedInteger(NewRHS); - break; - case ISD::SETGE: - case ISD::SETGT: - case ISD::SETLT: - case ISD::SETLE: - NewLHS = SExtPromotedInteger(NewLHS); - NewRHS = SExtPromotedInteger(NewRHS); - break; + + // Prefer to promote the comparison operand with zero extension. + + // If the width of OpL/OpR excluding the duplicated sign bits is no greater + // than the width of LHS/RHS, we can avoid/ inserting a zext_inreg operation + // that we might not be able to remove. + unsigned OpLEffectiveBits = DAG.ComputeMaxSignificantBits(OpL); + unsigned OpREffectiveBits = DAG.ComputeMaxSignificantBits(OpR); + if (OpLEffectiveBits <= LHS.getScalarValueSizeInBits() && + OpREffectiveBits <= RHS.getScalarValueSizeInBits()) { + LHS = OpL; + RHS = OpR; + return; } + + // Otherwise, use zext_inreg. + LHS = ZExtPromotedInteger(LHS); + RHS = ZExtPromotedInteger(RHS); } SDValue DAGTypeLegalizer::PromoteIntOp_ANY_EXTEND(SDNode *N) { @@ -2099,8 +2094,8 @@ SDValue DAGTypeLegalizer::PromoteIntOp_PREFETCH(SDNode *N, unsigned OpNo) { } SDValue DAGTypeLegalizer::PromoteIntOp_FPOWI(SDNode *N) { - // FIXME: Support for promotion of STRICT_FPOWI is not implemented yet. - assert(N->getOpcode() == ISD::FPOWI && "No STRICT_FPOWI support here yet."); + bool IsStrict = N->isStrictFPOpcode(); + SDValue Chain = IsStrict ? N->getOperand(0) : SDValue(); // The integer operand is the last operand in FPOWI (so the result and // floating point operand is already type legalized). @@ -2118,17 +2113,19 @@ SDValue DAGTypeLegalizer::PromoteIntOp_FPOWI(SDNode *N) { DAG.getContext()->emitError("Don't know how to promote fpowi to fpow"); return DAG.getUNDEF(N->getValueType(0)); } + unsigned OpOffset = IsStrict ? 1 : 0; // The exponent should fit in a sizeof(int) type for the libcall to be valid. assert(DAG.getLibInfo().getIntSize() == - N->getOperand(1).getValueType().getSizeInBits() && + N->getOperand(1 + OpOffset).getValueType().getSizeInBits() && "POWI exponent should match with sizeof(int) when doing the libcall."); TargetLowering::MakeLibCallOptions CallOptions; CallOptions.setSExt(true); - SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) }; - std::pair<SDValue, SDValue> Tmp = - TLI.makeLibCall(DAG, LC, N->getValueType(0), Ops, - CallOptions, SDLoc(N), SDValue()); + SDValue Ops[2] = {N->getOperand(0 + OpOffset), N->getOperand(1 + OpOffset)}; + std::pair<SDValue, SDValue> Tmp = TLI.makeLibCall( + DAG, LC, N->getValueType(0), Ops, CallOptions, SDLoc(N), Chain); ReplaceValueWith(SDValue(N, 0), Tmp.first); + if (IsStrict) + ReplaceValueWith(SDValue(N, 1), Tmp.second); return SDValue(); } @@ -2255,7 +2252,7 @@ void DAGTypeLegalizer::ExpandIntegerResult(SDNode *N, unsigned ResNo) { case ISD::ARITH_FENCE: SplitRes_ARITH_FENCE(N, Lo, Hi); break; case ISD::MERGE_VALUES: SplitRes_MERGE_VALUES(N, ResNo, Lo, Hi); break; - case ISD::SELECT: SplitRes_SELECT(N, Lo, Hi); break; + case ISD::SELECT: SplitRes_Select(N, Lo, Hi); break; case ISD::SELECT_CC: SplitRes_SELECT_CC(N, Lo, Hi); break; case ISD::UNDEF: SplitRes_UNDEF(N, Lo, Hi); break; case ISD::FREEZE: SplitRes_FREEZE(N, Lo, Hi); break; diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h index da282ecad282..4d8daa82d8c0 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -334,18 +334,17 @@ private: SDValue PromoteIntRes_MGATHER(MaskedGatherSDNode *N); SDValue PromoteIntRes_Overflow(SDNode *N); SDValue PromoteIntRes_SADDSUBO(SDNode *N, unsigned ResNo); - SDValue PromoteIntRes_SELECT(SDNode *N); - SDValue PromoteIntRes_VSELECT(SDNode *N); + SDValue PromoteIntRes_Select(SDNode *N); SDValue PromoteIntRes_SELECT_CC(SDNode *N); SDValue PromoteIntRes_SETCC(SDNode *N); - SDValue PromoteIntRes_SHL(SDNode *N, bool IsVP); - SDValue PromoteIntRes_SimpleIntBinOp(SDNode *N, bool IsVP); - SDValue PromoteIntRes_ZExtIntBinOp(SDNode *N, bool IsVP); - SDValue PromoteIntRes_SExtIntBinOp(SDNode *N, bool IsVP); + SDValue PromoteIntRes_SHL(SDNode *N); + SDValue PromoteIntRes_SimpleIntBinOp(SDNode *N); + SDValue PromoteIntRes_ZExtIntBinOp(SDNode *N); + SDValue PromoteIntRes_SExtIntBinOp(SDNode *N); SDValue PromoteIntRes_UMINUMAX(SDNode *N); SDValue PromoteIntRes_SIGN_EXTEND_INREG(SDNode *N); - SDValue PromoteIntRes_SRA(SDNode *N, bool IsVP); - SDValue PromoteIntRes_SRL(SDNode *N, bool IsVP); + SDValue PromoteIntRes_SRA(SDNode *N); + SDValue PromoteIntRes_SRL(SDNode *N); SDValue PromoteIntRes_TRUNCATE(SDNode *N); SDValue PromoteIntRes_UADDSUBO(SDNode *N, unsigned ResNo); SDValue PromoteIntRes_ADDSUBCARRY(SDNode *N, unsigned ResNo); @@ -819,6 +818,12 @@ private: void GetSplitVector(SDValue Op, SDValue &Lo, SDValue &Hi); void SetSplitVector(SDValue Op, SDValue Lo, SDValue Hi); + /// Split mask operator of a VP intrinsic. + std::pair<SDValue, SDValue> SplitMask(SDValue Mask); + + /// Split mask operator of a VP intrinsic in a given location. + std::pair<SDValue, SDValue> SplitMask(SDValue Mask, const SDLoc &DL); + // Helper function for incrementing the pointer when splitting // memory operations void IncrementPointer(MemSDNode *N, EVT MemVT, MachinePointerInfo &MPI, @@ -826,7 +831,7 @@ private: // Vector Result Splitting: <128 x ty> -> 2 x <64 x ty>. void SplitVectorResult(SDNode *N, unsigned ResNo); - void SplitVecRes_BinOp(SDNode *N, SDValue &Lo, SDValue &Hi, bool IsVP); + void SplitVecRes_BinOp(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_TernaryOp(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_UnaryOp(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_ExtendOp(SDNode *N, SDValue &Lo, SDValue &Hi); @@ -847,8 +852,10 @@ private: void SplitVecRes_FCOPYSIGN(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_LOAD(LoadSDNode *LD, SDValue &Lo, SDValue &Hi); + void SplitVecRes_VP_LOAD(VPLoadSDNode *LD, SDValue &Lo, SDValue &Hi); void SplitVecRes_MLOAD(MaskedLoadSDNode *MLD, SDValue &Lo, SDValue &Hi); - void SplitVecRes_MGATHER(MaskedGatherSDNode *MGT, SDValue &Lo, SDValue &Hi); + void SplitVecRes_Gather(MemSDNode *VPGT, SDValue &Lo, SDValue &Hi, + bool SplitSETCC = false); void SplitVecRes_ScalarOp(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_STEP_VECTOR(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_SETCC(SDNode *N, SDValue &Lo, SDValue &Hi); @@ -864,6 +871,7 @@ private: SDValue SplitVecOp_VSELECT(SDNode *N, unsigned OpNo); SDValue SplitVecOp_VECREDUCE(SDNode *N, unsigned OpNo); SDValue SplitVecOp_VECREDUCE_SEQ(SDNode *N); + SDValue SplitVecOp_VP_REDUCE(SDNode *N, unsigned OpNo); SDValue SplitVecOp_UnaryOp(SDNode *N); SDValue SplitVecOp_TruncateHelper(SDNode *N); @@ -873,9 +881,10 @@ private: SDValue SplitVecOp_EXTRACT_VECTOR_ELT(SDNode *N); SDValue SplitVecOp_ExtVecInRegOp(SDNode *N); SDValue SplitVecOp_STORE(StoreSDNode *N, unsigned OpNo); + SDValue SplitVecOp_VP_STORE(VPStoreSDNode *N, unsigned OpNo); SDValue SplitVecOp_MSTORE(MaskedStoreSDNode *N, unsigned OpNo); - SDValue SplitVecOp_MSCATTER(MaskedScatterSDNode *N, unsigned OpNo); - SDValue SplitVecOp_MGATHER(MaskedGatherSDNode *MGT, unsigned OpNo); + SDValue SplitVecOp_Scatter(MemSDNode *N, unsigned OpNo); + SDValue SplitVecOp_Gather(MemSDNode *MGT, unsigned OpNo); SDValue SplitVecOp_CONCAT_VECTORS(SDNode *N); SDValue SplitVecOp_VSETCC(SDNode *N); SDValue SplitVecOp_FP_ROUND(SDNode *N); @@ -900,6 +909,23 @@ private: } void SetWidenedVector(SDValue Op, SDValue Result); + /// Given a mask Mask, returns the larger vector into which Mask was widened. + SDValue GetWidenedMask(SDValue Mask, ElementCount EC) { + // For VP operations, we must also widen the mask. Note that the mask type + // may not actually need widening, leading it be split along with the VP + // operation. + // FIXME: This could lead to an infinite split/widen loop. We only handle + // the case where the mask needs widening to an identically-sized type as + // the vector inputs. + assert(getTypeAction(Mask.getValueType()) == + TargetLowering::TypeWidenVector && + "Unable to widen binary VP op"); + Mask = GetWidenedVector(Mask); + assert(Mask.getValueType().getVectorElementCount() == EC && + "Unable to widen binary VP op"); + return Mask; + } + // Widen Vector Result Promotion. void WidenVectorResult(SDNode *N, unsigned ResNo); SDValue WidenVecRes_MERGE_VALUES(SDNode* N, unsigned ResNo); @@ -911,10 +937,12 @@ private: SDValue WidenVecRes_INSERT_SUBVECTOR(SDNode *N); SDValue WidenVecRes_INSERT_VECTOR_ELT(SDNode* N); SDValue WidenVecRes_LOAD(SDNode* N); + SDValue WidenVecRes_VP_LOAD(VPLoadSDNode *N); SDValue WidenVecRes_MLOAD(MaskedLoadSDNode* N); SDValue WidenVecRes_MGATHER(MaskedGatherSDNode* N); + SDValue WidenVecRes_VP_GATHER(VPGatherSDNode* N); SDValue WidenVecRes_ScalarOp(SDNode* N); - SDValue WidenVecRes_SELECT(SDNode* N); + SDValue WidenVecRes_Select(SDNode *N); SDValue WidenVSELECTMask(SDNode *N); SDValue WidenVecRes_SELECT_CC(SDNode* N); SDValue WidenVecRes_SETCC(SDNode* N); @@ -923,7 +951,7 @@ private: SDValue WidenVecRes_VECTOR_SHUFFLE(ShuffleVectorSDNode *N); SDValue WidenVecRes_Ternary(SDNode *N); - SDValue WidenVecRes_Binary(SDNode *N, bool IsVP); + SDValue WidenVecRes_Binary(SDNode *N); SDValue WidenVecRes_BinaryCanTrap(SDNode *N); SDValue WidenVecRes_BinaryWithExtraScalarOp(SDNode *N); SDValue WidenVecRes_StrictFP(SDNode *N); @@ -945,9 +973,11 @@ private: SDValue WidenVecOp_INSERT_SUBVECTOR(SDNode *N); SDValue WidenVecOp_EXTRACT_SUBVECTOR(SDNode *N); SDValue WidenVecOp_STORE(SDNode* N); + SDValue WidenVecOp_VP_STORE(SDNode *N, unsigned OpNo); SDValue WidenVecOp_MSTORE(SDNode* N, unsigned OpNo); SDValue WidenVecOp_MGATHER(SDNode* N, unsigned OpNo); SDValue WidenVecOp_MSCATTER(SDNode* N, unsigned OpNo); + SDValue WidenVecOp_VP_SCATTER(SDNode* N, unsigned OpNo); SDValue WidenVecOp_SETCC(SDNode* N); SDValue WidenVecOp_STRICT_FSETCC(SDNode* N); SDValue WidenVecOp_VSELECT(SDNode *N); @@ -957,6 +987,7 @@ private: SDValue WidenVecOp_FCOPYSIGN(SDNode *N); SDValue WidenVecOp_VECREDUCE(SDNode *N); SDValue WidenVecOp_VECREDUCE_SEQ(SDNode *N); + SDValue WidenVecOp_VP_REDUCE(SDNode *N); /// Helper function to generate a set of operations to perform /// a vector operation for a wider type. @@ -1023,7 +1054,7 @@ private: void SplitRes_MERGE_VALUES(SDNode *N, unsigned ResNo, SDValue &Lo, SDValue &Hi); void SplitRes_ARITH_FENCE (SDNode *N, SDValue &Lo, SDValue &Hi); - void SplitRes_SELECT (SDNode *N, SDValue &Lo, SDValue &Hi); + void SplitRes_Select (SDNode *N, SDValue &Lo, SDValue &Hi); void SplitRes_SELECT_CC (SDNode *N, SDValue &Lo, SDValue &Hi); void SplitRes_UNDEF (SDNode *N, SDValue &Lo, SDValue &Hi); void SplitRes_FREEZE (SDNode *N, SDValue &Lo, SDValue &Hi); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp index 3d3c9a2ad837..c6885677d644 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp @@ -506,9 +506,10 @@ void DAGTypeLegalizer::SplitRes_MERGE_VALUES(SDNode *N, unsigned ResNo, GetSplitOp(Op, Lo, Hi); } -void DAGTypeLegalizer::SplitRes_SELECT(SDNode *N, SDValue &Lo, SDValue &Hi) { +void DAGTypeLegalizer::SplitRes_Select(SDNode *N, SDValue &Lo, SDValue &Hi) { SDValue LL, LH, RL, RH, CL, CH; SDLoc dl(N); + unsigned Opcode = N->getOpcode(); GetSplitOp(N->getOperand(1), LL, LH); GetSplitOp(N->getOperand(2), RL, RH); @@ -539,8 +540,18 @@ void DAGTypeLegalizer::SplitRes_SELECT(SDNode *N, SDValue &Lo, SDValue &Hi) { std::tie(CL, CH) = DAG.SplitVector(Cond, dl); } - Lo = DAG.getNode(N->getOpcode(), dl, LL.getValueType(), CL, LL, RL); - Hi = DAG.getNode(N->getOpcode(), dl, LH.getValueType(), CH, LH, RH); + if (Opcode != ISD::VP_SELECT && Opcode != ISD::VP_MERGE) { + Lo = DAG.getNode(Opcode, dl, LL.getValueType(), CL, LL, RL); + Hi = DAG.getNode(Opcode, dl, LH.getValueType(), CH, LH, RH); + return; + } + + SDValue EVLLo, EVLHi; + std::tie(EVLLo, EVLHi) = + DAG.SplitEVL(N->getOperand(3), N->getValueType(0), dl); + + Lo = DAG.getNode(Opcode, dl, LL.getValueType(), CL, LL, RL, EVLLo); + Hi = DAG.getNode(Opcode, dl, LH.getValueType(), CH, LH, RH, EVLHi); } void DAGTypeLegalizer::SplitRes_SELECT_CC(SDNode *N, SDValue &Lo, diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp index 1493f36fcd3e..abf6a3ac6916 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -133,6 +133,8 @@ class VectorLegalizer { /// Implement vselect in terms of XOR, AND, OR when blend is not /// supported by the target. SDValue ExpandVSELECT(SDNode *Node); + SDValue ExpandVP_SELECT(SDNode *Node); + SDValue ExpandVP_MERGE(SDNode *Node); SDValue ExpandSELECT(SDNode *Node); std::pair<SDValue, SDValue> ExpandLoad(SDNode *N); SDValue ExpandStore(SDNode *N); @@ -457,6 +459,14 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) { Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0)); break; } + +#define BEGIN_REGISTER_VP_SDNODE(VPID, LEGALPOS, ...) \ + case ISD::VPID: { \ + EVT LegalizeVT = LEGALPOS < 0 ? Node->getValueType(-(1 + LEGALPOS)) \ + : Node->getOperand(LEGALPOS).getValueType(); \ + Action = TLI.getOperationAction(Node->getOpcode(), LegalizeVT); \ + } break; +#include "llvm/IR/VPIntrinsics.def" } LLVM_DEBUG(dbgs() << "\nLegalizing vector op: "; Node->dump(&DAG)); @@ -718,6 +728,9 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl<SDValue> &Results) { case ISD::VSELECT: Results.push_back(ExpandVSELECT(Node)); return; + case ISD::VP_SELECT: + Results.push_back(ExpandVP_SELECT(Node)); + return; case ISD::SELECT: Results.push_back(ExpandSELECT(Node)); return; @@ -865,6 +878,9 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl<SDValue> &Results) { case ISD::UREM: ExpandREM(Node, Results); return; + case ISD::VP_MERGE: + Results.push_back(ExpandVP_MERGE(Node)); + return; } Results.push_back(DAG.UnrollVectorOp(Node)); @@ -1195,6 +1211,79 @@ SDValue VectorLegalizer::ExpandVSELECT(SDNode *Node) { return DAG.getNode(ISD::BITCAST, DL, Node->getValueType(0), Val); } +SDValue VectorLegalizer::ExpandVP_SELECT(SDNode *Node) { + // Implement VP_SELECT in terms of VP_XOR, VP_AND and VP_OR on platforms which + // do not support it natively. + SDLoc DL(Node); + + SDValue Mask = Node->getOperand(0); + SDValue Op1 = Node->getOperand(1); + SDValue Op2 = Node->getOperand(2); + SDValue EVL = Node->getOperand(3); + + EVT VT = Mask.getValueType(); + + // If we can't even use the basic vector operations of + // VP_AND,VP_OR,VP_XOR, we will have to scalarize the op. + if (TLI.getOperationAction(ISD::VP_AND, VT) == TargetLowering::Expand || + TLI.getOperationAction(ISD::VP_XOR, VT) == TargetLowering::Expand || + TLI.getOperationAction(ISD::VP_OR, VT) == TargetLowering::Expand) + return DAG.UnrollVectorOp(Node); + + // This operation also isn't safe when the operands aren't also booleans. + if (Op1.getValueType().getVectorElementType() != MVT::i1) + return DAG.UnrollVectorOp(Node); + + SDValue Ones = DAG.getAllOnesConstant(DL, VT); + SDValue NotMask = DAG.getNode(ISD::VP_XOR, DL, VT, Mask, Ones, Mask, EVL); + + Op1 = DAG.getNode(ISD::VP_AND, DL, VT, Op1, Mask, Mask, EVL); + Op2 = DAG.getNode(ISD::VP_AND, DL, VT, Op2, NotMask, Mask, EVL); + return DAG.getNode(ISD::VP_OR, DL, VT, Op1, Op2, Mask, EVL); +} + +SDValue VectorLegalizer::ExpandVP_MERGE(SDNode *Node) { + // Implement VP_MERGE in terms of VSELECT. Construct a mask where vector + // indices less than the EVL/pivot are true. Combine that with the original + // mask for a full-length mask. Use a full-length VSELECT to select between + // the true and false values. + SDLoc DL(Node); + + SDValue Mask = Node->getOperand(0); + SDValue Op1 = Node->getOperand(1); + SDValue Op2 = Node->getOperand(2); + SDValue EVL = Node->getOperand(3); + + EVT MaskVT = Mask.getValueType(); + bool IsFixedLen = MaskVT.isFixedLengthVector(); + + EVT EVLVecVT = EVT::getVectorVT(*DAG.getContext(), EVL.getValueType(), + MaskVT.getVectorElementCount()); + + // If we can't construct the EVL mask efficiently, it's better to unroll. + if ((IsFixedLen && + !TLI.isOperationLegalOrCustom(ISD::BUILD_VECTOR, EVLVecVT)) || + (!IsFixedLen && + (!TLI.isOperationLegalOrCustom(ISD::STEP_VECTOR, EVLVecVT) || + !TLI.isOperationLegalOrCustom(ISD::SPLAT_VECTOR, EVLVecVT)))) + return DAG.UnrollVectorOp(Node); + + // If using a SETCC would result in a different type than the mask type, + // unroll. + if (TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), + EVLVecVT) != MaskVT) + return DAG.UnrollVectorOp(Node); + + SDValue StepVec = DAG.getStepVector(DL, EVLVecVT); + SDValue SplatEVL = IsFixedLen ? DAG.getSplatBuildVector(EVLVecVT, DL, EVL) + : DAG.getSplatVector(EVLVecVT, DL, EVL); + SDValue EVLMask = + DAG.getSetCC(DL, MaskVT, StepVec, SplatEVL, ISD::CondCode::SETULT); + + SDValue FullMask = DAG.getNode(ISD::AND, DL, MaskVT, Mask, EVLMask); + return DAG.getSelect(DL, Node->getValueType(0), FullMask, Op1, Op2); +} + void VectorLegalizer::ExpandFP_TO_UINT(SDNode *Node, SmallVectorImpl<SDValue> &Results) { // Attempt to expand using TargetLowering. diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index 7ec2638b1e71..0bd44ce4c872 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -914,7 +914,9 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) { case ISD::MERGE_VALUES: SplitRes_MERGE_VALUES(N, ResNo, Lo, Hi); break; case ISD::VSELECT: - case ISD::SELECT: SplitRes_SELECT(N, Lo, Hi); break; + case ISD::SELECT: + case ISD::VP_MERGE: + case ISD::VP_SELECT: SplitRes_Select(N, Lo, Hi); break; case ISD::SELECT_CC: SplitRes_SELECT_CC(N, Lo, Hi); break; case ISD::UNDEF: SplitRes_UNDEF(N, Lo, Hi); break; case ISD::BITCAST: SplitVecRes_BITCAST(N, Lo, Hi); break; @@ -936,11 +938,15 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) { case ISD::LOAD: SplitVecRes_LOAD(cast<LoadSDNode>(N), Lo, Hi); break; + case ISD::VP_LOAD: + SplitVecRes_VP_LOAD(cast<VPLoadSDNode>(N), Lo, Hi); + break; case ISD::MLOAD: SplitVecRes_MLOAD(cast<MaskedLoadSDNode>(N), Lo, Hi); break; case ISD::MGATHER: - SplitVecRes_MGATHER(cast<MaskedGatherSDNode>(N), Lo, Hi); + case ISD::VP_GATHER: + SplitVecRes_Gather(cast<MemSDNode>(N), Lo, Hi, /*SplitSETCC*/ true); break; case ISD::SETCC: SplitVecRes_SETCC(N, Lo, Hi); @@ -1008,31 +1014,31 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) { SplitVecRes_ExtendOp(N, Lo, Hi); break; - case ISD::ADD: - case ISD::SUB: - case ISD::MUL: + case ISD::ADD: case ISD::VP_ADD: + case ISD::SUB: case ISD::VP_SUB: + case ISD::MUL: case ISD::VP_MUL: case ISD::MULHS: case ISD::MULHU: - case ISD::FADD: - case ISD::FSUB: - case ISD::FMUL: + case ISD::FADD: case ISD::VP_FADD: + case ISD::FSUB: case ISD::VP_FSUB: + case ISD::FMUL: case ISD::VP_FMUL: case ISD::FMINNUM: case ISD::FMAXNUM: case ISD::FMINIMUM: case ISD::FMAXIMUM: - case ISD::SDIV: - case ISD::UDIV: - case ISD::FDIV: + case ISD::SDIV: case ISD::VP_SDIV: + case ISD::UDIV: case ISD::VP_UDIV: + case ISD::FDIV: case ISD::VP_FDIV: case ISD::FPOW: - case ISD::AND: - case ISD::OR: - case ISD::XOR: - case ISD::SHL: - case ISD::SRA: - case ISD::SRL: - case ISD::UREM: - case ISD::SREM: - case ISD::FREM: + case ISD::AND: case ISD::VP_AND: + case ISD::OR: case ISD::VP_OR: + case ISD::XOR: case ISD::VP_XOR: + case ISD::SHL: case ISD::VP_SHL: + case ISD::SRA: case ISD::VP_ASHR: + case ISD::SRL: case ISD::VP_LSHR: + case ISD::UREM: case ISD::VP_UREM: + case ISD::SREM: case ISD::VP_SREM: + case ISD::FREM: case ISD::VP_FREM: case ISD::SMIN: case ISD::SMAX: case ISD::UMIN: @@ -1045,7 +1051,7 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) { case ISD::USHLSAT: case ISD::ROTL: case ISD::ROTR: - SplitVecRes_BinOp(N, Lo, Hi, /*IsVP*/ false); + SplitVecRes_BinOp(N, Lo, Hi); break; case ISD::FMA: case ISD::FSHL: @@ -1082,26 +1088,6 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) { case ISD::UDIVFIXSAT: SplitVecRes_FIX(N, Lo, Hi); break; - case ISD::VP_ADD: - case ISD::VP_AND: - case ISD::VP_MUL: - case ISD::VP_OR: - case ISD::VP_SUB: - case ISD::VP_XOR: - case ISD::VP_SHL: - case ISD::VP_LSHR: - case ISD::VP_ASHR: - case ISD::VP_SDIV: - case ISD::VP_UDIV: - case ISD::VP_SREM: - case ISD::VP_UREM: - case ISD::VP_FADD: - case ISD::VP_FSUB: - case ISD::VP_FMUL: - case ISD::VP_FDIV: - case ISD::VP_FREM: - SplitVecRes_BinOp(N, Lo, Hi, /*IsVP*/ true); - break; } // If Lo/Hi is null, the sub-method took care of registering results etc. @@ -1133,8 +1119,22 @@ void DAGTypeLegalizer::IncrementPointer(MemSDNode *N, EVT MemVT, } } -void DAGTypeLegalizer::SplitVecRes_BinOp(SDNode *N, SDValue &Lo, SDValue &Hi, - bool IsVP) { +std::pair<SDValue, SDValue> DAGTypeLegalizer::SplitMask(SDValue Mask) { + return SplitMask(Mask, SDLoc(Mask)); +} + +std::pair<SDValue, SDValue> DAGTypeLegalizer::SplitMask(SDValue Mask, + const SDLoc &DL) { + SDValue MaskLo, MaskHi; + EVT MaskVT = Mask.getValueType(); + if (getTypeAction(MaskVT) == TargetLowering::TypeSplitVector) + GetSplitVector(Mask, MaskLo, MaskHi); + else + std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, DL); + return std::make_pair(MaskLo, MaskHi); +} + +void DAGTypeLegalizer::SplitVecRes_BinOp(SDNode *N, SDValue &Lo, SDValue &Hi) { SDValue LHSLo, LHSHi; GetSplitVector(N->getOperand(0), LHSLo, LHSHi); SDValue RHSLo, RHSHi; @@ -1143,36 +1143,21 @@ void DAGTypeLegalizer::SplitVecRes_BinOp(SDNode *N, SDValue &Lo, SDValue &Hi, const SDNodeFlags Flags = N->getFlags(); unsigned Opcode = N->getOpcode(); - if (!IsVP) { + if (N->getNumOperands() == 2) { Lo = DAG.getNode(Opcode, dl, LHSLo.getValueType(), LHSLo, RHSLo, Flags); Hi = DAG.getNode(Opcode, dl, LHSHi.getValueType(), LHSHi, RHSHi, Flags); return; } - // Split the mask. + assert(N->getNumOperands() == 4 && "Unexpected number of operands!"); + assert(N->isVPOpcode() && "Expected VP opcode"); + SDValue MaskLo, MaskHi; - SDValue Mask = N->getOperand(2); - EVT MaskVT = Mask.getValueType(); - if (getTypeAction(MaskVT) == TargetLowering::TypeSplitVector) - GetSplitVector(Mask, MaskLo, MaskHi); - else - std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, SDLoc(Mask)); - - // Split the vector length parameter. - // %evl -> umin(%evl, %halfnumelts) and usubsat(%evl - %halfnumelts). - SDValue EVL = N->getOperand(3); - EVT VecVT = N->getValueType(0); - EVT EVLVT = EVL.getValueType(); - assert(VecVT.getVectorElementCount().isKnownEven() && - "Expecting the mask to be an evenly-sized vector"); - unsigned HalfMinNumElts = VecVT.getVectorMinNumElements() / 2; - SDValue HalfNumElts = - VecVT.isFixedLengthVector() - ? DAG.getConstant(HalfMinNumElts, dl, EVLVT) - : DAG.getVScale(dl, EVLVT, - APInt(EVLVT.getScalarSizeInBits(), HalfMinNumElts)); - SDValue EVLLo = DAG.getNode(ISD::UMIN, dl, EVLVT, EVL, HalfNumElts); - SDValue EVLHi = DAG.getNode(ISD::USUBSAT, dl, EVLVT, EVL, HalfNumElts); + std::tie(MaskLo, MaskHi) = SplitMask(N->getOperand(2)); + + SDValue EVLLo, EVLHi; + std::tie(EVLLo, EVLHi) = + DAG.SplitEVL(N->getOperand(3), N->getValueType(0), dl); Lo = DAG.getNode(Opcode, dl, LHSLo.getValueType(), {LHSLo, RHSLo, MaskLo, EVLLo}, Flags); @@ -1781,6 +1766,86 @@ void DAGTypeLegalizer::SplitVecRes_LOAD(LoadSDNode *LD, SDValue &Lo, ReplaceValueWith(SDValue(LD, 1), Ch); } +void DAGTypeLegalizer::SplitVecRes_VP_LOAD(VPLoadSDNode *LD, SDValue &Lo, + SDValue &Hi) { + assert(LD->isUnindexed() && "Indexed VP load during type legalization!"); + EVT LoVT, HiVT; + SDLoc dl(LD); + std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(LD->getValueType(0)); + + ISD::LoadExtType ExtType = LD->getExtensionType(); + SDValue Ch = LD->getChain(); + SDValue Ptr = LD->getBasePtr(); + SDValue Offset = LD->getOffset(); + assert(Offset.isUndef() && "Unexpected indexed variable-length load offset"); + Align Alignment = LD->getOriginalAlign(); + SDValue Mask = LD->getMask(); + SDValue EVL = LD->getVectorLength(); + EVT MemoryVT = LD->getMemoryVT(); + + EVT LoMemVT, HiMemVT; + bool HiIsEmpty = false; + std::tie(LoMemVT, HiMemVT) = + DAG.GetDependentSplitDestVTs(MemoryVT, LoVT, &HiIsEmpty); + + // Split Mask operand + SDValue MaskLo, MaskHi; + if (Mask.getOpcode() == ISD::SETCC) { + SplitVecRes_SETCC(Mask.getNode(), MaskLo, MaskHi); + } else { + if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector) + GetSplitVector(Mask, MaskLo, MaskHi); + else + std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, dl); + } + + // Split EVL operand + SDValue EVLLo, EVLHi; + std::tie(EVLLo, EVLHi) = DAG.SplitEVL(EVL, LD->getValueType(0), dl); + + MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand( + LD->getPointerInfo(), MachineMemOperand::MOLoad, + MemoryLocation::UnknownSize, Alignment, LD->getAAInfo(), LD->getRanges()); + + Lo = + DAG.getLoadVP(LD->getAddressingMode(), ExtType, LoVT, dl, Ch, Ptr, Offset, + MaskLo, EVLLo, LoMemVT, MMO, LD->isExpandingLoad()); + + if (HiIsEmpty) { + // The hi vp_load has zero storage size. We therefore simply set it to + // the low vp_load and rely on subsequent removal from the chain. + Hi = Lo; + } else { + // Generate hi vp_load. + Ptr = TLI.IncrementMemoryAddress(Ptr, MaskLo, dl, LoMemVT, DAG, + LD->isExpandingLoad()); + + MachinePointerInfo MPI; + if (LoMemVT.isScalableVector()) + MPI = MachinePointerInfo(LD->getPointerInfo().getAddrSpace()); + else + MPI = LD->getPointerInfo().getWithOffset( + LoMemVT.getStoreSize().getFixedSize()); + + MMO = DAG.getMachineFunction().getMachineMemOperand( + MPI, MachineMemOperand::MOLoad, MemoryLocation::UnknownSize, Alignment, + LD->getAAInfo(), LD->getRanges()); + + Hi = DAG.getLoadVP(LD->getAddressingMode(), ExtType, HiVT, dl, Ch, Ptr, + Offset, MaskHi, EVLHi, HiMemVT, MMO, + LD->isExpandingLoad()); + } + + // Build a factor node to remember that this load is independent of the + // other one. + Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1), + Hi.getValue(1)); + + // Legalize the chain result - switch anything that used the old chain to + // use the new one. + ReplaceValueWith(SDValue(LD, 1), Ch); +} + void DAGTypeLegalizer::SplitVecRes_MLOAD(MaskedLoadSDNode *MLD, SDValue &Lo, SDValue &Hi) { assert(MLD->isUnindexed() && "Indexed masked load during type legalization!"); @@ -1865,61 +1930,85 @@ void DAGTypeLegalizer::SplitVecRes_MLOAD(MaskedLoadSDNode *MLD, } -void DAGTypeLegalizer::SplitVecRes_MGATHER(MaskedGatherSDNode *MGT, - SDValue &Lo, SDValue &Hi) { +void DAGTypeLegalizer::SplitVecRes_Gather(MemSDNode *N, SDValue &Lo, + SDValue &Hi, bool SplitSETCC) { EVT LoVT, HiVT; - SDLoc dl(MGT); - std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(MGT->getValueType(0)); - - SDValue Ch = MGT->getChain(); - SDValue Ptr = MGT->getBasePtr(); - SDValue Mask = MGT->getMask(); - SDValue PassThru = MGT->getPassThru(); - SDValue Index = MGT->getIndex(); - SDValue Scale = MGT->getScale(); - EVT MemoryVT = MGT->getMemoryVT(); - Align Alignment = MGT->getOriginalAlign(); - ISD::LoadExtType ExtType = MGT->getExtensionType(); + SDLoc dl(N); + std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0)); + + SDValue Ch = N->getChain(); + SDValue Ptr = N->getBasePtr(); + struct Operands { + SDValue Mask; + SDValue Index; + SDValue Scale; + } Ops = [&]() -> Operands { + if (auto *MSC = dyn_cast<MaskedGatherSDNode>(N)) { + return {MSC->getMask(), MSC->getIndex(), MSC->getScale()}; + } + auto *VPSC = cast<VPGatherSDNode>(N); + return {VPSC->getMask(), VPSC->getIndex(), VPSC->getScale()}; + }(); + + EVT MemoryVT = N->getMemoryVT(); + Align Alignment = N->getOriginalAlign(); // Split Mask operand SDValue MaskLo, MaskHi; - if (Mask.getOpcode() == ISD::SETCC) { - SplitVecRes_SETCC(Mask.getNode(), MaskLo, MaskHi); + if (SplitSETCC && Ops.Mask.getOpcode() == ISD::SETCC) { + SplitVecRes_SETCC(Ops.Mask.getNode(), MaskLo, MaskHi); } else { - if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector) - GetSplitVector(Mask, MaskLo, MaskHi); - else - std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, dl); + std::tie(MaskLo, MaskHi) = SplitMask(Ops.Mask, dl); } EVT LoMemVT, HiMemVT; // Split MemoryVT std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT); - SDValue PassThruLo, PassThruHi; - if (getTypeAction(PassThru.getValueType()) == TargetLowering::TypeSplitVector) - GetSplitVector(PassThru, PassThruLo, PassThruHi); - else - std::tie(PassThruLo, PassThruHi) = DAG.SplitVector(PassThru, dl); - SDValue IndexHi, IndexLo; - if (getTypeAction(Index.getValueType()) == TargetLowering::TypeSplitVector) - GetSplitVector(Index, IndexLo, IndexHi); + if (getTypeAction(Ops.Index.getValueType()) == + TargetLowering::TypeSplitVector) + GetSplitVector(Ops.Index, IndexLo, IndexHi); else - std::tie(IndexLo, IndexHi) = DAG.SplitVector(Index, dl); + std::tie(IndexLo, IndexHi) = DAG.SplitVector(Ops.Index, dl); MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand( - MGT->getPointerInfo(), MachineMemOperand::MOLoad, - MemoryLocation::UnknownSize, Alignment, MGT->getAAInfo(), - MGT->getRanges()); + N->getPointerInfo(), MachineMemOperand::MOLoad, + MemoryLocation::UnknownSize, Alignment, N->getAAInfo(), N->getRanges()); + + if (auto *MGT = dyn_cast<MaskedGatherSDNode>(N)) { + SDValue PassThru = MGT->getPassThru(); + SDValue PassThruLo, PassThruHi; + if (getTypeAction(PassThru.getValueType()) == + TargetLowering::TypeSplitVector) + GetSplitVector(PassThru, PassThruLo, PassThruHi); + else + std::tie(PassThruLo, PassThruHi) = DAG.SplitVector(PassThru, dl); - SDValue OpsLo[] = {Ch, PassThruLo, MaskLo, Ptr, IndexLo, Scale}; - Lo = DAG.getMaskedGather(DAG.getVTList(LoVT, MVT::Other), LoMemVT, dl, OpsLo, - MMO, MGT->getIndexType(), ExtType); + ISD::LoadExtType ExtType = MGT->getExtensionType(); + ISD::MemIndexType IndexTy = MGT->getIndexType(); - SDValue OpsHi[] = {Ch, PassThruHi, MaskHi, Ptr, IndexHi, Scale}; - Hi = DAG.getMaskedGather(DAG.getVTList(HiVT, MVT::Other), HiMemVT, dl, OpsHi, - MMO, MGT->getIndexType(), ExtType); + SDValue OpsLo[] = {Ch, PassThruLo, MaskLo, Ptr, IndexLo, Ops.Scale}; + Lo = DAG.getMaskedGather(DAG.getVTList(LoVT, MVT::Other), LoMemVT, dl, + OpsLo, MMO, IndexTy, ExtType); + + SDValue OpsHi[] = {Ch, PassThruHi, MaskHi, Ptr, IndexHi, Ops.Scale}; + Hi = DAG.getMaskedGather(DAG.getVTList(HiVT, MVT::Other), HiMemVT, dl, + OpsHi, MMO, IndexTy, ExtType); + } else { + auto *VPGT = cast<VPGatherSDNode>(N); + SDValue EVLLo, EVLHi; + std::tie(EVLLo, EVLHi) = + DAG.SplitEVL(VPGT->getVectorLength(), MemoryVT, dl); + + SDValue OpsLo[] = {Ch, Ptr, IndexLo, Ops.Scale, MaskLo, EVLLo}; + Lo = DAG.getGatherVP(DAG.getVTList(LoVT, MVT::Other), LoMemVT, dl, OpsLo, + MMO, VPGT->getIndexType()); + + SDValue OpsHi[] = {Ch, Ptr, IndexHi, Ops.Scale, MaskHi, EVLHi}; + Hi = DAG.getGatherVP(DAG.getVTList(HiVT, MVT::Other), HiMemVT, dl, OpsHi, + MMO, VPGT->getIndexType()); + } // Build a factor node to remember that this load is independent of the // other one. @@ -1928,10 +2017,9 @@ void DAGTypeLegalizer::SplitVecRes_MGATHER(MaskedGatherSDNode *MGT, // Legalize the chain result - switch anything that used the old chain to // use the new one. - ReplaceValueWith(SDValue(MGT, 1), Ch); + ReplaceValueWith(SDValue(N, 1), Ch); } - void DAGTypeLegalizer::SplitVecRes_SETCC(SDNode *N, SDValue &Lo, SDValue &Hi) { assert(N->getValueType(0).isVector() && N->getOperand(0).getValueType().isVector() && @@ -2221,14 +2309,19 @@ bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) { case ISD::STORE: Res = SplitVecOp_STORE(cast<StoreSDNode>(N), OpNo); break; + case ISD::VP_STORE: + Res = SplitVecOp_VP_STORE(cast<VPStoreSDNode>(N), OpNo); + break; case ISD::MSTORE: Res = SplitVecOp_MSTORE(cast<MaskedStoreSDNode>(N), OpNo); break; case ISD::MSCATTER: - Res = SplitVecOp_MSCATTER(cast<MaskedScatterSDNode>(N), OpNo); + case ISD::VP_SCATTER: + Res = SplitVecOp_Scatter(cast<MemSDNode>(N), OpNo); break; case ISD::MGATHER: - Res = SplitVecOp_MGATHER(cast<MaskedGatherSDNode>(N), OpNo); + case ISD::VP_GATHER: + Res = SplitVecOp_Gather(cast<MemSDNode>(N), OpNo); break; case ISD::VSELECT: Res = SplitVecOp_VSELECT(N, OpNo); @@ -2285,6 +2378,23 @@ bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) { case ISD::VECREDUCE_SEQ_FMUL: Res = SplitVecOp_VECREDUCE_SEQ(N); break; + case ISD::VP_REDUCE_FADD: + case ISD::VP_REDUCE_SEQ_FADD: + case ISD::VP_REDUCE_FMUL: + case ISD::VP_REDUCE_SEQ_FMUL: + case ISD::VP_REDUCE_ADD: + case ISD::VP_REDUCE_MUL: + case ISD::VP_REDUCE_AND: + case ISD::VP_REDUCE_OR: + case ISD::VP_REDUCE_XOR: + case ISD::VP_REDUCE_SMAX: + case ISD::VP_REDUCE_SMIN: + case ISD::VP_REDUCE_UMAX: + case ISD::VP_REDUCE_UMIN: + case ISD::VP_REDUCE_FMAX: + case ISD::VP_REDUCE_FMIN: + Res = SplitVecOp_VP_REDUCE(N, OpNo); + break; } // If the result is null, the sub-method took care of registering results etc. @@ -2381,6 +2491,33 @@ SDValue DAGTypeLegalizer::SplitVecOp_VECREDUCE_SEQ(SDNode *N) { return DAG.getNode(N->getOpcode(), dl, ResVT, Partial, Hi, Flags); } +SDValue DAGTypeLegalizer::SplitVecOp_VP_REDUCE(SDNode *N, unsigned OpNo) { + assert(N->isVPOpcode() && "Expected VP opcode"); + assert(OpNo == 1 && "Can only split reduce vector operand"); + + unsigned Opc = N->getOpcode(); + EVT ResVT = N->getValueType(0); + SDValue Lo, Hi; + SDLoc dl(N); + + SDValue VecOp = N->getOperand(OpNo); + EVT VecVT = VecOp.getValueType(); + assert(VecVT.isVector() && "Can only split reduce vector operand"); + GetSplitVector(VecOp, Lo, Hi); + + SDValue MaskLo, MaskHi; + std::tie(MaskLo, MaskHi) = SplitMask(N->getOperand(2)); + + SDValue EVLLo, EVLHi; + std::tie(EVLLo, EVLHi) = DAG.SplitEVL(N->getOperand(3), VecVT, dl); + + const SDNodeFlags Flags = N->getFlags(); + + SDValue ResLo = + DAG.getNode(Opc, dl, ResVT, {N->getOperand(0), Lo, MaskLo, EVLLo}, Flags); + return DAG.getNode(Opc, dl, ResVT, {ResLo, Hi, MaskHi, EVLHi}, Flags); +} + SDValue DAGTypeLegalizer::SplitVecOp_UnaryOp(SDNode *N) { // The result has a legal vector type, but the input needs splitting. EVT ResVT = N->getValueType(0); @@ -2558,70 +2695,92 @@ SDValue DAGTypeLegalizer::SplitVecOp_ExtVecInRegOp(SDNode *N) { return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), N->getValueType(0), Lo, Hi); } -SDValue DAGTypeLegalizer::SplitVecOp_MGATHER(MaskedGatherSDNode *MGT, - unsigned OpNo) { - EVT LoVT, HiVT; - SDLoc dl(MGT); - std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(MGT->getValueType(0)); - - SDValue Ch = MGT->getChain(); - SDValue Ptr = MGT->getBasePtr(); - SDValue Index = MGT->getIndex(); - SDValue Scale = MGT->getScale(); - SDValue Mask = MGT->getMask(); - SDValue PassThru = MGT->getPassThru(); - Align Alignment = MGT->getOriginalAlign(); - ISD::LoadExtType ExtType = MGT->getExtensionType(); +SDValue DAGTypeLegalizer::SplitVecOp_Gather(MemSDNode *N, unsigned OpNo) { + (void)OpNo; + SDValue Lo, Hi; + SplitVecRes_Gather(N, Lo, Hi); - SDValue MaskLo, MaskHi; - if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector) - // Split Mask operand - GetSplitVector(Mask, MaskLo, MaskHi); - else - std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, dl); + SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, N, N->getValueType(0), Lo, Hi); + ReplaceValueWith(SDValue(N, 0), Res); + return SDValue(); +} - EVT MemoryVT = MGT->getMemoryVT(); - EVT LoMemVT, HiMemVT; - std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT); +SDValue DAGTypeLegalizer::SplitVecOp_VP_STORE(VPStoreSDNode *N, unsigned OpNo) { + assert(N->isUnindexed() && "Indexed vp_store of vector?"); + SDValue Ch = N->getChain(); + SDValue Ptr = N->getBasePtr(); + SDValue Offset = N->getOffset(); + assert(Offset.isUndef() && "Unexpected VP store offset"); + SDValue Mask = N->getMask(); + SDValue EVL = N->getVectorLength(); + SDValue Data = N->getValue(); + Align Alignment = N->getOriginalAlign(); + SDLoc DL(N); - SDValue PassThruLo, PassThruHi; - if (getTypeAction(PassThru.getValueType()) == TargetLowering::TypeSplitVector) - GetSplitVector(PassThru, PassThruLo, PassThruHi); + SDValue DataLo, DataHi; + if (getTypeAction(Data.getValueType()) == TargetLowering::TypeSplitVector) + // Split Data operand + GetSplitVector(Data, DataLo, DataHi); else - std::tie(PassThruLo, PassThruHi) = DAG.SplitVector(PassThru, dl); + std::tie(DataLo, DataHi) = DAG.SplitVector(Data, DL); - SDValue IndexHi, IndexLo; - if (getTypeAction(Index.getValueType()) == TargetLowering::TypeSplitVector) - GetSplitVector(Index, IndexLo, IndexHi); - else - std::tie(IndexLo, IndexHi) = DAG.SplitVector(Index, dl); + // Split Mask operand + SDValue MaskLo, MaskHi; + if (OpNo == 1 && Mask.getOpcode() == ISD::SETCC) { + SplitVecRes_SETCC(Mask.getNode(), MaskLo, MaskHi); + } else { + if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector) + GetSplitVector(Mask, MaskLo, MaskHi); + else + std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, DL); + } + EVT MemoryVT = N->getMemoryVT(); + EVT LoMemVT, HiMemVT; + bool HiIsEmpty = false; + std::tie(LoMemVT, HiMemVT) = + DAG.GetDependentSplitDestVTs(MemoryVT, DataLo.getValueType(), &HiIsEmpty); + + // Split EVL + SDValue EVLLo, EVLHi; + std::tie(EVLLo, EVLHi) = DAG.SplitEVL(EVL, Data.getValueType(), DL); + + SDValue Lo, Hi; MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand( - MGT->getPointerInfo(), MachineMemOperand::MOLoad, - MemoryLocation::UnknownSize, Alignment, MGT->getAAInfo(), - MGT->getRanges()); + N->getPointerInfo(), MachineMemOperand::MOStore, + MemoryLocation::UnknownSize, Alignment, N->getAAInfo(), N->getRanges()); - SDValue OpsLo[] = {Ch, PassThruLo, MaskLo, Ptr, IndexLo, Scale}; - SDValue Lo = DAG.getMaskedGather(DAG.getVTList(LoVT, MVT::Other), LoMemVT, dl, - OpsLo, MMO, MGT->getIndexType(), ExtType); + Lo = DAG.getStoreVP(Ch, DL, DataLo, Ptr, Offset, MaskLo, EVLLo, LoMemVT, MMO, + N->getAddressingMode(), N->isTruncatingStore(), + N->isCompressingStore()); - SDValue OpsHi[] = {Ch, PassThruHi, MaskHi, Ptr, IndexHi, Scale}; - SDValue Hi = DAG.getMaskedGather(DAG.getVTList(HiVT, MVT::Other), HiMemVT, dl, - OpsHi, MMO, MGT->getIndexType(), ExtType); + // If the hi vp_store has zero storage size, only the lo vp_store is needed. + if (HiIsEmpty) + return Lo; - // Build a factor node to remember that this load is independent of the - // other one. - Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1), - Hi.getValue(1)); + Ptr = TLI.IncrementMemoryAddress(Ptr, MaskLo, DL, LoMemVT, DAG, + N->isCompressingStore()); - // Legalize the chain result - switch anything that used the old chain to - // use the new one. - ReplaceValueWith(SDValue(MGT, 1), Ch); + MachinePointerInfo MPI; + if (LoMemVT.isScalableVector()) { + Alignment = commonAlignment(Alignment, + LoMemVT.getSizeInBits().getKnownMinSize() / 8); + MPI = MachinePointerInfo(N->getPointerInfo().getAddrSpace()); + } else + MPI = N->getPointerInfo().getWithOffset( + LoMemVT.getStoreSize().getFixedSize()); - SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MGT->getValueType(0), Lo, - Hi); - ReplaceValueWith(SDValue(MGT, 0), Res); - return SDValue(); + MMO = DAG.getMachineFunction().getMachineMemOperand( + MPI, MachineMemOperand::MOStore, MemoryLocation::UnknownSize, Alignment, + N->getAAInfo(), N->getRanges()); + + Hi = DAG.getStoreVP(Ch, DL, DataHi, Ptr, Offset, MaskHi, EVLHi, HiMemVT, MMO, + N->getAddressingMode(), N->isTruncatingStore(), + N->isCompressingStore()); + + // Build a factor node to remember that this store is independent of the + // other one. + return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo, Hi); } SDValue DAGTypeLegalizer::SplitVecOp_MSTORE(MaskedStoreSDNode *N, @@ -2703,64 +2862,87 @@ SDValue DAGTypeLegalizer::SplitVecOp_MSTORE(MaskedStoreSDNode *N, return Res; } -SDValue DAGTypeLegalizer::SplitVecOp_MSCATTER(MaskedScatterSDNode *N, - unsigned OpNo) { - SDValue Ch = N->getChain(); +SDValue DAGTypeLegalizer::SplitVecOp_Scatter(MemSDNode *N, unsigned OpNo) { + SDValue Ch = N->getChain(); SDValue Ptr = N->getBasePtr(); - SDValue Mask = N->getMask(); - SDValue Index = N->getIndex(); - SDValue Scale = N->getScale(); - SDValue Data = N->getValue(); EVT MemoryVT = N->getMemoryVT(); Align Alignment = N->getOriginalAlign(); SDLoc DL(N); - + struct Operands { + SDValue Mask; + SDValue Index; + SDValue Scale; + SDValue Data; + } Ops = [&]() -> Operands { + if (auto *MSC = dyn_cast<MaskedScatterSDNode>(N)) { + return {MSC->getMask(), MSC->getIndex(), MSC->getScale(), + MSC->getValue()}; + } + auto *VPSC = cast<VPScatterSDNode>(N); + return {VPSC->getMask(), VPSC->getIndex(), VPSC->getScale(), + VPSC->getValue()}; + }(); // Split all operands EVT LoMemVT, HiMemVT; std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT); SDValue DataLo, DataHi; - if (getTypeAction(Data.getValueType()) == TargetLowering::TypeSplitVector) + if (getTypeAction(Ops.Data.getValueType()) == TargetLowering::TypeSplitVector) // Split Data operand - GetSplitVector(Data, DataLo, DataHi); + GetSplitVector(Ops.Data, DataLo, DataHi); else - std::tie(DataLo, DataHi) = DAG.SplitVector(Data, DL); + std::tie(DataLo, DataHi) = DAG.SplitVector(Ops.Data, DL); // Split Mask operand SDValue MaskLo, MaskHi; - if (OpNo == 1 && Mask.getOpcode() == ISD::SETCC) { - SplitVecRes_SETCC(Mask.getNode(), MaskLo, MaskHi); + if (OpNo == 1 && Ops.Mask.getOpcode() == ISD::SETCC) { + SplitVecRes_SETCC(Ops.Mask.getNode(), MaskLo, MaskHi); } else { - if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector) - GetSplitVector(Mask, MaskLo, MaskHi); - else - std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, DL); + std::tie(MaskLo, MaskHi) = SplitMask(Ops.Mask, DL); } SDValue IndexHi, IndexLo; - if (getTypeAction(Index.getValueType()) == TargetLowering::TypeSplitVector) - GetSplitVector(Index, IndexLo, IndexHi); + if (getTypeAction(Ops.Index.getValueType()) == + TargetLowering::TypeSplitVector) + GetSplitVector(Ops.Index, IndexLo, IndexHi); else - std::tie(IndexLo, IndexHi) = DAG.SplitVector(Index, DL); + std::tie(IndexLo, IndexHi) = DAG.SplitVector(Ops.Index, DL); SDValue Lo; MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand( N->getPointerInfo(), MachineMemOperand::MOStore, MemoryLocation::UnknownSize, Alignment, N->getAAInfo(), N->getRanges()); - SDValue OpsLo[] = {Ch, DataLo, MaskLo, Ptr, IndexLo, Scale}; - Lo = DAG.getMaskedScatter(DAG.getVTList(MVT::Other), LoMemVT, - DL, OpsLo, MMO, N->getIndexType(), - N->isTruncatingStore()); + if (auto *MSC = dyn_cast<MaskedScatterSDNode>(N)) { + SDValue OpsLo[] = {Ch, DataLo, MaskLo, Ptr, IndexLo, Ops.Scale}; + Lo = + DAG.getMaskedScatter(DAG.getVTList(MVT::Other), LoMemVT, DL, OpsLo, MMO, + MSC->getIndexType(), MSC->isTruncatingStore()); + + // The order of the Scatter operation after split is well defined. The "Hi" + // part comes after the "Lo". So these two operations should be chained one + // after another. + SDValue OpsHi[] = {Lo, DataHi, MaskHi, Ptr, IndexHi, Ops.Scale}; + return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), HiMemVT, DL, OpsHi, + MMO, MSC->getIndexType(), + MSC->isTruncatingStore()); + } + auto *VPSC = cast<VPScatterSDNode>(N); + SDValue EVLLo, EVLHi; + std::tie(EVLLo, EVLHi) = + DAG.SplitEVL(VPSC->getVectorLength(), Ops.Data.getValueType(), DL); + + SDValue OpsLo[] = {Ch, DataLo, Ptr, IndexLo, Ops.Scale, MaskLo, EVLLo}; + Lo = DAG.getScatterVP(DAG.getVTList(MVT::Other), LoMemVT, DL, OpsLo, MMO, + VPSC->getIndexType()); // The order of the Scatter operation after split is well defined. The "Hi" // part comes after the "Lo". So these two operations should be chained one // after another. - SDValue OpsHi[] = {Lo, DataHi, MaskHi, Ptr, IndexHi, Scale}; - return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), HiMemVT, - DL, OpsHi, MMO, N->getIndexType(), - N->isTruncatingStore()); + SDValue OpsHi[] = {Lo, DataHi, Ptr, IndexHi, Ops.Scale, MaskHi, EVLHi}; + return DAG.getScatterVP(DAG.getVTList(MVT::Other), HiMemVT, DL, OpsHi, MMO, + VPSC->getIndexType()); } SDValue DAGTypeLegalizer::SplitVecOp_STORE(StoreSDNode *N, unsigned OpNo) { @@ -3047,31 +3229,41 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) { break; case ISD::SIGN_EXTEND_INREG: Res = WidenVecRes_InregOp(N); break; case ISD::VSELECT: - case ISD::SELECT: Res = WidenVecRes_SELECT(N); break; + case ISD::SELECT: + case ISD::VP_SELECT: + case ISD::VP_MERGE: + Res = WidenVecRes_Select(N); + break; case ISD::SELECT_CC: Res = WidenVecRes_SELECT_CC(N); break; case ISD::SETCC: Res = WidenVecRes_SETCC(N); break; case ISD::UNDEF: Res = WidenVecRes_UNDEF(N); break; case ISD::VECTOR_SHUFFLE: Res = WidenVecRes_VECTOR_SHUFFLE(cast<ShuffleVectorSDNode>(N)); break; + case ISD::VP_LOAD: + Res = WidenVecRes_VP_LOAD(cast<VPLoadSDNode>(N)); + break; case ISD::MLOAD: Res = WidenVecRes_MLOAD(cast<MaskedLoadSDNode>(N)); break; case ISD::MGATHER: Res = WidenVecRes_MGATHER(cast<MaskedGatherSDNode>(N)); break; + case ISD::VP_GATHER: + Res = WidenVecRes_VP_GATHER(cast<VPGatherSDNode>(N)); + break; - case ISD::ADD: - case ISD::AND: - case ISD::MUL: + case ISD::ADD: case ISD::VP_ADD: + case ISD::AND: case ISD::VP_AND: + case ISD::MUL: case ISD::VP_MUL: case ISD::MULHS: case ISD::MULHU: - case ISD::OR: - case ISD::SUB: - case ISD::XOR: - case ISD::SHL: - case ISD::SRA: - case ISD::SRL: + case ISD::OR: case ISD::VP_OR: + case ISD::SUB: case ISD::VP_SUB: + case ISD::XOR: case ISD::VP_XOR: + case ISD::SHL: case ISD::VP_SHL: + case ISD::SRA: case ISD::VP_ASHR: + case ISD::SRL: case ISD::VP_LSHR: case ISD::FMINNUM: case ISD::FMAXNUM: case ISD::FMINIMUM: @@ -3088,7 +3280,21 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) { case ISD::USHLSAT: case ISD::ROTL: case ISD::ROTR: - Res = WidenVecRes_Binary(N, /*IsVP*/ false); + // Vector-predicated binary op widening. Note that -- unlike the + // unpredicated versions -- we don't have to worry about trapping on + // operations like UDIV, FADD, etc., as we pass on the original vector + // length parameter. This means the widened elements containing garbage + // aren't active. + case ISD::VP_SDIV: + case ISD::VP_UDIV: + case ISD::VP_SREM: + case ISD::VP_UREM: + case ISD::VP_FADD: + case ISD::VP_FSUB: + case ISD::VP_FMUL: + case ISD::VP_FDIV: + case ISD::VP_FREM: + Res = WidenVecRes_Binary(N); break; case ISD::FADD: @@ -3212,31 +3418,6 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) { case ISD::FSHR: Res = WidenVecRes_Ternary(N); break; - case ISD::VP_ADD: - case ISD::VP_AND: - case ISD::VP_MUL: - case ISD::VP_OR: - case ISD::VP_SUB: - case ISD::VP_XOR: - case ISD::VP_SHL: - case ISD::VP_LSHR: - case ISD::VP_ASHR: - case ISD::VP_SDIV: - case ISD::VP_UDIV: - case ISD::VP_SREM: - case ISD::VP_UREM: - case ISD::VP_FADD: - case ISD::VP_FSUB: - case ISD::VP_FMUL: - case ISD::VP_FDIV: - case ISD::VP_FREM: - // Vector-predicated binary op widening. Note that -- unlike the - // unpredicated versions -- we don't have to worry about trapping on - // operations like UDIV, FADD, etc., as we pass on the original vector - // length parameter. This means the widened elements containing garbage - // aren't active. - Res = WidenVecRes_Binary(N, /*IsVP*/ true); - break; } // If Res is null, the sub-method took care of registering the result. @@ -3254,29 +3435,21 @@ SDValue DAGTypeLegalizer::WidenVecRes_Ternary(SDNode *N) { return DAG.getNode(N->getOpcode(), dl, WidenVT, InOp1, InOp2, InOp3); } -SDValue DAGTypeLegalizer::WidenVecRes_Binary(SDNode *N, bool IsVP) { +SDValue DAGTypeLegalizer::WidenVecRes_Binary(SDNode *N) { // Binary op widening. SDLoc dl(N); EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); SDValue InOp1 = GetWidenedVector(N->getOperand(0)); SDValue InOp2 = GetWidenedVector(N->getOperand(1)); - if (!IsVP) + if (N->getNumOperands() == 2) return DAG.getNode(N->getOpcode(), dl, WidenVT, InOp1, InOp2, N->getFlags()); - // For VP operations, we must also widen the mask. Note that the mask type - // may not actually need widening, leading it be split along with the VP - // operation. - // FIXME: This could lead to an infinite split/widen loop. We only handle the - // case where the mask needs widening to an identically-sized type as the - // vector inputs. - SDValue Mask = N->getOperand(2); - assert(getTypeAction(Mask.getValueType()) == - TargetLowering::TypeWidenVector && - "Unable to widen binary VP op"); - Mask = GetWidenedVector(Mask); - assert(Mask.getValueType().getVectorElementCount() == - WidenVT.getVectorElementCount() && - "Unable to widen binary VP op"); + + assert(N->getNumOperands() == 4 && "Unexpected number of operands!"); + assert(N->isVPOpcode() && "Expected VP opcode"); + + SDValue Mask = + GetWidenedMask(N->getOperand(2), WidenVT.getVectorElementCount()); return DAG.getNode(N->getOpcode(), dl, WidenVT, {InOp1, InOp2, Mask, N->getOperand(3)}, N->getFlags()); } @@ -4226,6 +4399,33 @@ SDValue DAGTypeLegalizer::WidenVecRes_LOAD(SDNode *N) { report_fatal_error("Unable to widen vector load"); } +SDValue DAGTypeLegalizer::WidenVecRes_VP_LOAD(VPLoadSDNode *N) { + EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); + SDValue Mask = N->getMask(); + SDValue EVL = N->getVectorLength(); + ISD::LoadExtType ExtType = N->getExtensionType(); + SDLoc dl(N); + + // The mask should be widened as well + assert(getTypeAction(Mask.getValueType()) == + TargetLowering::TypeWidenVector && + "Unable to widen binary VP op"); + Mask = GetWidenedVector(Mask); + assert(Mask.getValueType().getVectorElementCount() == + TLI.getTypeToTransformTo(*DAG.getContext(), Mask.getValueType()) + .getVectorElementCount() && + "Unable to widen vector load"); + + SDValue Res = + DAG.getLoadVP(N->getAddressingMode(), ExtType, WidenVT, dl, N->getChain(), + N->getBasePtr(), N->getOffset(), Mask, EVL, + N->getMemoryVT(), N->getMemOperand(), N->isExpandingLoad()); + // Legalize the chain result - switch anything that used the old chain to + // use the new one. + ReplaceValueWith(SDValue(N, 1), Res.getValue(1)); + return Res; +} + SDValue DAGTypeLegalizer::WidenVecRes_MLOAD(MaskedLoadSDNode *N) { EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(),N->getValueType(0)); @@ -4289,6 +4489,29 @@ SDValue DAGTypeLegalizer::WidenVecRes_MGATHER(MaskedGatherSDNode *N) { return Res; } +SDValue DAGTypeLegalizer::WidenVecRes_VP_GATHER(VPGatherSDNode *N) { + EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); + SDValue Mask = N->getMask(); + SDValue Scale = N->getScale(); + ElementCount WideEC = WideVT.getVectorElementCount(); + SDLoc dl(N); + + SDValue Index = GetWidenedVector(N->getIndex()); + EVT WideMemVT = EVT::getVectorVT(*DAG.getContext(), + N->getMemoryVT().getScalarType(), WideEC); + Mask = GetWidenedMask(Mask, WideEC); + + SDValue Ops[] = {N->getChain(), N->getBasePtr(), Index, Scale, + Mask, N->getVectorLength()}; + SDValue Res = DAG.getGatherVP(DAG.getVTList(WideVT, MVT::Other), WideMemVT, + dl, Ops, N->getMemOperand(), N->getIndexType()); + + // Legalize the chain result - switch anything that used the old chain to + // use the new one. + ReplaceValueWith(SDValue(N, 1), Res.getValue(1)); + return Res; +} + SDValue DAGTypeLegalizer::WidenVecRes_ScalarOp(SDNode *N) { EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); return DAG.getNode(N->getOpcode(), SDLoc(N), WidenVT, N->getOperand(0)); @@ -4522,19 +4745,19 @@ SDValue DAGTypeLegalizer::WidenVSELECTMask(SDNode *N) { return Mask; } -SDValue DAGTypeLegalizer::WidenVecRes_SELECT(SDNode *N) { +SDValue DAGTypeLegalizer::WidenVecRes_Select(SDNode *N) { EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); ElementCount WidenEC = WidenVT.getVectorElementCount(); SDValue Cond1 = N->getOperand(0); EVT CondVT = Cond1.getValueType(); + unsigned Opcode = N->getOpcode(); if (CondVT.isVector()) { if (SDValue WideCond = WidenVSELECTMask(N)) { SDValue InOp1 = GetWidenedVector(N->getOperand(1)); SDValue InOp2 = GetWidenedVector(N->getOperand(2)); assert(InOp1.getValueType() == WidenVT && InOp2.getValueType() == WidenVT); - return DAG.getNode(N->getOpcode(), SDLoc(N), - WidenVT, WideCond, InOp1, InOp2); + return DAG.getNode(Opcode, SDLoc(N), WidenVT, WideCond, InOp1, InOp2); } EVT CondEltVT = CondVT.getVectorElementType(); @@ -4560,8 +4783,10 @@ SDValue DAGTypeLegalizer::WidenVecRes_SELECT(SDNode *N) { SDValue InOp1 = GetWidenedVector(N->getOperand(1)); SDValue InOp2 = GetWidenedVector(N->getOperand(2)); assert(InOp1.getValueType() == WidenVT && InOp2.getValueType() == WidenVT); - return DAG.getNode(N->getOpcode(), SDLoc(N), - WidenVT, Cond1, InOp1, InOp2); + return Opcode == ISD::VP_SELECT || Opcode == ISD::VP_MERGE + ? DAG.getNode(Opcode, SDLoc(N), WidenVT, Cond1, InOp1, InOp2, + N->getOperand(3)) + : DAG.getNode(Opcode, SDLoc(N), WidenVT, Cond1, InOp1, InOp2); } SDValue DAGTypeLegalizer::WidenVecRes_SELECT_CC(SDNode *N) { @@ -4711,9 +4936,11 @@ bool DAGTypeLegalizer::WidenVectorOperand(SDNode *N, unsigned OpNo) { case ISD::EXTRACT_SUBVECTOR: Res = WidenVecOp_EXTRACT_SUBVECTOR(N); break; case ISD::EXTRACT_VECTOR_ELT: Res = WidenVecOp_EXTRACT_VECTOR_ELT(N); break; case ISD::STORE: Res = WidenVecOp_STORE(N); break; + case ISD::VP_STORE: Res = WidenVecOp_VP_STORE(N, OpNo); break; case ISD::MSTORE: Res = WidenVecOp_MSTORE(N, OpNo); break; case ISD::MGATHER: Res = WidenVecOp_MGATHER(N, OpNo); break; case ISD::MSCATTER: Res = WidenVecOp_MSCATTER(N, OpNo); break; + case ISD::VP_SCATTER: Res = WidenVecOp_VP_SCATTER(N, OpNo); break; case ISD::SETCC: Res = WidenVecOp_SETCC(N); break; case ISD::STRICT_FSETCC: case ISD::STRICT_FSETCCS: Res = WidenVecOp_STRICT_FSETCC(N); break; @@ -4766,6 +4993,23 @@ bool DAGTypeLegalizer::WidenVectorOperand(SDNode *N, unsigned OpNo) { case ISD::VECREDUCE_SEQ_FMUL: Res = WidenVecOp_VECREDUCE_SEQ(N); break; + case ISD::VP_REDUCE_FADD: + case ISD::VP_REDUCE_SEQ_FADD: + case ISD::VP_REDUCE_FMUL: + case ISD::VP_REDUCE_SEQ_FMUL: + case ISD::VP_REDUCE_ADD: + case ISD::VP_REDUCE_MUL: + case ISD::VP_REDUCE_AND: + case ISD::VP_REDUCE_OR: + case ISD::VP_REDUCE_XOR: + case ISD::VP_REDUCE_SMAX: + case ISD::VP_REDUCE_SMIN: + case ISD::VP_REDUCE_UMAX: + case ISD::VP_REDUCE_UMIN: + case ISD::VP_REDUCE_FMAX: + case ISD::VP_REDUCE_FMIN: + Res = WidenVecOp_VP_REDUCE(N); + break; } // If Res is null, the sub-method took care of registering the result. @@ -5092,15 +5336,54 @@ SDValue DAGTypeLegalizer::WidenVecOp_STORE(SDNode *N) { unsigned NumVTElts = StVT.getVectorMinNumElements(); SDValue EVL = DAG.getVScale(DL, EVLVT, APInt(EVLVT.getScalarSizeInBits(), NumVTElts)); - const auto *MMO = ST->getMemOperand(); - return DAG.getStoreVP(ST->getChain(), DL, StVal, ST->getBasePtr(), Mask, - EVL, MMO->getPointerInfo(), MMO->getAlign(), - MMO->getFlags(), MMO->getAAInfo()); + return DAG.getStoreVP(ST->getChain(), DL, StVal, ST->getBasePtr(), + DAG.getUNDEF(ST->getBasePtr().getValueType()), Mask, + EVL, StVal.getValueType(), ST->getMemOperand(), + ST->getAddressingMode()); } report_fatal_error("Unable to widen vector store"); } +SDValue DAGTypeLegalizer::WidenVecOp_VP_STORE(SDNode *N, unsigned OpNo) { + assert((OpNo == 1 || OpNo == 3) && + "Can widen only data or mask operand of vp_store"); + VPStoreSDNode *ST = cast<VPStoreSDNode>(N); + SDValue Mask = ST->getMask(); + SDValue StVal = ST->getValue(); + SDLoc dl(N); + + if (OpNo == 1) { + // Widen the value. + StVal = GetWidenedVector(StVal); + + // We only handle the case where the mask needs widening to an + // identically-sized type as the vector inputs. + assert(getTypeAction(Mask.getValueType()) == + TargetLowering::TypeWidenVector && + "Unable to widen VP store"); + Mask = GetWidenedVector(Mask); + } else { + Mask = GetWidenedVector(Mask); + + // We only handle the case where the stored value needs widening to an + // identically-sized type as the mask. + assert(getTypeAction(StVal.getValueType()) == + TargetLowering::TypeWidenVector && + "Unable to widen VP store"); + StVal = GetWidenedVector(StVal); + } + + assert(Mask.getValueType().getVectorElementCount() == + StVal.getValueType().getVectorElementCount() && + "Mask and data vectors should have the same number of elements"); + return DAG.getStoreVP(ST->getChain(), dl, StVal, ST->getBasePtr(), + ST->getOffset(), Mask, ST->getVectorLength(), + ST->getMemoryVT(), ST->getMemOperand(), + ST->getAddressingMode(), ST->isTruncatingStore(), + ST->isCompressingStore()); +} + SDValue DAGTypeLegalizer::WidenVecOp_MSTORE(SDNode *N, unsigned OpNo) { assert((OpNo == 1 || OpNo == 3) && "Can widen only data or mask operand of mstore"); @@ -5202,6 +5485,34 @@ SDValue DAGTypeLegalizer::WidenVecOp_MSCATTER(SDNode *N, unsigned OpNo) { MSC->isTruncatingStore()); } +SDValue DAGTypeLegalizer::WidenVecOp_VP_SCATTER(SDNode *N, unsigned OpNo) { + VPScatterSDNode *VPSC = cast<VPScatterSDNode>(N); + SDValue DataOp = VPSC->getValue(); + SDValue Mask = VPSC->getMask(); + SDValue Index = VPSC->getIndex(); + SDValue Scale = VPSC->getScale(); + EVT WideMemVT = VPSC->getMemoryVT(); + + if (OpNo == 1) { + DataOp = GetWidenedVector(DataOp); + Index = GetWidenedVector(Index); + const auto WideEC = DataOp.getValueType().getVectorElementCount(); + Mask = GetWidenedMask(Mask, WideEC); + WideMemVT = EVT::getVectorVT(*DAG.getContext(), + VPSC->getMemoryVT().getScalarType(), WideEC); + } else if (OpNo == 4) { + // Just widen the index. It's allowed to have extra elements. + Index = GetWidenedVector(Index); + } else + llvm_unreachable("Can't widen this operand of mscatter"); + + SDValue Ops[] = { + VPSC->getChain(), DataOp, VPSC->getBasePtr(), Index, Scale, Mask, + VPSC->getVectorLength()}; + return DAG.getScatterVP(DAG.getVTList(MVT::Other), WideMemVT, SDLoc(N), Ops, + VPSC->getMemOperand(), VPSC->getIndexType()); +} + SDValue DAGTypeLegalizer::WidenVecOp_SETCC(SDNode *N) { SDValue InOp0 = GetWidenedVector(N->getOperand(0)); SDValue InOp1 = GetWidenedVector(N->getOperand(1)); @@ -5320,6 +5631,19 @@ SDValue DAGTypeLegalizer::WidenVecOp_VECREDUCE_SEQ(SDNode *N) { return DAG.getNode(Opc, dl, N->getValueType(0), AccOp, Op, Flags); } +SDValue DAGTypeLegalizer::WidenVecOp_VP_REDUCE(SDNode *N) { + assert(N->isVPOpcode() && "Expected VP opcode"); + + SDLoc dl(N); + SDValue Op = GetWidenedVector(N->getOperand(1)); + SDValue Mask = GetWidenedMask(N->getOperand(2), + Op.getValueType().getVectorElementCount()); + + return DAG.getNode(N->getOpcode(), dl, N->getValueType(0), + {N->getOperand(0), Op, Mask, N->getOperand(3)}, + N->getFlags()); +} + SDValue DAGTypeLegalizer::WidenVecOp_VSELECT(SDNode *N) { // This only gets called in the case that the left and right inputs and // result are of a legal odd vector type, and the condition is illegal i1 of @@ -5779,6 +6103,8 @@ SDValue DAGTypeLegalizer::ModifyToType(SDValue InOp, EVT NVT, EVT InVT = InOp.getValueType(); assert(InVT.getVectorElementType() == NVT.getVectorElementType() && "input and widen element type must match"); + assert(!InVT.isScalableVector() && !NVT.isScalableVector() && + "cannot modify scalable vectors in this way"); SDLoc dl(InOp); // Check if InOp already has the right width. diff --git a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp index aec2cf38b400..403f34573899 100644 --- a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp @@ -286,7 +286,7 @@ void ScheduleDAGSDNodes::ClusterNeighboringLoads(SDNode *Node) { // Cluster loads by adding MVT::Glue outputs and inputs. This also // ensure they are scheduled in order of increasing addresses. SDNode *Lead = Loads[0]; - SDValue InGlue = SDValue(nullptr, 0); + SDValue InGlue; if (AddGlue(Lead, InGlue, true, DAG)) InGlue = SDValue(Lead, Lead->getNumValues() - 1); for (unsigned I = 1, E = Loads.size(); I != E; ++I) { @@ -1057,12 +1057,13 @@ EmitSchedule(MachineBasicBlock::iterator &InsertPos) { "first terminator cannot be a debug value"); for (MachineInstr &MI : make_early_inc_range( make_range(std::next(FirstTerm), InsertBB->end()))) { + // Only scan up to insertion point. + if (&MI == InsertPos) + break; + if (!MI.isDebugValue()) continue; - if (&MI == InsertPos) - InsertPos = std::prev(InsertPos->getIterator()); - // The DBG_VALUE was referencing a value produced by a terminator. By // moving the DBG_VALUE, the referenced value also needs invalidating. MI.getOperand(0).ChangeToRegister(0, false); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 2ae0d4df7b77..45f3005e8f57 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -373,31 +373,46 @@ ISD::NodeType ISD::getVecReduceBaseOpcode(unsigned VecReduceOpcode) { llvm_unreachable("Expected VECREDUCE opcode"); case ISD::VECREDUCE_FADD: case ISD::VECREDUCE_SEQ_FADD: + case ISD::VP_REDUCE_FADD: + case ISD::VP_REDUCE_SEQ_FADD: return ISD::FADD; case ISD::VECREDUCE_FMUL: case ISD::VECREDUCE_SEQ_FMUL: + case ISD::VP_REDUCE_FMUL: + case ISD::VP_REDUCE_SEQ_FMUL: return ISD::FMUL; case ISD::VECREDUCE_ADD: + case ISD::VP_REDUCE_ADD: return ISD::ADD; case ISD::VECREDUCE_MUL: + case ISD::VP_REDUCE_MUL: return ISD::MUL; case ISD::VECREDUCE_AND: + case ISD::VP_REDUCE_AND: return ISD::AND; case ISD::VECREDUCE_OR: + case ISD::VP_REDUCE_OR: return ISD::OR; case ISD::VECREDUCE_XOR: + case ISD::VP_REDUCE_XOR: return ISD::XOR; case ISD::VECREDUCE_SMAX: + case ISD::VP_REDUCE_SMAX: return ISD::SMAX; case ISD::VECREDUCE_SMIN: + case ISD::VP_REDUCE_SMIN: return ISD::SMIN; case ISD::VECREDUCE_UMAX: + case ISD::VP_REDUCE_UMAX: return ISD::UMAX; case ISD::VECREDUCE_UMIN: + case ISD::VP_REDUCE_UMIN: return ISD::UMIN; case ISD::VECREDUCE_FMAX: + case ISD::VP_REDUCE_FMAX: return ISD::FMAXNUM; case ISD::VECREDUCE_FMIN: + case ISD::VP_REDUCE_FMIN: return ISD::FMINNUM; } } @@ -3066,7 +3081,8 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts, case ISD::MUL: { Known = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1); Known2 = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1); - Known = KnownBits::mul(Known, Known2); + bool SelfMultiply = Op.getOperand(0) == Op.getOperand(1); + Known = KnownBits::mul(Known, Known2, SelfMultiply); break; } case ISD::MULHU: { @@ -3085,8 +3101,9 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts, assert((Op.getResNo() == 0 || Op.getResNo() == 1) && "Unknown result"); Known = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1); Known2 = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1); + bool SelfMultiply = Op.getOperand(0) == Op.getOperand(1); if (Op.getResNo() == 0) - Known = KnownBits::mul(Known, Known2); + Known = KnownBits::mul(Known, Known2, SelfMultiply); else Known = KnownBits::mulhu(Known, Known2); break; @@ -3095,8 +3112,9 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts, assert((Op.getResNo() == 0 || Op.getResNo() == 1) && "Unknown result"); Known = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1); Known2 = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1); + bool SelfMultiply = Op.getOperand(0) == Op.getOperand(1); if (Op.getResNo() == 0) - Known = KnownBits::mul(Known, Known2); + Known = KnownBits::mul(Known, Known2, SelfMultiply); else Known = KnownBits::mulhs(Known, Known2); break; @@ -3363,6 +3381,8 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts, case ISD::AssertAlign: { unsigned LogOfAlign = Log2(cast<AssertAlignSDNode>(Op)->getAlign()); assert(LogOfAlign != 0); + + // TODO: Should use maximum with source // If a node is guaranteed to be aligned, set low zero bits accordingly as // well as clearing one bits. Known.Zero.setLowBits(LogOfAlign); @@ -3584,6 +3604,12 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts, Known = KnownBits::smin(Known, Known2); break; } + case ISD::FP_TO_UINT_SAT: { + // FP_TO_UINT_SAT produces an unsigned value that fits in the saturating VT. + EVT VT = cast<VTSDNode>(Op.getOperand(1))->getVT(); + Known.Zero |= APInt::getBitsSetFrom(BitWidth, VT.getScalarSizeInBits()); + break; + } case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: if (Op.getResNo() == 1) { // The boolean result conforms to getBooleanContents. @@ -3860,6 +3886,10 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts, break; } + case ISD::FP_TO_SINT_SAT: + // FP_TO_SINT_SAT produces a signed value that fits in the saturating VT. + Tmp = cast<VTSDNode>(Op.getOperand(1))->getVT().getScalarSizeInBits(); + return VTBits - Tmp + 1; case ISD::SIGN_EXTEND: Tmp = VTBits - Op.getOperand(0).getScalarValueSizeInBits(); return ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth+1) + Tmp; @@ -4252,7 +4282,8 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts, // scalar cases. Type *CstTy = Cst->getType(); if (CstTy->isVectorTy() && - (NumElts * VTBits) == CstTy->getPrimitiveSizeInBits()) { + (NumElts * VTBits) == CstTy->getPrimitiveSizeInBits() && + VTBits == CstTy->getScalarSizeInBits()) { Tmp = VTBits; for (unsigned i = 0; i != NumElts; ++i) { if (!DemandedElts[i]) @@ -4294,31 +4325,18 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts, // Finally, if we can prove that the top bits of the result are 0's or 1's, // use this information. KnownBits Known = computeKnownBits(Op, DemandedElts, Depth); - - APInt Mask; - if (Known.isNonNegative()) { // sign bit is 0 - Mask = Known.Zero; - } else if (Known.isNegative()) { // sign bit is 1; - Mask = Known.One; - } else { - // Nothing known. - return FirstAnswer; - } - - // Okay, we know that the sign bit in Mask is set. Use CLO to determine - // the number of identical bits in the top of the input value. - Mask <<= Mask.getBitWidth()-VTBits; - return std::max(FirstAnswer, Mask.countLeadingOnes()); + return std::max(FirstAnswer, Known.countMinSignBits()); } -unsigned SelectionDAG::ComputeMinSignedBits(SDValue Op, unsigned Depth) const { +unsigned SelectionDAG::ComputeMaxSignificantBits(SDValue Op, + unsigned Depth) const { unsigned SignBits = ComputeNumSignBits(Op, Depth); return Op.getScalarValueSizeInBits() - SignBits + 1; } -unsigned SelectionDAG::ComputeMinSignedBits(SDValue Op, - const APInt &DemandedElts, - unsigned Depth) const { +unsigned SelectionDAG::ComputeMaxSignificantBits(SDValue Op, + const APInt &DemandedElts, + unsigned Depth) const { unsigned SignBits = ComputeNumSignBits(Op, DemandedElts, Depth); return Op.getScalarValueSizeInBits() - SignBits + 1; } @@ -5102,6 +5120,9 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, "BSWAP types must be a multiple of 16 bits!"); if (OpOpcode == ISD::UNDEF) return getUNDEF(VT); + // bswap(bswap(X)) -> X. + if (OpOpcode == ISD::BSWAP) + return Operand.getOperand(0); break; case ISD::BITREVERSE: assert(VT.isInteger() && VT == Operand.getValueType() && @@ -5398,6 +5419,19 @@ SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL, } } + // Fold (mul step_vector(C0), C1) to (step_vector(C0 * C1)). + // (shl step_vector(C0), C1) -> (step_vector(C0 << C1)) + if ((Opcode == ISD::MUL || Opcode == ISD::SHL) && + Ops[0].getOpcode() == ISD::STEP_VECTOR) { + APInt RHSVal; + if (ISD::isConstantSplatVector(Ops[1].getNode(), RHSVal)) { + APInt NewStep = Opcode == ISD::MUL + ? Ops[0].getConstantOperandAPInt(0) * RHSVal + : Ops[0].getConstantOperandAPInt(0) << RHSVal; + return getStepVector(DL, VT, NewStep); + } + } + auto IsScalarOrSameVectorSize = [NumElts](const SDValue &Op) { return !Op.getValueType().isVector() || Op.getValueType().getVectorElementCount() == NumElts; @@ -5595,22 +5629,24 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, assert(N1.getOpcode() != ISD::DELETED_NODE && N2.getOpcode() != ISD::DELETED_NODE && "Operand is DELETED_NODE!"); - ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1); - ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(N2); - ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1); - ConstantFPSDNode *N2CFP = dyn_cast<ConstantFPSDNode>(N2); - // Canonicalize constant to RHS if commutative. if (TLI->isCommutativeBinOp(Opcode)) { - if (N1C && !N2C) { - std::swap(N1C, N2C); + bool IsN1C = isConstantIntBuildVectorOrConstantInt(N1); + bool IsN2C = isConstantIntBuildVectorOrConstantInt(N2); + bool IsN1CFP = isConstantFPBuildVectorOrConstantFP(N1); + bool IsN2CFP = isConstantFPBuildVectorOrConstantFP(N2); + if ((IsN1C && !IsN2C) || (IsN1CFP && !IsN2CFP)) std::swap(N1, N2); - } else if (N1CFP && !N2CFP) { - std::swap(N1CFP, N2CFP); - std::swap(N1, N2); - } } + auto *N1C = dyn_cast<ConstantSDNode>(N1); + auto *N2C = dyn_cast<ConstantSDNode>(N2); + + // Don't allow undefs in vector splats - we might be returning N2 when folding + // to zero etc. + ConstantSDNode *N2CV = + isConstOrConstSplat(N2, /*AllowUndefs*/ false, /*AllowTruncation*/ true); + switch (Opcode) { default: break; case ISD::TokenFactor: @@ -5640,9 +5676,9 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, N1.getValueType() == VT && "Binary operator types must match!"); // (X & 0) -> 0. This commonly occurs when legalizing i64 values, so it's // worth handling here. - if (N2C && N2C->isZero()) + if (N2CV && N2CV->isZero()) return N2; - if (N2C && N2C->isAllOnes()) // X & -1 -> X + if (N2CV && N2CV->isAllOnes()) // X & -1 -> X return N1; break; case ISD::OR: @@ -5654,7 +5690,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, N1.getValueType() == VT && "Binary operator types must match!"); // (X ^|+- 0) -> X. This commonly occurs when legalizing i64 values, so // it's worth handling here. - if (N2C && N2C->isZero()) + if (N2CV && N2CV->isZero()) return N1; if ((Opcode == ISD::ADD || Opcode == ISD::SUB) && VT.isVector() && VT.getVectorElementType() == MVT::i1) @@ -5760,7 +5796,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, // size of the value, the shift/rotate count is guaranteed to be zero. if (VT == MVT::i1) return N1; - if (N2C && N2C->isZero()) + if (N2CV && N2CV->isZero()) return N1; break; case ISD::FP_ROUND: @@ -6358,7 +6394,7 @@ static SDValue getMemsetStringVal(EVT VT, const SDLoc &dl, SelectionDAG &DAG, Type *Ty = VT.getTypeForEVT(*DAG.getContext()); if (TLI.shouldConvertConstantLoadToIntImm(Val, Ty)) return DAG.getConstant(Val, dl, VT); - return SDValue(nullptr, 0); + return SDValue(); } SDValue SelectionDAG::getMemBasePlusOffset(SDValue Base, TypeSize Offset, @@ -7697,23 +7733,6 @@ SDValue SelectionDAG::getLoadVP(ISD::MemIndexedMode AM, SDValue Offset, SDValue Mask, SDValue EVL, EVT MemVT, MachineMemOperand *MMO, bool IsExpanding) { - if (VT == MemVT) { - ExtType = ISD::NON_EXTLOAD; - } else if (ExtType == ISD::NON_EXTLOAD) { - assert(VT == MemVT && "Non-extending load from different memory type!"); - } else { - // Extending load. - assert(MemVT.getScalarType().bitsLT(VT.getScalarType()) && - "Should only be an extending load, not truncating!"); - assert(VT.isInteger() == MemVT.isInteger() && - "Cannot convert from FP to Int or Int -> FP!"); - assert(VT.isVector() == MemVT.isVector() && - "Cannot use an ext load to convert to or from a vector!"); - assert((!VT.isVector() || - VT.getVectorElementCount() == MemVT.getVectorElementCount()) && - "Cannot use an ext load to change the number of vector elements!"); - } - bool Indexed = AM != ISD::UNINDEXED; assert((Indexed || Offset.isUndef()) && "Unindexed load with an offset!"); @@ -7802,48 +7821,29 @@ SDValue SelectionDAG::getIndexedLoadVP(SDValue OrigLoad, const SDLoc &dl, } SDValue SelectionDAG::getStoreVP(SDValue Chain, const SDLoc &dl, SDValue Val, - SDValue Ptr, SDValue Mask, SDValue EVL, - MachinePointerInfo PtrInfo, Align Alignment, - MachineMemOperand::Flags MMOFlags, - const AAMDNodes &AAInfo, bool IsCompressing) { + SDValue Ptr, SDValue Offset, SDValue Mask, + SDValue EVL, EVT MemVT, MachineMemOperand *MMO, + ISD::MemIndexedMode AM, bool IsTruncating, + bool IsCompressing) { assert(Chain.getValueType() == MVT::Other && "Invalid chain type"); - - MMOFlags |= MachineMemOperand::MOStore; - assert((MMOFlags & MachineMemOperand::MOLoad) == 0); - - if (PtrInfo.V.isNull()) - PtrInfo = InferPointerInfo(PtrInfo, *this, Ptr); - - MachineFunction &MF = getMachineFunction(); - uint64_t Size = - MemoryLocation::getSizeOrUnknown(Val.getValueType().getStoreSize()); - MachineMemOperand *MMO = - MF.getMachineMemOperand(PtrInfo, MMOFlags, Size, Alignment, AAInfo); - return getStoreVP(Chain, dl, Val, Ptr, Mask, EVL, MMO, IsCompressing); -} - -SDValue SelectionDAG::getStoreVP(SDValue Chain, const SDLoc &dl, SDValue Val, - SDValue Ptr, SDValue Mask, SDValue EVL, - MachineMemOperand *MMO, bool IsCompressing) { - assert(Chain.getValueType() == MVT::Other && "Invalid chain type"); - EVT VT = Val.getValueType(); - SDVTList VTs = getVTList(MVT::Other); - SDValue Undef = getUNDEF(Ptr.getValueType()); - SDValue Ops[] = {Chain, Val, Ptr, Undef, Mask, EVL}; + bool Indexed = AM != ISD::UNINDEXED; + assert((Indexed || Offset.isUndef()) && "Unindexed vp_store with an offset!"); + SDVTList VTs = Indexed ? getVTList(Ptr.getValueType(), MVT::Other) + : getVTList(MVT::Other); + SDValue Ops[] = {Chain, Val, Ptr, Offset, Mask, EVL}; FoldingSetNodeID ID; AddNodeIDNode(ID, ISD::VP_STORE, VTs, Ops); - ID.AddInteger(VT.getRawBits()); + ID.AddInteger(MemVT.getRawBits()); ID.AddInteger(getSyntheticNodeSubclassData<VPStoreSDNode>( - dl.getIROrder(), VTs, ISD::UNINDEXED, false, IsCompressing, VT, MMO)); + dl.getIROrder(), VTs, AM, IsTruncating, IsCompressing, MemVT, MMO)); ID.AddInteger(MMO->getPointerInfo().getAddrSpace()); void *IP = nullptr; if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) { cast<VPStoreSDNode>(E)->refineAlignment(MMO); return SDValue(E, 0); } - auto *N = - newSDNode<VPStoreSDNode>(dl.getIROrder(), dl.getDebugLoc(), VTs, - ISD::UNINDEXED, false, IsCompressing, VT, MMO); + auto *N = newSDNode<VPStoreSDNode>(dl.getIROrder(), dl.getDebugLoc(), VTs, AM, + IsTruncating, IsCompressing, MemVT, MMO); createOperands(N, Ops); CSEMap.InsertNode(N, IP); @@ -7885,7 +7885,9 @@ SDValue SelectionDAG::getTruncStoreVP(SDValue Chain, const SDLoc &dl, assert(Chain.getValueType() == MVT::Other && "Invalid chain type"); if (VT == SVT) - return getStoreVP(Chain, dl, Val, Ptr, Mask, EVL, MMO, IsCompressing); + return getStoreVP(Chain, dl, Val, Ptr, getUNDEF(Ptr.getValueType()), Mask, + EVL, VT, MMO, ISD::UNINDEXED, + /*IsTruncating*/ false, IsCompressing); assert(SVT.getScalarType().bitsLT(VT.getScalarType()) && "Should only be a truncating store, not extending!"); @@ -10661,6 +10663,23 @@ SelectionDAG::SplitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT, return std::make_pair(Lo, Hi); } +std::pair<SDValue, SDValue> SelectionDAG::SplitEVL(SDValue N, EVT VecVT, + const SDLoc &DL) { + // Split the vector length parameter. + // %evl -> umin(%evl, %halfnumelts) and usubsat(%evl - %halfnumelts). + EVT VT = N.getValueType(); + assert(VecVT.getVectorElementCount().isKnownEven() && + "Expecting the mask to be an evenly-sized vector"); + unsigned HalfMinNumElts = VecVT.getVectorMinNumElements() / 2; + SDValue HalfNumElts = + VecVT.isFixedLengthVector() + ? getConstant(HalfMinNumElts, DL, VT) + : getVScale(DL, VT, APInt(VT.getScalarSizeInBits(), HalfMinNumElts)); + SDValue Lo = getNode(ISD::UMIN, DL, VT, N, HalfNumElts); + SDValue Hi = getNode(ISD::USUBSAT, DL, VT, N, HalfNumElts); + return std::make_pair(Lo, Hi); +} + /// Widen the vector up to the next power of two using INSERT_SUBVECTOR. SDValue SelectionDAG::WidenVector(const SDValue &N, const SDLoc &DL) { EVT VT = N.getValueType(); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 63cd723cf6da..41460f78e1c2 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -1683,6 +1683,8 @@ SDValue SelectionDAGBuilder::getValueImpl(const Value *V) { if (const MetadataAsValue *MD = dyn_cast<MetadataAsValue>(V)) { return DAG.getMDNode(cast<MDNode>(MD->getMetadata())); } + if (const auto *BB = dyn_cast<BasicBlock>(V)) + return DAG.getBasicBlock(FuncInfo.MBBMap[BB]); llvm_unreachable("Can't get register for value!"); } @@ -4846,10 +4848,7 @@ void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I, } if (!I.getType()->isVoidTy()) { - if (VectorType *PTy = dyn_cast<VectorType>(I.getType())) { - EVT VT = TLI.getValueType(DAG.getDataLayout(), PTy); - Result = DAG.getNode(ISD::BITCAST, getCurSDLoc(), VT, Result); - } else + if (!isa<VectorType>(I.getType())) Result = lowerRangeToAssertZExt(DAG, I, Result); MaybeAlign Alignment = I.getRetAlign(); @@ -7327,8 +7326,6 @@ void SelectionDAGBuilder::visitVPLoadGather(const VPIntrinsic &VPIntrin, EVT VT, const TargetLowering &TLI = DAG.getTargetLoweringInfo(); Value *PtrOperand = VPIntrin.getArgOperand(0); MaybeAlign Alignment = VPIntrin.getPointerAlignment(); - if (!Alignment) - Alignment = DAG.getEVTAlign(VT); AAMDNodes AAInfo = VPIntrin.getAAMetadata(); const MDNode *Ranges = VPIntrin.getMetadata(LLVMContext::MD_range); SDValue LD; @@ -7336,6 +7333,8 @@ void SelectionDAGBuilder::visitVPLoadGather(const VPIntrinsic &VPIntrin, EVT VT, if (!IsGather) { // Do not serialize variable-length loads of constant memory with // anything. + if (!Alignment) + Alignment = DAG.getEVTAlign(VT); MemoryLocation ML = MemoryLocation::getAfter(PtrOperand, AAInfo); AddToChain = !AA || !AA->pointsToConstantMemory(ML); SDValue InChain = AddToChain ? DAG.getRoot() : DAG.getEntryNode(); @@ -7345,6 +7344,8 @@ void SelectionDAGBuilder::visitVPLoadGather(const VPIntrinsic &VPIntrin, EVT VT, LD = DAG.getLoadVP(VT, DL, InChain, OpValues[0], OpValues[1], OpValues[2], MMO, false /*IsExpanding */); } else { + if (!Alignment) + Alignment = DAG.getEVTAlign(VT.getScalarType()); unsigned AS = PtrOperand->getType()->getScalarType()->getPointerAddressSpace(); MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand( @@ -7385,18 +7386,22 @@ void SelectionDAGBuilder::visitVPStoreScatter(const VPIntrinsic &VPIntrin, Value *PtrOperand = VPIntrin.getArgOperand(1); EVT VT = OpValues[0].getValueType(); MaybeAlign Alignment = VPIntrin.getPointerAlignment(); - if (!Alignment) - Alignment = DAG.getEVTAlign(VT); AAMDNodes AAInfo = VPIntrin.getAAMetadata(); SDValue ST; if (!IsScatter) { + if (!Alignment) + Alignment = DAG.getEVTAlign(VT); + SDValue Ptr = OpValues[1]; + SDValue Offset = DAG.getUNDEF(Ptr.getValueType()); MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand( MachinePointerInfo(PtrOperand), MachineMemOperand::MOStore, MemoryLocation::UnknownSize, *Alignment, AAInfo); - ST = - DAG.getStoreVP(getMemoryRoot(), DL, OpValues[0], OpValues[1], - OpValues[2], OpValues[3], MMO, false /* IsTruncating */); + ST = DAG.getStoreVP(getMemoryRoot(), DL, OpValues[0], Ptr, Offset, + OpValues[2], OpValues[3], VT, MMO, ISD::UNINDEXED, + /* IsTruncating */ false, /*IsCompressing*/ false); } else { + if (!Alignment) + Alignment = DAG.getEVTAlign(VT.getScalarType()); unsigned AS = PtrOperand->getType()->getScalarType()->getPointerAddressSpace(); MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand( @@ -8250,7 +8255,8 @@ public: /// corresponds to. If there is no Value* for this operand, it returns /// MVT::Other. EVT getCallOperandValEVT(LLVMContext &Context, const TargetLowering &TLI, - const DataLayout &DL) const { + const DataLayout &DL, + llvm::Type *ParamElemType) const { if (!CallOperandVal) return MVT::Other; if (isa<BasicBlock>(CallOperandVal)) @@ -8262,10 +8268,8 @@ public: // If this is an indirect operand, the operand is a pointer to the // accessed type. if (isIndirect) { - PointerType *PtrTy = dyn_cast<PointerType>(OpTy); - if (!PtrTy) - report_fatal_error("Indirect operand for inline asm not a pointer!"); - OpTy = PtrTy->getElementType(); + OpTy = ParamElemType; + assert(OpTy && "Indirect opernad must have elementtype attribute"); } // Look for vector wrapped in a struct. e.g. { <16 x i8> }. @@ -8559,37 +8563,19 @@ void SelectionDAGBuilder::visitInlineAsm(const CallBase &Call, unsigned ArgNo = 0; // ArgNo - The argument of the CallInst. unsigned ResNo = 0; // ResNo - The result number of the next output. - unsigned NumMatchingOps = 0; for (auto &T : TargetConstraints) { ConstraintOperands.push_back(SDISelAsmOperandInfo(T)); SDISelAsmOperandInfo &OpInfo = ConstraintOperands.back(); // Compute the value type for each operand. - if (OpInfo.Type == InlineAsm::isInput || - (OpInfo.Type == InlineAsm::isOutput && OpInfo.isIndirect)) { - OpInfo.CallOperandVal = Call.getArgOperand(ArgNo++); - - // Process the call argument. BasicBlocks are labels, currently appearing - // only in asm's. - if (isa<CallBrInst>(Call) && - ArgNo - 1 >= (cast<CallBrInst>(&Call)->arg_size() - - cast<CallBrInst>(&Call)->getNumIndirectDests() - - NumMatchingOps) && - (NumMatchingOps == 0 || - ArgNo - 1 < - (cast<CallBrInst>(&Call)->arg_size() - NumMatchingOps))) { - const auto *BA = cast<BlockAddress>(OpInfo.CallOperandVal); - EVT VT = TLI.getValueType(DAG.getDataLayout(), BA->getType(), true); - OpInfo.CallOperand = DAG.getTargetBlockAddress(BA, VT); - } else if (const auto *BB = dyn_cast<BasicBlock>(OpInfo.CallOperandVal)) { - OpInfo.CallOperand = DAG.getBasicBlock(FuncInfo.MBBMap[BB]); - } else { - OpInfo.CallOperand = getValue(OpInfo.CallOperandVal); - } - + if (OpInfo.hasArg()) { + OpInfo.CallOperandVal = Call.getArgOperand(ArgNo); + OpInfo.CallOperand = getValue(OpInfo.CallOperandVal); + Type *ParamElemTy = Call.getAttributes().getParamElementType(ArgNo); EVT VT = OpInfo.getCallOperandValEVT(*DAG.getContext(), TLI, - DAG.getDataLayout()); + DAG.getDataLayout(), ParamElemTy); OpInfo.ConstraintVT = VT.isSimple() ? VT.getSimpleVT() : MVT::Other; + ArgNo++; } else if (OpInfo.Type == InlineAsm::isOutput && !OpInfo.isIndirect) { // The return value of the call is this value. As such, there is no // corresponding argument. @@ -8607,9 +8593,6 @@ void SelectionDAGBuilder::visitInlineAsm(const CallBase &Call, OpInfo.ConstraintVT = MVT::Other; } - if (OpInfo.hasMatchingInput()) - ++NumMatchingOps; - if (!HasSideEffect) HasSideEffect = OpInfo.hasMemory(TLI); @@ -11246,12 +11229,6 @@ void SelectionDAGBuilder::visitVectorSplice(const CallInst &I) { unsigned NumElts = VT.getVectorNumElements(); - if ((-Imm > NumElts) || (Imm >= NumElts)) { - // Result is undefined if immediate is out-of-bounds. - setValue(&I, DAG.getUNDEF(VT)); - return; - } - uint64_t Idx = (NumElts + Imm) % NumElts; // Use VECTOR_SHUFFLE to maintain original behaviour for fixed-length vectors. diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index e6b06ab93d6b..a98c21f16c71 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -60,7 +60,7 @@ bool TargetLowering::isInTailCallPosition(SelectionDAG &DAG, SDNode *Node, // Conservatively require the attributes of the call to match those of // the return. Ignore following attributes because they don't affect the // call sequence. - AttrBuilder CallerAttrs(F.getAttributes(), AttributeList::ReturnIndex); + AttrBuilder CallerAttrs(F.getContext(), F.getAttributes().getRetAttrs()); for (const auto &Attr : {Attribute::Alignment, Attribute::Dereferenceable, Attribute::DereferenceableOrNull, Attribute::NoAlias, Attribute::NonNull}) @@ -1806,6 +1806,31 @@ bool TargetLowering::SimplifyDemandedBits( } case ISD::BSWAP: { SDValue Src = Op.getOperand(0); + + // If the only bits demanded come from one byte of the bswap result, + // just shift the input byte into position to eliminate the bswap. + unsigned NLZ = DemandedBits.countLeadingZeros(); + unsigned NTZ = DemandedBits.countTrailingZeros(); + + // Round NTZ down to the next byte. If we have 11 trailing zeros, then + // we need all the bits down to bit 8. Likewise, round NLZ. If we + // have 14 leading zeros, round to 8. + NLZ = alignDown(NLZ, 8); + NTZ = alignDown(NTZ, 8); + // If we need exactly one byte, we can do this transformation. + if (BitWidth - NLZ - NTZ == 8) { + // Replace this with either a left or right shift to get the byte into + // the right place. + unsigned ShiftOpcode = NLZ > NTZ ? ISD::SRL : ISD::SHL; + if (!TLO.LegalOperations() || isOperationLegal(ShiftOpcode, VT)) { + EVT ShiftAmtTy = getShiftAmountTy(VT, DL); + unsigned ShiftAmount = NLZ > NTZ ? NLZ - NTZ : NTZ - NLZ; + SDValue ShAmt = TLO.DAG.getConstant(ShiftAmount, dl, ShiftAmtTy); + SDValue NewOp = TLO.DAG.getNode(ShiftOpcode, dl, VT, Src, ShAmt); + return TLO.CombineTo(Op, NewOp); + } + } + APInt DemandedSrcBits = DemandedBits.byteSwap(); if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedElts, Known2, TLO, Depth + 1)) @@ -1833,19 +1858,15 @@ bool TargetLowering::SimplifyDemandedBits( // If we only care about the highest bit, don't bother shifting right. if (DemandedBits.isSignMask()) { unsigned MinSignedBits = - TLO.DAG.ComputeMinSignedBits(Op0, DemandedElts, Depth + 1); + TLO.DAG.ComputeMaxSignificantBits(Op0, DemandedElts, Depth + 1); bool AlreadySignExtended = ExVTBits >= MinSignedBits; // However if the input is already sign extended we expect the sign // extension to be dropped altogether later and do not simplify. if (!AlreadySignExtended) { // Compute the correct shift amount type, which must be getShiftAmountTy // for scalar types after legalization. - EVT ShiftAmtTy = VT; - if (TLO.LegalTypes() && !ShiftAmtTy.isVector()) - ShiftAmtTy = getShiftAmountTy(ShiftAmtTy, DL); - - SDValue ShiftAmt = - TLO.DAG.getConstant(BitWidth - ExVTBits, dl, ShiftAmtTy); + SDValue ShiftAmt = TLO.DAG.getConstant(BitWidth - ExVTBits, dl, + getShiftAmountTy(VT, DL)); return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::SHL, dl, VT, Op0, ShiftAmt)); } @@ -3233,17 +3254,29 @@ bool TargetLowering::isExtendedTrueVal(const ConstantSDNode *N, EVT VT, SDValue TargetLowering::foldSetCCWithAnd(EVT VT, SDValue N0, SDValue N1, ISD::CondCode Cond, const SDLoc &DL, DAGCombinerInfo &DCI) const { - // Match these patterns in any of their permutations: - // (X & Y) == Y - // (X & Y) != Y if (N1.getOpcode() == ISD::AND && N0.getOpcode() != ISD::AND) std::swap(N0, N1); + SelectionDAG &DAG = DCI.DAG; EVT OpVT = N0.getValueType(); if (N0.getOpcode() != ISD::AND || !OpVT.isInteger() || (Cond != ISD::SETEQ && Cond != ISD::SETNE)) return SDValue(); + // (X & Y) != 0 --> zextOrTrunc(X & Y) + // iff everything but LSB is known zero: + if (Cond == ISD::SETNE && isNullConstant(N1) && + (getBooleanContents(OpVT) == TargetLowering::UndefinedBooleanContent || + getBooleanContents(OpVT) == TargetLowering::ZeroOrOneBooleanContent)) { + unsigned NumEltBits = OpVT.getScalarSizeInBits(); + APInt UpperBits = APInt::getHighBitsSet(NumEltBits, NumEltBits - 1); + if (DAG.MaskedValueIsZero(N0, UpperBits)) + return DAG.getBoolExtOrTrunc(N0, DL, VT, OpVT); + } + + // Match these patterns in any of their permutations: + // (X & Y) == Y + // (X & Y) != Y SDValue X, Y; if (N0.getOperand(0) == N1) { X = N0.getOperand(1); @@ -3255,7 +3288,6 @@ SDValue TargetLowering::foldSetCCWithAnd(EVT VT, SDValue N0, SDValue N1, return SDValue(); } - SelectionDAG &DAG = DCI.DAG; SDValue Zero = DAG.getConstant(0, DL, OpVT); if (DAG.isKnownToBeAPowerOfTwo(Y)) { // Simplify X & Y == Y to X & Y != 0 if Y has exactly one bit set. @@ -3678,9 +3710,7 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1, } // Figure out how many bits we need to preserve this constant. - unsigned ReqdBits = Signed ? - C1.getBitWidth() - C1.getNumSignBits() + 1 : - C1.getActiveBits(); + unsigned ReqdBits = Signed ? C1.getMinSignedBits() : C1.getActiveBits(); // Make sure we're not losing bits from the constant. if (MinBits > 0 && @@ -4594,20 +4624,12 @@ void TargetLowering::LowerAsmOperandForConstraint(SDValue Op, char ConstraintLetter = Constraint[0]; switch (ConstraintLetter) { default: break; - case 'X': // Allows any operand; labels (basic block) use this. - if (Op.getOpcode() == ISD::BasicBlock || - Op.getOpcode() == ISD::TargetBlockAddress) { - Ops.push_back(Op); - return; - } - LLVM_FALLTHROUGH; + case 'X': // Allows any operand case 'i': // Simple Integer or Relocatable Constant case 'n': // Simple Integer case 's': { // Relocatable Constant - GlobalAddressSDNode *GA; ConstantSDNode *C; - BlockAddressSDNode *BA; uint64_t Offset = 0; // Match (GA) or (C) or (GA+C) or (GA-C) or ((GA+C)+C) or (((GA+C)+C)+C), @@ -4615,13 +4637,7 @@ void TargetLowering::LowerAsmOperandForConstraint(SDValue Op, // SelectionDAG::FoldSymbolOffset because it expects the GA to be accessible // while in this case the GA may be furthest from the root node which is // likely an ISD::ADD. - while (1) { - if ((GA = dyn_cast<GlobalAddressSDNode>(Op)) && ConstraintLetter != 'n') { - Ops.push_back(DAG.getTargetGlobalAddress(GA->getGlobal(), SDLoc(Op), - GA->getValueType(0), - Offset + GA->getOffset())); - return; - } + while (true) { if ((C = dyn_cast<ConstantSDNode>(Op)) && ConstraintLetter != 's') { // gcc prints these as sign extended. Sign extend value to 64 bits // now; without this it would get ZExt'd later in @@ -4636,11 +4652,23 @@ void TargetLowering::LowerAsmOperandForConstraint(SDValue Op, DAG.getTargetConstant(Offset + ExtVal, SDLoc(C), MVT::i64)); return; } - if ((BA = dyn_cast<BlockAddressSDNode>(Op)) && ConstraintLetter != 'n') { - Ops.push_back(DAG.getTargetBlockAddress( - BA->getBlockAddress(), BA->getValueType(0), - Offset + BA->getOffset(), BA->getTargetFlags())); - return; + if (ConstraintLetter != 'n') { + if (const auto *GA = dyn_cast<GlobalAddressSDNode>(Op)) { + Ops.push_back(DAG.getTargetGlobalAddress(GA->getGlobal(), SDLoc(Op), + GA->getValueType(0), + Offset + GA->getOffset())); + return; + } + if (const auto *BA = dyn_cast<BlockAddressSDNode>(Op)) { + Ops.push_back(DAG.getTargetBlockAddress( + BA->getBlockAddress(), BA->getValueType(0), + Offset + BA->getOffset(), BA->getTargetFlags())); + return; + } + if (isa<BasicBlockSDNode>(Op)) { + Ops.push_back(Op); + return; + } } const unsigned OpCode = Op.getOpcode(); if (OpCode == ISD::ADD || OpCode == ISD::SUB) { @@ -4753,7 +4781,7 @@ TargetLowering::ParseConstraints(const DataLayout &DL, case InlineAsm::isOutput: // Indirect outputs just consume an argument. if (OpInfo.isIndirect) { - OpInfo.CallOperandVal = Call.getArgOperand(ArgNo++); + OpInfo.CallOperandVal = Call.getArgOperand(ArgNo); break; } @@ -4771,7 +4799,7 @@ TargetLowering::ParseConstraints(const DataLayout &DL, ++ResNo; break; case InlineAsm::isInput: - OpInfo.CallOperandVal = Call.getArgOperand(ArgNo++); + OpInfo.CallOperandVal = Call.getArgOperand(ArgNo); break; case InlineAsm::isClobber: // Nothing to do. @@ -4781,10 +4809,8 @@ TargetLowering::ParseConstraints(const DataLayout &DL, if (OpInfo.CallOperandVal) { llvm::Type *OpTy = OpInfo.CallOperandVal->getType(); if (OpInfo.isIndirect) { - llvm::PointerType *PtrTy = dyn_cast<PointerType>(OpTy); - if (!PtrTy) - report_fatal_error("Indirect operand for inline asm not a pointer!"); - OpTy = PtrTy->getElementType(); + OpTy = Call.getAttributes().getParamElementType(ArgNo); + assert(OpTy && "Indirect opernad must have elementtype attribute"); } // Look for vector wrapped in a struct. e.g. { <16 x i8> }. @@ -4814,6 +4840,8 @@ TargetLowering::ParseConstraints(const DataLayout &DL, } else { OpInfo.ConstraintVT = MVT::getVT(OpTy, true); } + + ArgNo++; } } @@ -5087,17 +5115,18 @@ void TargetLowering::ComputeConstraintToUse(AsmOperandInfo &OpInfo, // 'X' matches anything. if (OpInfo.ConstraintCode == "X" && OpInfo.CallOperandVal) { - // Labels and constants are handled elsewhere ('X' is the only thing - // that matches labels). For Functions, the type here is the type of - // the result, which is not what we want to look at; leave them alone. + // Constants are handled elsewhere. For Functions, the type here is the + // type of the result, which is not what we want to look at; leave them + // alone. Value *v = OpInfo.CallOperandVal; - if (isa<BasicBlock>(v) || isa<ConstantInt>(v) || isa<Function>(v)) { - OpInfo.CallOperandVal = v; + if (isa<ConstantInt>(v) || isa<Function>(v)) { return; } - if (Op.getNode() && Op.getOpcode() == ISD::TargetBlockAddress) + if (isa<BasicBlock>(v) || isa<BlockAddress>(v)) { + OpInfo.ConstraintCode = "i"; return; + } // Otherwise, try to resolve it to something we know about by looking at // the actual operand type. @@ -6438,12 +6467,6 @@ bool TargetLowering::expandMUL_LOHI(unsigned Opcode, EVT VT, const SDLoc &dl, unsigned ShiftAmount = OuterBitSize - InnerBitSize; EVT ShiftAmountTy = getShiftAmountTy(VT, DAG.getDataLayout()); - if (APInt::getMaxValue(ShiftAmountTy.getSizeInBits()).ult(ShiftAmount)) { - // FIXME getShiftAmountTy does not always return a sensible result when VT - // is an illegal type, and so the type may be too small to fit the shift - // amount. Override it with i32. The shift will have to be legalized. - ShiftAmountTy = MVT::i32; - } SDValue Shift = DAG.getConstant(ShiftAmount, dl, ShiftAmountTy); if (!LH.getNode() && !RH.getNode() && diff --git a/llvm/lib/CodeGen/ShrinkWrap.cpp b/llvm/lib/CodeGen/ShrinkWrap.cpp index f89069e9f728..f6ad2b50abcd 100644 --- a/llvm/lib/CodeGen/ShrinkWrap.cpp +++ b/llvm/lib/CodeGen/ShrinkWrap.cpp @@ -273,6 +273,8 @@ bool ShrinkWrap::useOrDefCSROrFI(const MachineInstr &MI, LLVM_DEBUG(dbgs() << "Frame instruction: " << MI << '\n'); return true; } + const MachineFunction *MF = MI.getParent()->getParent(); + const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); for (const MachineOperand &MO : MI.operands()) { bool UseOrDefCSR = false; if (MO.isReg()) { @@ -288,8 +290,14 @@ bool ShrinkWrap::useOrDefCSROrFI(const MachineInstr &MI, // separately. An SP mentioned by a call instruction, we can ignore, // though, as it's harmless and we do not want to effectively disable tail // calls by forcing the restore point to post-dominate them. - UseOrDefCSR = (!MI.isCall() && PhysReg == SP) || - RCI.getLastCalleeSavedAlias(PhysReg); + // PPC's LR is also not normally described as a callee-saved register in + // calling convention definitions, so we need to watch for it, too. An LR + // mentioned implicitly by a return (or "branch to link register") + // instruction we can ignore, otherwise we may pessimize shrinkwrapping. + UseOrDefCSR = + (!MI.isCall() && PhysReg == SP) || + RCI.getLastCalleeSavedAlias(PhysReg) || + (!MI.isReturn() && TRI->isNonallocatableRegisterCalleeSave(PhysReg)); } else if (MO.isRegMask()) { // Check if this regmask clobbers any of the CSRs. for (unsigned Reg : getCurrentCSRs(RS)) { diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp index 6fc6881f8736..ab574232e367 100644 --- a/llvm/lib/CodeGen/TargetLoweringBase.cpp +++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -715,6 +715,7 @@ TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm) : TM(tm) { SchedPreferenceInfo = Sched::ILP; GatherAllAliasesMaxDepth = 18; IsStrictFPEnabled = DisableStrictNodeMutation; + MaxBytesForAlignment = 0; // TODO: the default will be switched to 0 in the next commit, along // with the Target-specific changes necessary. MaxAtomicSizeInBitsSupported = 1024; @@ -2040,6 +2041,11 @@ Align TargetLoweringBase::getPrefLoopAlignment(MachineLoop *ML) const { return PrefLoopAlignment; } +unsigned TargetLoweringBase::getMaxPermittedBytesForAlignment( + MachineBasicBlock *MBB) const { + return MaxBytesForAlignment; +} + //===----------------------------------------------------------------------===// // Reciprocal Estimates //===----------------------------------------------------------------------===// diff --git a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp index d1c2cdeb133b..ce350034d073 100644 --- a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp +++ b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp @@ -108,8 +108,7 @@ static void GetObjCImageInfo(Module &M, unsigned &Version, unsigned &Flags, // ELF //===----------------------------------------------------------------------===// -TargetLoweringObjectFileELF::TargetLoweringObjectFileELF() - : TargetLoweringObjectFile() { +TargetLoweringObjectFileELF::TargetLoweringObjectFileELF() { SupportDSOLocalEquivalentLowering = true; } @@ -478,6 +477,11 @@ static SectionKind getELFKindForNamedSection(StringRef Name, SectionKind K) { return K; } +static bool hasPrefix(StringRef SectionName, StringRef Prefix) { + return SectionName.consume_front(Prefix) && + (SectionName.empty() || SectionName[0] == '.'); +} + static unsigned getELFSectionType(StringRef Name, SectionKind K) { // Use SHT_NOTE for section whose name starts with ".note" to allow // emitting ELF notes from C variable declaration. @@ -485,13 +489,13 @@ static unsigned getELFSectionType(StringRef Name, SectionKind K) { if (Name.startswith(".note")) return ELF::SHT_NOTE; - if (Name == ".init_array") + if (hasPrefix(Name, ".init_array")) return ELF::SHT_INIT_ARRAY; - if (Name == ".fini_array") + if (hasPrefix(Name, ".fini_array")) return ELF::SHT_FINI_ARRAY; - if (Name == ".preinit_array") + if (hasPrefix(Name, ".preinit_array")) return ELF::SHT_PREINIT_ARRAY; if (K.isBSS() || K.isThreadBSS()) @@ -1139,8 +1143,7 @@ TargetLoweringObjectFileELF::InitializeELF(bool UseInitArray_) { // MachO //===----------------------------------------------------------------------===// -TargetLoweringObjectFileMachO::TargetLoweringObjectFileMachO() - : TargetLoweringObjectFile() { +TargetLoweringObjectFileMachO::TargetLoweringObjectFileMachO() { SupportIndirectSymViaGOTPCRel = true; } @@ -1185,6 +1188,7 @@ void TargetLoweringObjectFileMachO::emitModuleMetadata(MCStreamer &Streamer, StringRef SectionVal; GetObjCImageInfo(M, VersionVal, ImageInfoFlags, SectionVal); + emitCGProfileMetadata(Streamer, M); // The section is mandatory. If we don't have it, then we don't have GC info. if (SectionVal.empty()) @@ -2543,8 +2547,7 @@ MCSection *TargetLoweringObjectFileXCOFF::getSectionForTOCEntry( //===----------------------------------------------------------------------===// // GOFF //===----------------------------------------------------------------------===// -TargetLoweringObjectFileGOFF::TargetLoweringObjectFileGOFF() - : TargetLoweringObjectFile() {} +TargetLoweringObjectFileGOFF::TargetLoweringObjectFileGOFF() {} MCSection *TargetLoweringObjectFileGOFF::getExplicitSectionGlobal( const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const { diff --git a/llvm/lib/CodeGen/TargetPassConfig.cpp b/llvm/lib/CodeGen/TargetPassConfig.cpp index 402e21d3708b..05004fb935df 100644 --- a/llvm/lib/CodeGen/TargetPassConfig.cpp +++ b/llvm/lib/CodeGen/TargetPassConfig.cpp @@ -328,7 +328,7 @@ static IdentifyingPassPtr overridePass(AnalysisID StandardID, // Find the FSProfile file name. The internal option takes the precedence // before getting from TargetMachine. -static const std::string getFSProfileFile(const TargetMachine *TM) { +static std::string getFSProfileFile(const TargetMachine *TM) { if (!FSProfileFile.empty()) return FSProfileFile.getValue(); const Optional<PGOOptions> &PGOOpt = TM->getPGOOption(); @@ -339,7 +339,7 @@ static const std::string getFSProfileFile(const TargetMachine *TM) { // Find the Profile remapping file name. The internal option takes the // precedence before getting from TargetMachine. -static const std::string getFSRemappingFile(const TargetMachine *TM) { +static std::string getFSRemappingFile(const TargetMachine *TM) { if (!FSRemappingFile.empty()) return FSRemappingFile.getValue(); const Optional<PGOOptions> &PGOOpt = TM->getPGOOption(); @@ -1399,6 +1399,9 @@ bool TargetPassConfig::addRegAssignAndRewriteOptimized() { // Finally rewrite virtual registers. addPass(&VirtRegRewriterID); + // Regalloc scoring for ML-driven eviction - noop except when learning a new + // eviction policy. + addPass(createRegAllocScoringPass()); return true; } diff --git a/llvm/lib/CodeGen/TargetRegisterInfo.cpp b/llvm/lib/CodeGen/TargetRegisterInfo.cpp index f5cb518fce3e..6bcf79547056 100644 --- a/llvm/lib/CodeGen/TargetRegisterInfo.cpp +++ b/llvm/lib/CodeGen/TargetRegisterInfo.cpp @@ -552,7 +552,7 @@ bool TargetRegisterInfo::getCoveringSubRegIndexes( // Abort if we cannot possibly implement the COPY with the given indexes. if (BestIdx == 0) - return 0; + return false; NeededIndexes.push_back(BestIdx); @@ -581,7 +581,7 @@ bool TargetRegisterInfo::getCoveringSubRegIndexes( } if (BestIdx == 0) - return 0; // Impossible to handle + return false; // Impossible to handle NeededIndexes.push_back(BestIdx); diff --git a/llvm/lib/CodeGen/TypePromotion.cpp b/llvm/lib/CodeGen/TypePromotion.cpp index d042deefd746..01ea171e5ea2 100644 --- a/llvm/lib/CodeGen/TypePromotion.cpp +++ b/llvm/lib/CodeGen/TypePromotion.cpp @@ -116,11 +116,11 @@ class IRPromoter { SmallPtrSet<Value*, 8> Promoted; void ReplaceAllUsersOfWith(Value *From, Value *To); - void ExtendSources(void); - void ConvertTruncs(void); - void PromoteTree(void); - void TruncateSinks(void); - void Cleanup(void); + void ExtendSources(); + void ConvertTruncs(); + void PromoteTree(); + void TruncateSinks(); + void Cleanup(); public: IRPromoter(LLVMContext &C, IntegerType *Ty, unsigned Width, diff --git a/llvm/lib/CodeGen/VLIWMachineScheduler.cpp b/llvm/lib/CodeGen/VLIWMachineScheduler.cpp index cbc5d9ec169b..5f59cb4643f2 100644 --- a/llvm/lib/CodeGen/VLIWMachineScheduler.cpp +++ b/llvm/lib/CodeGen/VLIWMachineScheduler.cpp @@ -293,7 +293,7 @@ void ConvergingVLIWScheduler::initialize(ScheduleDAGMI *dag) { const std::vector<unsigned> &MaxPressure = DAG->getRegPressure().MaxSetPressure; - HighPressureSets.assign(MaxPressure.size(), 0); + HighPressureSets.assign(MaxPressure.size(), false); for (unsigned i = 0, e = MaxPressure.size(); i < e; ++i) { unsigned Limit = DAG->getRegClassInfo()->getRegPressureSetLimit(i); HighPressureSets[i] = diff --git a/llvm/lib/DWARFLinker/DWARFLinker.cpp b/llvm/lib/DWARFLinker/DWARFLinker.cpp index ae0859e1ecfd..b56095ca9a96 100644 --- a/llvm/lib/DWARFLinker/DWARFLinker.cpp +++ b/llvm/lib/DWARFLinker/DWARFLinker.cpp @@ -124,6 +124,7 @@ static bool isTypeTag(uint16_t Tag) { case dwarf::DW_TAG_interface_type: case dwarf::DW_TAG_unspecified_type: case dwarf::DW_TAG_shared_type: + case dwarf::DW_TAG_immutable_type: return true; default: break; @@ -1934,7 +1935,7 @@ uint32_t DWARFLinker::DIECloner::hashFullyQualifiedName(DWARFDie DIE, CompileUnit *CU = &U; Optional<DWARFFormValue> Ref; - while (1) { + while (true) { if (const char *CurrentName = DIE.getName(DINameKind::ShortName)) Name = CurrentName; @@ -2107,7 +2108,6 @@ Error DWARFLinker::loadClangModule( // Add this module. Unit = std::make_unique<CompileUnit>(*CU, UnitID++, !Options.NoODR, ModuleName); - Unit->setHasInterestingContent(); analyzeContextInfo(CUDie, 0, *Unit, &ODRContexts.getRoot(), ODRContexts, ModulesEndOffset, Options.ParseableSwiftInterfaces, [&](const Twine &Warning, const DWARFDie &DIE) { diff --git a/llvm/lib/DWARFLinker/DWARFLinkerCompileUnit.cpp b/llvm/lib/DWARFLinker/DWARFLinkerCompileUnit.cpp index 925ab3d295c2..acecb1788d10 100644 --- a/llvm/lib/DWARFLinker/DWARFLinkerCompileUnit.cpp +++ b/llvm/lib/DWARFLinker/DWARFLinkerCompileUnit.cpp @@ -40,8 +40,6 @@ StringRef CompileUnit::getSysRoot() { void CompileUnit::markEverythingAsKept() { unsigned Idx = 0; - setHasInterestingContent(); - for (auto &I : Info) { // Mark everything that wasn't explicit marked for pruning. I.Keep = !I.Prune; diff --git a/llvm/lib/DWARFLinker/DWARFLinkerDeclContext.cpp b/llvm/lib/DWARFLinker/DWARFLinkerDeclContext.cpp index d9b3c4235b4d..5ab2ad0780a2 100644 --- a/llvm/lib/DWARFLinker/DWARFLinkerDeclContext.cpp +++ b/llvm/lib/DWARFLinker/DWARFLinkerDeclContext.cpp @@ -173,7 +173,7 @@ DeclContextTree::getChildDeclContext(DeclContext &Context, const DWARFDie &DIE, !(*ContextIter)->setLastSeenDIE(U, DIE)) { // The context was found, but it is ambiguous with another context // in the same file. Mark it invalid. - return PointerIntPair<DeclContext *, 1>(*ContextIter, /* Invalid= */ 1); + return PointerIntPair<DeclContext *, 1>(*ContextIter, /* IntVal= */ 1); } assert(ContextIter != Contexts.end()); @@ -183,7 +183,7 @@ DeclContextTree::getChildDeclContext(DeclContext &Context, const DWARFDie &DIE, Context.getTag() != dwarf::DW_TAG_structure_type && Context.getTag() != dwarf::DW_TAG_class_type) || (Tag == dwarf::DW_TAG_union_type)) - return PointerIntPair<DeclContext *, 1>(*ContextIter, /* Invalid= */ 1); + return PointerIntPair<DeclContext *, 1>(*ContextIter, /* IntVal= */ 1); return PointerIntPair<DeclContext *, 1>(*ContextIter); } diff --git a/llvm/lib/DebugInfo/CodeView/EnumTables.cpp b/llvm/lib/DebugInfo/CodeView/EnumTables.cpp index b4a2a0031b2d..adf4ae519dae 100644 --- a/llvm/lib/DebugInfo/CodeView/EnumTables.cpp +++ b/llvm/lib/DebugInfo/CodeView/EnumTables.cpp @@ -104,7 +104,7 @@ static const EnumEntry<codeview::SourceLanguage> SourceLanguages[] = { CV_ENUM_ENT(SourceLanguage, ILAsm), CV_ENUM_ENT(SourceLanguage, Java), CV_ENUM_ENT(SourceLanguage, JScript), CV_ENUM_ENT(SourceLanguage, MSIL), CV_ENUM_ENT(SourceLanguage, HLSL), CV_ENUM_ENT(SourceLanguage, D), - CV_ENUM_ENT(SourceLanguage, Swift), + CV_ENUM_ENT(SourceLanguage, Swift), CV_ENUM_ENT(SourceLanguage, Rust), }; static const EnumEntry<uint32_t> CompileSym2FlagNames[] = { diff --git a/llvm/lib/DebugInfo/DWARF/DWARFCompileUnit.cpp b/llvm/lib/DebugInfo/DWARF/DWARFCompileUnit.cpp index 6e30309ae94a..d68ecd4f8a42 100644 --- a/llvm/lib/DebugInfo/DWARF/DWARFCompileUnit.cpp +++ b/llvm/lib/DebugInfo/DWARF/DWARFCompileUnit.cpp @@ -15,6 +15,8 @@ using namespace llvm; void DWARFCompileUnit::dump(raw_ostream &OS, DIDumpOptions DumpOpts) { + if (DumpOpts.SummarizeTypes) + return; int OffsetDumpWidth = 2 * dwarf::getDwarfOffsetByteSize(getFormat()); OS << format("0x%08" PRIx64, getOffset()) << ": Compile Unit:" << " length = " << format("0x%0*" PRIx64, OffsetDumpWidth, getLength()) diff --git a/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp b/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp index 95135c95e8d2..ef50ad53650a 100644 --- a/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp +++ b/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp @@ -695,14 +695,30 @@ void DWARFContext::dump( DWARFTypeUnit *DWARFContext::getTypeUnitForHash(uint16_t Version, uint64_t Hash, bool IsDWO) { - // FIXME: Check for/use the tu_index here, if there is one. - for (const auto &U : IsDWO ? dwo_units() : normal_units()) { - if (DWARFTypeUnit *TU = dyn_cast<DWARFTypeUnit>(U.get())) { - if (TU->getTypeHash() == Hash) - return TU; + parseDWOUnits(LazyParse); + + if (const auto &TUI = getTUIndex()) { + if (const auto *R = TUI.getFromHash(Hash)) + return dyn_cast_or_null<DWARFTypeUnit>( + DWOUnits.getUnitForIndexEntry(*R)); + return nullptr; + } + + struct UnitContainers { + const DWARFUnitVector &Units; + Optional<DenseMap<uint64_t, DWARFTypeUnit *>> ⤅ + }; + UnitContainers Units = IsDWO ? UnitContainers{DWOUnits, DWOTypeUnits} + : UnitContainers{NormalUnits, NormalTypeUnits}; + if (!Units.Map) { + Units.Map.emplace(); + for (const auto &U : IsDWO ? dwo_units() : normal_units()) { + if (DWARFTypeUnit *TU = dyn_cast<DWARFTypeUnit>(U.get())) + (*Units.Map)[TU->getTypeHash()] = TU; } } - return nullptr; + + return (*Units.Map)[Hash]; } DWARFCompileUnit *DWARFContext::getDWOCompileUnitForHash(uint64_t Hash) { @@ -1098,6 +1114,7 @@ static Optional<uint64_t> getTypeSize(DWARFDie Type, uint64_t PointerSize) { return PointerSize; } case DW_TAG_const_type: + case DW_TAG_immutable_type: case DW_TAG_volatile_type: case DW_TAG_restrict_type: case DW_TAG_typedef: { diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDebugAbbrev.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDebugAbbrev.cpp index d91a630256d6..ee54fc754803 100644 --- a/llvm/lib/DebugInfo/DWARF/DWARFDebugAbbrev.cpp +++ b/llvm/lib/DebugInfo/DWARF/DWARFDebugAbbrev.cpp @@ -74,7 +74,7 @@ std::string DWARFAbbreviationDeclarationSet::getCodeRange() const { for (const auto &Decl : Decls) Codes.push_back(Decl.getCode()); - std::string Buffer = ""; + std::string Buffer; raw_string_ostream Stream(Buffer); // Each iteration through this loop represents a single contiguous range in // the set of codes. diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp index 5421b2d59a1b..ec7889a3728a 100644 --- a/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp +++ b/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp @@ -215,15 +215,16 @@ struct DWARFTypePrinter { OS << "void"; return DWARFDie(); } - DWARFDie Inner = resolveReferencedType(D); + DWARFDie InnerDIE; + auto Inner = [&] { return InnerDIE = resolveReferencedType(D); }; const dwarf::Tag T = D.getTag(); switch (T) { case DW_TAG_pointer_type: { - appendPointerLikeTypeBefore(D, Inner, "*"); + appendPointerLikeTypeBefore(D, Inner(), "*"); break; } case DW_TAG_subroutine_type: { - appendQualifiedNameBefore(Inner); + appendQualifiedNameBefore(Inner()); if (Word) { OS << ' '; } @@ -231,18 +232,18 @@ struct DWARFTypePrinter { break; } case DW_TAG_array_type: { - appendQualifiedNameBefore(Inner); + appendQualifiedNameBefore(Inner()); break; } case DW_TAG_reference_type: - appendPointerLikeTypeBefore(D, Inner, "&"); + appendPointerLikeTypeBefore(D, Inner(), "&"); break; case DW_TAG_rvalue_reference_type: - appendPointerLikeTypeBefore(D, Inner, "&&"); + appendPointerLikeTypeBefore(D, Inner(), "&&"); break; case DW_TAG_ptr_to_member_type: { - appendQualifiedNameBefore(Inner); - if (needsParens(Inner)) + appendQualifiedNameBefore(Inner()); + if (needsParens(InnerDIE)) OS << '('; else if (Word) OS << ' '; @@ -284,7 +285,7 @@ struct DWARFTypePrinter { const char *NamePtr = dwarf::toString(D.find(DW_AT_name), nullptr); if (!NamePtr) { appendTypeTagName(D.getTag()); - return Inner; + return DWARFDie(); } Word = true; StringRef Name = NamePtr; @@ -317,7 +318,7 @@ struct DWARFTypePrinter { break; } } - return Inner; + return InnerDIE; } void appendUnqualifiedNameAfter(DWARFDie D, DWARFDie Inner, @@ -610,7 +611,8 @@ struct DWARFTypePrinter { bool First = true; bool RealFirst = true; for (DWARFDie P : D) { - if (P.getTag() != DW_TAG_formal_parameter) + if (P.getTag() != DW_TAG_formal_parameter && + P.getTag() != DW_TAG_unspecified_parameters) return; DWARFDie T = resolveReferencedType(P); if (SkipFirstParamIfArtificial && RealFirst && P.find(DW_AT_artificial)) { @@ -622,7 +624,10 @@ struct DWARFTypePrinter { OS << ", "; } First = false; - appendQualifiedName(T); + if (P.getTag() == DW_TAG_unspecified_parameters) + OS << "..."; + else + appendQualifiedName(T); } EndedWithTemplate = false; OS << ')'; @@ -767,7 +772,7 @@ static void dumpAttribute(raw_ostream &OS, const DWARFDie &Die, DWARFDie D = resolveReferencedType(Die, FormValue); if (D && !D.isNULL()) { OS << Space << "\""; - DWARFTypePrinter(OS).appendQualifiedName(D); + dumpTypeQualifiedName(D, OS); OS << '"'; } } else if (Attr == DW_AT_APPLE_property_attribute) { @@ -801,7 +806,9 @@ void DWARFDie::getFullName(raw_string_ostream &OS, const char *NamePtr = getShortName(); if (!NamePtr) return; - DWARFTypePrinter(OS).appendUnqualifiedName(*this, OriginalFullName); + if (getTag() == DW_TAG_GNU_template_parameter_pack) + return; + dumpTypeUnqualifiedName(*this, OS, OriginalFullName); } bool DWARFDie::isSubprogramDIE() const { return getTag() == DW_TAG_subprogram; } @@ -1263,3 +1270,16 @@ bool DWARFAttribute::mayHaveLocationExpr(dwarf::Attribute Attr) { return false; } } + +namespace llvm { + +void dumpTypeQualifiedName(const DWARFDie &DIE, raw_ostream &OS) { + DWARFTypePrinter(OS).appendQualifiedName(DIE); +} + +void dumpTypeUnqualifiedName(const DWARFDie &DIE, raw_ostream &OS, + std::string *OriginalFullName) { + DWARFTypePrinter(OS).appendUnqualifiedName(DIE, OriginalFullName); +} + +} // namespace llvm diff --git a/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp b/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp index 6424c2f59844..ca7ac785b550 100644 --- a/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp +++ b/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp @@ -173,7 +173,7 @@ bool DWARFVerifier::verifyName(const DWARFDie &Die) { Die.getFullName(OS, &OriginalFullName); OS.flush(); if (OriginalFullName.empty() || OriginalFullName == ReconstructedName) - return 0; + return false; error() << "Simplified template DW_AT_name could not be reconstituted:\n" << formatv(" original: {0}\n" @@ -181,7 +181,7 @@ bool DWARFVerifier::verifyName(const DWARFDie &Die) { OriginalFullName, ReconstructedName); dump(Die) << '\n'; dump(Die.getDwarfUnit()->getUnitDIE()) << '\n'; - return 1; + return true; } unsigned DWARFVerifier::verifyUnitContents(DWARFUnit &Unit, @@ -322,12 +322,19 @@ unsigned DWARFVerifier::verifyUnits(const DWARFUnitVector &Units) { unsigned NumDebugInfoErrors = 0; ReferenceMap CrossUnitReferences; + unsigned Index = 1; for (const auto &Unit : Units) { - ReferenceMap UnitLocalReferences; - NumDebugInfoErrors += - verifyUnitContents(*Unit, UnitLocalReferences, CrossUnitReferences); - NumDebugInfoErrors += verifyDebugInfoReferences( - UnitLocalReferences, [&](uint64_t Offset) { return Unit.get(); }); + OS << "Verifying unit: " << Index << " / " << Units.getNumUnits(); + if (const char* Name = Unit->getUnitDIE(true).getShortName()) + OS << ", \"" << Name << '\"'; + OS << '\n'; + OS.flush(); + ReferenceMap UnitLocalReferences; + NumDebugInfoErrors += + verifyUnitContents(*Unit, UnitLocalReferences, CrossUnitReferences); + NumDebugInfoErrors += verifyDebugInfoReferences( + UnitLocalReferences, [&](uint64_t Offset) { return Unit.get(); }); + ++Index; } NumDebugInfoErrors += verifyDebugInfoReferences( diff --git a/llvm/lib/DebugInfo/PDB/Native/NativeEnumTypes.cpp b/llvm/lib/DebugInfo/PDB/Native/NativeEnumTypes.cpp index ac217df1ee48..2524e10cb6c5 100644 --- a/llvm/lib/DebugInfo/PDB/Native/NativeEnumTypes.cpp +++ b/llvm/lib/DebugInfo/PDB/Native/NativeEnumTypes.cpp @@ -23,7 +23,7 @@ using namespace llvm::pdb; NativeEnumTypes::NativeEnumTypes(NativeSession &PDBSession, LazyRandomTypeCollection &Types, std::vector<codeview::TypeLeafKind> Kinds) - : Matches(), Index(0), Session(PDBSession) { + : Index(0), Session(PDBSession) { Optional<TypeIndex> TI = Types.getFirst(); while (TI) { CVType CVT = Types.getType(*TI); diff --git a/llvm/lib/DebugInfo/PDB/PDBExtras.cpp b/llvm/lib/DebugInfo/PDB/PDBExtras.cpp index 25962e5152eb..a6d7ca0da7a9 100644 --- a/llvm/lib/DebugInfo/PDB/PDBExtras.cpp +++ b/llvm/lib/DebugInfo/PDB/PDBExtras.cpp @@ -231,6 +231,7 @@ raw_ostream &llvm::pdb::operator<<(raw_ostream &OS, const PDB_Lang &Lang) { CASE_OUTPUT_ENUM_CLASS_NAME(PDB_Lang, HLSL, OS) CASE_OUTPUT_ENUM_CLASS_NAME(PDB_Lang, D, OS) CASE_OUTPUT_ENUM_CLASS_NAME(PDB_Lang, Swift, OS) + CASE_OUTPUT_ENUM_CLASS_NAME(PDB_Lang, Rust, OS) } return OS; } diff --git a/llvm/lib/DebugInfo/PDB/PDBSymbolCompiland.cpp b/llvm/lib/DebugInfo/PDB/PDBSymbolCompiland.cpp index 9b2883546305..529100b23ba5 100644 --- a/llvm/lib/DebugInfo/PDB/PDBSymbolCompiland.cpp +++ b/llvm/lib/DebugInfo/PDB/PDBSymbolCompiland.cpp @@ -100,6 +100,7 @@ std::string PDBSymbolCompiland::getSourceFileFullPath() const { .Case(".c", Lang == PDB_Lang::C) .Case(".asm", Lang == PDB_Lang::Masm) .Case(".swift", Lang == PDB_Lang::Swift) + .Case(".rs", Lang == PDB_Lang::Rust) .Default(false)) return File->getFileName(); } diff --git a/llvm/lib/DebugInfo/Symbolize/DIPrinter.cpp b/llvm/lib/DebugInfo/Symbolize/DIPrinter.cpp index 555d29fe184b..e29968d113bd 100644 --- a/llvm/lib/DebugInfo/Symbolize/DIPrinter.cpp +++ b/llvm/lib/DebugInfo/Symbolize/DIPrinter.cpp @@ -33,8 +33,8 @@ namespace symbolize { class SourceCode { std::unique_ptr<MemoryBuffer> MemBuf; - const Optional<StringRef> load(StringRef FileName, - const Optional<StringRef> &EmbeddedSource) { + Optional<StringRef> load(StringRef FileName, + const Optional<StringRef> &EmbeddedSource) { if (Lines <= 0) return None; @@ -50,7 +50,7 @@ class SourceCode { } } - const Optional<StringRef> pruneSource(const Optional<StringRef> &Source) { + Optional<StringRef> pruneSource(const Optional<StringRef> &Source) { if (!Source) return None; size_t FirstLinePos = StringRef::npos, Pos = 0; diff --git a/llvm/lib/Debuginfod/Debuginfod.cpp b/llvm/lib/Debuginfod/Debuginfod.cpp index 389b18fd62ac..27614572766d 100644 --- a/llvm/lib/Debuginfod/Debuginfod.cpp +++ b/llvm/lib/Debuginfod/Debuginfod.cpp @@ -21,8 +21,10 @@ #include "llvm/Debuginfod/HTTPClient.h" #include "llvm/Support/CachePruning.h" #include "llvm/Support/Caching.h" +#include "llvm/Support/Errc.h" #include "llvm/Support/Error.h" #include "llvm/Support/FileUtilities.h" +#include "llvm/Support/Path.h" #include "llvm/Support/xxhash.h" namespace llvm { @@ -36,7 +38,7 @@ static std::string buildIDToString(BuildIDRef ID) { Expected<SmallVector<StringRef>> getDefaultDebuginfodUrls() { const char *DebuginfodUrlsEnv = std::getenv("DEBUGINFOD_URLS"); - if (DebuginfodUrlsEnv == NULL) + if (DebuginfodUrlsEnv == nullptr) return SmallVector<StringRef>(); SmallVector<StringRef> DebuginfodUrls; @@ -52,6 +54,7 @@ Expected<std::string> getDefaultDebuginfodCacheDirectory() { if (!sys::path::cache_directory(CacheDirectory)) return createStringError( errc::io_error, "Unable to determine appropriate cache directory."); + sys::path::append(CacheDirectory, "llvm-debuginfod", "client"); return std::string(CacheDirectory); } diff --git a/llvm/lib/Demangle/DLangDemangle.cpp b/llvm/lib/Demangle/DLangDemangle.cpp index 0cefbd63a7ae..7cecd8007087 100644 --- a/llvm/lib/Demangle/DLangDemangle.cpp +++ b/llvm/lib/Demangle/DLangDemangle.cpp @@ -68,7 +68,53 @@ private: /// \note A result larger than UINT_MAX is considered a failure. /// /// \see https://dlang.org/spec/abi.html#Number . - const char *decodeNumber(const char *Mangled, unsigned long *Ret); + const char *decodeNumber(const char *Mangled, unsigned long &Ret); + + /// Extract the back reference position from a given string. + /// + /// \param Mangled string to extract the back reference position. + /// \param Ret assigned result value. + /// + /// \return the remaining string on success or nullptr on failure. + /// + /// \note Ret is always >= 0 on success, and unspecified on failure + /// + /// \see https://dlang.org/spec/abi.html#back_ref . + /// \see https://dlang.org/spec/abi.html#NumberBackRef . + const char *decodeBackrefPos(const char *Mangled, long &Ret); + + /// Extract the symbol pointed by the back reference form a given string. + /// + /// \param Mangled string to extract the back reference position. + /// \param Ret assigned result value. + /// + /// \return the remaining string on success or nullptr on failure. + /// + /// \see https://dlang.org/spec/abi.html#back_ref . + const char *decodeBackref(const char *Mangled, const char *&Ret); + + /// Extract and demangle backreferenced symbol from a given mangled symbol + /// and append it to the output string. + /// + /// \param Demangled output buffer to write the demangled name. + /// \param Mangled mangled symbol to be demangled. + /// + /// \return the remaining string on success or nullptr on failure. + /// + /// \see https://dlang.org/spec/abi.html#back_ref . + /// \see https://dlang.org/spec/abi.html#IdentifierBackRef . + const char *parseSymbolBackref(OutputBuffer *Demangled, const char *Mangled); + + /// Extract and demangle backreferenced type from a given mangled symbol + /// and append it to the output string. + /// + /// \param Mangled mangled symbol to be demangled. + /// + /// \return the remaining string on success or nullptr on failure. + /// + /// \see https://dlang.org/spec/abi.html#back_ref . + /// \see https://dlang.org/spec/abi.html#TypeBackRef . + const char *parseTypeBackref(const char *Mangled); /// Check whether it is the beginning of a symbol name. /// @@ -115,13 +161,25 @@ private: /// \see https://dlang.org/spec/abi.html#QualifiedName . const char *parseQualified(OutputBuffer *Demangled, const char *Mangled); + /// Extract and demangle a type from a given mangled symbol append it to + /// the output string. + /// + /// \param Mangled mangled symbol to be demangled. + /// + /// \return the remaining string on success or nullptr on failure. + /// + /// \see https://dlang.org/spec/abi.html#Type . + const char *parseType(const char *Mangled); + /// The string we are demangling. const char *Str; + /// The index of the last back reference. + int LastBackref; }; } // namespace -const char *Demangler::decodeNumber(const char *Mangled, unsigned long *Ret) { +const char *Demangler::decodeNumber(const char *Mangled, unsigned long &Ret) { // Return nullptr if trying to extract something that isn't a digit. if (Mangled == nullptr || !std::isdigit(*Mangled)) return nullptr; @@ -142,16 +200,145 @@ const char *Demangler::decodeNumber(const char *Mangled, unsigned long *Ret) { if (*Mangled == '\0') return nullptr; - *Ret = Val; + Ret = Val; + return Mangled; +} + +const char *Demangler::decodeBackrefPos(const char *Mangled, long &Ret) { + // Return nullptr if trying to extract something that isn't a digit + if (Mangled == nullptr || !std::isalpha(*Mangled)) + return nullptr; + + // Any identifier or non-basic type that has been emitted to the mangled + // symbol before will not be emitted again, but is referenced by a special + // sequence encoding the relative position of the original occurrence in the + // mangled symbol name. + // Numbers in back references are encoded with base 26 by upper case letters + // A-Z for higher digits but lower case letters a-z for the last digit. + // NumberBackRef: + // [a-z] + // [A-Z] NumberBackRef + // ^ + unsigned long Val = 0; + + while (std::isalpha(*Mangled)) { + // Check for overflow + if (Val > (std::numeric_limits<unsigned long>::max() - 25) / 26) + break; + + Val *= 26; + + if (Mangled[0] >= 'a' && Mangled[0] <= 'z') { + Val += Mangled[0] - 'a'; + if ((long)Val <= 0) + break; + Ret = Val; + return Mangled + 1; + } + + Val += Mangled[0] - 'A'; + ++Mangled; + } + + return nullptr; +} + +const char *Demangler::decodeBackref(const char *Mangled, const char *&Ret) { + assert(Mangled != nullptr && *Mangled == 'Q' && "Invalid back reference!"); + Ret = nullptr; + + // Position of 'Q' + const char *Qpos = Mangled; + long RefPos; + ++Mangled; + + Mangled = decodeBackrefPos(Mangled, RefPos); + if (Mangled == nullptr) + return nullptr; + + if (RefPos > Qpos - Str) + return nullptr; + + // Set the position of the back reference. + Ret = Qpos - RefPos; + + return Mangled; +} + +const char *Demangler::parseSymbolBackref(OutputBuffer *Demangled, + const char *Mangled) { + // An identifier back reference always points to a digit 0 to 9. + // IdentifierBackRef: + // Q NumberBackRef + // ^ + const char *Backref; + unsigned long Len; + + // Get position of the back reference + Mangled = decodeBackref(Mangled, Backref); + + // Must point to a simple identifier + Backref = decodeNumber(Backref, Len); + if (Backref == nullptr || strlen(Backref) < Len) + return nullptr; + + Backref = parseLName(Demangled, Backref, Len); + if (Backref == nullptr) + return nullptr; + + return Mangled; +} + +const char *Demangler::parseTypeBackref(const char *Mangled) { + // A type back reference always points to a letter. + // TypeBackRef: + // Q NumberBackRef + // ^ + const char *Backref; + + // If we appear to be moving backwards through the mangle string, then + // bail as this may be a recursive back reference. + if (Mangled - Str >= LastBackref) + return nullptr; + + int SaveRefPos = LastBackref; + LastBackref = Mangled - Str; + + // Get position of the back reference. + Mangled = decodeBackref(Mangled, Backref); + + // Can't decode back reference. + if (Backref == nullptr) + return nullptr; + + // TODO: Add support for function type back references. + Backref = parseType(Backref); + + LastBackref = SaveRefPos; + + if (Backref == nullptr) + return nullptr; + return Mangled; } bool Demangler::isSymbolName(const char *Mangled) { + long Ret; + const char *Qref = Mangled; + if (std::isdigit(*Mangled)) return true; - // TODO: Handle symbol back references and template instances. - return false; + // TODO: Handle template instances. + + if (*Mangled != 'Q') + return false; + + Mangled = decodeBackrefPos(Mangled + 1, Ret); + if (Mangled == nullptr || Ret > Qref - Str) + return false; + + return std::isdigit(Qref[-Ret]); } const char *Demangler::parseMangle(OutputBuffer *Demangled, @@ -174,8 +361,7 @@ const char *Demangler::parseMangle(OutputBuffer *Demangled, if (*Mangled == 'Z') ++Mangled; else { - // TODO: Implement symbols with types. - return nullptr; + Mangled = parseType(Mangled); } } @@ -228,9 +414,12 @@ const char *Demangler::parseIdentifier(OutputBuffer *Demangled, if (Mangled == nullptr || *Mangled == '\0') return nullptr; - // TODO: Parse back references and lengthless template instances. + if (*Mangled == 'Q') + return parseSymbolBackref(Demangled, Mangled); + + // TODO: Parse lengthless template instances. - const char *Endptr = decodeNumber(Mangled, &Len); + const char *Endptr = decodeNumber(Mangled, Len); if (Endptr == nullptr || Len == 0) return nullptr; @@ -262,6 +451,34 @@ const char *Demangler::parseIdentifier(OutputBuffer *Demangled, return parseLName(Demangled, Mangled, Len); } +const char *Demangler::parseType(const char *Mangled) { + if (*Mangled == '\0') + return nullptr; + + switch (*Mangled) { + // TODO: Parse type qualifiers. + // TODO: Parse function types. + // TODO: Parse compound types. + // TODO: Parse delegate types. + // TODO: Parse tuple types. + + // Basic types. + case 'i': + ++Mangled; + // TODO: Add type name dumping + return Mangled; + + // TODO: Add support for the rest of the basic types. + + // Back referenced type. + case 'Q': + return parseTypeBackref(Mangled); + + default: // unhandled. + return nullptr; + } +} + const char *Demangler::parseLName(OutputBuffer *Demangled, const char *Mangled, unsigned long Len) { switch (Len) { @@ -319,7 +536,8 @@ const char *Demangler::parseLName(OutputBuffer *Demangled, const char *Mangled, return Mangled; } -Demangler::Demangler(const char *Mangled) : Str(Mangled) {} +Demangler::Demangler(const char *Mangled) + : Str(Mangled), LastBackref(strlen(Mangled)) {} const char *Demangler::parseMangle(OutputBuffer *Demangled) { return parseMangle(Demangled, this->Str); diff --git a/llvm/lib/Demangle/ItaniumDemangle.cpp b/llvm/lib/Demangle/ItaniumDemangle.cpp index 3f68f76761ce..1a5db755e37b 100644 --- a/llvm/lib/Demangle/ItaniumDemangle.cpp +++ b/llvm/lib/Demangle/ItaniumDemangle.cpp @@ -19,9 +19,7 @@ #include <cstdlib> #include <cstring> #include <functional> -#include <numeric> #include <utility> -#include <vector> using namespace llvm; using namespace llvm::itanium_demangle; diff --git a/llvm/lib/Demangle/MicrosoftDemangleNodes.cpp b/llvm/lib/Demangle/MicrosoftDemangleNodes.cpp index 32d8dff66c3f..d07d05a08c55 100644 --- a/llvm/lib/Demangle/MicrosoftDemangleNodes.cpp +++ b/llvm/lib/Demangle/MicrosoftDemangleNodes.cpp @@ -11,7 +11,6 @@ //===----------------------------------------------------------------------===// #include "llvm/Demangle/MicrosoftDemangleNodes.h" -#include "llvm/Demangle/DemangleConfig.h" #include "llvm/Demangle/Utility.h" #include <cctype> #include <string> diff --git a/llvm/lib/ExecutionEngine/GDBRegistrationListener.cpp b/llvm/lib/ExecutionEngine/GDBRegistrationListener.cpp index e15bce0d6c4b..1fb37ce7c57c 100644 --- a/llvm/lib/ExecutionEngine/GDBRegistrationListener.cpp +++ b/llvm/lib/ExecutionEngine/GDBRegistrationListener.cpp @@ -96,7 +96,7 @@ class GDBJITRegistrationListener : public JITEventListener { public: /// Instantiates the JIT service. - GDBJITRegistrationListener() : ObjectBufferMap() {} + GDBJITRegistrationListener() {} /// Unregisters each object that was previously registered and releases all /// internal resources. diff --git a/llvm/lib/ExecutionEngine/JITLink/DefineExternalSectionStartAndEndSymbols.h b/llvm/lib/ExecutionEngine/JITLink/DefineExternalSectionStartAndEndSymbols.h index 8ae3bc2bf61d..159880e4b152 100644 --- a/llvm/lib/ExecutionEngine/JITLink/DefineExternalSectionStartAndEndSymbols.h +++ b/llvm/lib/ExecutionEngine/JITLink/DefineExternalSectionStartAndEndSymbols.h @@ -52,13 +52,13 @@ public: auto &SR = getSectionRange(*D.Sec); if (D.IsStart) { if (SR.empty()) - G.makeAbsolute(*Sym, 0); + G.makeAbsolute(*Sym, orc::ExecutorAddr()); else G.makeDefined(*Sym, *SR.getFirstBlock(), 0, 0, Linkage::Strong, Scope::Local, false); } else { if (SR.empty()) - G.makeAbsolute(*Sym, 0); + G.makeAbsolute(*Sym, orc::ExecutorAddr()); else G.makeDefined(*Sym, *SR.getLastBlock(), SR.getLastBlock()->getSize(), 0, Linkage::Strong, diff --git a/llvm/lib/ExecutionEngine/JITLink/EHFrameSupport.cpp b/llvm/lib/ExecutionEngine/JITLink/EHFrameSupport.cpp index 4d7d5ce26668..2ae193595fc0 100644 --- a/llvm/lib/ExecutionEngine/JITLink/EHFrameSupport.cpp +++ b/llvm/lib/ExecutionEngine/JITLink/EHFrameSupport.cpp @@ -65,10 +65,7 @@ Error EHFrameSplitter::operator()(LinkGraph &G) { Error EHFrameSplitter::processBlock(LinkGraph &G, Block &B, LinkGraph::SplitBlockCache &Cache) { - LLVM_DEBUG({ - dbgs() << " Processing block at " << formatv("{0:x16}", B.getAddress()) - << "\n"; - }); + LLVM_DEBUG(dbgs() << " Processing block at " << B.getAddress() << "\n"); // eh-frame should not contain zero-fill blocks. if (B.isZeroFill()) @@ -400,7 +397,7 @@ Error EHFrameEdgeFixer::processFDE(ParseContext &PC, Block &B, BlockEdgeMap &BlockEdges) { LLVM_DEBUG(dbgs() << " Record is FDE\n"); - JITTargetAddress RecordAddress = B.getAddress() + RecordOffset; + orc::ExecutorAddr RecordAddress = B.getAddress() + RecordOffset; auto RecordContent = B.getContent().slice(RecordOffset, RecordLength); BinaryStreamReader RecordReader( @@ -418,8 +415,9 @@ Error EHFrameEdgeFixer::processFDE(ParseContext &PC, Block &B, { // Process the CIE pointer field. auto CIEEdgeItr = BlockEdges.find(RecordOffset + CIEDeltaFieldOffset); - JITTargetAddress CIEAddress = - RecordAddress + CIEDeltaFieldOffset - CIEDelta; + orc::ExecutorAddr CIEAddress = + RecordAddress + orc::ExecutorAddrDiff(CIEDeltaFieldOffset) - + orc::ExecutorAddrDiff(CIEDelta); if (CIEEdgeItr == BlockEdges.end()) { LLVM_DEBUG({ @@ -456,7 +454,7 @@ Error EHFrameEdgeFixer::processFDE(ParseContext &PC, Block &B, { // Process the PC-Begin field. Block *PCBeginBlock = nullptr; - JITTargetAddress PCBeginFieldOffset = RecordReader.getOffset(); + orc::ExecutorAddrDiff PCBeginFieldOffset = RecordReader.getOffset(); auto PCEdgeItr = BlockEdges.find(RecordOffset + PCBeginFieldOffset); if (PCEdgeItr == BlockEdges.end()) { auto PCBeginPtrInfo = @@ -464,12 +462,12 @@ Error EHFrameEdgeFixer::processFDE(ParseContext &PC, Block &B, RecordAddress + PCBeginFieldOffset, RecordReader); if (!PCBeginPtrInfo) return PCBeginPtrInfo.takeError(); - JITTargetAddress PCBegin = PCBeginPtrInfo->first; + orc::ExecutorAddr PCBegin = PCBeginPtrInfo->first; Edge::Kind PCBeginEdgeKind = PCBeginPtrInfo->second; LLVM_DEBUG({ dbgs() << " Adding edge at " - << formatv("{0:x16}", RecordAddress + PCBeginFieldOffset) - << " to PC at " << formatv("{0:x16}", PCBegin) << "\n"; + << (RecordAddress + PCBeginFieldOffset) << " to PC at " + << formatv("{0:x16}", PCBegin) << "\n"; }); auto PCBeginSym = getOrCreateSymbol(PC, PCBegin); if (!PCBeginSym) @@ -522,7 +520,7 @@ Error EHFrameEdgeFixer::processFDE(ParseContext &PC, Block &B, if (auto Err = RecordReader.readULEB128(AugmentationDataSize)) return Err; - JITTargetAddress LSDAFieldOffset = RecordReader.getOffset(); + orc::ExecutorAddrDiff LSDAFieldOffset = RecordReader.getOffset(); auto LSDAEdgeItr = BlockEdges.find(RecordOffset + LSDAFieldOffset); if (LSDAEdgeItr == BlockEdges.end()) { auto LSDAPointerInfo = @@ -530,7 +528,7 @@ Error EHFrameEdgeFixer::processFDE(ParseContext &PC, Block &B, RecordAddress + LSDAFieldOffset, RecordReader); if (!LSDAPointerInfo) return LSDAPointerInfo.takeError(); - JITTargetAddress LSDA = LSDAPointerInfo->first; + orc::ExecutorAddr LSDA = LSDAPointerInfo->first; Edge::Kind LSDAEdgeKind = LSDAPointerInfo->second; auto LSDASym = getOrCreateSymbol(PC, LSDA); if (!LSDASym) @@ -645,12 +643,10 @@ unsigned EHFrameEdgeFixer::getPointerEncodingDataSize(uint8_t PointerEncoding) { } } -Expected<std::pair<JITTargetAddress, Edge::Kind>> +Expected<std::pair<orc::ExecutorAddr, Edge::Kind>> EHFrameEdgeFixer::readEncodedPointer(uint8_t PointerEncoding, - JITTargetAddress PointerFieldAddress, + orc::ExecutorAddr PointerFieldAddress, BinaryStreamReader &RecordReader) { - static_assert(sizeof(JITTargetAddress) == sizeof(uint64_t), - "Result must be able to hold a uint64_t"); assert(isSupportedPointerEncoding(PointerEncoding) && "Unsupported pointer encoding"); @@ -663,7 +659,7 @@ EHFrameEdgeFixer::readEncodedPointer(uint8_t PointerEncoding, if (EffectiveType == DW_EH_PE_absptr) EffectiveType = (PointerSize == 8) ? DW_EH_PE_udata8 : DW_EH_PE_udata4; - JITTargetAddress Addr; + orc::ExecutorAddr Addr; Edge::Kind PointerEdgeKind = Edge::Invalid; switch (EffectiveType) { case DW_EH_PE_udata4: { @@ -709,7 +705,7 @@ EHFrameEdgeFixer::readEncodedPointer(uint8_t PointerEncoding, } Expected<Symbol &> EHFrameEdgeFixer::getOrCreateSymbol(ParseContext &PC, - JITTargetAddress Addr) { + orc::ExecutorAddr Addr) { Symbol *CanonicalSym = nullptr; auto UpdateCanonicalSym = [&](Symbol *Sym) { @@ -753,8 +749,9 @@ Error EHFrameNullTerminator::operator()(LinkGraph &G) { << EHFrameSectionName << "\n"; }); - auto &NullTerminatorBlock = G.createContentBlock( - *EHFrame, NullTerminatorBlockContent, 0xfffffffffffffffc, 1, 0); + auto &NullTerminatorBlock = + G.createContentBlock(*EHFrame, NullTerminatorBlockContent, + orc::ExecutorAddr(~uint64_t(4)), 1, 0); G.addAnonymousSymbol(NullTerminatorBlock, 0, 4, false, true); return Error::success(); } @@ -762,17 +759,15 @@ Error EHFrameNullTerminator::operator()(LinkGraph &G) { EHFrameRegistrar::~EHFrameRegistrar() {} Error InProcessEHFrameRegistrar::registerEHFrames( - JITTargetAddress EHFrameSectionAddr, size_t EHFrameSectionSize) { - return orc::registerEHFrameSection( - jitTargetAddressToPointer<void *>(EHFrameSectionAddr), - EHFrameSectionSize); + orc::ExecutorAddrRange EHFrameSection) { + return orc::registerEHFrameSection(EHFrameSection.Start.toPtr<void *>(), + EHFrameSection.size()); } Error InProcessEHFrameRegistrar::deregisterEHFrames( - JITTargetAddress EHFrameSectionAddr, size_t EHFrameSectionSize) { - return orc::deregisterEHFrameSection( - jitTargetAddressToPointer<void *>(EHFrameSectionAddr), - EHFrameSectionSize); + orc::ExecutorAddrRange EHFrameSection) { + return orc::deregisterEHFrameSection(EHFrameSection.Start.toPtr<void *>(), + EHFrameSection.size()); } LinkGraphPassFunction @@ -789,14 +784,14 @@ createEHFrameRecorderPass(const Triple &TT, StoreFrameRange = std::move(StoreRangeAddress)](LinkGraph &G) -> Error { // Search for a non-empty eh-frame and record the address of the first // symbol in it. - JITTargetAddress Addr = 0; + orc::ExecutorAddr Addr; size_t Size = 0; if (auto *S = G.findSectionByName(EHFrameSectionName)) { auto R = SectionRange(*S); Addr = R.getStart(); Size = R.getSize(); } - if (Addr == 0 && Size != 0) + if (!Addr && Size != 0) return make_error<JITLinkError>( StringRef(EHFrameSectionName) + " section can not have zero address with non-zero size"); diff --git a/llvm/lib/ExecutionEngine/JITLink/EHFrameSupportImpl.h b/llvm/lib/ExecutionEngine/JITLink/EHFrameSupportImpl.h index b4c4b0f7b097..ef4b47b9aa28 100644 --- a/llvm/lib/ExecutionEngine/JITLink/EHFrameSupportImpl.h +++ b/llvm/lib/ExecutionEngine/JITLink/EHFrameSupportImpl.h @@ -71,12 +71,12 @@ private: }; using BlockEdgeMap = DenseMap<Edge::OffsetT, EdgeTarget>; - using CIEInfosMap = DenseMap<JITTargetAddress, CIEInformation>; + using CIEInfosMap = DenseMap<orc::ExecutorAddr, CIEInformation>; struct ParseContext { ParseContext(LinkGraph &G) : G(G) {} - Expected<CIEInformation *> findCIEInfo(JITTargetAddress Address) { + Expected<CIEInformation *> findCIEInfo(orc::ExecutorAddr Address) { auto I = CIEInfos.find(Address); if (I == CIEInfos.end()) return make_error<JITLinkError>("No CIE found at address " + @@ -102,12 +102,13 @@ private: static bool isSupportedPointerEncoding(uint8_t PointerEncoding); unsigned getPointerEncodingDataSize(uint8_t PointerEncoding); - Expected<std::pair<JITTargetAddress, Edge::Kind>> + Expected<std::pair<orc::ExecutorAddr, Edge::Kind>> readEncodedPointer(uint8_t PointerEncoding, - JITTargetAddress PointerFieldAddress, + orc::ExecutorAddr PointerFieldAddress, BinaryStreamReader &RecordReader); - Expected<Symbol &> getOrCreateSymbol(ParseContext &PC, JITTargetAddress Addr); + Expected<Symbol &> getOrCreateSymbol(ParseContext &PC, + orc::ExecutorAddr Addr); StringRef EHFrameSectionName; unsigned PointerSize; diff --git a/llvm/lib/ExecutionEngine/JITLink/ELFLinkGraphBuilder.h b/llvm/lib/ExecutionEngine/JITLink/ELFLinkGraphBuilder.h index f9101d71dfa8..2ab7ed61f71b 100644 --- a/llvm/lib/ExecutionEngine/JITLink/ELFLinkGraphBuilder.h +++ b/llvm/lib/ExecutionEngine/JITLink/ELFLinkGraphBuilder.h @@ -77,14 +77,14 @@ protected: return Obj.getHeader().e_type == llvm::ELF::ET_REL; } - void setGraphSection(ELFSectionIndex SecIndex, Section &Sec) { - assert(!GraphSections.count(SecIndex) && "Duplicate section at index"); - GraphSections[SecIndex] = &Sec; + void setGraphBlock(ELFSectionIndex SecIndex, Block *B) { + assert(!GraphBlocks.count(SecIndex) && "Duplicate section at index"); + GraphBlocks[SecIndex] = B; } - Section *getGraphSection(ELFSectionIndex SecIndex) { - auto I = GraphSections.find(SecIndex); - if (I == GraphSections.end()) + Block *getGraphBlock(ELFSectionIndex SecIndex) { + auto I = GraphBlocks.find(SecIndex); + if (I == GraphBlocks.end()) return nullptr; return I->second; } @@ -139,9 +139,9 @@ protected: const typename ELFFile::Elf_Shdr *SymTabSec = nullptr; StringRef SectionStringTab; - // Maps ELF section indexes to LinkGraph Sections. - // Only SHF_ALLOC sections will have graph sections. - DenseMap<ELFSectionIndex, Section *> GraphSections; + // Maps ELF section indexes to LinkGraph Blocks. + // Only SHF_ALLOC sections will have graph blocks. + DenseMap<ELFSectionIndex, Block *> GraphBlocks; DenseMap<ELFSymbolIndex, Symbol *> GraphSymbols; DenseMap<const typename ELFFile::Elf_Shdr *, ArrayRef<typename ELFFile::Elf_Word>> @@ -316,18 +316,27 @@ template <typename ELFT> Error ELFLinkGraphBuilder<ELFT>::graphifySections() { else Prot = MemProt::Read | MemProt::Write; - auto &GraphSec = G->createSection(*Name, Prot); + // Look for existing sections first. + auto *GraphSec = G->findSectionByName(*Name); + if (!GraphSec) + GraphSec = &G->createSection(*Name, Prot); + assert(GraphSec->getMemProt() == Prot && "MemProt should match"); + + Block *B = nullptr; if (Sec.sh_type != ELF::SHT_NOBITS) { auto Data = Obj.template getSectionContentsAsArray<char>(Sec); if (!Data) return Data.takeError(); - G->createContentBlock(GraphSec, *Data, Sec.sh_addr, Sec.sh_addralign, 0); + B = &G->createContentBlock(*GraphSec, *Data, + orc::ExecutorAddr(Sec.sh_addr), + Sec.sh_addralign, 0); } else - G->createZeroFillBlock(GraphSec, Sec.sh_size, Sec.sh_addr, - Sec.sh_addralign, 0); + B = &G->createZeroFillBlock(*GraphSec, Sec.sh_size, + orc::ExecutorAddr(Sec.sh_addr), + Sec.sh_addralign, 0); - setGraphSection(SecIndex, GraphSec); + setGraphBlock(SecIndex, B); } return Error::success(); @@ -393,9 +402,9 @@ template <typename ELFT> Error ELFLinkGraphBuilder<ELFT>::graphifySymbols() { // Handle common symbols specially. if (Sym.isCommon()) { - Symbol &GSym = - G->addCommonSymbol(*Name, Scope::Default, getCommonSection(), 0, - Sym.st_size, Sym.getValue(), false); + Symbol &GSym = G->addCommonSymbol(*Name, Scope::Default, + getCommonSection(), orc::ExecutorAddr(), + Sym.st_size, Sym.getValue(), false); setGraphSymbol(SymIndex, GSym); continue; } @@ -425,28 +434,24 @@ template <typename ELFT> Error ELFLinkGraphBuilder<ELFT>::graphifySymbols() { return NdxOrErr.takeError(); Shndx = *NdxOrErr; } - if (auto *GraphSec = getGraphSection(Shndx)) { - Block *B = nullptr; - { - auto Blocks = GraphSec->blocks(); - assert(Blocks.begin() != Blocks.end() && "No blocks for section"); - assert(std::next(Blocks.begin()) == Blocks.end() && - "Multiple blocks for section"); - B = *Blocks.begin(); - } - + if (auto *B = getGraphBlock(Shndx)) { LLVM_DEBUG({ dbgs() << " " << SymIndex << ": Creating defined graph symbol for ELF symbol \"" << *Name << "\"\n"; }); - if (Sym.getType() == ELF::STT_SECTION) - *Name = GraphSec->getName(); - + // In RISCV, temporary symbols (Used to generate dwarf, eh_frame + // sections...) will appear in object code's symbol table, and LLVM does + // not use names on these temporary symbols (RISCV gnu toolchain uses + // names on these temporary symbols). If the symbol is unnamed, add an + // anonymous symbol. auto &GSym = - G->addDefinedSymbol(*B, Sym.getValue(), *Name, Sym.st_size, L, S, - Sym.getType() == ELF::STT_FUNC, false); + Name->empty() + ? G->addAnonymousSymbol(*B, Sym.getValue(), Sym.st_size, + false, false) + : G->addDefinedSymbol(*B, Sym.getValue(), *Name, Sym.st_size, L, + S, Sym.getType() == ELF::STT_FUNC, false); setGraphSymbol(SymIndex, GSym); } } else if (Sym.isUndefined() && Sym.isExternal()) { @@ -498,8 +503,8 @@ Error ELFLinkGraphBuilder<ELFT>::forEachRelocation( } // Lookup the link-graph node corresponding to the target section name. - Section *GraphSect = G->findSectionByName(*Name); - if (!GraphSect) + auto *BlockToFix = getGraphBlock(RelSect.sh_info); + if (!BlockToFix) return make_error<StringError>( "Refencing a section that wasn't added to the graph: " + *Name, inconvertibleErrorCode()); @@ -510,7 +515,7 @@ Error ELFLinkGraphBuilder<ELFT>::forEachRelocation( // Let the callee process relocation entries one by one. for (const typename ELFT::Rela &R : *RelEntries) - if (Error Err = Func(R, **FixupSection, *GraphSect)) + if (Error Err = Func(R, **FixupSection, *BlockToFix)) return Err; LLVM_DEBUG(dbgs() << "\n"); diff --git a/llvm/lib/ExecutionEngine/JITLink/ELF_aarch64.cpp b/llvm/lib/ExecutionEngine/JITLink/ELF_aarch64.cpp index dc183dfddfae..dd3eb97c21a0 100644 --- a/llvm/lib/ExecutionEngine/JITLink/ELF_aarch64.cpp +++ b/llvm/lib/ExecutionEngine/JITLink/ELF_aarch64.cpp @@ -16,6 +16,7 @@ #include "llvm/BinaryFormat/ELF.h" #include "llvm/ExecutionEngine/JITLink/aarch64.h" #include "llvm/Object/ELFObjectFile.h" +#include "llvm/Support/MathExtras.h" #define DEBUG_TYPE "jitlink" @@ -41,16 +42,17 @@ private: char *BlockWorkingMem = B.getAlreadyMutableContent().data(); char *FixupPtr = BlockWorkingMem + E.getOffset(); - JITTargetAddress FixupAddress = B.getAddress() + E.getOffset(); + auto FixupAddress = B.getAddress() + E.getOffset(); switch (E.getKind()) { case aarch64::R_AARCH64_CALL26: { - assert((FixupAddress & 0x3) == 0 && "Call-inst is not 32-bit aligned"); + assert((FixupAddress.getValue() & 0x3) == 0 && + "Call-inst is not 32-bit aligned"); int64_t Value = E.getTarget().getAddress() - FixupAddress + E.getAddend(); if (static_cast<uint64_t>(Value) & 0x3) return make_error<JITLinkError>("Call target is not 32-bit aligned"); - if (!fitsRangeSignedInt<27>(Value)) + if (!isInt<28>(Value)) return makeTargetOutOfRangeError(G, B, E); uint32_t RawInstr = *(little32_t *)FixupPtr; @@ -64,10 +66,6 @@ private: } return Error::success(); } - - template <uint8_t Bits> static bool fitsRangeSignedInt(int64_t Value) { - return Value >= -(1ll << Bits) && Value < (1ll << Bits); - } }; template <typename ELFT> @@ -100,7 +98,7 @@ private: Error addSingleRelocation(const typename ELFT::Rela &Rel, const typename ELFT::Shdr &FixupSect, - Section &GraphSection) { + Block &BlockToFix) { using Base = ELFLinkGraphBuilder<ELFT>; uint32_t SymbolIndex = Rel.getSymbol(false); @@ -123,17 +121,17 @@ private: return Kind.takeError(); int64_t Addend = Rel.r_addend; - Block *BlockToFix = *(GraphSection.blocks().begin()); - JITTargetAddress FixupAddress = FixupSect.sh_addr + Rel.r_offset; - Edge::OffsetT Offset = FixupAddress - BlockToFix->getAddress(); + orc::ExecutorAddr FixupAddress = + orc::ExecutorAddr(FixupSect.sh_addr) + Rel.r_offset; + Edge::OffsetT Offset = FixupAddress - BlockToFix.getAddress(); Edge GE(*Kind, Offset, *GraphSymbol, Addend); LLVM_DEBUG({ dbgs() << " "; - printEdge(dbgs(), *BlockToFix, GE, aarch64::getEdgeKindName(*Kind)); + printEdge(dbgs(), BlockToFix, GE, aarch64::getEdgeKindName(*Kind)); dbgs() << "\n"; }); - BlockToFix->addEdge(std::move(GE)); + BlockToFix.addEdge(std::move(GE)); return Error::success(); } diff --git a/llvm/lib/ExecutionEngine/JITLink/ELF_riscv.cpp b/llvm/lib/ExecutionEngine/JITLink/ELF_riscv.cpp index b057788ce3ef..f83001417e94 100644 --- a/llvm/lib/ExecutionEngine/JITLink/ELF_riscv.cpp +++ b/llvm/lib/ExecutionEngine/JITLink/ELF_riscv.cpp @@ -19,6 +19,7 @@ #include "llvm/ExecutionEngine/JITLink/riscv.h" #include "llvm/Object/ELF.h" #include "llvm/Object/ELFObjectFile.h" +#include "llvm/Support/Endian.h" #define DEBUG_TYPE "jitlink" using namespace llvm; @@ -44,15 +45,16 @@ public: bool isGOTEdgeToFix(Edge &E) const { return E.getKind() == R_RISCV_GOT_HI20; } Symbol &createGOTEntry(Symbol &Target) { - Block &GOTBlock = G.createContentBlock( - getGOTSection(), getGOTEntryBlockContent(), 0, G.getPointerSize(), 0); + Block &GOTBlock = + G.createContentBlock(getGOTSection(), getGOTEntryBlockContent(), + orc::ExecutorAddr(), G.getPointerSize(), 0); GOTBlock.addEdge(isRV64() ? R_RISCV_64 : R_RISCV_32, 0, Target, 0); return G.addAnonymousSymbol(GOTBlock, 0, G.getPointerSize(), false, false); } Symbol &createPLTStub(Symbol &Target) { - Block &StubContentBlock = - G.createContentBlock(getStubsSection(), getStubBlockContent(), 0, 4, 0); + Block &StubContentBlock = G.createContentBlock( + getStubsSection(), getStubBlockContent(), orc::ExecutorAddr(), 4, 0); auto &GOTEntrySymbol = getGOTEntry(Target); StubContentBlock.addEdge(R_RISCV_CALL, 0, GOTEntrySymbol, 0); return G.addAnonymousSymbol(StubContentBlock, 0, StubEntrySize, true, @@ -134,13 +136,13 @@ static Expected<const Edge &> getRISCVPCRelHi20(const Edge &E) { const Symbol &Sym = E.getTarget(); const Block &B = Sym.getBlock(); - JITTargetAddress Offset = Sym.getOffset(); + orc::ExecutorAddrDiff Offset = Sym.getOffset(); struct Comp { - bool operator()(const Edge &Lhs, JITTargetAddress Offset) { + bool operator()(const Edge &Lhs, orc::ExecutorAddrDiff Offset) { return Lhs.getOffset() < Offset; } - bool operator()(JITTargetAddress Offset, const Edge &Rhs) { + bool operator()(orc::ExecutorAddrDiff Offset, const Edge &Rhs) { return Offset < Rhs.getOffset(); } }; @@ -157,8 +159,24 @@ static Expected<const Edge &> getRISCVPCRelHi20(const Edge &E) { "No HI20 PCREL relocation type be found for LO12 PCREL relocation type"); } -static uint32_t extractBits(uint64_t Num, unsigned High, unsigned Low) { - return (Num & ((1ULL << (High + 1)) - 1)) >> Low; +static uint32_t extractBits(uint32_t Num, unsigned Low, unsigned Size) { + return (Num & (((1ULL << (Size + 1)) - 1) << Low)) >> Low; +} + +inline Error checkAlignment(llvm::orc::ExecutorAddr loc, uint64_t v, int n, + const Edge &E) { + if (v & (n - 1)) + return make_error<JITLinkError>("0x" + llvm::utohexstr(loc.getValue()) + + " improper alignment for relocation " + + formatv("{0:d}", E.getKind()) + ": 0x" + + llvm::utohexstr(v) + " is not aligned to " + + Twine(n) + " bytes"); + return Error::success(); +} + +static inline bool isInRangeForImmS32(int64_t Value) { + return (Value >= std::numeric_limits<int32_t>::min() && + Value <= std::numeric_limits<int32_t>::max()); } class ELFJITLinker_riscv : public JITLinker<ELFJITLinker_riscv> { @@ -176,27 +194,47 @@ private: char *BlockWorkingMem = B.getAlreadyMutableContent().data(); char *FixupPtr = BlockWorkingMem + E.getOffset(); - JITTargetAddress FixupAddress = B.getAddress() + E.getOffset(); + orc::ExecutorAddr FixupAddress = B.getAddress() + E.getOffset(); switch (E.getKind()) { case R_RISCV_32: { - int64_t Value = E.getTarget().getAddress() + E.getAddend(); + int64_t Value = (E.getTarget().getAddress() + E.getAddend()).getValue(); *(little32_t *)FixupPtr = static_cast<uint32_t>(Value); break; } case R_RISCV_64: { - int64_t Value = E.getTarget().getAddress() + E.getAddend(); + int64_t Value = (E.getTarget().getAddress() + E.getAddend()).getValue(); *(little64_t *)FixupPtr = static_cast<uint64_t>(Value); break; } + case R_RISCV_BRANCH: { + int64_t Value = E.getTarget().getAddress() + E.getAddend() - FixupAddress; + Error AlignmentIssue = checkAlignment(FixupAddress, Value, 2, E); + if (AlignmentIssue) { + return AlignmentIssue; + } + int64_t Lo = Value & 0xFFF; + uint32_t Imm31_25 = extractBits(Lo, 5, 6) << 25 | extractBits(Lo, 12, 1) + << 31; + uint32_t Imm11_7 = extractBits(Lo, 1, 4) << 8 | extractBits(Lo, 11, 1) + << 7; + uint32_t RawInstr = *(little32_t *)FixupPtr; + *(little32_t *)FixupPtr = (RawInstr & 0x1FFF07F) | Imm31_25 | Imm11_7; + break; + } case R_RISCV_HI20: { - int64_t Value = E.getTarget().getAddress() + E.getAddend(); - int32_t Hi = (Value + 0x800) & 0xFFFFF000; + int64_t Value = (E.getTarget().getAddress() + E.getAddend()).getValue(); + int64_t Hi = Value + 0x800; + if (LLVM_UNLIKELY(!isInRangeForImmS32(Hi))) + return makeTargetOutOfRangeError(G, B, E); uint32_t RawInstr = *(little32_t *)FixupPtr; - *(little32_t *)FixupPtr = (RawInstr & 0xFFF) | static_cast<uint32_t>(Hi); + *(little32_t *)FixupPtr = + (RawInstr & 0xFFF) | (static_cast<uint32_t>(Hi & 0xFFFFF000)); break; } case R_RISCV_LO12_I: { - int64_t Value = E.getTarget().getAddress() + E.getAddend(); + // FIXME: We assume that R_RISCV_HI20 is present in object code and pairs + // with current relocation R_RISCV_LO12_I. So here may need a check. + int64_t Value = (E.getTarget().getAddress() + E.getAddend()).getValue(); int32_t Lo = Value & 0xFFF; uint32_t RawInstr = *(little32_t *)FixupPtr; *(little32_t *)FixupPtr = @@ -205,23 +243,32 @@ private: } case R_RISCV_CALL: { int64_t Value = E.getTarget().getAddress() + E.getAddend() - FixupAddress; - int32_t Hi = (Value + 0x800) & 0xFFFFF000; + int64_t Hi = Value + 0x800; + if (LLVM_UNLIKELY(!isInRangeForImmS32(Hi))) + return makeTargetOutOfRangeError(G, B, E); int32_t Lo = Value & 0xFFF; uint32_t RawInstrAuipc = *(little32_t *)FixupPtr; uint32_t RawInstrJalr = *(little32_t *)(FixupPtr + 4); - *(little32_t *)FixupPtr = RawInstrAuipc | static_cast<uint32_t>(Hi); + *(little32_t *)FixupPtr = + RawInstrAuipc | (static_cast<uint32_t>(Hi & 0xFFFFF000)); *(little32_t *)(FixupPtr + 4) = RawInstrJalr | (static_cast<uint32_t>(Lo) << 20); break; } case R_RISCV_PCREL_HI20: { int64_t Value = E.getTarget().getAddress() + E.getAddend() - FixupAddress; - int32_t Hi = (Value + 0x800) & 0xFFFFF000; + int64_t Hi = Value + 0x800; + if (LLVM_UNLIKELY(!isInRangeForImmS32(Hi))) + return makeTargetOutOfRangeError(G, B, E); uint32_t RawInstr = *(little32_t *)FixupPtr; - *(little32_t *)FixupPtr = (RawInstr & 0xFFF) | static_cast<uint32_t>(Hi); + *(little32_t *)FixupPtr = + (RawInstr & 0xFFF) | (static_cast<uint32_t>(Hi & 0xFFFFF000)); break; } case R_RISCV_PCREL_LO12_I: { + // FIXME: We assume that R_RISCV_PCREL_HI20 is present in object code and + // pairs with current relocation R_RISCV_PCREL_LO12_I. So here may need a + // check. auto RelHI20 = getRISCVPCRelHi20(E); if (!RelHI20) return RelHI20.takeError(); @@ -234,17 +281,117 @@ private: break; } case R_RISCV_PCREL_LO12_S: { + // FIXME: We assume that R_RISCV_PCREL_HI20 is present in object code and + // pairs with current relocation R_RISCV_PCREL_LO12_S. So here may need a + // check. auto RelHI20 = getRISCVPCRelHi20(E); int64_t Value = RelHI20->getTarget().getAddress() + RelHI20->getAddend() - E.getTarget().getAddress(); int64_t Lo = Value & 0xFFF; - uint32_t Imm31_25 = extractBits(Lo, 11, 5) << 25; - uint32_t Imm11_7 = extractBits(Lo, 4, 0) << 7; + uint32_t Imm31_25 = extractBits(Lo, 5, 7) << 25; + uint32_t Imm11_7 = extractBits(Lo, 0, 5) << 7; uint32_t RawInstr = *(little32_t *)FixupPtr; *(little32_t *)FixupPtr = (RawInstr & 0x1FFF07F) | Imm31_25 | Imm11_7; break; } + case R_RISCV_ADD64: { + int64_t Value = (E.getTarget().getAddress() + + support::endian::read64le(reinterpret_cast<const void *>( + FixupAddress.getValue())) + + E.getAddend()) + .getValue(); + *(little64_t *)FixupPtr = static_cast<uint64_t>(Value); + break; + } + case R_RISCV_ADD32: { + int64_t Value = (E.getTarget().getAddress() + + support::endian::read32le(reinterpret_cast<const void *>( + FixupAddress.getValue())) + + E.getAddend()) + .getValue(); + *(little32_t *)FixupPtr = static_cast<uint32_t>(Value); + break; + } + case R_RISCV_ADD16: { + int64_t Value = (E.getTarget().getAddress() + + support::endian::read16le(reinterpret_cast<const void *>( + FixupAddress.getValue())) + + E.getAddend()) + .getValue(); + *(little16_t *)FixupPtr = static_cast<uint32_t>(Value); + break; + } + case R_RISCV_ADD8: { + int64_t Value = + (E.getTarget().getAddress() + + *(reinterpret_cast<const uint8_t *>(FixupAddress.getValue())) + + E.getAddend()) + .getValue(); + *FixupPtr = static_cast<uint8_t>(Value); + break; + } + case R_RISCV_SUB64: { + int64_t Value = support::endian::read64le(reinterpret_cast<const void *>( + FixupAddress.getValue())) - + E.getTarget().getAddress().getValue() - E.getAddend(); + *(little64_t *)FixupPtr = static_cast<uint64_t>(Value); + break; + } + case R_RISCV_SUB32: { + int64_t Value = support::endian::read32le(reinterpret_cast<const void *>( + FixupAddress.getValue())) - + E.getTarget().getAddress().getValue() - E.getAddend(); + *(little32_t *)FixupPtr = static_cast<uint32_t>(Value); + break; + } + case R_RISCV_SUB16: { + int64_t Value = support::endian::read16le(reinterpret_cast<const void *>( + FixupAddress.getValue())) - + E.getTarget().getAddress().getValue() - E.getAddend(); + *(little16_t *)FixupPtr = static_cast<uint32_t>(Value); + break; + } + case R_RISCV_SUB8: { + int64_t Value = + *(reinterpret_cast<const uint8_t *>(FixupAddress.getValue())) - + E.getTarget().getAddress().getValue() - E.getAddend(); + *FixupPtr = static_cast<uint8_t>(Value); + break; + } + case R_RISCV_SET6: { + int64_t Value = (E.getTarget().getAddress() + E.getAddend()).getValue(); + uint32_t RawData = *(little32_t *)FixupPtr; + int64_t Word6 = Value & 0x3f; + *(little32_t *)FixupPtr = (RawData & 0xffffffc0) | Word6; + break; + } + case R_RISCV_SET8: { + int64_t Value = (E.getTarget().getAddress() + E.getAddend()).getValue(); + uint32_t RawData = *(little32_t *)FixupPtr; + int64_t Word8 = Value & 0xff; + *(little32_t *)FixupPtr = (RawData & 0xffffff00) | Word8; + break; + } + case R_RISCV_SET16: { + int64_t Value = (E.getTarget().getAddress() + E.getAddend()).getValue(); + uint32_t RawData = *(little32_t *)FixupPtr; + int64_t Word16 = Value & 0xffff; + *(little32_t *)FixupPtr = (RawData & 0xffff0000) | Word16; + break; + } + case R_RISCV_SET32: { + int64_t Value = (E.getTarget().getAddress() + E.getAddend()).getValue(); + int64_t Word32 = Value & 0xffffffff; + *(little32_t *)FixupPtr = Word32; + break; + } + case R_RISCV_32_PCREL: { + int64_t Value = E.getTarget().getAddress() + E.getAddend() - FixupAddress; + int64_t Word32 = Value & 0xffffffff; + *(little32_t *)FixupPtr = Word32; + break; + } } return Error::success(); } @@ -261,6 +408,8 @@ private: return EdgeKind_riscv::R_RISCV_32; case ELF::R_RISCV_64: return EdgeKind_riscv::R_RISCV_64; + case ELF::R_RISCV_BRANCH: + return EdgeKind_riscv::R_RISCV_BRANCH; case ELF::R_RISCV_HI20: return EdgeKind_riscv::R_RISCV_HI20; case ELF::R_RISCV_LO12_I: @@ -277,6 +426,32 @@ private: return EdgeKind_riscv::R_RISCV_GOT_HI20; case ELF::R_RISCV_CALL_PLT: return EdgeKind_riscv::R_RISCV_CALL_PLT; + case ELF::R_RISCV_ADD64: + return EdgeKind_riscv::R_RISCV_ADD64; + case ELF::R_RISCV_ADD32: + return EdgeKind_riscv::R_RISCV_ADD32; + case ELF::R_RISCV_ADD16: + return EdgeKind_riscv::R_RISCV_ADD16; + case ELF::R_RISCV_ADD8: + return EdgeKind_riscv::R_RISCV_ADD8; + case ELF::R_RISCV_SUB64: + return EdgeKind_riscv::R_RISCV_SUB64; + case ELF::R_RISCV_SUB32: + return EdgeKind_riscv::R_RISCV_SUB32; + case ELF::R_RISCV_SUB16: + return EdgeKind_riscv::R_RISCV_SUB16; + case ELF::R_RISCV_SUB8: + return EdgeKind_riscv::R_RISCV_SUB8; + case ELF::R_RISCV_SET6: + return EdgeKind_riscv::R_RISCV_SET6; + case ELF::R_RISCV_SET8: + return EdgeKind_riscv::R_RISCV_SET8; + case ELF::R_RISCV_SET16: + return EdgeKind_riscv::R_RISCV_SET16; + case ELF::R_RISCV_SET32: + return EdgeKind_riscv::R_RISCV_SET32; + case ELF::R_RISCV_32_PCREL: + return EdgeKind_riscv::R_RISCV_32_PCREL; } return make_error<JITLinkError>("Unsupported riscv relocation:" + @@ -298,7 +473,7 @@ private: Error addSingleRelocation(const typename ELFT::Rela &Rel, const typename ELFT::Shdr &FixupSect, - Section &GraphSection) { + Block &BlockToFix) { using Base = ELFLinkGraphBuilder<ELFT>; uint32_t SymbolIndex = Rel.getSymbol(false); @@ -321,17 +496,16 @@ private: return Kind.takeError(); int64_t Addend = Rel.r_addend; - Block *BlockToFix = *(GraphSection.blocks().begin()); - JITTargetAddress FixupAddress = FixupSect.sh_addr + Rel.r_offset; - Edge::OffsetT Offset = FixupAddress - BlockToFix->getAddress(); + auto FixupAddress = orc::ExecutorAddr(FixupSect.sh_addr) + Rel.r_offset; + Edge::OffsetT Offset = FixupAddress - BlockToFix.getAddress(); Edge GE(*Kind, Offset, *GraphSymbol, Addend); LLVM_DEBUG({ dbgs() << " "; - printEdge(dbgs(), *BlockToFix, GE, riscv::getEdgeKindName(*Kind)); + printEdge(dbgs(), BlockToFix, GE, riscv::getEdgeKindName(*Kind)); dbgs() << "\n"; }); - BlockToFix->addEdge(std::move(GE)); + BlockToFix.addEdge(std::move(GE)); return Error::success(); } diff --git a/llvm/lib/ExecutionEngine/JITLink/ELF_x86_64.cpp b/llvm/lib/ExecutionEngine/JITLink/ELF_x86_64.cpp index 27d8833ae19e..79d2cdbb30f1 100644 --- a/llvm/lib/ExecutionEngine/JITLink/ELF_x86_64.cpp +++ b/llvm/lib/ExecutionEngine/JITLink/ELF_x86_64.cpp @@ -59,8 +59,8 @@ public: // the TLS Info entry's key value will be written by the fixTLVSectionByName // pass, so create mutable content. auto &TLSInfoEntry = G.createMutableContentBlock( - getTLSInfoSection(G), G.allocateContent(getTLSInfoEntryContent()), 0, 8, - 0); + getTLSInfoSection(G), G.allocateContent(getTLSInfoEntryContent()), + orc::ExecutorAddr(), 8, 0); TLSInfoEntry.addEdge(x86_64::Pointer64, 8, Target, 0); return G.addAnonymousSymbol(TLSInfoEntry, 0, 16, false, false); } @@ -172,7 +172,7 @@ private: Error addSingleRelocation(const typename ELFT::Rela &Rel, const typename ELFT::Shdr &FixupSection, - Section &GraphSection) { + Block &BlockToFix) { using Base = ELFLinkGraphBuilder<ELFT>; uint32_t SymbolIndex = Rel.getSymbol(false); @@ -248,17 +248,16 @@ private: } } - Block *BlockToFix = *(GraphSection.blocks().begin()); - JITTargetAddress FixupAddress = FixupSection.sh_addr + Rel.r_offset; - Edge::OffsetT Offset = FixupAddress - BlockToFix->getAddress(); + auto FixupAddress = orc::ExecutorAddr(FixupSection.sh_addr) + Rel.r_offset; + Edge::OffsetT Offset = FixupAddress - BlockToFix.getAddress(); Edge GE(Kind, Offset, *GraphSymbol, Addend); LLVM_DEBUG({ dbgs() << " "; - printEdge(dbgs(), *BlockToFix, GE, x86_64::getEdgeKindName(Kind)); + printEdge(dbgs(), BlockToFix, GE, x86_64::getEdgeKindName(Kind)); dbgs() << "\n"; }); - BlockToFix->addEdge(std::move(GE)); + BlockToFix.addEdge(std::move(GE)); return Error::success(); } @@ -322,8 +321,9 @@ private: // If there's no defined symbol then create one. SectionRange SR(*GOTSection); if (SR.empty()) - GOTSymbol = &G.addAbsoluteSymbol(ELFGOTSymbolName, 0, 0, - Linkage::Strong, Scope::Local, true); + GOTSymbol = + &G.addAbsoluteSymbol(ELFGOTSymbolName, orc::ExecutorAddr(), 0, + Linkage::Strong, Scope::Local, true); else GOTSymbol = &G.addDefinedSymbol(*SR.getFirstBlock(), 0, ELFGOTSymbolName, 0, diff --git a/llvm/lib/ExecutionEngine/JITLink/JITLink.cpp b/llvm/lib/ExecutionEngine/JITLink/JITLink.cpp index 51dcc1c35fad..78a603cfed17 100644 --- a/llvm/lib/ExecutionEngine/JITLink/JITLink.cpp +++ b/llvm/lib/ExecutionEngine/JITLink/JITLink.cpp @@ -90,8 +90,8 @@ const char *getScopeName(Scope S) { } raw_ostream &operator<<(raw_ostream &OS, const Block &B) { - return OS << formatv("{0:x16}", B.getAddress()) << " -- " - << formatv("{0:x8}", B.getAddress() + B.getSize()) << ": " + return OS << B.getAddress() << " -- " << (B.getAddress() + B.getSize()) + << ": " << "size = " << formatv("{0:x8}", B.getSize()) << ", " << (B.isZeroFill() ? "zero-fill" : "content") << ", align = " << B.getAlignment() @@ -100,9 +100,8 @@ raw_ostream &operator<<(raw_ostream &OS, const Block &B) { } raw_ostream &operator<<(raw_ostream &OS, const Symbol &Sym) { - OS << formatv("{0:x16}", Sym.getAddress()) << " (" - << (Sym.isDefined() ? "block" : "addressable") << " + " - << formatv("{0:x8}", Sym.getOffset()) + OS << Sym.getAddress() << " (" << (Sym.isDefined() ? "block" : "addressable") + << " + " << formatv("{0:x8}", Sym.getOffset()) << "): size: " << formatv("{0:x8}", Sym.getSize()) << ", linkage: " << formatv("{0:6}", getLinkageName(Sym.getLinkage())) << ", scope: " << formatv("{0:8}", getScopeName(Sym.getScope())) << ", " @@ -113,9 +112,9 @@ raw_ostream &operator<<(raw_ostream &OS, const Symbol &Sym) { void printEdge(raw_ostream &OS, const Block &B, const Edge &E, StringRef EdgeKindName) { - OS << "edge@" << formatv("{0:x16}", B.getAddress() + E.getOffset()) << ": " - << formatv("{0:x16}", B.getAddress()) << " + " - << formatv("{0:x}", E.getOffset()) << " -- " << EdgeKindName << " -> "; + OS << "edge@" << B.getAddress() + E.getOffset() << ": " << B.getAddress() + << " + " << formatv("{0:x}", E.getOffset()) << " -- " << EdgeKindName + << " -> "; auto &TargetSym = E.getTarget(); if (TargetSym.hasName()) @@ -123,17 +122,16 @@ void printEdge(raw_ostream &OS, const Block &B, const Edge &E, else { auto &TargetBlock = TargetSym.getBlock(); auto &TargetSec = TargetBlock.getSection(); - JITTargetAddress SecAddress = ~JITTargetAddress(0); + orc::ExecutorAddr SecAddress(~uint64_t(0)); for (auto *B : TargetSec.blocks()) if (B->getAddress() < SecAddress) SecAddress = B->getAddress(); - JITTargetAddress SecDelta = TargetSym.getAddress() - SecAddress; - OS << formatv("{0:x16}", TargetSym.getAddress()) << " (section " - << TargetSec.getName(); + orc::ExecutorAddrDiff SecDelta = TargetSym.getAddress() - SecAddress; + OS << TargetSym.getAddress() << " (section " << TargetSec.getName(); if (SecDelta) OS << " + " << formatv("{0:x}", SecDelta); - OS << " / block " << formatv("{0:x16}", TargetBlock.getAddress()); + OS << " / block " << TargetBlock.getAddress(); if (TargetSym.getOffset()) OS << " + " << formatv("{0:x}", TargetSym.getOffset()); OS << ")"; @@ -265,7 +263,7 @@ void LinkGraph::dump(raw_ostream &OS) { }); for (auto *B : SortedBlocks) { - OS << " block " << formatv("{0:x16}", B->getAddress()) + OS << " block " << B->getAddress() << " size = " << formatv("{0:x8}", B->getSize()) << ", align = " << B->getAlignment() << ", alignment-offset = " << B->getAlignmentOffset(); @@ -290,9 +288,8 @@ void LinkGraph::dump(raw_ostream &OS) { return LHS.getOffset() < RHS.getOffset(); }); for (auto &E : SortedEdges) { - OS << " " << formatv("{0:x16}", B->getFixupAddress(E)) - << " (block + " << formatv("{0:x8}", E.getOffset()) - << "), addend = "; + OS << " " << B->getFixupAddress(E) << " (block + " + << formatv("{0:x8}", E.getOffset()) << "), addend = "; if (E.getAddend() >= 0) OS << formatv("+{0:x8}", E.getAddend()); else @@ -315,16 +312,14 @@ void LinkGraph::dump(raw_ostream &OS) { OS << "Absolute symbols:\n"; if (!llvm::empty(absolute_symbols())) { for (auto *Sym : absolute_symbols()) - OS << " " << format("0x%016" PRIx64, Sym->getAddress()) << ": " << *Sym - << "\n"; + OS << " " << Sym->getAddress() << ": " << *Sym << "\n"; } else OS << " none\n"; OS << "\nExternal symbols:\n"; if (!llvm::empty(external_symbols())) { for (auto *Sym : external_symbols()) - OS << " " << format("0x%016" PRIx64, Sym->getAddress()) << ": " << *Sym - << "\n"; + OS << " " << Sym->getAddress() << ": " << *Sym << "\n"; } else OS << " none\n"; } @@ -370,10 +365,13 @@ Error makeTargetOutOfRangeError(const LinkGraph &G, const Block &B, Section &Sec = B.getSection(); ErrStream << "In graph " << G.getName() << ", section " << Sec.getName() << ": relocation target "; - if (E.getTarget().hasName()) - ErrStream << "\"" << E.getTarget().getName() << "\" "; - ErrStream << "at address " << formatv("{0:x}", E.getTarget().getAddress()); - ErrStream << " is out of range of " << G.getEdgeKindName(E.getKind()) + if (E.getTarget().hasName()) { + ErrStream << "\"" << E.getTarget().getName() << "\""; + } else + ErrStream << E.getTarget().getBlock().getSection().getName() << " + " + << formatv("{0:x}", E.getOffset()); + ErrStream << " at address " << formatv("{0:x}", E.getTarget().getAddress()) + << " is out of range of " << G.getEdgeKindName(E.getKind()) << " fixup at " << formatv("{0:x}", B.getFixupAddress(E)) << " ("; Symbol *BestSymbolForBlock = nullptr; diff --git a/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.cpp b/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.cpp index 706688aba4ec..35ee050c8566 100644 --- a/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.cpp +++ b/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.cpp @@ -192,7 +192,7 @@ JITLinkContext::LookupMap JITLinkerBase::getExternalSymbolNames() const { // Identify unresolved external symbols. JITLinkContext::LookupMap UnresolvedExternals; for (auto *Sym : G->external_symbols()) { - assert(Sym->getAddress() == 0 && + assert(!Sym->getAddress() && "External has already been assigned an address"); assert(Sym->getName() != StringRef() && Sym->getName() != "" && "Externals must be named"); @@ -209,11 +209,12 @@ void JITLinkerBase::applyLookupResult(AsyncLookupResult Result) { for (auto *Sym : G->external_symbols()) { assert(Sym->getOffset() == 0 && "External symbol is not at the start of its addressable block"); - assert(Sym->getAddress() == 0 && "Symbol already resolved"); + assert(!Sym->getAddress() && "Symbol already resolved"); assert(!Sym->isDefined() && "Symbol being resolved is already defined"); auto ResultI = Result.find(Sym->getName()); if (ResultI != Result.end()) - Sym->getAddressable().setAddress(ResultI->second.getAddress()); + Sym->getAddressable().setAddress( + orc::ExecutorAddr(ResultI->second.getAddress())); else assert(Sym->getLinkage() == Linkage::Weak && "Failed to resolve non-weak reference"); @@ -223,7 +224,7 @@ void JITLinkerBase::applyLookupResult(AsyncLookupResult Result) { dbgs() << "Externals after applying lookup result:\n"; for (auto *Sym : G->external_symbols()) dbgs() << " " << Sym->getName() << ": " - << formatv("{0:x16}", Sym->getAddress()) << "\n"; + << formatv("{0:x16}", Sym->getAddress().getValue()) << "\n"; }); } diff --git a/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.h b/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.h index e4fdda0783a4..1095fa5ce701 100644 --- a/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.h +++ b/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.h @@ -19,9 +19,6 @@ #define DEBUG_TYPE "jitlink" namespace llvm { - -class MemoryBufferRef; - namespace jitlink { /// Base class for a JIT linker. @@ -161,4 +158,4 @@ void prune(LinkGraph &G); #undef DEBUG_TYPE // "jitlink" -#endif // LLVM_EXECUTIONENGINE_JITLINK_JITLINKGENERIC_H +#endif // LIB_EXECUTIONENGINE_JITLINK_JITLINKGENERIC_H diff --git a/llvm/lib/ExecutionEngine/JITLink/JITLinkMemoryManager.cpp b/llvm/lib/ExecutionEngine/JITLink/JITLinkMemoryManager.cpp index 831b9b26d2fd..9315ac4f6120 100644 --- a/llvm/lib/ExecutionEngine/JITLink/JITLinkMemoryManager.cpp +++ b/llvm/lib/ExecutionEngine/JITLink/JITLinkMemoryManager.cpp @@ -15,63 +15,12 @@ using namespace llvm; -namespace { - -// FIXME: Remove this copy of CWrapperFunctionResult as soon as JITLink can -// depend on shared utils from Orc. - -// Must be kept in-sync with compiler-rt/lib/orc/c-api.h. -union CWrapperFunctionResultDataUnion { - char *ValuePtr; - char Value[sizeof(ValuePtr)]; -}; - -// Must be kept in-sync with compiler-rt/lib/orc/c-api.h. -typedef struct { - CWrapperFunctionResultDataUnion Data; - size_t Size; -} CWrapperFunctionResult; - -Error toError(CWrapperFunctionResult R) { - bool HasError = false; - std::string ErrMsg; - if (R.Size) { - bool Large = R.Size > sizeof(CWrapperFunctionResultDataUnion); - char *Content = Large ? R.Data.ValuePtr : R.Data.Value; - if (Content[0]) { - HasError = true; - constexpr unsigned StrStart = 1 + sizeof(uint64_t); - ErrMsg.resize(R.Size - StrStart); - memcpy(&ErrMsg[0], Content + StrStart, R.Size - StrStart); - } - if (Large) - free(R.Data.ValuePtr); - } else if (R.Data.ValuePtr) { - HasError = true; - ErrMsg = R.Data.ValuePtr; - free(R.Data.ValuePtr); - } - - if (HasError) - return make_error<StringError>(std::move(ErrMsg), inconvertibleErrorCode()); - return Error::success(); -} -} // namespace - namespace llvm { namespace jitlink { JITLinkMemoryManager::~JITLinkMemoryManager() = default; JITLinkMemoryManager::InFlightAlloc::~InFlightAlloc() = default; -static Error runAllocAction(JITLinkMemoryManager::AllocActionCall &C) { - using WrapperFnTy = CWrapperFunctionResult (*)(const void *, size_t); - auto *Fn = jitTargetAddressToPointer<WrapperFnTy>(C.FnAddr); - - return toError(Fn(jitTargetAddressToPointer<const void *>(C.CtxAddr), - static_cast<size_t>(C.CtxSize))); -} - BasicLayout::BasicLayout(LinkGraph &G) : G(G) { for (auto &Sec : G.sections()) { @@ -189,7 +138,7 @@ Error BasicLayout::apply() { return Error::success(); } -JITLinkMemoryManager::AllocActions &BasicLayout::graphAllocActions() { +orc::shared::AllocActions &BasicLayout::graphAllocActions() { return G.allocActions(); } @@ -209,7 +158,7 @@ void SimpleSegmentAlloc::Create(JITLinkMemoryManager &MemMgr, std::make_unique<LinkGraph>("", Triple(), 0, support::native, nullptr); AllocGroupSmallMap<Block *> ContentBlocks; - JITTargetAddress NextAddr = 0x100000; + orc::ExecutorAddr NextAddr(0x100000); for (auto &KV : Segments) { auto &AG = KV.first; auto &Seg = KV.second; @@ -222,7 +171,8 @@ void SimpleSegmentAlloc::Create(JITLinkMemoryManager &MemMgr, Sec.setMemDeallocPolicy(AG.getMemDeallocPolicy()); if (Seg.ContentSize != 0) { - NextAddr = alignTo(NextAddr, Seg.ContentAlign); + NextAddr = + orc::ExecutorAddr(alignTo(NextAddr.getValue(), Seg.ContentAlign)); auto &B = G->createMutableContentBlock(Sec, G->allocateBuffer(Seg.ContentSize), NextAddr, Seg.ContentAlign.value(), 0); @@ -297,19 +247,11 @@ public: } // Run finalization actions. - // FIXME: Roll back previous successful actions on failure. - std::vector<AllocActionCall> DeallocActions; - DeallocActions.reserve(G.allocActions().size()); - for (auto &ActPair : G.allocActions()) { - if (ActPair.Finalize.FnAddr) - if (auto Err = runAllocAction(ActPair.Finalize)) { - OnFinalized(std::move(Err)); - return; - } - if (ActPair.Dealloc.FnAddr) - DeallocActions.push_back(ActPair.Dealloc); + auto DeallocActions = runFinalizeActions(G.allocActions()); + if (!DeallocActions) { + OnFinalized(DeallocActions.takeError()); + return; } - G.allocActions().clear(); // Release the finalize segments slab. if (auto EC = sys::Memory::releaseMappedMemory(FinalizationSegments)) { @@ -319,7 +261,7 @@ public: // Continue with finalized allocation. OnFinalized(MemMgr.createFinalizedAlloc(std::move(StandardSegments), - std::move(DeallocActions))); + std::move(*DeallocActions))); } void abandon(OnAbandonedFunction OnAbandoned) override { @@ -428,8 +370,8 @@ void InProcessMemoryManager::allocate(const JITLinkDylib *JD, LinkGraph &G, static_cast<size_t>(SegsSizes->FinalizeSegs)}; } - auto NextStandardSegAddr = pointerToJITTargetAddress(StandardSegsMem.base()); - auto NextFinalizeSegAddr = pointerToJITTargetAddress(FinalizeSegsMem.base()); + auto NextStandardSegAddr = orc::ExecutorAddr::fromPtr(StandardSegsMem.base()); + auto NextFinalizeSegAddr = orc::ExecutorAddr::fromPtr(FinalizeSegsMem.base()); LLVM_DEBUG({ dbgs() << "InProcessMemoryManager allocated:\n"; @@ -456,7 +398,7 @@ void InProcessMemoryManager::allocate(const JITLinkDylib *JD, LinkGraph &G, ? NextStandardSegAddr : NextFinalizeSegAddr; - Seg.WorkingMem = jitTargetAddressToPointer<char *>(SegAddr); + Seg.WorkingMem = SegAddr.toPtr<char *>(); Seg.Addr = SegAddr; SegAddr += alignTo(Seg.ContentSize + Seg.ZeroFillSize, PageSize); @@ -475,13 +417,12 @@ void InProcessMemoryManager::allocate(const JITLinkDylib *JD, LinkGraph &G, void InProcessMemoryManager::deallocate(std::vector<FinalizedAlloc> Allocs, OnDeallocatedFunction OnDeallocated) { std::vector<sys::MemoryBlock> StandardSegmentsList; - std::vector<std::vector<AllocActionCall>> DeallocActionsList; + std::vector<std::vector<orc::shared::WrapperFunctionCall>> DeallocActionsList; { std::lock_guard<std::mutex> Lock(FinalizedAllocsMutex); for (auto &Alloc : Allocs) { - auto *FA = - jitTargetAddressToPointer<FinalizedAllocInfo *>(Alloc.release()); + auto *FA = Alloc.release().toPtr<FinalizedAllocInfo *>(); StandardSegmentsList.push_back(std::move(FA->StandardSegments)); if (!FA->DeallocActions.empty()) DeallocActionsList.push_back(std::move(FA->DeallocActions)); @@ -498,7 +439,7 @@ void InProcessMemoryManager::deallocate(std::vector<FinalizedAlloc> Allocs, /// Run any deallocate calls. while (!DeallocActions.empty()) { - if (auto Err = runAllocAction(DeallocActions.back())) + if (auto Err = DeallocActions.back().runWithSPSRetErrorMerged()) DeallocErr = joinErrors(std::move(DeallocErr), std::move(Err)); DeallocActions.pop_back(); } @@ -517,12 +458,12 @@ void InProcessMemoryManager::deallocate(std::vector<FinalizedAlloc> Allocs, JITLinkMemoryManager::FinalizedAlloc InProcessMemoryManager::createFinalizedAlloc( sys::MemoryBlock StandardSegments, - std::vector<AllocActionCall> DeallocActions) { + std::vector<orc::shared::WrapperFunctionCall> DeallocActions) { std::lock_guard<std::mutex> Lock(FinalizedAllocsMutex); auto *FA = FinalizedAllocInfos.Allocate<FinalizedAllocInfo>(); new (FA) FinalizedAllocInfo( {std::move(StandardSegments), std::move(DeallocActions)}); - return FinalizedAlloc(pointerToJITTargetAddress(FA)); + return FinalizedAlloc(orc::ExecutorAddr::fromPtr(FA)); } } // end namespace jitlink diff --git a/llvm/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.cpp b/llvm/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.cpp index d588b63d9e88..62574604458c 100644 --- a/llvm/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.cpp +++ b/llvm/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.cpp @@ -134,7 +134,7 @@ Error MachOLinkGraphBuilder::createNormalizedSections() { memcpy(&NSec.SegName, Sec64.segname, 16); NSec.SegName[16] = '\0'; - NSec.Address = Sec64.addr; + NSec.Address = orc::ExecutorAddr(Sec64.addr); NSec.Size = Sec64.size; NSec.Alignment = 1ULL << Sec64.align; NSec.Flags = Sec64.flags; @@ -147,7 +147,7 @@ Error MachOLinkGraphBuilder::createNormalizedSections() { memcpy(&NSec.SegName, Sec32.segname, 16); NSec.SegName[16] = '\0'; - NSec.Address = Sec32.addr; + NSec.Address = orc::ExecutorAddr(Sec32.addr); NSec.Size = Sec32.size; NSec.Alignment = 1ULL << Sec32.align; NSec.Flags = Sec32.flags; @@ -287,7 +287,8 @@ Error MachOLinkGraphBuilder::createNormalizedSymbols() { if (!NSec) return NSec.takeError(); - if (Value < NSec->Address || Value > NSec->Address + NSec->Size) + if (orc::ExecutorAddr(Value) < NSec->Address || + orc::ExecutorAddr(Value) > NSec->Address + NSec->Size) return make_error<JITLinkError>("Address " + formatv("{0:x}", Value) + " for symbol " + *Name + " does not fall within section"); @@ -311,8 +312,9 @@ Error MachOLinkGraphBuilder::createNormalizedSymbols() { } void MachOLinkGraphBuilder::addSectionStartSymAndBlock( - unsigned SecIndex, Section &GraphSec, uint64_t Address, const char *Data, - uint64_t Size, uint32_t Alignment, bool IsLive) { + unsigned SecIndex, Section &GraphSec, orc::ExecutorAddr Address, + const char *Data, orc::ExecutorAddrDiff Size, uint32_t Alignment, + bool IsLive) { Block &B = Data ? G->createContentBlock(GraphSec, ArrayRef<char>(Data, Size), Address, Alignment, 0) @@ -346,7 +348,8 @@ Error MachOLinkGraphBuilder::graphifyRegularSymbols() { return make_error<JITLinkError>("Anonymous common symbol at index " + Twine(KV.first)); NSym.GraphSymbol = &G->addCommonSymbol( - *NSym.Name, NSym.S, getCommonSection(), 0, NSym.Value, + *NSym.Name, NSym.S, getCommonSection(), orc::ExecutorAddr(), + orc::ExecutorAddrDiff(NSym.Value), 1ull << MachO::GET_COMM_ALIGN(NSym.Desc), NSym.Desc & MachO::N_NO_DEAD_STRIP); } else { @@ -364,8 +367,8 @@ Error MachOLinkGraphBuilder::graphifyRegularSymbols() { return make_error<JITLinkError>("Anonymous absolute symbol at index " + Twine(KV.first)); NSym.GraphSymbol = &G->addAbsoluteSymbol( - *NSym.Name, NSym.Value, 0, Linkage::Strong, Scope::Default, - NSym.Desc & MachO::N_NO_DEAD_STRIP); + *NSym.Name, orc::ExecutorAddr(NSym.Value), 0, Linkage::Strong, + Scope::Default, NSym.Desc & MachO::N_NO_DEAD_STRIP); break; case MachO::N_SECT: SecIndexToSymbols[NSym.Sect - 1].push_back(&NSym); @@ -468,13 +471,13 @@ Error MachOLinkGraphBuilder::graphifyRegularSymbols() { // If the section is non-empty but there is no symbol covering the start // address then add an anonymous one. - if (SecNSymStack.back()->Value != NSec.Address) { - auto AnonBlockSize = SecNSymStack.back()->Value - NSec.Address; + if (orc::ExecutorAddr(SecNSymStack.back()->Value) != NSec.Address) { + auto AnonBlockSize = + orc::ExecutorAddr(SecNSymStack.back()->Value) - NSec.Address; LLVM_DEBUG({ dbgs() << " Section start not covered by symbol. " - << "Creating anonymous block to cover [ " - << formatv("{0:x16}", NSec.Address) << " -- " - << formatv("{0:x16}", NSec.Address + AnonBlockSize) << " ]\n"; + << "Creating anonymous block to cover [ " << NSec.Address + << " -- " << (NSec.Address + AnonBlockSize) << " ]\n"; }); addSectionStartSymAndBlock(SecIndex, *NSec.GraphSection, NSec.Address, NSec.Data, AnonBlockSize, NSec.Alignment, @@ -496,12 +499,12 @@ Error MachOLinkGraphBuilder::graphifyRegularSymbols() { } // BlockNSyms now contains the block symbols in reverse canonical order. - JITTargetAddress BlockStart = BlockSyms.front()->Value; - JITTargetAddress BlockEnd = SecNSymStack.empty() - ? NSec.Address + NSec.Size - : SecNSymStack.back()->Value; - JITTargetAddress BlockOffset = BlockStart - NSec.Address; - JITTargetAddress BlockSize = BlockEnd - BlockStart; + auto BlockStart = orc::ExecutorAddr(BlockSyms.front()->Value); + orc::ExecutorAddr BlockEnd = + SecNSymStack.empty() ? NSec.Address + NSec.Size + : orc::ExecutorAddr(SecNSymStack.back()->Value); + orc::ExecutorAddrDiff BlockOffset = BlockStart - NSec.Address; + orc::ExecutorAddrDiff BlockSize = BlockEnd - BlockStart; LLVM_DEBUG({ dbgs() << " Creating block for " << formatv("{0:x16}", BlockStart) @@ -521,8 +524,8 @@ Error MachOLinkGraphBuilder::graphifyRegularSymbols() { BlockStart, NSec.Alignment, BlockStart % NSec.Alignment); - Optional<JITTargetAddress> LastCanonicalAddr; - JITTargetAddress SymEnd = BlockEnd; + Optional<orc::ExecutorAddr> LastCanonicalAddr; + auto SymEnd = BlockEnd; while (!BlockSyms.empty()) { auto &NSym = *BlockSyms.back(); BlockSyms.pop_back(); @@ -530,9 +533,9 @@ Error MachOLinkGraphBuilder::graphifyRegularSymbols() { bool SymLive = (NSym.Desc & MachO::N_NO_DEAD_STRIP) || SectionIsNoDeadStrip; - auto &Sym = createStandardGraphSymbol(NSym, B, SymEnd - NSym.Value, - SectionIsText, SymLive, - LastCanonicalAddr != NSym.Value); + auto &Sym = createStandardGraphSymbol( + NSym, B, SymEnd - orc::ExecutorAddr(NSym.Value), SectionIsText, + SymLive, LastCanonicalAddr != orc::ExecutorAddr(NSym.Value)); if (LastCanonicalAddr != Sym.getAddress()) { if (LastCanonicalAddr) @@ -568,11 +571,12 @@ Symbol &MachOLinkGraphBuilder::createStandardGraphSymbol(NormalizedSymbol &NSym, dbgs() << "\n"; }); - auto &Sym = NSym.Name ? G->addDefinedSymbol(B, NSym.Value - B.getAddress(), - *NSym.Name, Size, NSym.L, NSym.S, - IsText, IsNoDeadStrip) - : G->addAnonymousSymbol(B, NSym.Value - B.getAddress(), - Size, IsText, IsNoDeadStrip); + auto SymOffset = orc::ExecutorAddr(NSym.Value) - B.getAddress(); + auto &Sym = + NSym.Name + ? G->addDefinedSymbol(B, SymOffset, *NSym.Name, Size, NSym.L, NSym.S, + IsText, IsNoDeadStrip) + : G->addAnonymousSymbol(B, SymOffset, Size, IsText, IsNoDeadStrip); NSym.GraphSymbol = &Sym; if (IsCanonical) @@ -635,12 +639,12 @@ Error MachOLinkGraphBuilder::graphifyCStringSection( bool SectionIsNoDeadStrip = NSec.Flags & MachO::S_ATTR_NO_DEAD_STRIP; bool SectionIsText = NSec.Flags & MachO::S_ATTR_PURE_INSTRUCTIONS; - JITTargetAddress BlockStart = 0; + orc::ExecutorAddrDiff BlockStart = 0; // Scan section for null characters. for (size_t I = 0; I != NSec.Size; ++I) if (NSec.Data[I] == '\0') { - JITTargetAddress BlockEnd = I + 1; + orc::ExecutorAddrDiff BlockEnd = I + 1; size_t BlockSize = BlockEnd - BlockStart; // Create a block for this null terminated string. auto &B = G->createContentBlock(*NSec.GraphSection, @@ -654,7 +658,8 @@ Error MachOLinkGraphBuilder::graphifyCStringSection( }); // If there's no symbol at the start of this block then create one. - if (NSyms.empty() || NSyms.back()->Value != B.getAddress()) { + if (NSyms.empty() || + orc::ExecutorAddr(NSyms.back()->Value) != B.getAddress()) { auto &S = G->addAnonymousSymbol(B, 0, BlockSize, false, false); setCanonicalSymbol(NSec, S); LLVM_DEBUG({ @@ -666,18 +671,19 @@ Error MachOLinkGraphBuilder::graphifyCStringSection( } // Process any remaining symbols that point into this block. - JITTargetAddress LastCanonicalAddr = B.getAddress() + BlockEnd; - while (!NSyms.empty() && - NSyms.back()->Value < (B.getAddress() + BlockSize)) { + auto LastCanonicalAddr = B.getAddress() + BlockEnd; + while (!NSyms.empty() && orc::ExecutorAddr(NSyms.back()->Value) < + B.getAddress() + BlockSize) { auto &NSym = *NSyms.back(); - size_t SymSize = (B.getAddress() + BlockSize) - NSyms.back()->Value; + size_t SymSize = (B.getAddress() + BlockSize) - + orc::ExecutorAddr(NSyms.back()->Value); bool SymLive = (NSym.Desc & MachO::N_NO_DEAD_STRIP) || SectionIsNoDeadStrip; bool IsCanonical = false; - if (LastCanonicalAddr != NSym.Value) { + if (LastCanonicalAddr != orc::ExecutorAddr(NSym.Value)) { IsCanonical = true; - LastCanonicalAddr = NSym.Value; + LastCanonicalAddr = orc::ExecutorAddr(NSym.Value); } createStandardGraphSymbol(NSym, B, SymSize, SectionIsText, SymLive, @@ -785,7 +791,7 @@ Error CompactUnwindSplitter::operator()(LinkGraph &G) { E.getTarget().getName() + " is an external symbol"); auto &TgtBlock = E.getTarget().getBlock(); auto &CURecSym = - G.addAnonymousSymbol(CURec, 0, CURecordSize, 0, false); + G.addAnonymousSymbol(CURec, 0, CURecordSize, false, false); TgtBlock.addEdge(Edge::KeepAlive, 0, CURecSym, 0); AddedKeepAlive = true; } else if (E.getOffset() != PersonalityEdgeOffset && diff --git a/llvm/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.h b/llvm/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.h index d29732ebdba8..2951a8533098 100644 --- a/llvm/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.h +++ b/llvm/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.h @@ -71,13 +71,13 @@ protected: public: char SectName[17]; char SegName[17]; - uint64_t Address = 0; + orc::ExecutorAddr Address; uint64_t Size = 0; uint64_t Alignment = 0; uint32_t Flags = 0; const char *Data = nullptr; Section *GraphSection = nullptr; - std::map<JITTargetAddress, Symbol *> CanonicalSymbols; + std::map<orc::ExecutorAddr, Symbol *> CanonicalSymbols; }; using SectionParserFunction = std::function<Error(NormalizedSection &S)>; @@ -137,7 +137,7 @@ protected: /// Returns the symbol with the highest address not greater than the search /// address, or null if no such symbol exists. Symbol *getSymbolByAddress(NormalizedSection &NSec, - JITTargetAddress Address) { + orc::ExecutorAddr Address) { auto I = NSec.CanonicalSymbols.upper_bound(Address); if (I == NSec.CanonicalSymbols.begin()) return nullptr; @@ -147,7 +147,7 @@ protected: /// Returns the symbol with the highest address not greater than the search /// address, or an error if no such symbol exists. Expected<Symbol &> findSymbolByAddress(NormalizedSection &NSec, - JITTargetAddress Address) { + orc::ExecutorAddr Address) { auto *Sym = getSymbolByAddress(NSec, Address); if (Sym) if (Address <= Sym->getAddress() + Sym->getSize()) @@ -193,9 +193,9 @@ private: Section &getCommonSection(); void addSectionStartSymAndBlock(unsigned SecIndex, Section &GraphSec, - uint64_t Address, const char *Data, - uint64_t Size, uint32_t Alignment, - bool IsLive); + orc::ExecutorAddr Address, const char *Data, + orc::ExecutorAddrDiff Size, + uint32_t Alignment, bool IsLive); Error createNormalizedSections(); Error createNormalizedSymbols(); diff --git a/llvm/lib/ExecutionEngine/JITLink/MachO_arm64.cpp b/llvm/lib/ExecutionEngine/JITLink/MachO_arm64.cpp index f2a029d35cd5..3ca2e40c7263 100644 --- a/llvm/lib/ExecutionEngine/JITLink/MachO_arm64.cpp +++ b/llvm/lib/ExecutionEngine/JITLink/MachO_arm64.cpp @@ -109,7 +109,7 @@ private: Expected<PairRelocInfo> parsePairRelocation(Block &BlockToFix, Edge::Kind SubtractorKind, const MachO::relocation_info &SubRI, - JITTargetAddress FixupAddress, const char *FixupContent, + orc::ExecutorAddr FixupAddress, const char *FixupContent, object::relocation_iterator &UnsignedRelItr, object::relocation_iterator &RelEnd) { using namespace support; @@ -162,7 +162,7 @@ private: return ToSymbolSec.takeError(); ToSymbol = getSymbolByAddress(*ToSymbolSec, ToSymbolSec->Address); assert(ToSymbol && "No symbol for section"); - FixupValue -= ToSymbol->getAddress(); + FixupValue -= ToSymbol->getAddress().getValue(); } MachOARM64RelocationKind DeltaKind; @@ -195,7 +195,7 @@ private: for (auto &S : Obj.sections()) { - JITTargetAddress SectionAddress = S.getAddress(); + orc::ExecutorAddr SectionAddress(S.getAddress()); // Skip relocations virtual sections. if (S.isVirtual()) { @@ -234,7 +234,8 @@ private: return Kind.takeError(); // Find the address of the value to fix up. - JITTargetAddress FixupAddress = SectionAddress + (uint32_t)RI.r_address; + orc::ExecutorAddr FixupAddress = + SectionAddress + (uint32_t)RI.r_address; LLVM_DEBUG({ dbgs() << " " << NSec->SectName << " + " << formatv("{0:x8}", RI.r_address) << ":\n"; @@ -249,7 +250,7 @@ private: BlockToFix = &SymbolToFixOrErr->getBlock(); } - if (FixupAddress + static_cast<JITTargetAddress>(1ULL << RI.r_length) > + if (FixupAddress + orc::ExecutorAddrDiff(1ULL << RI.r_length) > BlockToFix->getAddress() + BlockToFix->getContent().size()) return make_error<JITLinkError>( "Relocation content extends past end of fixup block"); @@ -290,7 +291,7 @@ private: }); // Find the address of the value to fix up. - JITTargetAddress PairedFixupAddress = + orc::ExecutorAddr PairedFixupAddress = SectionAddress + (uint32_t)RI.r_address; if (PairedFixupAddress != FixupAddress) return make_error<JITLinkError>("Paired relocation points at " @@ -324,7 +325,7 @@ private: Addend = *(const ulittle64_t *)FixupContent; break; case Pointer64Anon: { - JITTargetAddress TargetAddress = *(const ulittle64_t *)FixupContent; + orc::ExecutorAddr TargetAddress(*(const ulittle64_t *)FixupContent); auto TargetNSec = findSectionByIndex(RI.r_symbolnum - 1); if (!TargetNSec) return TargetNSec.takeError(); @@ -435,7 +436,7 @@ public: Symbol &createGOTEntry(Symbol &Target) { auto &GOTEntryBlock = G.createContentBlock( - getGOTSection(), getGOTEntryBlockContent(), 0, 8, 0); + getGOTSection(), getGOTEntryBlockContent(), orc::ExecutorAddr(), 8, 0); GOTEntryBlock.addEdge(Pointer64, 0, Target, 0); return G.addAnonymousSymbol(GOTEntryBlock, 0, 8, false, false); } @@ -457,8 +458,8 @@ public: } Symbol &createPLTStub(Symbol &Target) { - auto &StubContentBlock = - G.createContentBlock(getStubsSection(), getStubBlockContent(), 0, 1, 0); + auto &StubContentBlock = G.createContentBlock( + getStubsSection(), getStubBlockContent(), orc::ExecutorAddr(), 1, 0); // Re-use GOT entries for stub targets. auto &GOTEntrySymbol = getGOTEntry(Target); StubContentBlock.addEdge(LDRLiteral19, 0, GOTEntrySymbol, 0); @@ -474,7 +475,7 @@ public: private: Section &getGOTSection() { if (!GOTSection) - GOTSection = &G.createSection("$__GOT", MemProt::Read); + GOTSection = &G.createSection("$__GOT", MemProt::Read | MemProt::Exec); return *GOTSection; } @@ -545,11 +546,12 @@ private: char *BlockWorkingMem = B.getAlreadyMutableContent().data(); char *FixupPtr = BlockWorkingMem + E.getOffset(); - JITTargetAddress FixupAddress = B.getAddress() + E.getOffset(); + orc::ExecutorAddr FixupAddress = B.getAddress() + E.getOffset(); switch (E.getKind()) { case Branch26: { - assert((FixupAddress & 0x3) == 0 && "Branch-inst is not 32-bit aligned"); + assert((FixupAddress.getValue() & 0x3) == 0 && + "Branch-inst is not 32-bit aligned"); int64_t Value = E.getTarget().getAddress() - FixupAddress + E.getAddend(); @@ -569,7 +571,7 @@ private: break; } case Pointer32: { - uint64_t Value = E.getTarget().getAddress() + E.getAddend(); + uint64_t Value = E.getTarget().getAddress().getValue() + E.getAddend(); if (Value > std::numeric_limits<uint32_t>::max()) return makeTargetOutOfRangeError(G, B, E); *(ulittle32_t *)FixupPtr = Value; @@ -577,7 +579,7 @@ private: } case Pointer64: case Pointer64Anon: { - uint64_t Value = E.getTarget().getAddress() + E.getAddend(); + uint64_t Value = E.getTarget().getAddress().getValue() + E.getAddend(); *(ulittle64_t *)FixupPtr = Value; break; } @@ -587,9 +589,10 @@ private: assert((E.getKind() != GOTPage21 || E.getAddend() == 0) && "GOTPAGE21 with non-zero addend"); uint64_t TargetPage = - (E.getTarget().getAddress() + E.getAddend()) & - ~static_cast<uint64_t>(4096 - 1); - uint64_t PCPage = FixupAddress & ~static_cast<uint64_t>(4096 - 1); + (E.getTarget().getAddress().getValue() + E.getAddend()) & + ~static_cast<uint64_t>(4096 - 1); + uint64_t PCPage = + FixupAddress.getValue() & ~static_cast<uint64_t>(4096 - 1); int64_t PageDelta = TargetPage - PCPage; if (PageDelta < -(1 << 30) || PageDelta > ((1 << 30) - 1)) @@ -606,7 +609,7 @@ private: } case PageOffset12: { uint64_t TargetOffset = - (E.getTarget().getAddress() + E.getAddend()) & 0xfff; + (E.getTarget().getAddress() + E.getAddend()).getValue() & 0xfff; uint32_t RawInstr = *(ulittle32_t *)FixupPtr; unsigned ImmShift = getPageOffset12Shift(RawInstr); @@ -627,7 +630,7 @@ private: assert((RawInstr & 0xfffffc00) == 0xf9400000 && "RawInstr isn't a 64-bit LDR immediate"); - uint32_t TargetOffset = E.getTarget().getAddress() & 0xfff; + uint32_t TargetOffset = E.getTarget().getAddress().getValue() & 0xfff; assert((TargetOffset & 0x7) == 0 && "GOT entry is not 8-byte aligned"); uint32_t EncodedImm = (TargetOffset >> 3) << 10; uint32_t FixedInstr = RawInstr | EncodedImm; @@ -635,7 +638,8 @@ private: break; } case LDRLiteral19: { - assert((FixupAddress & 0x3) == 0 && "LDR is not 32-bit aligned"); + assert((FixupAddress.getValue() & 0x3) == 0 && + "LDR is not 32-bit aligned"); assert(E.getAddend() == 0 && "LDRLiteral19 with non-zero addend"); uint32_t RawInstr = *(ulittle32_t *)FixupPtr; assert(RawInstr == 0x58000010 && "RawInstr isn't a 64-bit LDR literal"); @@ -705,6 +709,13 @@ void link_MachO_arm64(std::unique_ptr<LinkGraph> G, Config.PrePrunePasses.push_back( CompactUnwindSplitter("__LD,__compact_unwind")); + // Add eh-frame passses. + // FIXME: Prune eh-frames for which compact-unwind is available once + // we support compact-unwind registration with libunwind. + Config.PrePrunePasses.push_back(EHFrameSplitter("__TEXT,__eh_frame")); + Config.PrePrunePasses.push_back( + EHFrameEdgeFixer("__TEXT,__eh_frame", 8, Delta64, Delta32, NegDelta32)); + // Add an in-place GOT/Stubs pass. Config.PostPrunePasses.push_back( PerGraphGOTAndPLTStubsBuilder_MachO_arm64::asPass); diff --git a/llvm/lib/ExecutionEngine/JITLink/MachO_x86_64.cpp b/llvm/lib/ExecutionEngine/JITLink/MachO_x86_64.cpp index a4fcd3b9a5f5..82afaa3aa3c5 100644 --- a/llvm/lib/ExecutionEngine/JITLink/MachO_x86_64.cpp +++ b/llvm/lib/ExecutionEngine/JITLink/MachO_x86_64.cpp @@ -119,7 +119,7 @@ private: // returns the edge kind and addend to be used. Expected<PairRelocInfo> parsePairRelocation( Block &BlockToFix, MachONormalizedRelocationType SubtractorKind, - const MachO::relocation_info &SubRI, JITTargetAddress FixupAddress, + const MachO::relocation_info &SubRI, orc::ExecutorAddr FixupAddress, const char *FixupContent, object::relocation_iterator &UnsignedRelItr, object::relocation_iterator &RelEnd) { using namespace support; @@ -172,7 +172,7 @@ private: return ToSymbolSec.takeError(); ToSymbol = getSymbolByAddress(*ToSymbolSec, ToSymbolSec->Address); assert(ToSymbol && "No symbol for section"); - FixupValue -= ToSymbol->getAddress(); + FixupValue -= ToSymbol->getAddress().getValue(); } Edge::Kind DeltaKind; @@ -206,7 +206,7 @@ private: for (auto &S : Obj.sections()) { - JITTargetAddress SectionAddress = S.getAddress(); + orc::ExecutorAddr SectionAddress(S.getAddress()); // Skip relocations virtual sections. if (S.isVirtual()) { @@ -241,7 +241,7 @@ private: MachO::relocation_info RI = getRelocationInfo(RelItr); // Find the address of the value to fix up. - JITTargetAddress FixupAddress = SectionAddress + (uint32_t)RI.r_address; + auto FixupAddress = SectionAddress + (uint32_t)RI.r_address; LLVM_DEBUG({ dbgs() << " " << NSec->SectName << " + " @@ -257,7 +257,7 @@ private: BlockToFix = &SymbolToFixOrErr->getBlock(); } - if (FixupAddress + static_cast<JITTargetAddress>(1ULL << RI.r_length) > + if (FixupAddress + orc::ExecutorAddrDiff(1ULL << RI.r_length) > BlockToFix->getAddress() + BlockToFix->getContent().size()) return make_error<JITLinkError>( "Relocation extends past end of fixup block"); @@ -343,7 +343,7 @@ private: Kind = x86_64::Pointer64; break; case MachOPointer64Anon: { - JITTargetAddress TargetAddress = *(const ulittle64_t *)FixupContent; + orc::ExecutorAddr TargetAddress(*(const ulittle64_t *)FixupContent); auto TargetNSec = findSectionByIndex(RI.r_symbolnum - 1); if (!TargetNSec) return TargetNSec.takeError(); @@ -367,8 +367,8 @@ private: Kind = x86_64::Delta32; break; case MachOPCRel32Anon: { - JITTargetAddress TargetAddress = - FixupAddress + 4 + *(const little32_t *)FixupContent; + orc::ExecutorAddr TargetAddress(FixupAddress + 4 + + *(const little32_t *)FixupContent); auto TargetNSec = findSectionByIndex(RI.r_symbolnum - 1); if (!TargetNSec) return TargetNSec.takeError(); @@ -384,10 +384,10 @@ private: case MachOPCRel32Minus1Anon: case MachOPCRel32Minus2Anon: case MachOPCRel32Minus4Anon: { - JITTargetAddress Delta = - 4 + static_cast<JITTargetAddress>( + orc::ExecutorAddrDiff Delta = + 4 + orc::ExecutorAddrDiff( 1ULL << (*MachORelocKind - MachOPCRel32Minus1Anon)); - JITTargetAddress TargetAddress = + orc::ExecutorAddr TargetAddress = FixupAddress + Delta + *(const little32_t *)FixupContent; auto TargetNSec = findSectionByIndex(RI.r_symbolnum - 1); if (!TargetNSec) diff --git a/llvm/lib/ExecutionEngine/JITLink/PerGraphGOTAndPLTStubsBuilder.h b/llvm/lib/ExecutionEngine/JITLink/PerGraphGOTAndPLTStubsBuilder.h index 6e9df9c75a65..6e325f92bafb 100644 --- a/llvm/lib/ExecutionEngine/JITLink/PerGraphGOTAndPLTStubsBuilder.h +++ b/llvm/lib/ExecutionEngine/JITLink/PerGraphGOTAndPLTStubsBuilder.h @@ -47,16 +47,16 @@ public: if (impl().isGOTEdgeToFix(E)) { LLVM_DEBUG({ dbgs() << " Fixing " << G.getEdgeKindName(E.getKind()) - << " edge at " << formatv("{0:x}", B->getFixupAddress(E)) - << " (" << formatv("{0:x}", B->getAddress()) << " + " + << " edge at " << B->getFixupAddress(E) << " (" + << B->getAddress() << " + " << formatv("{0:x}", E.getOffset()) << ")\n"; }); impl().fixGOTEdge(E, getGOTEntry(E.getTarget())); } else if (impl().isExternalBranchEdge(E)) { LLVM_DEBUG({ dbgs() << " Fixing " << G.getEdgeKindName(E.getKind()) - << " edge at " << formatv("{0:x}", B->getFixupAddress(E)) - << " (" << formatv("{0:x}", B->getAddress()) << " + " + << " edge at " << B->getFixupAddress(E) << " (" + << B->getAddress() << " + " << formatv("{0:x}", E.getOffset()) << ")\n"; }); impl().fixPLTEdge(E, getPLTStub(E.getTarget())); diff --git a/llvm/lib/ExecutionEngine/JITLink/riscv.cpp b/llvm/lib/ExecutionEngine/JITLink/riscv.cpp index 6b73ff95a3b0..3ce2cf10a24c 100644 --- a/llvm/lib/ExecutionEngine/JITLink/riscv.cpp +++ b/llvm/lib/ExecutionEngine/JITLink/riscv.cpp @@ -24,6 +24,8 @@ const char *getEdgeKindName(Edge::Kind K) { return "R_RISCV_32"; case R_RISCV_64: return "R_RISCV_64"; + case R_RISCV_BRANCH: + return "R_RISCV_BRANCH"; case R_RISCV_HI20: return "R_RISCV_HI20"; case R_RISCV_LO12_I: @@ -36,6 +38,32 @@ const char *getEdgeKindName(Edge::Kind K) { return "R_RISCV_PCREL_LO12_S"; case R_RISCV_CALL: return "R_RISCV_CALL"; + case R_RISCV_32_PCREL: + return "R_RISCV_32_PCREL"; + case R_RISCV_ADD64: + return "R_RISCV_ADD64"; + case R_RISCV_ADD32: + return "R_RISCV_ADD32"; + case R_RISCV_ADD16: + return "R_RISCV_ADD16"; + case R_RISCV_ADD8: + return "R_RISCV_ADD8"; + case R_RISCV_SUB64: + return "R_RISCV_SUB64"; + case R_RISCV_SUB32: + return "R_RISCV_SUB32"; + case R_RISCV_SUB16: + return "R_RISCV_SUB16"; + case R_RISCV_SUB8: + return "R_RISCV_SUB8"; + case R_RISCV_SET6: + return "R_RISCV_SET6"; + case R_RISCV_SET8: + return "R_RISCV_SET8"; + case R_RISCV_SET16: + return "R_RISCV_SET16"; + case R_RISCV_SET32: + return "R_RISCV_SET32"; } return getGenericEdgeKindName(K); } diff --git a/llvm/lib/ExecutionEngine/JITLink/x86_64.cpp b/llvm/lib/ExecutionEngine/JITLink/x86_64.cpp index 48521280059d..df9979b47e88 100644 --- a/llvm/lib/ExecutionEngine/JITLink/x86_64.cpp +++ b/llvm/lib/ExecutionEngine/JITLink/x86_64.cpp @@ -95,10 +95,10 @@ Error optimizeGOTAndStubAccesses(LinkGraph &G) { assert(GOTEntryBlock.edges_size() == 1 && "GOT entry should only have one outgoing edge"); auto &GOTTarget = GOTEntryBlock.edges().begin()->getTarget(); - JITTargetAddress TargetAddr = GOTTarget.getAddress(); - JITTargetAddress EdgeAddr = B->getFixupAddress(E); + orc::ExecutorAddr TargetAddr = GOTTarget.getAddress(); + orc::ExecutorAddr EdgeAddr = B->getFixupAddress(E); int64_t Displacement = TargetAddr - EdgeAddr + 4; - bool TargetInRangeForImmU32 = isInRangeForImmU32(TargetAddr); + bool TargetInRangeForImmU32 = isInRangeForImmU32(TargetAddr.getValue()); bool DisplacementInRangeForImmS32 = isInRangeForImmS32(Displacement); // If both of the Target and displacement is out of range, then @@ -165,8 +165,8 @@ Error optimizeGOTAndStubAccesses(LinkGraph &G) { "GOT block should only have one outgoing edge"); auto &GOTTarget = GOTBlock.edges().begin()->getTarget(); - JITTargetAddress EdgeAddr = B->getAddress() + E.getOffset(); - JITTargetAddress TargetAddr = GOTTarget.getAddress(); + orc::ExecutorAddr EdgeAddr = B->getAddress() + E.getOffset(); + orc::ExecutorAddr TargetAddr = GOTTarget.getAddress(); int64_t Displacement = TargetAddr - EdgeAddr + 4; if (isInRangeForImmS32(Displacement)) { diff --git a/llvm/lib/ExecutionEngine/Orc/Core.cpp b/llvm/lib/ExecutionEngine/Orc/Core.cpp index aa82cf38c45d..e5cb8103919a 100644 --- a/llvm/lib/ExecutionEngine/Orc/Core.cpp +++ b/llvm/lib/ExecutionEngine/Orc/Core.cpp @@ -1933,9 +1933,14 @@ Error ExecutionSession::removeJITDylib(JITDylib &JD) { JDs.erase(I); }); - // Clear the JITDylib. + // Clear the JITDylib. Hold on to any error while we clean up the + // JITDylib members below. auto Err = JD.clear(); + // Notify the platform of the teardown. + if (P) + Err = joinErrors(std::move(Err), P->teardownJITDylib(JD)); + // Set JD to closed state. Clear remaining data structures. runSessionLocked([&] { assert(JD.State == JITDylib::Closing && "JD should be closing"); @@ -1953,19 +1958,22 @@ Error ExecutionSession::removeJITDylib(JITDylib &JD) { return Err; } -std::vector<JITDylibSP> JITDylib::getDFSLinkOrder(ArrayRef<JITDylibSP> JDs) { +Expected<std::vector<JITDylibSP>> +JITDylib::getDFSLinkOrder(ArrayRef<JITDylibSP> JDs) { if (JDs.empty()) - return {}; + return std::vector<JITDylibSP>(); auto &ES = JDs.front()->getExecutionSession(); - return ES.runSessionLocked([&]() { + return ES.runSessionLocked([&]() -> Expected<std::vector<JITDylibSP>> { DenseSet<JITDylib *> Visited; std::vector<JITDylibSP> Result; for (auto &JD : JDs) { - assert(JD->State == Open && "JD is defunct"); - + if (JD->State != Open) + return make_error<StringError>( + "Error building link order: " + JD->getName() + " is defunct", + inconvertibleErrorCode()); if (Visited.count(JD.get())) continue; @@ -1990,18 +1998,19 @@ std::vector<JITDylibSP> JITDylib::getDFSLinkOrder(ArrayRef<JITDylibSP> JDs) { }); } -std::vector<JITDylibSP> +Expected<std::vector<JITDylibSP>> JITDylib::getReverseDFSLinkOrder(ArrayRef<JITDylibSP> JDs) { - auto Tmp = getDFSLinkOrder(JDs); - std::reverse(Tmp.begin(), Tmp.end()); - return Tmp; + auto Result = getDFSLinkOrder(JDs); + if (Result) + std::reverse(Result->begin(), Result->end()); + return Result; } -std::vector<JITDylibSP> JITDylib::getDFSLinkOrder() { +Expected<std::vector<JITDylibSP>> JITDylib::getDFSLinkOrder() { return getDFSLinkOrder({this}); } -std::vector<JITDylibSP> JITDylib::getReverseDFSLinkOrder() { +Expected<std::vector<JITDylibSP>> JITDylib::getReverseDFSLinkOrder() { return getReverseDFSLinkOrder({this}); } @@ -2201,7 +2210,7 @@ void ExecutionSession::dump(raw_ostream &OS) { void ExecutionSession::dispatchOutstandingMUs() { LLVM_DEBUG(dbgs() << "Dispatching MaterializationUnits...\n"); - while (1) { + while (true) { Optional<std::pair<std::unique_ptr<MaterializationUnit>, std::unique_ptr<MaterializationResponsibility>>> JMU; diff --git a/llvm/lib/ExecutionEngine/Orc/DebugObjectManagerPlugin.cpp b/llvm/lib/ExecutionEngine/Orc/DebugObjectManagerPlugin.cpp index fcfe389f82a8..4ff6b7fd54df 100644 --- a/llvm/lib/ExecutionEngine/Orc/DebugObjectManagerPlugin.cpp +++ b/llvm/lib/ExecutionEngine/Orc/DebugObjectManagerPlugin.cpp @@ -67,9 +67,9 @@ private: template <typename ELFT> void ELFDebugObjectSection<ELFT>::setTargetMemoryRange(SectionRange Range) { // Only patch load-addresses for executable and data sections. - if (isTextOrDataSection()) { - Header->sh_addr = static_cast<typename ELFT::uint>(Range.getStart()); - } + if (isTextOrDataSection()) + Header->sh_addr = + static_cast<typename ELFT::uint>(Range.getStart().getValue()); } template <typename ELFT> diff --git a/llvm/lib/ExecutionEngine/Orc/DebuggerSupportPlugin.cpp b/llvm/lib/ExecutionEngine/Orc/DebuggerSupportPlugin.cpp index fe62138c790c..6916ee4a827f 100644 --- a/llvm/lib/ExecutionEngine/Orc/DebuggerSupportPlugin.cpp +++ b/llvm/lib/ExecutionEngine/Orc/DebuggerSupportPlugin.cpp @@ -129,8 +129,8 @@ public: Section *Sec = nullptr; StringRef SegName; StringRef SecName; - JITTargetAddress Alignment = 0; - JITTargetAddress StartAddr = 0; + uint64_t Alignment = 0; + orc::ExecutorAddr StartAddr; uint64_t Size = 0; }; @@ -153,7 +153,8 @@ public: return Error::success(); } DebugSecInfos.push_back({&Sec, Sec.getName().substr(0, SepPos), - Sec.getName().substr(SepPos + 1), 0, 0}); + Sec.getName().substr(SepPos + 1), 0, + orc::ExecutorAddr(), 0}); } else { NonDebugSections.push_back(&Sec); @@ -182,11 +183,11 @@ public: size_t ContainerBlockSize = sizeof(typename MachOTraits::Header) + SegmentLCSize; auto ContainerBlockContent = G.allocateBuffer(ContainerBlockSize); - MachOContainerBlock = - &G.createMutableContentBlock(SDOSec, ContainerBlockContent, 0, 8, 0); + MachOContainerBlock = &G.createMutableContentBlock( + SDOSec, ContainerBlockContent, orc::ExecutorAddr(), 8, 0); // Copy debug section blocks and symbols. - JITTargetAddress NextBlockAddr = MachOContainerBlock->getSize(); + orc::ExecutorAddr NextBlockAddr(MachOContainerBlock->getSize()); for (auto &SI : DebugSecInfos) { assert(!llvm::empty(SI.Sec->blocks()) && "Empty debug info section?"); @@ -219,7 +220,8 @@ public: G.mergeSections(SDOSec, *SI.Sec); SI.Sec = nullptr; } - size_t DebugSectionsSize = NextBlockAddr - MachOContainerBlock->getSize(); + size_t DebugSectionsSize = + NextBlockAddr - orc::ExecutorAddr(MachOContainerBlock->getSize()); // Write MachO header and debug section load commands. MachOStructWriter Writer(MachOContainerBlock->getAlreadyMutableContent()); @@ -266,9 +268,9 @@ public: memset(&Sec, 0, sizeof(Sec)); memcpy(Sec.sectname, SI.SecName.data(), SI.SecName.size()); memcpy(Sec.segname, SI.SegName.data(), SI.SegName.size()); - Sec.addr = SI.StartAddr; + Sec.addr = SI.StartAddr.getValue(); Sec.size = SI.Size; - Sec.offset = SI.StartAddr; + Sec.offset = SI.StartAddr.getValue(); Sec.align = SI.Alignment; Sec.reloff = 0; Sec.nreloc = 0; @@ -336,7 +338,7 @@ public: memset(&SecCmd, 0, sizeof(SecCmd)); memcpy(SecCmd.sectname, SecName.data(), SecName.size()); memcpy(SecCmd.segname, SegName.data(), SegName.size()); - SecCmd.addr = R.getStart(); + SecCmd.addr = R.getStart().getValue(); SecCmd.size = R.getSize(); SecCmd.offset = 0; SecCmd.align = R.getFirstBlock()->getAlignment(); @@ -347,8 +349,10 @@ public: } SectionRange R(MachOContainerBlock->getSection()); - G.allocActions().push_back( - {{RegisterActionAddr.getValue(), R.getStart(), R.getSize()}, {}}); + G.allocActions().push_back({cantFail(shared::WrapperFunctionCall::Create< + SPSArgList<SPSExecutorAddrRange>>( + RegisterActionAddr, R.getRange())), + {}}); return Error::success(); } diff --git a/llvm/lib/ExecutionEngine/Orc/ELFNixPlatform.cpp b/llvm/lib/ExecutionEngine/Orc/ELFNixPlatform.cpp index eded54f4bfb3..d02760703f06 100644 --- a/llvm/lib/ExecutionEngine/Orc/ELFNixPlatform.cpp +++ b/llvm/lib/ExecutionEngine/Orc/ELFNixPlatform.cpp @@ -58,7 +58,8 @@ public: auto &DSOHandleSection = G->createSection(".data.__dso_handle", jitlink::MemProt::Read); auto &DSOHandleBlock = G->createContentBlock( - DSOHandleSection, getDSOHandleContent(PointerSize), 0, 8, 0); + DSOHandleSection, getDSOHandleContent(PointerSize), orc::ExecutorAddr(), + 8, 0); auto &DSOHandleSymbol = G->addDefinedSymbol( DSOHandleBlock, 0, *R->getInitializerSymbol(), DSOHandleBlock.getSize(), jitlink::Linkage::Strong, jitlink::Scope::Default, false, true); @@ -154,6 +155,10 @@ Error ELFNixPlatform::setupJITDylib(JITDylib &JD) { std::make_unique<DSOHandleMaterializationUnit>(*this, DSOHandleSymbol)); } +Error ELFNixPlatform::teardownJITDylib(JITDylib &JD) { + return Error::success(); +} + Error ELFNixPlatform::notifyAdding(ResourceTracker &RT, const MaterializationUnit &MU) { auto &JD = RT.getJITDylib(); @@ -315,9 +320,14 @@ void ELFNixPlatform::getInitializersLookupPhase( SendInitializerSequenceFn SendResult, JITDylib &JD) { auto DFSLinkOrder = JD.getDFSLinkOrder(); + if (!DFSLinkOrder) { + SendResult(DFSLinkOrder.takeError()); + return; + } + DenseMap<JITDylib *, SymbolLookupSet> NewInitSymbols; ES.runSessionLocked([&]() { - for (auto &InitJD : DFSLinkOrder) { + for (auto &InitJD : *DFSLinkOrder) { auto RISItr = RegisteredInitSymbols.find(InitJD.get()); if (RISItr != RegisteredInitSymbols.end()) { NewInitSymbols[InitJD.get()] = std::move(RISItr->second); @@ -330,7 +340,7 @@ void ELFNixPlatform::getInitializersLookupPhase( // phase. if (NewInitSymbols.empty()) { getInitializersBuildSequencePhase(std::move(SendResult), JD, - std::move(DFSLinkOrder)); + std::move(*DFSLinkOrder)); return; } @@ -375,7 +385,7 @@ void ELFNixPlatform::rt_getDeinitializers( { std::lock_guard<std::mutex> Lock(PlatformMutex); - auto I = HandleAddrToJITDylib.find(Handle.getValue()); + auto I = HandleAddrToJITDylib.find(Handle); if (I != HandleAddrToJITDylib.end()) JD = I->second; } @@ -406,7 +416,7 @@ void ELFNixPlatform::rt_lookupSymbol(SendSymbolAddressFn SendResult, { std::lock_guard<std::mutex> Lock(PlatformMutex); - auto I = HandleAddrToJITDylib.find(Handle.getValue()); + auto I = HandleAddrToJITDylib.find(Handle); if (I != HandleAddrToJITDylib.end()) JD = I->second; } @@ -630,12 +640,11 @@ void ELFNixPlatform::ELFNixPlatformPlugin::addDSOHandleSupportPasses( assert(I != G.defined_symbols().end() && "Missing DSO handle symbol"); { std::lock_guard<std::mutex> Lock(MP.PlatformMutex); - JITTargetAddress HandleAddr = (*I)->getAddress(); + auto HandleAddr = (*I)->getAddress(); MP.HandleAddrToJITDylib[HandleAddr] = &JD; assert(!MP.InitSeqs.count(&JD) && "InitSeq entry for JD already exists"); MP.InitSeqs.insert(std::make_pair( - &JD, - ELFNixJITDylibInitializers(JD.getName(), ExecutorAddr(HandleAddr)))); + &JD, ELFNixJITDylibInitializers(JD.getName(), HandleAddr))); } return Error::success(); }); diff --git a/llvm/lib/ExecutionEngine/Orc/EPCEHFrameRegistrar.cpp b/llvm/lib/ExecutionEngine/Orc/EPCEHFrameRegistrar.cpp index 4c0fab8aa9fa..256ce94690f0 100644 --- a/llvm/lib/ExecutionEngine/Orc/EPCEHFrameRegistrar.cpp +++ b/llvm/lib/ExecutionEngine/Orc/EPCEHFrameRegistrar.cpp @@ -56,18 +56,15 @@ EPCEHFrameRegistrar::Create(ExecutionSession &ES) { ExecutorAddr(DeregisterEHFrameWrapperFnAddr)); } -Error EPCEHFrameRegistrar::registerEHFrames(JITTargetAddress EHFrameSectionAddr, - size_t EHFrameSectionSize) { - return ES.callSPSWrapper<void(SPSExecutorAddr, uint64_t)>( - RegisterEHFrameWrapperFnAddr, ExecutorAddr(EHFrameSectionAddr), - static_cast<uint64_t>(EHFrameSectionSize)); +Error EPCEHFrameRegistrar::registerEHFrames(ExecutorAddrRange EHFrameSection) { + return ES.callSPSWrapper<void(SPSExecutorAddrRange)>( + RegisterEHFrameWrapperFnAddr, EHFrameSection); } Error EPCEHFrameRegistrar::deregisterEHFrames( - JITTargetAddress EHFrameSectionAddr, size_t EHFrameSectionSize) { - return ES.callSPSWrapper<void(SPSExecutorAddr, uint64_t)>( - DeregisterEHFrameWrapperFnAddr, ExecutorAddr(EHFrameSectionAddr), - static_cast<uint64_t>(EHFrameSectionSize)); + ExecutorAddrRange EHFrameSection) { + return ES.callSPSWrapper<void(SPSExecutorAddrRange)>( + DeregisterEHFrameWrapperFnAddr, EHFrameSection); } } // end namespace orc diff --git a/llvm/lib/ExecutionEngine/Orc/EPCGenericJITLinkMemoryManager.cpp b/llvm/lib/ExecutionEngine/Orc/EPCGenericJITLinkMemoryManager.cpp index 9b712cb8f7ca..75cc30753f41 100644 --- a/llvm/lib/ExecutionEngine/Orc/EPCGenericJITLinkMemoryManager.cpp +++ b/llvm/lib/ExecutionEngine/Orc/EPCGenericJITLinkMemoryManager.cpp @@ -56,17 +56,7 @@ public: } // Transfer allocation actions. - // FIXME: Merge JITLink and ORC SupportFunctionCall and Action list types, - // turn this into a std::swap. - FR.Actions.reserve(G.allocActions().size()); - for (auto &ActPair : G.allocActions()) - FR.Actions.push_back({{ExecutorAddr(ActPair.Finalize.FnAddr), - {ExecutorAddr(ActPair.Finalize.CtxAddr), - ExecutorAddrDiff(ActPair.Finalize.CtxSize)}}, - {ExecutorAddr(ActPair.Dealloc.FnAddr), - {ExecutorAddr(ActPair.Dealloc.CtxAddr), - ExecutorAddrDiff(ActPair.Dealloc.CtxSize)}}}); - G.allocActions().clear(); + std::swap(FR.Actions, G.allocActions()); Parent.EPC.callSPSWrapperAsync< rt::SPSSimpleExecutorMemoryManagerFinalizeSignature>( @@ -80,7 +70,7 @@ public: } else if (FinalizeErr) OnFinalize(std::move(FinalizeErr)); else - OnFinalize(FinalizedAlloc(AllocAddr.getValue())); + OnFinalize(FinalizedAlloc(AllocAddr)); }, Parent.SAs.Allocator, std::move(FR)); } @@ -161,7 +151,7 @@ void EPCGenericJITLinkMemoryManager::completeAllocation( const auto &AG = KV.first; auto &Seg = KV.second; - Seg.Addr = NextSegAddr.getValue(); + Seg.Addr = NextSegAddr; KV.second.WorkingMem = BL.getGraph().allocateBuffer(Seg.ContentSize).data(); NextSegAddr += ExecutorAddrDiff( alignTo(Seg.ContentSize + Seg.ZeroFillSize, EPC.getPageSize())); diff --git a/llvm/lib/ExecutionEngine/Orc/EPCGenericRTDyldMemoryManager.cpp b/llvm/lib/ExecutionEngine/Orc/EPCGenericRTDyldMemoryManager.cpp index 1d98e104a4d7..cdac367e11a3 100644 --- a/llvm/lib/ExecutionEngine/Orc/EPCGenericRTDyldMemoryManager.cpp +++ b/llvm/lib/ExecutionEngine/Orc/EPCGenericRTDyldMemoryManager.cpp @@ -14,6 +14,8 @@ #define DEBUG_TYPE "orc" +using namespace llvm::orc::shared; + namespace llvm { namespace orc { @@ -27,10 +29,8 @@ EPCGenericRTDyldMemoryManager::CreateWithDefaultBootstrapSymbols( {SAs.Finalize, rt::SimpleExecutorMemoryManagerFinalizeWrapperName}, {SAs.Deallocate, rt::SimpleExecutorMemoryManagerDeallocateWrapperName}, - {SAs.RegisterEHFrame, - rt::RegisterEHFrameSectionCustomDirectWrapperName}, - {SAs.DeregisterEHFrame, - rt::DeregisterEHFrameSectionCustomDirectWrapperName}})) + {SAs.RegisterEHFrame, rt::RegisterEHFrameSectionWrapperName}, + {SAs.DeregisterEHFrame, rt::DeregisterEHFrameSectionWrapperName}})) return std::move(Err); return std::make_unique<EPCGenericRTDyldMemoryManager>(EPC, std::move(SAs)); } @@ -263,10 +263,12 @@ bool EPCGenericRTDyldMemoryManager::finalizeMemory(std::string *ErrMsg) { for (auto &Frame : ObjAllocs.UnfinalizedEHFrames) FR.Actions.push_back( - {{SAs.RegisterEHFrame, - {ExecutorAddr(Frame.Addr), ExecutorAddrDiff(Frame.Size)}}, - {SAs.DeregisterEHFrame, - {ExecutorAddr(Frame.Addr), ExecutorAddrDiff(Frame.Size)}}}); + {cantFail( + WrapperFunctionCall::Create<SPSArgList<SPSExecutorAddrRange>>( + SAs.RegisterEHFrame, Frame)), + cantFail( + WrapperFunctionCall::Create<SPSArgList<SPSExecutorAddrRange>>( + SAs.DeregisterEHFrame, Frame))}); // We'll also need to make an extra allocation for the eh-frame wrapper call // arguments. diff --git a/llvm/lib/ExecutionEngine/Orc/EPCIndirectionUtils.cpp b/llvm/lib/ExecutionEngine/Orc/EPCIndirectionUtils.cpp index 818b6b52ff83..b901a2d2da23 100644 --- a/llvm/lib/ExecutionEngine/Orc/EPCIndirectionUtils.cpp +++ b/llvm/lib/ExecutionEngine/Orc/EPCIndirectionUtils.cpp @@ -119,10 +119,12 @@ Error EPCTrampolinePool::grow() { unsigned NumTrampolines = TrampolinesPerPage; auto SegInfo = Alloc->getSegInfo(MemProt::Read | MemProt::Exec); - EPCIU.getABISupport().writeTrampolines( - SegInfo.WorkingMem.data(), SegInfo.Addr, ResolverAddress, NumTrampolines); + EPCIU.getABISupport().writeTrampolines(SegInfo.WorkingMem.data(), + SegInfo.Addr.getValue(), + ResolverAddress, NumTrampolines); for (unsigned I = 0; I < NumTrampolines; ++I) - AvailableTrampolines.push_back(SegInfo.Addr + (I * TrampolineSize)); + AvailableTrampolines.push_back(SegInfo.Addr.getValue() + + (I * TrampolineSize)); auto FA = Alloc->finalize(); if (!FA) @@ -300,15 +302,15 @@ EPCIndirectionUtils::writeResolverBlock(JITTargetAddress ReentryFnAddr, return Alloc.takeError(); auto SegInfo = Alloc->getSegInfo(MemProt::Read | MemProt::Exec); - ABI->writeResolverCode(SegInfo.WorkingMem.data(), SegInfo.Addr, ReentryFnAddr, - ReentryCtxAddr); + ABI->writeResolverCode(SegInfo.WorkingMem.data(), SegInfo.Addr.getValue(), + ReentryFnAddr, ReentryCtxAddr); auto FA = Alloc->finalize(); if (!FA) return FA.takeError(); ResolverBlock = std::move(*FA); - return SegInfo.Addr; + return SegInfo.Addr.getValue(); } std::unique_ptr<IndirectStubsManager> @@ -369,8 +371,9 @@ EPCIndirectionUtils::getIndirectStubs(unsigned NumStubs) { auto StubSeg = Alloc->getSegInfo(StubProt); auto PtrSeg = Alloc->getSegInfo(PtrProt); - ABI->writeIndirectStubsBlock(StubSeg.WorkingMem.data(), StubSeg.Addr, - PtrSeg.Addr, NumStubsToAllocate); + ABI->writeIndirectStubsBlock(StubSeg.WorkingMem.data(), + StubSeg.Addr.getValue(), + PtrSeg.Addr.getValue(), NumStubsToAllocate); auto FA = Alloc->finalize(); if (!FA) @@ -381,8 +384,8 @@ EPCIndirectionUtils::getIndirectStubs(unsigned NumStubs) { auto StubExecutorAddr = StubSeg.Addr; auto PtrExecutorAddr = PtrSeg.Addr; for (unsigned I = 0; I != NumStubsToAllocate; ++I) { - AvailableIndirectStubs.push_back( - IndirectStubInfo(StubExecutorAddr, PtrExecutorAddr)); + AvailableIndirectStubs.push_back(IndirectStubInfo( + StubExecutorAddr.getValue(), PtrExecutorAddr.getValue())); StubExecutorAddr += ABI->getStubSize(); PtrExecutorAddr += ABI->getPointerSize(); } diff --git a/llvm/lib/ExecutionEngine/Orc/IndirectionUtils.cpp b/llvm/lib/ExecutionEngine/Orc/IndirectionUtils.cpp index f427271bb45d..7a71d2f781d7 100644 --- a/llvm/lib/ExecutionEngine/Orc/IndirectionUtils.cpp +++ b/llvm/lib/ExecutionEngine/Orc/IndirectionUtils.cpp @@ -410,7 +410,7 @@ Error addFunctionPointerRelocationsToCurrentSymbol(jitlink::Symbol &Sym, while (I < Content.size()) { MCInst Instr; uint64_t InstrSize = 0; - uint64_t InstrStart = SymAddress + I; + uint64_t InstrStart = SymAddress.getValue() + I; auto DecodeStatus = Disassembler.getInstruction( Instr, InstrSize, Content.drop_front(I), InstrStart, CommentStream); if (DecodeStatus != MCDisassembler::Success) { @@ -426,7 +426,7 @@ Error addFunctionPointerRelocationsToCurrentSymbol(jitlink::Symbol &Sym, // Check for a PC-relative address equal to the symbol itself. auto PCRelAddr = MIA.evaluateMemoryOperandAddress(Instr, &STI, InstrStart, InstrSize); - if (!PCRelAddr.hasValue() || PCRelAddr.getValue() != SymAddress) + if (!PCRelAddr || *PCRelAddr != SymAddress.getValue()) continue; auto RelocOffInInstr = @@ -438,8 +438,8 @@ Error addFunctionPointerRelocationsToCurrentSymbol(jitlink::Symbol &Sym, continue; } - auto RelocOffInBlock = - InstrStart + *RelocOffInInstr - SymAddress + Sym.getOffset(); + auto RelocOffInBlock = orc::ExecutorAddr(InstrStart) + *RelocOffInInstr - + SymAddress + Sym.getOffset(); if (ExistingRelocations.contains(RelocOffInBlock)) continue; diff --git a/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp b/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp index 0ab0d7d2e2b6..91949c9d7eeb 100644 --- a/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp +++ b/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp @@ -89,6 +89,7 @@ class GenericLLVMIRPlatform : public Platform { public: GenericLLVMIRPlatform(GenericLLVMIRPlatformSupport &S) : S(S) {} Error setupJITDylib(JITDylib &JD) override; + Error teardownJITDylib(JITDylib &JD) override; Error notifyAdding(ResourceTracker &RT, const MaterializationUnit &MU) override; Error notifyRemoving(ResourceTracker &RT) override { @@ -276,17 +277,22 @@ private: DenseMap<JITDylib *, SymbolLookupSet> LookupSymbols; std::vector<JITDylibSP> DFSLinkOrder; - getExecutionSession().runSessionLocked([&]() { - DFSLinkOrder = JD.getDFSLinkOrder(); - - for (auto &NextJD : DFSLinkOrder) { - auto IFItr = InitFunctions.find(NextJD.get()); - if (IFItr != InitFunctions.end()) { - LookupSymbols[NextJD.get()] = std::move(IFItr->second); - InitFunctions.erase(IFItr); - } - } - }); + if (auto Err = getExecutionSession().runSessionLocked([&]() -> Error { + if (auto DFSLinkOrderOrErr = JD.getDFSLinkOrder()) + DFSLinkOrder = std::move(*DFSLinkOrderOrErr); + else + return DFSLinkOrderOrErr.takeError(); + + for (auto &NextJD : DFSLinkOrder) { + auto IFItr = InitFunctions.find(NextJD.get()); + if (IFItr != InitFunctions.end()) { + LookupSymbols[NextJD.get()] = std::move(IFItr->second); + InitFunctions.erase(IFItr); + } + } + return Error::success(); + })) + return std::move(Err); LLVM_DEBUG({ dbgs() << "JITDylib init order is [ "; @@ -326,20 +332,25 @@ private: DenseMap<JITDylib *, SymbolLookupSet> LookupSymbols; std::vector<JITDylibSP> DFSLinkOrder; - ES.runSessionLocked([&]() { - DFSLinkOrder = JD.getDFSLinkOrder(); - - for (auto &NextJD : DFSLinkOrder) { - auto &JDLookupSymbols = LookupSymbols[NextJD.get()]; - auto DIFItr = DeInitFunctions.find(NextJD.get()); - if (DIFItr != DeInitFunctions.end()) { - LookupSymbols[NextJD.get()] = std::move(DIFItr->second); - DeInitFunctions.erase(DIFItr); - } - JDLookupSymbols.add(LLJITRunAtExits, - SymbolLookupFlags::WeaklyReferencedSymbol); - } - }); + if (auto Err = ES.runSessionLocked([&]() -> Error { + if (auto DFSLinkOrderOrErr = JD.getDFSLinkOrder()) + DFSLinkOrder = std::move(*DFSLinkOrderOrErr); + else + return DFSLinkOrderOrErr.takeError(); + + for (auto &NextJD : DFSLinkOrder) { + auto &JDLookupSymbols = LookupSymbols[NextJD.get()]; + auto DIFItr = DeInitFunctions.find(NextJD.get()); + if (DIFItr != DeInitFunctions.end()) { + LookupSymbols[NextJD.get()] = std::move(DIFItr->second); + DeInitFunctions.erase(DIFItr); + } + JDLookupSymbols.add(LLJITRunAtExits, + SymbolLookupFlags::WeaklyReferencedSymbol); + } + return Error::success(); + })) + return std::move(Err); LLVM_DEBUG({ dbgs() << "JITDylib deinit order is [ "; @@ -380,17 +391,22 @@ private: DenseMap<JITDylib *, SymbolLookupSet> RequiredInitSymbols; std::vector<JITDylibSP> DFSLinkOrder; - getExecutionSession().runSessionLocked([&]() { - DFSLinkOrder = JD.getDFSLinkOrder(); - - for (auto &NextJD : DFSLinkOrder) { - auto ISItr = InitSymbols.find(NextJD.get()); - if (ISItr != InitSymbols.end()) { - RequiredInitSymbols[NextJD.get()] = std::move(ISItr->second); - InitSymbols.erase(ISItr); - } - } - }); + if (auto Err = getExecutionSession().runSessionLocked([&]() -> Error { + if (auto DFSLinkOrderOrErr = JD.getDFSLinkOrder()) + DFSLinkOrder = std::move(*DFSLinkOrderOrErr); + else + return DFSLinkOrderOrErr.takeError(); + + for (auto &NextJD : DFSLinkOrder) { + auto ISItr = InitSymbols.find(NextJD.get()); + if (ISItr != InitSymbols.end()) { + RequiredInitSymbols[NextJD.get()] = std::move(ISItr->second); + InitSymbols.erase(ISItr); + } + } + return Error::success(); + })) + return Err; return Platform::lookupInitSymbols(getExecutionSession(), RequiredInitSymbols) @@ -460,6 +476,10 @@ Error GenericLLVMIRPlatform::setupJITDylib(JITDylib &JD) { return S.setupJITDylib(JD); } +Error GenericLLVMIRPlatform::teardownJITDylib(JITDylib &JD) { + return Error::success(); +} + Error GenericLLVMIRPlatform::notifyAdding(ResourceTracker &RT, const MaterializationUnit &MU) { return S.notifyAdding(RT, MU); diff --git a/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp b/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp index fb2e90e1c9c5..a364719855b4 100644 --- a/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp +++ b/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp @@ -106,7 +106,8 @@ private: auto HeaderContent = G.allocateString( StringRef(reinterpret_cast<const char *>(&Hdr), sizeof(Hdr))); - return G.createContentBlock(HeaderSection, HeaderContent, 0, 8, 0); + return G.createContentBlock(HeaderSection, HeaderContent, ExecutorAddr(), 8, + 0); } static MaterializationUnit::Interface @@ -202,6 +203,8 @@ Error MachOPlatform::setupJITDylib(JITDylib &JD) { *this, MachOHeaderStartSymbol)); } +Error MachOPlatform::teardownJITDylib(JITDylib &JD) { return Error::success(); } + Error MachOPlatform::notifyAdding(ResourceTracker &RT, const MaterializationUnit &MU) { auto &JD = RT.getJITDylib(); @@ -379,9 +382,14 @@ void MachOPlatform::getInitializersLookupPhase( SendInitializerSequenceFn SendResult, JITDylib &JD) { auto DFSLinkOrder = JD.getDFSLinkOrder(); + if (!DFSLinkOrder) { + SendResult(DFSLinkOrder.takeError()); + return; + } + DenseMap<JITDylib *, SymbolLookupSet> NewInitSymbols; ES.runSessionLocked([&]() { - for (auto &InitJD : DFSLinkOrder) { + for (auto &InitJD : *DFSLinkOrder) { auto RISItr = RegisteredInitSymbols.find(InitJD.get()); if (RISItr != RegisteredInitSymbols.end()) { NewInitSymbols[InitJD.get()] = std::move(RISItr->second); @@ -394,7 +402,7 @@ void MachOPlatform::getInitializersLookupPhase( // phase. if (NewInitSymbols.empty()) { getInitializersBuildSequencePhase(std::move(SendResult), JD, - std::move(DFSLinkOrder)); + std::move(*DFSLinkOrder)); return; } @@ -439,7 +447,7 @@ void MachOPlatform::rt_getDeinitializers(SendDeinitializerSequenceFn SendResult, { std::lock_guard<std::mutex> Lock(PlatformMutex); - auto I = HeaderAddrToJITDylib.find(Handle.getValue()); + auto I = HeaderAddrToJITDylib.find(Handle); if (I != HeaderAddrToJITDylib.end()) JD = I->second; } @@ -469,7 +477,7 @@ void MachOPlatform::rt_lookupSymbol(SendSymbolAddressFn SendResult, { std::lock_guard<std::mutex> Lock(PlatformMutex); - auto I = HeaderAddrToJITDylib.find(Handle.getValue()); + auto I = HeaderAddrToJITDylib.find(Handle); if (I != HeaderAddrToJITDylib.end()) JD = I->second; } @@ -661,11 +669,11 @@ Error MachOPlatform::MachOPlatformPlugin::associateJITDylibHeaderSymbol( auto &JD = MR.getTargetJITDylib(); std::lock_guard<std::mutex> Lock(MP.PlatformMutex); - JITTargetAddress HeaderAddr = (*I)->getAddress(); + auto HeaderAddr = (*I)->getAddress(); MP.HeaderAddrToJITDylib[HeaderAddr] = &JD; assert(!MP.InitSeqs.count(&JD) && "InitSeq entry for JD already exists"); - MP.InitSeqs.insert(std::make_pair( - &JD, MachOJITDylibInitializers(JD.getName(), ExecutorAddr(HeaderAddr)))); + MP.InitSeqs.insert( + std::make_pair(&JD, MachOJITDylibInitializers(JD.getName(), HeaderAddr))); return Error::success(); } @@ -792,7 +800,7 @@ Error MachOPlatform::MachOPlatformPlugin::registerInitSections( if (auto *ObjCImageInfoSec = G.findSectionByName(ObjCImageInfoSectionName)) { if (auto Addr = jitlink::SectionRange(*ObjCImageInfoSec).getStart()) - ObjCImageInfoAddr.setValue(Addr); + ObjCImageInfoAddr = Addr; } for (auto InitSectionName : InitSectionNames) @@ -880,10 +888,12 @@ Error MachOPlatform::MachOPlatformPlugin::registerEHAndTLVSections( jitlink::SectionRange R(*EHFrameSection); if (!R.empty()) G.allocActions().push_back( - {{MP.orc_rt_macho_register_ehframe_section.getValue(), R.getStart(), - R.getSize()}, - {MP.orc_rt_macho_deregister_ehframe_section.getValue(), R.getStart(), - R.getSize()}}); + {cantFail( + WrapperFunctionCall::Create<SPSArgList<SPSExecutorAddrRange>>( + MP.orc_rt_macho_register_ehframe_section, R.getRange())), + cantFail( + WrapperFunctionCall::Create<SPSArgList<SPSExecutorAddrRange>>( + MP.orc_rt_macho_deregister_ehframe_section, R.getRange()))}); } // Get a pointer to the thread data section if there is one. It will be used @@ -913,10 +923,13 @@ Error MachOPlatform::MachOPlatformPlugin::registerEHAndTLVSections( inconvertibleErrorCode()); G.allocActions().push_back( - {{MP.orc_rt_macho_register_thread_data_section.getValue(), - R.getStart(), R.getSize()}, - {MP.orc_rt_macho_deregister_thread_data_section.getValue(), - R.getStart(), R.getSize()}}); + {cantFail( + WrapperFunctionCall::Create<SPSArgList<SPSExecutorAddrRange>>( + MP.orc_rt_macho_register_thread_data_section, R.getRange())), + cantFail( + WrapperFunctionCall::Create<SPSArgList<SPSExecutorAddrRange>>( + MP.orc_rt_macho_deregister_thread_data_section, + R.getRange()))}); } } return Error::success(); @@ -963,10 +976,10 @@ Error MachOPlatform::MachOPlatformPlugin::registerEHSectionsPhase1( // Otherwise, add allocation actions to the graph to register eh-frames for // this object. G.allocActions().push_back( - {{orc_rt_macho_register_ehframe_section.getValue(), R.getStart(), - R.getSize()}, - {orc_rt_macho_deregister_ehframe_section.getValue(), R.getStart(), - R.getSize()}}); + {cantFail(WrapperFunctionCall::Create<SPSArgList<SPSExecutorAddrRange>>( + orc_rt_macho_register_ehframe_section, R.getRange())), + cantFail(WrapperFunctionCall::Create<SPSArgList<SPSExecutorAddrRange>>( + orc_rt_macho_deregister_ehframe_section, R.getRange()))}); return Error::success(); } diff --git a/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp b/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp index 0d6a33c5685e..32c5998a789b 100644 --- a/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp +++ b/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp @@ -217,7 +217,7 @@ public: Flags |= JITSymbolFlags::Exported; InternedResult[InternedName] = - JITEvaluatedSymbol(Sym->getAddress(), Flags); + JITEvaluatedSymbol(Sym->getAddress().getValue(), Flags); if (AutoClaim && !MR->getSymbols().count(InternedName)) { assert(!ExtraSymbolsToClaim.count(InternedName) && "Duplicate symbol to claim?"); @@ -235,7 +235,7 @@ public: if (Sym->getLinkage() == Linkage::Weak) Flags |= JITSymbolFlags::Weak; InternedResult[InternedName] = - JITEvaluatedSymbol(Sym->getAddress(), Flags); + JITEvaluatedSymbol(Sym->getAddress().getValue(), Flags); if (AutoClaim && !MR->getSymbols().count(InternedName)) { assert(!ExtraSymbolsToClaim.count(InternedName) && "Duplicate symbol to claim?"); @@ -743,7 +743,7 @@ void EHFrameRegistrationPlugin::modifyPassConfig( PassConfiguration &PassConfig) { PassConfig.PostFixupPasses.push_back(createEHFrameRecorderPass( - G.getTargetTriple(), [this, &MR](JITTargetAddress Addr, size_t Size) { + G.getTargetTriple(), [this, &MR](ExecutorAddr Addr, size_t Size) { if (Addr) { std::lock_guard<std::mutex> Lock(EHFramePluginMutex); assert(!InProcessLinks.count(&MR) && @@ -756,7 +756,7 @@ void EHFrameRegistrationPlugin::modifyPassConfig( Error EHFrameRegistrationPlugin::notifyEmitted( MaterializationResponsibility &MR) { - EHFrameRange EmittedRange; + ExecutorAddrRange EmittedRange; { std::lock_guard<std::mutex> Lock(EHFramePluginMutex); @@ -765,7 +765,7 @@ Error EHFrameRegistrationPlugin::notifyEmitted( return Error::success(); EmittedRange = EHFrameRangeItr->second; - assert(EmittedRange.Addr && "eh-frame addr to register can not be null"); + assert(EmittedRange.Start && "eh-frame addr to register can not be null"); InProcessLinks.erase(EHFrameRangeItr); } @@ -773,7 +773,7 @@ Error EHFrameRegistrationPlugin::notifyEmitted( [&](ResourceKey K) { EHFrameRanges[K].push_back(EmittedRange); })) return Err; - return Registrar->registerEHFrames(EmittedRange.Addr, EmittedRange.Size); + return Registrar->registerEHFrames(EmittedRange); } Error EHFrameRegistrationPlugin::notifyFailed( @@ -784,7 +784,7 @@ Error EHFrameRegistrationPlugin::notifyFailed( } Error EHFrameRegistrationPlugin::notifyRemovingResources(ResourceKey K) { - std::vector<EHFrameRange> RangesToRemove; + std::vector<ExecutorAddrRange> RangesToRemove; ES.runSessionLocked([&] { auto I = EHFrameRanges.find(K); @@ -798,10 +798,9 @@ Error EHFrameRegistrationPlugin::notifyRemovingResources(ResourceKey K) { while (!RangesToRemove.empty()) { auto RangeToRemove = RangesToRemove.back(); RangesToRemove.pop_back(); - assert(RangeToRemove.Addr && "Untracked eh-frame range must not be null"); - Err = joinErrors( - std::move(Err), - Registrar->deregisterEHFrames(RangeToRemove.Addr, RangeToRemove.Size)); + assert(RangeToRemove.Start && "Untracked eh-frame range must not be null"); + Err = joinErrors(std::move(Err), + Registrar->deregisterEHFrames(RangeToRemove)); } return Err; diff --git a/llvm/lib/ExecutionEngine/Orc/OrcV2CBindings.cpp b/llvm/lib/ExecutionEngine/Orc/OrcV2CBindings.cpp index 77a8f5af8ba0..71be8dfdc004 100644 --- a/llvm/lib/ExecutionEngine/Orc/OrcV2CBindings.cpp +++ b/llvm/lib/ExecutionEngine/Orc/OrcV2CBindings.cpp @@ -611,7 +611,7 @@ LLVMErrorRef LLVMOrcCreateDynamicLibrarySearchGeneratorForProcess( DynamicLibrarySearchGenerator::GetForCurrentProcess(GlobalPrefix, Pred); if (!ProcessSymsGenerator) { - *Result = 0; + *Result = nullptr; return wrap(ProcessSymsGenerator.takeError()); } @@ -637,7 +637,7 @@ LLVMErrorRef LLVMOrcCreateDynamicLibrarySearchGeneratorForPath( DynamicLibrarySearchGenerator::Load(FileName, GlobalPrefix, Pred); if (!LibrarySymsGenerator) { - *Result = 0; + *Result = nullptr; return wrap(LibrarySymsGenerator.takeError()); } @@ -657,7 +657,7 @@ LLVMErrorRef LLVMOrcCreateStaticLibrarySearchGeneratorForPath( auto LibrarySymsGenerator = StaticLibraryDefinitionGenerator::Load(*unwrap(ObjLayer), FileName, TT); if (!LibrarySymsGenerator) { - *Result = 0; + *Result = nullptr; return wrap(LibrarySymsGenerator.takeError()); } *Result = wrap(LibrarySymsGenerator->release()); @@ -666,7 +666,7 @@ LLVMErrorRef LLVMOrcCreateStaticLibrarySearchGeneratorForPath( auto LibrarySymsGenerator = StaticLibraryDefinitionGenerator::Load(*unwrap(ObjLayer), FileName); if (!LibrarySymsGenerator) { - *Result = 0; + *Result = nullptr; return wrap(LibrarySymsGenerator.takeError()); } *Result = wrap(LibrarySymsGenerator->release()); @@ -712,7 +712,7 @@ LLVMErrorRef LLVMOrcJITTargetMachineBuilderDetectHost( auto JTMB = JITTargetMachineBuilder::detectHost(); if (!JTMB) { - Result = 0; + Result = nullptr; return wrap(JTMB.takeError()); } @@ -876,7 +876,7 @@ LLVMErrorRef LLVMOrcCreateLLJIT(LLVMOrcLLJITRef *Result, LLVMOrcDisposeLLJITBuilder(Builder); if (!J) { - Result = 0; + Result = nullptr; return wrap(J.takeError()); } diff --git a/llvm/lib/ExecutionEngine/Orc/Shared/AllocationActions.cpp b/llvm/lib/ExecutionEngine/Orc/Shared/AllocationActions.cpp new file mode 100644 index 000000000000..91f2899449ef --- /dev/null +++ b/llvm/lib/ExecutionEngine/Orc/Shared/AllocationActions.cpp @@ -0,0 +1,44 @@ +//===----- AllocationActions.gpp -- JITLink allocation support calls -----===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/ExecutionEngine/Orc/Shared/AllocationActions.h" + +namespace llvm { +namespace orc { +namespace shared { + +Expected<std::vector<WrapperFunctionCall>> +runFinalizeActions(AllocActions &AAs) { + std::vector<WrapperFunctionCall> DeallocActions; + DeallocActions.reserve(numDeallocActions(AAs)); + + for (auto &AA : AAs) { + if (AA.Finalize) + if (auto Err = AA.Finalize.runWithSPSRetErrorMerged()) + return joinErrors(std::move(Err), runDeallocActions(DeallocActions)); + + if (AA.Dealloc) + DeallocActions.push_back(std::move(AA.Dealloc)); + } + + AAs.clear(); + return DeallocActions; +} + +Error runDeallocActions(ArrayRef<WrapperFunctionCall> DAs) { + Error Err = Error::success(); + while (!DAs.empty()) { + Err = joinErrors(std::move(Err), DAs.back().runWithSPSRetErrorMerged()); + DAs = DAs.drop_back(); + } + return Err; +} + +} // namespace shared +} // namespace orc +} // namespace llvm diff --git a/llvm/lib/ExecutionEngine/Orc/Shared/OrcRTBridge.cpp b/llvm/lib/ExecutionEngine/Orc/Shared/OrcRTBridge.cpp index 02044e4af29a..5eae33121eb9 100644 --- a/llvm/lib/ExecutionEngine/Orc/Shared/OrcRTBridge.cpp +++ b/llvm/lib/ExecutionEngine/Orc/Shared/OrcRTBridge.cpp @@ -36,10 +36,10 @@ const char *MemoryWriteUInt64sWrapperName = "__llvm_orc_bootstrap_mem_write_uint64s_wrapper"; const char *MemoryWriteBuffersWrapperName = "__llvm_orc_bootstrap_mem_write_buffers_wrapper"; -const char *RegisterEHFrameSectionCustomDirectWrapperName = - "__llvm_orc_bootstrap_register_ehframe_section_custom_direct_wrapper"; -const char *DeregisterEHFrameSectionCustomDirectWrapperName = - "__llvm_orc_bootstrap_deregister_ehframe_section_custom_direct_wrapper"; +const char *RegisterEHFrameSectionWrapperName = + "__llvm_orc_bootstrap_register_ehframe_section_wrapper"; +const char *DeregisterEHFrameSectionWrapperName = + "__llvm_orc_bootstrap_deregister_ehframe_section_wrapper"; const char *RunAsMainWrapperName = "__llvm_orc_bootstrap_run_as_main_wrapper"; } // end namespace rt diff --git a/llvm/lib/ExecutionEngine/Orc/TargetProcess/JITLoaderGDB.cpp b/llvm/lib/ExecutionEngine/Orc/TargetProcess/JITLoaderGDB.cpp index 4c15e25b1d89..ffa2969536e7 100644 --- a/llvm/lib/ExecutionEngine/Orc/TargetProcess/JITLoaderGDB.cpp +++ b/llvm/lib/ExecutionEngine/Orc/TargetProcess/JITLoaderGDB.cpp @@ -105,23 +105,25 @@ static void registerJITLoaderGDBImpl(const char *ObjAddr, size_t Size) { extern "C" orc::shared::CWrapperFunctionResult llvm_orc_registerJITLoaderGDBAllocAction(const char *Data, size_t Size) { using namespace orc::shared; - return WrapperFunction<SPSError()>::handle(nullptr, 0, - [=]() -> Error { - registerJITLoaderGDBImpl(Data, - Size); - return Error::success(); - }) + return WrapperFunction<SPSError(SPSExecutorAddrRange)>::handle( + Data, Size, + [](ExecutorAddrRange R) { + registerJITLoaderGDBImpl(R.Start.toPtr<const char *>(), + R.size()); + return Error::success(); + }) .release(); } extern "C" orc::shared::CWrapperFunctionResult llvm_orc_registerJITLoaderGDBWrapper(const char *Data, uint64_t Size) { using namespace orc::shared; - return WrapperFunction<void(SPSExecutorAddrRange)>::handle( + return WrapperFunction<SPSError(SPSExecutorAddrRange)>::handle( Data, Size, [](ExecutorAddrRange R) { - registerJITLoaderGDBImpl(R.Start.toPtr<char *>(), - R.size().getValue()); + registerJITLoaderGDBImpl(R.Start.toPtr<const char *>(), + R.size()); + return Error::success(); }) .release(); } diff --git a/llvm/lib/ExecutionEngine/Orc/TargetProcess/OrcRTBootstrap.cpp b/llvm/lib/ExecutionEngine/Orc/TargetProcess/OrcRTBootstrap.cpp index 82aa62a0c0d9..909d47deef59 100644 --- a/llvm/lib/ExecutionEngine/Orc/TargetProcess/OrcRTBootstrap.cpp +++ b/llvm/lib/ExecutionEngine/Orc/TargetProcess/OrcRTBootstrap.cpp @@ -71,11 +71,10 @@ void addTo(StringMap<ExecutorAddr> &M) { shared::SPSMemoryAccessUInt64Write>); M[rt::MemoryWriteBuffersWrapperName] = ExecutorAddr::fromPtr(&writeBuffersWrapper); - M[rt::RegisterEHFrameSectionCustomDirectWrapperName] = ExecutorAddr::fromPtr( - &llvm_orc_registerEHFrameSectionCustomDirectWrapper); - M[rt::DeregisterEHFrameSectionCustomDirectWrapperName] = - ExecutorAddr::fromPtr( - &llvm_orc_deregisterEHFrameSectionCustomDirectWrapper); + M[rt::RegisterEHFrameSectionWrapperName] = + ExecutorAddr::fromPtr(&llvm_orc_registerEHFrameSectionWrapper); + M[rt::DeregisterEHFrameSectionWrapperName] = + ExecutorAddr::fromPtr(&llvm_orc_deregisterEHFrameSectionWrapper); M[rt::RunAsMainWrapperName] = ExecutorAddr::fromPtr(&runAsMainWrapper); } diff --git a/llvm/lib/ExecutionEngine/Orc/TargetProcess/OrcRTBootstrap.h b/llvm/lib/ExecutionEngine/Orc/TargetProcess/OrcRTBootstrap.h index 6b7ff79a3efc..92b513d0bb53 100644 --- a/llvm/lib/ExecutionEngine/Orc/TargetProcess/OrcRTBootstrap.h +++ b/llvm/lib/ExecutionEngine/Orc/TargetProcess/OrcRTBootstrap.h @@ -33,4 +33,4 @@ void addTo(StringMap<ExecutorAddr> &M); } // end namespace orc } // end namespace llvm -#endif // LLVM_EXECUTIONENGINE_ORC_TARGETPROCESS_ORCRTBOOTSTRAP_H +#endif // LIB_EXECUTIONENGINE_ORC_TARGETPROCESS_ORCRTBOOTSTRAP_H diff --git a/llvm/lib/ExecutionEngine/Orc/TargetProcess/RegisterEHFrames.cpp b/llvm/lib/ExecutionEngine/Orc/TargetProcess/RegisterEHFrames.cpp index e331bad84200..fdae0e45da65 100644 --- a/llvm/lib/ExecutionEngine/Orc/TargetProcess/RegisterEHFrames.cpp +++ b/llvm/lib/ExecutionEngine/Orc/TargetProcess/RegisterEHFrames.cpp @@ -158,42 +158,26 @@ Error deregisterEHFrameSection(const void *EHFrameSectionAddr, } // end namespace orc } // end namespace llvm -extern "C" llvm::orc::shared::CWrapperFunctionResult -llvm_orc_registerEHFrameSectionCustomDirectWrapper( - const char *EHFrameSectionAddr, uint64_t Size) { - if (auto Err = registerEHFrameSection(EHFrameSectionAddr, Size)) - return WrapperFunctionResult::createOutOfBandError(toString(std::move(Err))) - .release(); - return llvm::orc::shared::CWrapperFunctionResult(); +static Error registerEHFrameWrapper(ExecutorAddrRange EHFrame) { + return llvm::orc::registerEHFrameSection(EHFrame.Start.toPtr<const void *>(), + EHFrame.size()); } -extern "C" llvm::orc::shared::CWrapperFunctionResult -llvm_orc_deregisterEHFrameSectionCustomDirectWrapper( - const char *EHFrameSectionAddr, uint64_t Size) { - if (auto Err = deregisterEHFrameSection(EHFrameSectionAddr, Size)) - return WrapperFunctionResult::createOutOfBandError(toString(std::move(Err))) - .release(); - return llvm::orc::shared::CWrapperFunctionResult(); -} - -static Error registerEHFrameWrapper(ExecutorAddr Addr, uint64_t Size) { - return llvm::orc::registerEHFrameSection(Addr.toPtr<const void *>(), Size); -} - -static Error deregisterEHFrameWrapper(ExecutorAddr Addr, uint64_t Size) { - return llvm::orc::deregisterEHFrameSection(Addr.toPtr<const void *>(), Size); +static Error deregisterEHFrameWrapper(ExecutorAddrRange EHFrame) { + return llvm::orc::deregisterEHFrameSection( + EHFrame.Start.toPtr<const void *>(), EHFrame.size()); } extern "C" orc::shared::CWrapperFunctionResult llvm_orc_registerEHFrameSectionWrapper(const char *Data, uint64_t Size) { - return WrapperFunction<SPSError(SPSExecutorAddr, uint64_t)>::handle( + return WrapperFunction<SPSError(SPSExecutorAddrRange)>::handle( Data, Size, registerEHFrameWrapper) .release(); } extern "C" orc::shared::CWrapperFunctionResult llvm_orc_deregisterEHFrameSectionWrapper(const char *Data, uint64_t Size) { - return WrapperFunction<SPSError(SPSExecutorAddr, uint64_t)>::handle( + return WrapperFunction<SPSError(SPSExecutorAddrRange)>::handle( Data, Size, deregisterEHFrameWrapper) .release(); } diff --git a/llvm/lib/ExecutionEngine/Orc/TargetProcess/SimpleExecutorMemoryManager.cpp b/llvm/lib/ExecutionEngine/Orc/TargetProcess/SimpleExecutorMemoryManager.cpp index 232340c22a32..7cadf3bb51a7 100644 --- a/llvm/lib/ExecutionEngine/Orc/TargetProcess/SimpleExecutorMemoryManager.cpp +++ b/llvm/lib/ExecutionEngine/Orc/TargetProcess/SimpleExecutorMemoryManager.cpp @@ -24,7 +24,7 @@ SimpleExecutorMemoryManager::~SimpleExecutorMemoryManager() { Expected<ExecutorAddr> SimpleExecutorMemoryManager::allocate(uint64_t Size) { std::error_code EC; auto MB = sys::Memory::allocateMappedMemory( - Size, 0, sys::Memory::MF_READ | sys::Memory::MF_WRITE, EC); + Size, nullptr, sys::Memory::MF_READ | sys::Memory::MF_WRITE, EC); if (EC) return errorCodeToError(EC); std::lock_guard<std::mutex> Lock(M); @@ -35,7 +35,7 @@ Expected<ExecutorAddr> SimpleExecutorMemoryManager::allocate(uint64_t Size) { Error SimpleExecutorMemoryManager::finalize(tpctypes::FinalizeRequest &FR) { ExecutorAddr Base(~0ULL); - std::vector<tpctypes::WrapperFunctionCall> DeallocationActions; + std::vector<shared::WrapperFunctionCall> DeallocationActions; size_t SuccessfulFinalizationActions = 0; if (FR.Segments.empty()) { @@ -52,8 +52,8 @@ Error SimpleExecutorMemoryManager::finalize(tpctypes::FinalizeRequest &FR) { Base = std::min(Base, Seg.Addr); for (auto &ActPair : FR.Actions) - if (ActPair.Deallocate.Func) - DeallocationActions.push_back(ActPair.Deallocate); + if (ActPair.Dealloc) + DeallocationActions.push_back(ActPair.Dealloc); // Get the Allocation for this finalization. size_t AllocSize = 0; @@ -96,7 +96,7 @@ Error SimpleExecutorMemoryManager::finalize(tpctypes::FinalizeRequest &FR) { while (SuccessfulFinalizationActions) Err = joinErrors(std::move(Err), FR.Actions[--SuccessfulFinalizationActions] - .Deallocate.runWithSPSRet()); + .Dealloc.runWithSPSRetErrorMerged()); // Deallocate memory. sys::MemoryBlock MB(AllocToDestroy.first, AllocToDestroy.second.Size); @@ -139,7 +139,7 @@ Error SimpleExecutorMemoryManager::finalize(tpctypes::FinalizeRequest &FR) { // Run finalization actions. for (auto &ActPair : FR.Actions) { - if (auto Err = ActPair.Finalize.runWithSPSRet()) + if (auto Err = ActPair.Finalize.runWithSPSRetErrorMerged()) return BailOut(std::move(Err)); ++SuccessfulFinalizationActions; } @@ -212,7 +212,7 @@ Error SimpleExecutorMemoryManager::deallocateImpl(void *Base, Allocation &A) { while (!A.DeallocationActions.empty()) { Err = joinErrors(std::move(Err), - A.DeallocationActions.back().runWithSPSRet()); + A.DeallocationActions.back().runWithSPSRetErrorMerged()); A.DeallocationActions.pop_back(); } diff --git a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldChecker.cpp b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldChecker.cpp index 2b88c481dab0..33db23408cf2 100644 --- a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldChecker.cpp +++ b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldChecker.cpp @@ -97,8 +97,8 @@ private: class EvalResult { public: - EvalResult() : Value(0), ErrorMsg("") {} - EvalResult(uint64_t Value) : Value(Value), ErrorMsg("") {} + EvalResult() : Value(0) {} + EvalResult(uint64_t Value) : Value(Value) {} EvalResult(std::string ErrorMsg) : Value(0), ErrorMsg(std::move(ErrorMsg)) {} uint64_t getValue() const { return Value; } diff --git a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp index 0de76ab78e0f..f92618afdff6 100644 --- a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp +++ b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp @@ -422,6 +422,8 @@ void RuntimeDyldELF::resolveAArch64Relocation(const SectionEntry &Section, default: report_fatal_error("Relocation type not implemented yet!"); break; + case ELF::R_AARCH64_NONE: + break; case ELF::R_AARCH64_ABS16: { uint64_t Result = Value + Addend; assert(static_cast<int64_t>(Result) >= INT16_MIN && Result < UINT16_MAX); diff --git a/llvm/lib/ExecutionEngine/SectionMemoryManager.cpp b/llvm/lib/ExecutionEngine/SectionMemoryManager.cpp index 6690dd07d99b..56b232b9dbcd 100644 --- a/llvm/lib/ExecutionEngine/SectionMemoryManager.cpp +++ b/llvm/lib/ExecutionEngine/SectionMemoryManager.cpp @@ -114,11 +114,11 @@ uint8_t *SectionMemoryManager::allocateSection( // Copy the address to all the other groups, if they have not // been initialized. - if (CodeMem.Near.base() == 0) + if (CodeMem.Near.base() == nullptr) CodeMem.Near = MB; - if (RODataMem.Near.base() == 0) + if (RODataMem.Near.base() == nullptr) RODataMem.Near = MB; - if (RWDataMem.Near.base() == 0) + if (RWDataMem.Near.base() == nullptr) RWDataMem.Near = MB; // Remember that we allocated this memory diff --git a/llvm/lib/FileCheck/FileCheck.cpp b/llvm/lib/FileCheck/FileCheck.cpp index c962231cbdc1..6186af444e73 100644 --- a/llvm/lib/FileCheck/FileCheck.cpp +++ b/llvm/lib/FileCheck/FileCheck.cpp @@ -1007,8 +1007,9 @@ bool Pattern::parsePattern(StringRef PatternStr, StringRef Prefix, // brackets. They also accept a combined form which sets a numeric variable // to the evaluation of an expression. Both string and numeric variable // names must satisfy the regular expression "[a-zA-Z_][0-9a-zA-Z_]*" to be - // valid, as this helps catch some common errors. - if (PatternStr.startswith("[[")) { + // valid, as this helps catch some common errors. If there are extra '['s + // before the "[[", treat them literally. + if (PatternStr.startswith("[[") && !PatternStr.startswith("[[[")) { StringRef UnparsedPatternStr = PatternStr.substr(2); // Find the closing bracket pair ending the match. End is going to be an // offset relative to the beginning of the match string. @@ -1183,12 +1184,14 @@ bool Pattern::parsePattern(StringRef PatternStr, StringRef Prefix, Substitutions.push_back(Substitution); } } + + continue; } // Handle fixed string matches. // Find the end, which is the start of the next regex. - size_t FixedMatchEnd = PatternStr.find("{{"); - FixedMatchEnd = std::min(FixedMatchEnd, PatternStr.find("[[")); + size_t FixedMatchEnd = + std::min(PatternStr.find("{{", 1), PatternStr.find("[[", 1)); RegExStr += Regex::escape(PatternStr.substr(0, FixedMatchEnd)); PatternStr = PatternStr.substr(FixedMatchEnd); } @@ -2215,7 +2218,7 @@ static Error reportMatchResult(bool ExpectedMatch, const SourceMgr &SM, static unsigned CountNumNewlinesBetween(StringRef Range, const char *&FirstNewLine) { unsigned NumNewLines = 0; - while (1) { + while (true) { // Scan for newline. Range = Range.substr(Range.find_first_of("\n\r")); if (Range.empty()) diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp index 5157d51fd18c..3b8d80c4eeec 100644 --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -13,6 +13,7 @@ //===----------------------------------------------------------------------===// #include "llvm/Frontend/OpenMP/OMPIRBuilder.h" +#include "llvm/ADT/SmallSet.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/Triple.h" #include "llvm/Analysis/AssumptionCache.h" @@ -21,7 +22,9 @@ #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/IR/CFG.h" +#include "llvm/IR/Constants.h" #include "llvm/IR/DebugInfo.h" +#include "llvm/IR/GlobalVariable.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/MDBuilder.h" #include "llvm/IR/PassManager.h" @@ -37,6 +40,7 @@ #include "llvm/Transforms/Utils/ModuleUtils.h" #include "llvm/Transforms/Utils/UnrollLoop.h" +#include <cstdint> #include <sstream> #define DEBUG_TYPE "openmp-ir-builder" @@ -56,6 +60,20 @@ static cl::opt<double> UnrollThresholdFactor( "simplifications still taking place"), cl::init(1.5)); +#ifndef NDEBUG +/// Return whether IP1 and IP2 are ambiguous, i.e. that inserting instructions +/// at position IP1 may change the meaning of IP2 or vice-versa. This is because +/// an InsertPoint stores the instruction before something is inserted. For +/// instance, if both point to the same instruction, two IRBuilders alternating +/// creating instruction will cause the instructions to be interleaved. +static bool isConflictIP(IRBuilder<>::InsertPoint IP1, + IRBuilder<>::InsertPoint IP2) { + if (!IP1.isSet() || !IP2.isSet()) + return false; + return IP1.getBlock() == IP2.getBlock() && IP1.getPoint() == IP2.getPoint(); +} +#endif + void OpenMPIRBuilder::addAttributes(omp::RuntimeFunction FnID, Function &Fn) { LLVMContext &Ctx = Fn.getContext(); @@ -156,7 +174,7 @@ Function *OpenMPIRBuilder::getOrCreateRuntimeFunctionPtr(RuntimeFunction FnID) { void OpenMPIRBuilder::initialize() { initializeTypes(M); } -void OpenMPIRBuilder::finalize(Function *Fn, bool AllowExtractorSinking) { +void OpenMPIRBuilder::finalize(Function *Fn) { SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet; SmallVector<BasicBlock *, 32> Blocks; SmallVector<OutlineInfo, 16> DeferredOutlines; @@ -175,7 +193,7 @@ void OpenMPIRBuilder::finalize(Function *Fn, bool AllowExtractorSinking) { Function *OuterFn = OI.getFunction(); CodeExtractorAnalysisCache CEAC(*OuterFn); CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr, - /* AggregateArgs */ false, + /* AggregateArgs */ true, /* BlockFrequencyInfo */ nullptr, /* BranchProbabilityInfo */ nullptr, /* AssumptionCache */ nullptr, @@ -189,6 +207,9 @@ void OpenMPIRBuilder::finalize(Function *Fn, bool AllowExtractorSinking) { assert(Extractor.isEligible() && "Expected OpenMP outlining to be possible!"); + for (auto *V : OI.ExcludeArgsFromAggregate) + Extractor.excludeArgFromAggregate(V); + Function *OutlinedFn = Extractor.extractCodeRegion(CEAC); LLVM_DEBUG(dbgs() << "After outlining: " << *OuterFn << "\n"); @@ -207,25 +228,25 @@ void OpenMPIRBuilder::finalize(Function *Fn, bool AllowExtractorSinking) { BasicBlock &ArtificialEntry = OutlinedFn->getEntryBlock(); assert(ArtificialEntry.getUniqueSuccessor() == OI.EntryBB); assert(OI.EntryBB->getUniquePredecessor() == &ArtificialEntry); - if (AllowExtractorSinking) { - // Move instructions from the to-be-deleted ArtificialEntry to the entry - // basic block of the parallel region. CodeExtractor may have sunk - // allocas/bitcasts for values that are solely used in the outlined - // region and do not escape. - assert(!ArtificialEntry.empty() && - "Expected instructions to sink in the outlined region"); - for (BasicBlock::iterator It = ArtificialEntry.begin(), - End = ArtificialEntry.end(); - It != End;) { - Instruction &I = *It; - It++; - - if (I.isTerminator()) - continue; - - I.moveBefore(*OI.EntryBB, OI.EntryBB->getFirstInsertionPt()); - } + // Move instructions from the to-be-deleted ArtificialEntry to the entry + // basic block of the parallel region. CodeExtractor generates + // instructions to unwrap the aggregate argument and may sink + // allocas/bitcasts for values that are solely used in the outlined region + // and do not escape. + assert(!ArtificialEntry.empty() && + "Expected instructions to add in the outlined region entry"); + for (BasicBlock::reverse_iterator It = ArtificialEntry.rbegin(), + End = ArtificialEntry.rend(); + It != End;) { + Instruction &I = *It; + It++; + + if (I.isTerminator()) + continue; + + I.moveBefore(*OI.EntryBB, OI.EntryBB->getFirstInsertionPt()); } + OI.EntryBB->moveBefore(&ArtificialEntry); ArtificialEntry.eraseFromParent(); } @@ -251,23 +272,26 @@ GlobalValue *OpenMPIRBuilder::createGlobalFlag(unsigned Value, StringRef Name) { new GlobalVariable(M, I32Ty, /* isConstant = */ true, GlobalValue::WeakODRLinkage, ConstantInt::get(I32Ty, Value), Name); + GV->setVisibility(GlobalValue::HiddenVisibility); return GV; } -Value *OpenMPIRBuilder::getOrCreateIdent(Constant *SrcLocStr, - IdentFlag LocFlags, - unsigned Reserve2Flags) { +Constant *OpenMPIRBuilder::getOrCreateIdent(Constant *SrcLocStr, + uint32_t SrcLocStrSize, + IdentFlag LocFlags, + unsigned Reserve2Flags) { // Enable "C-mode". LocFlags |= OMP_IDENT_FLAG_KMPC; - Value *&Ident = + Constant *&Ident = IdentMap[{SrcLocStr, uint64_t(LocFlags) << 31 | Reserve2Flags}]; if (!Ident) { Constant *I32Null = ConstantInt::getNullValue(Int32); - Constant *IdentData[] = { - I32Null, ConstantInt::get(Int32, uint32_t(LocFlags)), - ConstantInt::get(Int32, Reserve2Flags), I32Null, SrcLocStr}; + Constant *IdentData[] = {I32Null, + ConstantInt::get(Int32, uint32_t(LocFlags)), + ConstantInt::get(Int32, Reserve2Flags), + ConstantInt::get(Int32, SrcLocStrSize), SrcLocStr}; Constant *Initializer = ConstantStruct::get(OpenMPIRBuilder::Ident, IdentData); @@ -290,10 +314,12 @@ Value *OpenMPIRBuilder::getOrCreateIdent(Constant *SrcLocStr, } } - return Builder.CreatePointerCast(Ident, IdentPtr); + return ConstantExpr::getPointerBitCastOrAddrSpaceCast(Ident, IdentPtr); } -Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(StringRef LocStr) { +Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(StringRef LocStr, + uint32_t &SrcLocStrSize) { + SrcLocStrSize = LocStr.size(); Constant *&SrcLocStr = SrcLocStrMap[LocStr]; if (!SrcLocStr) { Constant *Initializer = @@ -314,8 +340,8 @@ Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(StringRef LocStr) { Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(StringRef FunctionName, StringRef FileName, - unsigned Line, - unsigned Column) { + unsigned Line, unsigned Column, + uint32_t &SrcLocStrSize) { SmallString<128> Buffer; Buffer.push_back(';'); Buffer.append(FileName); @@ -327,17 +353,21 @@ Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(StringRef FunctionName, Buffer.append(std::to_string(Column)); Buffer.push_back(';'); Buffer.push_back(';'); - return getOrCreateSrcLocStr(Buffer.str()); + return getOrCreateSrcLocStr(Buffer.str(), SrcLocStrSize); } -Constant *OpenMPIRBuilder::getOrCreateDefaultSrcLocStr() { - return getOrCreateSrcLocStr(";unknown;unknown;0;0;;"); +Constant * +OpenMPIRBuilder::getOrCreateDefaultSrcLocStr(uint32_t &SrcLocStrSize) { + StringRef UnknownLoc = ";unknown;unknown;0;0;;"; + return getOrCreateSrcLocStr(UnknownLoc, SrcLocStrSize); } -Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(DebugLoc DL, Function *F) { +Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(DebugLoc DL, + uint32_t &SrcLocStrSize, + Function *F) { DILocation *DIL = DL.get(); if (!DIL) - return getOrCreateDefaultSrcLocStr(); + return getOrCreateDefaultSrcLocStr(SrcLocStrSize); StringRef FileName = M.getName(); if (DIFile *DIF = DIL->getFile()) if (Optional<StringRef> Source = DIF->getSource()) @@ -346,12 +376,13 @@ Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(DebugLoc DL, Function *F) { if (Function.empty() && F) Function = F->getName(); return getOrCreateSrcLocStr(Function, FileName, DIL->getLine(), - DIL->getColumn()); + DIL->getColumn(), SrcLocStrSize); } -Constant * -OpenMPIRBuilder::getOrCreateSrcLocStr(const LocationDescription &Loc) { - return getOrCreateSrcLocStr(Loc.DL, Loc.IP.getBlock()->getParent()); +Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(const LocationDescription &Loc, + uint32_t &SrcLocStrSize) { + return getOrCreateSrcLocStr(Loc.DL, SrcLocStrSize, + Loc.IP.getBlock()->getParent()); } Value *OpenMPIRBuilder::getOrCreateThreadID(Value *Ident) { @@ -393,9 +424,11 @@ OpenMPIRBuilder::emitBarrierImpl(const LocationDescription &Loc, Directive Kind, break; } - Constant *SrcLocStr = getOrCreateSrcLocStr(Loc); - Value *Args[] = {getOrCreateIdent(SrcLocStr, BarrierLocFlags), - getOrCreateThreadID(getOrCreateIdent(SrcLocStr))}; + uint32_t SrcLocStrSize; + Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); + Value *Args[] = { + getOrCreateIdent(SrcLocStr, SrcLocStrSize, BarrierLocFlags), + getOrCreateThreadID(getOrCreateIdent(SrcLocStr, SrcLocStrSize))}; // If we are in a cancellable parallel region, barriers are cancellation // points. @@ -441,8 +474,9 @@ OpenMPIRBuilder::createCancel(const LocationDescription &Loc, llvm_unreachable("Unknown cancel kind!"); } - Constant *SrcLocStr = getOrCreateSrcLocStr(Loc); - Value *Ident = getOrCreateIdent(SrcLocStr); + uint32_t SrcLocStrSize; + Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); + Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize); Value *Args[] = {Ident, getOrCreateThreadID(Ident), CancelKind}; Value *Result = Builder.CreateCall( getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_cancel), Args); @@ -510,11 +544,14 @@ IRBuilder<>::InsertPoint OpenMPIRBuilder::createParallel( BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB, FinalizeCallbackTy FiniCB, Value *IfCondition, Value *NumThreads, omp::ProcBindKind ProcBind, bool IsCancellable) { + assert(!isConflictIP(Loc.IP, OuterAllocaIP) && "IPs must not be ambiguous"); + if (!updateToLocation(Loc)) return Loc.IP; - Constant *SrcLocStr = getOrCreateSrcLocStr(Loc); - Value *Ident = getOrCreateIdent(SrcLocStr); + uint32_t SrcLocStrSize; + Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); + Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize); Value *ThreadID = getOrCreateThreadID(Ident); if (NumThreads) { @@ -777,8 +814,10 @@ IRBuilder<>::InsertPoint OpenMPIRBuilder::createParallel( getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num); auto PrivHelper = [&](Value &V) { - if (&V == TIDAddr || &V == ZeroAddr) + if (&V == TIDAddr || &V == ZeroAddr) { + OI.ExcludeArgsFromAggregate.push_back(&V); return; + } SetVector<Use *> Uses; for (Use &U : V.uses()) @@ -871,8 +910,9 @@ IRBuilder<>::InsertPoint OpenMPIRBuilder::createParallel( void OpenMPIRBuilder::emitFlush(const LocationDescription &Loc) { // Build call void __kmpc_flush(ident_t *loc) - Constant *SrcLocStr = getOrCreateSrcLocStr(Loc); - Value *Args[] = {getOrCreateIdent(SrcLocStr)}; + uint32_t SrcLocStrSize; + Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); + Value *Args[] = {getOrCreateIdent(SrcLocStr, SrcLocStrSize)}; Builder.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_flush), Args); } @@ -886,8 +926,9 @@ void OpenMPIRBuilder::createFlush(const LocationDescription &Loc) { void OpenMPIRBuilder::emitTaskwaitImpl(const LocationDescription &Loc) { // Build call kmp_int32 __kmpc_omp_taskwait(ident_t *loc, kmp_int32 // global_tid); - Constant *SrcLocStr = getOrCreateSrcLocStr(Loc); - Value *Ident = getOrCreateIdent(SrcLocStr); + uint32_t SrcLocStrSize; + Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); + Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize); Value *Args[] = {Ident, getOrCreateThreadID(Ident)}; // Ignore return result until untied tasks are supported. @@ -903,8 +944,9 @@ void OpenMPIRBuilder::createTaskwait(const LocationDescription &Loc) { void OpenMPIRBuilder::emitTaskyieldImpl(const LocationDescription &Loc) { // Build call __kmpc_omp_taskyield(loc, thread_id, 0); - Constant *SrcLocStr = getOrCreateSrcLocStr(Loc); - Value *Ident = getOrCreateIdent(SrcLocStr); + uint32_t SrcLocStrSize; + Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); + Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize); Constant *I32Null = ConstantInt::getNullValue(Int32); Value *Args[] = {Ident, getOrCreateThreadID(Ident), I32Null}; @@ -1114,14 +1156,16 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createReductions( Module *Module = Func->getParent(); Value *RedArrayPtr = Builder.CreateBitCast(RedArray, Builder.getInt8PtrTy(), "red.array.ptr"); - Constant *SrcLocStr = getOrCreateSrcLocStr(Loc); + uint32_t SrcLocStrSize; + Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); bool CanGenerateAtomic = llvm::all_of(ReductionInfos, [](const ReductionInfo &RI) { return RI.AtomicReductionGen; }); - Value *Ident = getOrCreateIdent( - SrcLocStr, CanGenerateAtomic ? IdentFlag::OMP_IDENT_FLAG_ATOMIC_REDUCE - : IdentFlag(0)); + Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize, + CanGenerateAtomic + ? IdentFlag::OMP_IDENT_FLAG_ATOMIC_REDUCE + : IdentFlag(0)); Value *ThreadId = getOrCreateThreadID(Ident); Constant *NumVariables = Builder.getInt32(NumReductions); const DataLayout &DL = Module->getDataLayout(); @@ -1235,8 +1279,9 @@ OpenMPIRBuilder::createMaster(const LocationDescription &Loc, return Loc.IP; Directive OMPD = Directive::OMPD_master; - Constant *SrcLocStr = getOrCreateSrcLocStr(Loc); - Value *Ident = getOrCreateIdent(SrcLocStr); + uint32_t SrcLocStrSize; + Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); + Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize); Value *ThreadId = getOrCreateThreadID(Ident); Value *Args[] = {Ident, ThreadId}; @@ -1258,8 +1303,9 @@ OpenMPIRBuilder::createMasked(const LocationDescription &Loc, return Loc.IP; Directive OMPD = Directive::OMPD_masked; - Constant *SrcLocStr = getOrCreateSrcLocStr(Loc); - Value *Ident = getOrCreateIdent(SrcLocStr); + uint32_t SrcLocStrSize; + Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); + Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize); Value *ThreadId = getOrCreateThreadID(Ident); Value *Args[] = {Ident, ThreadId, Filter}; Value *ArgsEnd[] = {Ident, ThreadId}; @@ -1475,13 +1521,16 @@ OpenMPIRBuilder::applyStaticWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP, bool NeedsBarrier, Value *Chunk) { assert(CLI->isValid() && "Requires a valid canonical loop"); + assert(!isConflictIP(AllocaIP, CLI->getPreheaderIP()) && + "Require dedicated allocate IP"); // Set up the source location value for OpenMP runtime. Builder.restoreIP(CLI->getPreheaderIP()); Builder.SetCurrentDebugLocation(DL); - Constant *SrcLocStr = getOrCreateSrcLocStr(DL); - Value *SrcLoc = getOrCreateIdent(SrcLocStr); + uint32_t SrcLocStrSize; + Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize); + Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize); // Declare useful OpenMP runtime functions. Value *IV = CLI->getIndVar(); @@ -1604,12 +1653,15 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::applyDynamicWorkshareLoop( DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP, OMPScheduleType SchedType, bool NeedsBarrier, Value *Chunk) { assert(CLI->isValid() && "Requires a valid canonical loop"); + assert(!isConflictIP(AllocaIP, CLI->getPreheaderIP()) && + "Require dedicated allocate IP"); // Set up the source location value for OpenMP runtime. Builder.SetCurrentDebugLocation(DL); - Constant *SrcLocStr = getOrCreateSrcLocStr(DL); - Value *SrcLoc = getOrCreateIdent(SrcLocStr); + uint32_t SrcLocStrSize; + Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize); + Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize); // Declare useful OpenMP runtime functions. Value *IV = CLI->getIndVar(); @@ -2119,6 +2171,19 @@ static void addLoopMetadata(CanonicalLoopInfo *Loop, Latch->getTerminator()->setMetadata(LLVMContext::MD_loop, LoopID); } +/// Attach llvm.access.group metadata to the memref instructions of \p Block +static void addSimdMetadata(BasicBlock *Block, MDNode *AccessGroup, + LoopInfo &LI) { + for (Instruction &I : *Block) { + if (I.mayReadOrWriteMemory()) { + // TODO: This instruction may already have access group from + // other pragmas e.g. #pragma clang loop vectorize. Append + // so that the existing metadata is not overwritten. + I.setMetadata(LLVMContext::MD_access_group, AccessGroup); + } + } +} + void OpenMPIRBuilder::unrollLoopFull(DebugLoc, CanonicalLoopInfo *Loop) { LLVMContext &Ctx = Builder.getContext(); addLoopMetadata( @@ -2134,6 +2199,53 @@ void OpenMPIRBuilder::unrollLoopHeuristic(DebugLoc, CanonicalLoopInfo *Loop) { }); } +void OpenMPIRBuilder::applySimd(DebugLoc, CanonicalLoopInfo *CanonicalLoop) { + LLVMContext &Ctx = Builder.getContext(); + + Function *F = CanonicalLoop->getFunction(); + + FunctionAnalysisManager FAM; + FAM.registerPass([]() { return DominatorTreeAnalysis(); }); + FAM.registerPass([]() { return LoopAnalysis(); }); + FAM.registerPass([]() { return PassInstrumentationAnalysis(); }); + + LoopAnalysis LIA; + LoopInfo &&LI = LIA.run(*F, FAM); + + Loop *L = LI.getLoopFor(CanonicalLoop->getHeader()); + + SmallSet<BasicBlock *, 8> Reachable; + + // Get the basic blocks from the loop in which memref instructions + // can be found. + // TODO: Generalize getting all blocks inside a CanonicalizeLoopInfo, + // preferably without running any passes. + for (BasicBlock *Block : L->getBlocks()) { + if (Block == CanonicalLoop->getCond() || + Block == CanonicalLoop->getHeader()) + continue; + Reachable.insert(Block); + } + + // Add access group metadata to memory-access instructions. + MDNode *AccessGroup = MDNode::getDistinct(Ctx, {}); + for (BasicBlock *BB : Reachable) + addSimdMetadata(BB, AccessGroup, LI); + + // Use the above access group metadata to create loop level + // metadata, which should be distinct for each loop. + ConstantAsMetadata *BoolConst = + ConstantAsMetadata::get(ConstantInt::getTrue(Type::getInt1Ty(Ctx))); + // TODO: If the loop has existing parallel access metadata, have + // to combine two lists. + addLoopMetadata( + CanonicalLoop, + {MDNode::get(Ctx, {MDString::get(Ctx, "llvm.loop.parallel_accesses"), + AccessGroup}), + MDNode::get(Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.enable"), + BoolConst})}); +} + /// Create the TargetMachine object to query the backend for optimization /// preferences. /// @@ -2243,7 +2355,7 @@ static int32_t computeHeuristicUnrollFactor(CanonicalLoopInfo *CLI) { gatherPeelingPreferences(L, SE, TTI, /*UserAllowPeeling=*/false, /*UserAllowProfileBasedPeeling=*/false, - /*UserUnrollingSpecficValues=*/false); + /*UnrollingSpecficValues=*/false); SmallPtrSet<const Value *, 32> EphValues; CodeMetrics::collectEphemeralValues(L, &AC, EphValues); @@ -2379,8 +2491,9 @@ OpenMPIRBuilder::createCopyPrivate(const LocationDescription &Loc, if (!updateToLocation(Loc)) return Loc.IP; - Constant *SrcLocStr = getOrCreateSrcLocStr(Loc); - Value *Ident = getOrCreateIdent(SrcLocStr); + uint32_t SrcLocStrSize; + Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); + Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize); Value *ThreadId = getOrCreateThreadID(Ident); llvm::Value *DidItLD = Builder.CreateLoad(Builder.getInt32Ty(), DidIt); @@ -2407,8 +2520,9 @@ OpenMPIRBuilder::createSingle(const LocationDescription &Loc, } Directive OMPD = Directive::OMPD_single; - Constant *SrcLocStr = getOrCreateSrcLocStr(Loc); - Value *Ident = getOrCreateIdent(SrcLocStr); + uint32_t SrcLocStrSize; + Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); + Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize); Value *ThreadId = getOrCreateThreadID(Ident); Value *Args[] = {Ident, ThreadId}; @@ -2436,8 +2550,9 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createCritical( return Loc.IP; Directive OMPD = Directive::OMPD_critical; - Constant *SrcLocStr = getOrCreateSrcLocStr(Loc); - Value *Ident = getOrCreateIdent(SrcLocStr); + uint32_t SrcLocStrSize; + Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); + Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize); Value *ThreadId = getOrCreateThreadID(Ident); Value *LockVar = getOMPCriticalRegionLock(CriticalName); Value *Args[] = {Ident, ThreadId, LockVar}; @@ -2466,6 +2581,10 @@ OpenMPIRBuilder::createOrderedDepend(const LocationDescription &Loc, InsertPointTy AllocaIP, unsigned NumLoops, ArrayRef<llvm::Value *> StoreValues, const Twine &Name, bool IsDependSource) { + for (size_t I = 0; I < StoreValues.size(); I++) + assert(StoreValues[I]->getType()->isIntegerTy(64) && + "OpenMP runtime requires depend vec with i64 type"); + if (!updateToLocation(Loc)) return Loc.IP; @@ -2480,14 +2599,16 @@ OpenMPIRBuilder::createOrderedDepend(const LocationDescription &Loc, for (unsigned I = 0; I < NumLoops; ++I) { Value *DependAddrGEPIter = Builder.CreateInBoundsGEP( ArrI64Ty, ArgsBase, {Builder.getInt64(0), Builder.getInt64(I)}); - Builder.CreateStore(StoreValues[I], DependAddrGEPIter); + StoreInst *STInst = Builder.CreateStore(StoreValues[I], DependAddrGEPIter); + STInst->setAlignment(Align(8)); } Value *DependBaseAddrGEP = Builder.CreateInBoundsGEP( ArrI64Ty, ArgsBase, {Builder.getInt64(0), Builder.getInt64(0)}); - Constant *SrcLocStr = getOrCreateSrcLocStr(Loc); - Value *Ident = getOrCreateIdent(SrcLocStr); + uint32_t SrcLocStrSize; + Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); + Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize); Value *ThreadId = getOrCreateThreadID(Ident); Value *Args[] = {Ident, ThreadId, DependBaseAddrGEP}; @@ -2512,8 +2633,9 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createOrderedThreadsSimd( Instruction *ExitCall = nullptr; if (IsThreads) { - Constant *SrcLocStr = getOrCreateSrcLocStr(Loc); - Value *Ident = getOrCreateIdent(SrcLocStr); + uint32_t SrcLocStrSize; + Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); + Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize); Value *ThreadId = getOrCreateThreadID(Ident); Value *Args[] = {Ident, ThreadId}; @@ -2718,8 +2840,9 @@ CallInst *OpenMPIRBuilder::createOMPAlloc(const LocationDescription &Loc, IRBuilder<>::InsertPointGuard IPG(Builder); Builder.restoreIP(Loc.IP); - Constant *SrcLocStr = getOrCreateSrcLocStr(Loc); - Value *Ident = getOrCreateIdent(SrcLocStr); + uint32_t SrcLocStrSize; + Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); + Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize); Value *ThreadId = getOrCreateThreadID(Ident); Value *Args[] = {ThreadId, Size, Allocator}; @@ -2734,8 +2857,9 @@ CallInst *OpenMPIRBuilder::createOMPFree(const LocationDescription &Loc, IRBuilder<>::InsertPointGuard IPG(Builder); Builder.restoreIP(Loc.IP); - Constant *SrcLocStr = getOrCreateSrcLocStr(Loc); - Value *Ident = getOrCreateIdent(SrcLocStr); + uint32_t SrcLocStrSize; + Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); + Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize); Value *ThreadId = getOrCreateThreadID(Ident); Value *Args[] = {ThreadId, Addr, Allocator}; Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_free); @@ -2748,8 +2872,9 @@ CallInst *OpenMPIRBuilder::createCachedThreadPrivate( IRBuilder<>::InsertPointGuard IPG(Builder); Builder.restoreIP(Loc.IP); - Constant *SrcLocStr = getOrCreateSrcLocStr(Loc); - Value *Ident = getOrCreateIdent(SrcLocStr); + uint32_t SrcLocStrSize; + Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); + Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize); Value *ThreadId = getOrCreateThreadID(Ident); Constant *ThreadPrivateCache = getOrCreateOMPInternalVariable(Int8PtrPtr, Name); @@ -2767,8 +2892,9 @@ OpenMPIRBuilder::createTargetInit(const LocationDescription &Loc, bool IsSPMD, if (!updateToLocation(Loc)) return Loc.IP; - Constant *SrcLocStr = getOrCreateSrcLocStr(Loc); - Value *Ident = getOrCreateIdent(SrcLocStr); + uint32_t SrcLocStrSize; + Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); + Constant *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize); ConstantInt *IsSPMDVal = ConstantInt::getSigned( IntegerType::getInt8Ty(Int8->getContext()), IsSPMD ? OMP_TGT_EXEC_MODE_SPMD : OMP_TGT_EXEC_MODE_GENERIC); @@ -2820,8 +2946,9 @@ void OpenMPIRBuilder::createTargetDeinit(const LocationDescription &Loc, if (!updateToLocation(Loc)) return; - Constant *SrcLocStr = getOrCreateSrcLocStr(Loc); - Value *Ident = getOrCreateIdent(SrcLocStr); + uint32_t SrcLocStrSize; + Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); + Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize); ConstantInt *IsSPMDVal = ConstantInt::getSigned( IntegerType::getInt8Ty(Int8->getContext()), IsSPMD ? OMP_TGT_EXEC_MODE_SPMD : OMP_TGT_EXEC_MODE_GENERIC); @@ -2860,7 +2987,8 @@ Constant *OpenMPIRBuilder::getOrCreateOMPInternalVariable( StringRef RuntimeName = Out.str(); auto &Elem = *InternalVars.try_emplace(RuntimeName, nullptr).first; if (Elem.second) { - assert(Elem.second->getType()->getPointerElementType() == Ty && + assert(cast<PointerType>(Elem.second->getType()) + ->isOpaqueOrPointeeTypeMatches(Ty) && "OMP internal variable has different type than requested"); } else { // TODO: investigate the appropriate linkage type used for the global diff --git a/llvm/lib/FuzzMutate/Operations.cpp b/llvm/lib/FuzzMutate/Operations.cpp index a37fd5454dd4..221a3a84b49b 100644 --- a/llvm/lib/FuzzMutate/Operations.cpp +++ b/llvm/lib/FuzzMutate/Operations.cpp @@ -169,7 +169,7 @@ OpDescriptor llvm::fuzzerop::splitBlockDescriptor(unsigned Weight) { OpDescriptor llvm::fuzzerop::gepDescriptor(unsigned Weight) { auto buildGEP = [](ArrayRef<Value *> Srcs, Instruction *Inst) { - Type *Ty = cast<PointerType>(Srcs[0]->getType())->getElementType(); + Type *Ty = Srcs[0]->getType()->getPointerElementType(); auto Indices = makeArrayRef(Srcs).drop_front(1); return GetElementPtrInst::Create(Ty, Srcs[0], Indices, "G", Inst); }; diff --git a/llvm/lib/FuzzMutate/RandomIRBuilder.cpp b/llvm/lib/FuzzMutate/RandomIRBuilder.cpp index 1295714839e8..27c3bdfb22a8 100644 --- a/llvm/lib/FuzzMutate/RandomIRBuilder.cpp +++ b/llvm/lib/FuzzMutate/RandomIRBuilder.cpp @@ -53,8 +53,8 @@ Value *RandomIRBuilder::newSource(BasicBlock &BB, ArrayRef<Instruction *> Insts, IP = ++I->getIterator(); assert(IP != BB.end() && "guaranteed by the findPointer"); } - auto *NewLoad = new LoadInst( - cast<PointerType>(Ptr->getType())->getElementType(), Ptr, "L", &*IP); + auto *NewLoad = + new LoadInst(Ptr->getType()->getPointerElementType(), Ptr, "L", &*IP); // Only sample this load if it really matches the descriptor if (Pred.matches(Srcs, NewLoad)) @@ -141,12 +141,12 @@ Value *RandomIRBuilder::findPointer(BasicBlock &BB, if (auto PtrTy = dyn_cast<PointerType>(Inst->getType())) { // We can never generate loads from non first class or non sized types - if (!PtrTy->getElementType()->isSized() || - !PtrTy->getElementType()->isFirstClassType()) + Type *ElemTy = PtrTy->getPointerElementType(); + if (!ElemTy->isSized() || !ElemTy->isFirstClassType()) return false; // TODO: Check if this is horribly expensive. - return Pred.matches(Srcs, UndefValue::get(PtrTy->getElementType())); + return Pred.matches(Srcs, UndefValue::get(ElemTy)); } return false; }; diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp index bbe0c97e60a2..179754e275b0 100644 --- a/llvm/lib/IR/AsmWriter.cpp +++ b/llvm/lib/IR/AsmWriter.cpp @@ -587,7 +587,7 @@ void TypePrinting::print(Type *Ty, raw_ostream &OS) { OS << " addrspace(" << AddressSpace << ')'; return; } - print(PTy->getElementType(), OS); + print(PTy->getNonOpaquePointerElementType(), OS); if (unsigned AddressSpace = PTy->getAddressSpace()) OS << " addrspace(" << AddressSpace << ')'; OS << '*'; @@ -1986,6 +1986,8 @@ static void writeDIStringType(raw_ostream &Out, const DIStringType *N, Printer.printString("name", N->getName()); Printer.printMetadata("stringLength", N->getRawStringLength()); Printer.printMetadata("stringLengthExpression", N->getRawStringLengthExp()); + Printer.printMetadata("stringLocationExpression", + N->getRawStringLocationExp()); Printer.printInt("size", N->getSizeInBits()); Printer.printInt("align", N->getAlignInBits()); Printer.printDwarfEnum("encoding", N->getEncoding(), diff --git a/llvm/lib/IR/Attributes.cpp b/llvm/lib/IR/Attributes.cpp index c899afae6cce..c92bacaee36d 100644 --- a/llvm/lib/IR/Attributes.cpp +++ b/llvm/lib/IR/Attributes.cpp @@ -607,14 +607,14 @@ AttributeSet AttributeSet::get(LLVMContext &C, ArrayRef<Attribute> Attrs) { AttributeSet AttributeSet::addAttribute(LLVMContext &C, Attribute::AttrKind Kind) const { if (hasAttribute(Kind)) return *this; - AttrBuilder B; + AttrBuilder B(C); B.addAttribute(Kind); return addAttributes(C, AttributeSet::get(C, B)); } AttributeSet AttributeSet::addAttribute(LLVMContext &C, StringRef Kind, StringRef Value) const { - AttrBuilder B; + AttrBuilder B(C); B.addAttribute(Kind, Value); return addAttributes(C, AttributeSet::get(C, B)); } @@ -627,17 +627,15 @@ AttributeSet AttributeSet::addAttributes(LLVMContext &C, if (!AS.hasAttributes()) return *this; - AttrBuilder B(AS); - for (const auto &I : *this) - B.addAttribute(I); - - return get(C, B); + AttrBuilder B(C, *this); + B.merge(AttrBuilder(C, AS)); + return get(C, B); } AttributeSet AttributeSet::removeAttribute(LLVMContext &C, Attribute::AttrKind Kind) const { if (!hasAttribute(Kind)) return *this; - AttrBuilder B(*this); + AttrBuilder B(C, *this); B.removeAttribute(Kind); return get(C, B); } @@ -645,14 +643,14 @@ AttributeSet AttributeSet::removeAttribute(LLVMContext &C, AttributeSet AttributeSet::removeAttribute(LLVMContext &C, StringRef Kind) const { if (!hasAttribute(Kind)) return *this; - AttrBuilder B(*this); + AttrBuilder B(C, *this); B.removeAttribute(Kind); return get(C, B); } AttributeSet AttributeSet::removeAttributes(LLVMContext &C, - const AttrBuilder &Attrs) const { - AttrBuilder B(*this); + const AttributeMask &Attrs) const { + AttrBuilder B(C, *this); // If there is nothing to remove, directly return the original set. if (!B.overlaps(Attrs)) return *this; @@ -817,28 +815,7 @@ AttributeSetNode *AttributeSetNode::getSorted(LLVMContext &C, } AttributeSetNode *AttributeSetNode::get(LLVMContext &C, const AttrBuilder &B) { - // Add target-independent attributes. - SmallVector<Attribute, 8> Attrs; - for (Attribute::AttrKind Kind = Attribute::None; - Kind != Attribute::EndAttrKinds; Kind = Attribute::AttrKind(Kind + 1)) { - if (!B.contains(Kind)) - continue; - - Attribute Attr; - if (Attribute::isTypeAttrKind(Kind)) - Attr = Attribute::get(C, Kind, B.getTypeAttr(Kind)); - else if (Attribute::isIntAttrKind(Kind)) - Attr = Attribute::get(C, Kind, B.getRawIntAttr(Kind)); - else - Attr = Attribute::get(C, Kind); - Attrs.push_back(Attr); - } - - // Add target-dependent (string) attributes. - for (const auto &TDA : B.td_attrs()) - Attrs.emplace_back(Attribute::get(C, TDA.first, TDA.second)); - - return getSorted(C, Attrs); + return getSorted(C, B.attrs()); } bool AttributeSetNode::hasAttribute(StringRef Kind) const { @@ -1194,9 +1171,9 @@ AttributeList AttributeList::get(LLVMContext &C, SmallVector<AttributeSet, 8> NewAttrSets(MaxSize); for (unsigned I = 0; I < MaxSize; ++I) { - AttrBuilder CurBuilder; + AttrBuilder CurBuilder(C); for (const auto &List : Attrs) - CurBuilder.merge(List.getAttributes(I - 1)); + CurBuilder.merge(AttrBuilder(C, List.getAttributes(I - 1))); NewAttrSets[I] = AttributeSet::get(C, CurBuilder); } @@ -1218,14 +1195,14 @@ AttributeList::addAttributeAtIndex(LLVMContext &C, unsigned Index, AttributeList AttributeList::addAttributeAtIndex(LLVMContext &C, unsigned Index, StringRef Kind, StringRef Value) const { - AttrBuilder B; + AttrBuilder B(C); B.addAttribute(Kind, Value); return addAttributesAtIndex(C, Index, B); } AttributeList AttributeList::addAttributeAtIndex(LLVMContext &C, unsigned Index, Attribute A) const { - AttrBuilder B; + AttrBuilder B(C); B.addAttribute(A); return addAttributesAtIndex(C, Index, B); } @@ -1250,16 +1227,7 @@ AttributeList AttributeList::addAttributesAtIndex(LLVMContext &C, if (!pImpl) return AttributeList::get(C, {{Index, AttributeSet::get(C, B)}}); -#ifndef NDEBUG - // FIXME it is not obvious how this should work for alignment. For now, say - // we can't change a known alignment. - const MaybeAlign OldAlign = getAttributes(Index).getAlignment(); - const MaybeAlign NewAlign = B.getAlignment(); - assert((!OldAlign || !NewAlign || OldAlign == NewAlign) && - "Attempt to change alignment!"); -#endif - - AttrBuilder Merged(getAttributes(Index)); + AttrBuilder Merged(C, getAttributes(Index)); Merged.merge(B); return setAttributesAtIndex(C, Index, AttributeSet::get(C, Merged)); } @@ -1276,7 +1244,7 @@ AttributeList AttributeList::addParamAttribute(LLVMContext &C, for (unsigned ArgNo : ArgNos) { unsigned Index = attrIdxToArrayIdx(ArgNo + FirstArgIndex); - AttrBuilder B(AttrSets[Index]); + AttrBuilder B(C, AttrSets[Index]); B.addAttribute(A); AttrSets[Index] = AttributeSet::get(C, B); } @@ -1314,9 +1282,8 @@ AttributeList AttributeList::removeAttributeAtIndex(LLVMContext &C, return getImpl(C, AttrSets); } -AttributeList -AttributeList::removeAttributesAtIndex(LLVMContext &C, unsigned Index, - const AttrBuilder &AttrsToRemove) const { +AttributeList AttributeList::removeAttributesAtIndex( + LLVMContext &C, unsigned Index, const AttributeMask &AttrsToRemove) const { AttributeSet Attrs = getAttributes(Index); AttributeSet NewAttrs = Attrs.removeAttributes(C, AttrsToRemove); // If nothing was removed, return the original list. @@ -1340,7 +1307,7 @@ AttributeList::removeAttributesAtIndex(LLVMContext &C, AttributeList AttributeList::addDereferenceableRetAttr(LLVMContext &C, uint64_t Bytes) const { - AttrBuilder B; + AttrBuilder B(C); B.addDereferenceableAttr(Bytes); return addRetAttributes(C, B); } @@ -1348,7 +1315,7 @@ AttributeList AttributeList::addDereferenceableRetAttr(LLVMContext &C, AttributeList AttributeList::addDereferenceableParamAttr(LLVMContext &C, unsigned Index, uint64_t Bytes) const { - AttrBuilder B; + AttrBuilder B(C); B.addDereferenceableAttr(Bytes); return addParamAttributes(C, Index, B); } @@ -1356,7 +1323,7 @@ AttributeList AttributeList::addDereferenceableParamAttr(LLVMContext &C, AttributeList AttributeList::addDereferenceableOrNullParamAttr(LLVMContext &C, unsigned Index, uint64_t Bytes) const { - AttrBuilder B; + AttrBuilder B(C); B.addDereferenceableOrNullAttr(Bytes); return addParamAttributes(C, Index, B); } @@ -1365,7 +1332,7 @@ AttributeList AttributeList::addAllocSizeParamAttr(LLVMContext &C, unsigned Index, unsigned ElemSizeArg, const Optional<unsigned> &NumElemsArg) { - AttrBuilder B; + AttrBuilder B(C); B.addAllocSizeAttr(ElemSizeArg, NumElemsArg); return addParamAttributes(C, Index, B); } @@ -1549,97 +1516,93 @@ LLVM_DUMP_METHOD void AttributeList::dump() const { print(dbgs()); } // AttrBuilder Method Implementations //===----------------------------------------------------------------------===// -// FIXME: Remove this ctor, use AttributeSet. -AttrBuilder::AttrBuilder(AttributeList AL, unsigned Index) { - AttributeSet AS = AL.getAttributes(Index); - for (const auto &A : AS) - addAttribute(A); -} - -AttrBuilder::AttrBuilder(AttributeSet AS) { - for (const auto &A : AS) - addAttribute(A); +AttrBuilder::AttrBuilder(LLVMContext &Ctx, AttributeSet AS) : Ctx(Ctx) { + append_range(Attrs, AS); + assert(is_sorted(Attrs) && "AttributeSet should be sorted"); } -void AttrBuilder::clear() { - Attrs.reset(); - TargetDepAttrs.clear(); - IntAttrs = {}; - TypeAttrs = {}; -} +void AttrBuilder::clear() { Attrs.clear(); } -Optional<unsigned> -AttrBuilder::kindToIntIndex(Attribute::AttrKind Kind) const { - if (Attribute::isIntAttrKind(Kind)) - return Kind - Attribute::FirstIntAttr; - return None; -} +/// Attribute comparator that only compares attribute keys. Enum attributes are +/// sorted before string attributes. +struct AttributeComparator { + bool operator()(Attribute A0, Attribute A1) const { + bool A0IsString = A0.isStringAttribute(); + bool A1IsString = A1.isStringAttribute(); + if (A0IsString) { + if (A1IsString) + return A0.getKindAsString() < A1.getKindAsString(); + else + return false; + } + if (A1IsString) + return true; + return A0.getKindAsEnum() < A1.getKindAsEnum(); + } + bool operator()(Attribute A0, Attribute::AttrKind Kind) const { + if (A0.isStringAttribute()) + return false; + return A0.getKindAsEnum() < Kind; + } + bool operator()(Attribute A0, StringRef Kind) const { + if (A0.isStringAttribute()) + return A0.getKindAsString() < Kind; + return true; + } +}; -Optional<unsigned> -AttrBuilder::kindToTypeIndex(Attribute::AttrKind Kind) const { - if (Attribute::isTypeAttrKind(Kind)) - return Kind - Attribute::FirstTypeAttr; - return None; +template <typename K> +static void addAttributeImpl(SmallVectorImpl<Attribute> &Attrs, K Kind, + Attribute Attr) { + auto It = lower_bound(Attrs, Kind, AttributeComparator()); + if (It != Attrs.end() && It->hasAttribute(Kind)) + std::swap(*It, Attr); + else + Attrs.insert(It, Attr); } AttrBuilder &AttrBuilder::addAttribute(Attribute Attr) { - if (Attr.isStringAttribute()) { - addAttribute(Attr.getKindAsString(), Attr.getValueAsString()); - return *this; - } - - Attribute::AttrKind Kind = Attr.getKindAsEnum(); - Attrs[Kind] = true; - - if (Optional<unsigned> TypeIndex = kindToTypeIndex(Kind)) - TypeAttrs[*TypeIndex] = Attr.getValueAsType(); - else if (Optional<unsigned> IntIndex = kindToIntIndex(Kind)) - IntAttrs[*IntIndex] = Attr.getValueAsInt(); + if (Attr.isStringAttribute()) + addAttributeImpl(Attrs, Attr.getKindAsString(), Attr); + else + addAttributeImpl(Attrs, Attr.getKindAsEnum(), Attr); + return *this; +} +AttrBuilder &AttrBuilder::addAttribute(Attribute::AttrKind Kind) { + addAttributeImpl(Attrs, Kind, Attribute::get(Ctx, Kind)); return *this; } AttrBuilder &AttrBuilder::addAttribute(StringRef A, StringRef V) { - TargetDepAttrs[A] = V; + addAttributeImpl(Attrs, A, Attribute::get(Ctx, A, V)); return *this; } AttrBuilder &AttrBuilder::removeAttribute(Attribute::AttrKind Val) { assert((unsigned)Val < Attribute::EndAttrKinds && "Attribute out of range!"); - Attrs[Val] = false; - - if (Optional<unsigned> TypeIndex = kindToTypeIndex(Val)) - TypeAttrs[*TypeIndex] = nullptr; - else if (Optional<unsigned> IntIndex = kindToIntIndex(Val)) - IntAttrs[*IntIndex] = 0; - - return *this; -} - -AttrBuilder &AttrBuilder::removeAttributes(AttributeList A, uint64_t Index) { - remove(A.getAttributes(Index)); + auto It = lower_bound(Attrs, Val, AttributeComparator()); + if (It != Attrs.end() && It->hasAttribute(Val)) + Attrs.erase(It); return *this; } AttrBuilder &AttrBuilder::removeAttribute(StringRef A) { - TargetDepAttrs.erase(A); + auto It = lower_bound(Attrs, A, AttributeComparator()); + if (It != Attrs.end() && It->hasAttribute(A)) + Attrs.erase(It); return *this; } uint64_t AttrBuilder::getRawIntAttr(Attribute::AttrKind Kind) const { - Optional<unsigned> IntIndex = kindToIntIndex(Kind); - assert(IntIndex && "Not an int attribute"); - return IntAttrs[*IntIndex]; + assert(Attribute::isIntAttrKind(Kind) && "Not an int attribute"); + Attribute A = getAttribute(Kind); + return A.isValid() ? A.getValueAsInt() : 0; } AttrBuilder &AttrBuilder::addRawIntAttr(Attribute::AttrKind Kind, uint64_t Value) { - Optional<unsigned> IntIndex = kindToIntIndex(Kind); - assert(IntIndex && "Not an int attribute"); - assert(Value && "Value cannot be zero"); - Attrs[Kind] = true; - IntAttrs[*IntIndex] = Value; - return *this; + return addAttribute(Attribute::get(Ctx, Kind, Value)); } std::pair<unsigned, Optional<unsigned>> AttrBuilder::getAllocSizeArgs() const { @@ -1709,17 +1672,13 @@ AttrBuilder &AttrBuilder::addVScaleRangeAttrFromRawRepr(uint64_t RawArgs) { } Type *AttrBuilder::getTypeAttr(Attribute::AttrKind Kind) const { - Optional<unsigned> TypeIndex = kindToTypeIndex(Kind); - assert(TypeIndex && "Not a type attribute"); - return TypeAttrs[*TypeIndex]; + assert(Attribute::isTypeAttrKind(Kind) && "Not a type attribute"); + Attribute A = getAttribute(Kind); + return A.isValid() ? A.getValueAsType() : nullptr; } AttrBuilder &AttrBuilder::addTypeAttr(Attribute::AttrKind Kind, Type *Ty) { - Optional<unsigned> TypeIndex = kindToTypeIndex(Kind); - assert(TypeIndex && "Not a type attribute"); - Attrs[Kind] = true; - TypeAttrs[*TypeIndex] = Ty; - return *this; + return addAttribute(Attribute::get(Ctx, Kind, Ty)); } AttrBuilder &AttrBuilder::addByValAttr(Type *Ty) { @@ -1743,76 +1702,43 @@ AttrBuilder &AttrBuilder::addInAllocaAttr(Type *Ty) { } AttrBuilder &AttrBuilder::merge(const AttrBuilder &B) { - // FIXME: What if both have an int/type attribute, but they don't match?! - for (unsigned Index = 0; Index < Attribute::NumIntAttrKinds; ++Index) - if (!IntAttrs[Index]) - IntAttrs[Index] = B.IntAttrs[Index]; - - for (unsigned Index = 0; Index < Attribute::NumTypeAttrKinds; ++Index) - if (!TypeAttrs[Index]) - TypeAttrs[Index] = B.TypeAttrs[Index]; - - Attrs |= B.Attrs; - - for (const auto &I : B.td_attrs()) - TargetDepAttrs[I.first] = I.second; + // TODO: Could make this O(n) as we're merging two sorted lists. + for (const auto &I : B.attrs()) + addAttribute(I); return *this; } -AttrBuilder &AttrBuilder::remove(const AttrBuilder &B) { - // FIXME: What if both have an int/type attribute, but they don't match?! - for (unsigned Index = 0; Index < Attribute::NumIntAttrKinds; ++Index) - if (B.IntAttrs[Index]) - IntAttrs[Index] = 0; - - for (unsigned Index = 0; Index < Attribute::NumTypeAttrKinds; ++Index) - if (B.TypeAttrs[Index]) - TypeAttrs[Index] = nullptr; - - Attrs &= ~B.Attrs; - - for (const auto &I : B.td_attrs()) - TargetDepAttrs.erase(I.first); - +AttrBuilder &AttrBuilder::remove(const AttributeMask &AM) { + erase_if(Attrs, [&](Attribute A) { return AM.contains(A); }); return *this; } -bool AttrBuilder::overlaps(const AttrBuilder &B) const { - // First check if any of the target independent attributes overlap. - if ((Attrs & B.Attrs).any()) - return true; - - // Then check if any target dependent ones do. - for (const auto &I : td_attrs()) - if (B.contains(I.first)) - return true; - - return false; +bool AttrBuilder::overlaps(const AttributeMask &AM) const { + return any_of(Attrs, [&](Attribute A) { return AM.contains(A); }); } -bool AttrBuilder::contains(StringRef A) const { - return TargetDepAttrs.find(A) != TargetDepAttrs.end(); +Attribute AttrBuilder::getAttribute(Attribute::AttrKind A) const { + assert((unsigned)A < Attribute::EndAttrKinds && "Attribute out of range!"); + auto It = lower_bound(Attrs, A, AttributeComparator()); + if (It != Attrs.end() && It->hasAttribute(A)) + return *It; + return {}; } -bool AttrBuilder::hasAttributes() const { - return !Attrs.none() || !TargetDepAttrs.empty(); +Attribute AttrBuilder::getAttribute(StringRef A) const { + auto It = lower_bound(Attrs, A, AttributeComparator()); + if (It != Attrs.end() && It->hasAttribute(A)) + return *It; + return {}; } -bool AttrBuilder::hasAttributes(AttributeList AL, uint64_t Index) const { - AttributeSet AS = AL.getAttributes(Index); - - for (const auto &Attr : AS) { - if (Attr.isEnumAttribute() || Attr.isIntAttribute()) { - if (contains(Attr.getKindAsEnum())) - return true; - } else { - assert(Attr.isStringAttribute() && "Invalid attribute kind!"); - return contains(Attr.getKindAsString()); - } - } +bool AttrBuilder::contains(Attribute::AttrKind A) const { + return getAttribute(A).isValid(); +} - return false; +bool AttrBuilder::contains(StringRef A) const { + return getAttribute(A).isValid(); } bool AttrBuilder::hasAlignmentAttr() const { @@ -1820,14 +1746,7 @@ bool AttrBuilder::hasAlignmentAttr() const { } bool AttrBuilder::operator==(const AttrBuilder &B) const { - if (Attrs != B.Attrs) - return false; - - for (const auto &TDA : TargetDepAttrs) - if (B.TargetDepAttrs.find(TDA.first) == B.TargetDepAttrs.end()) - return false; - - return IntAttrs == B.IntAttrs && TypeAttrs == B.TypeAttrs; + return Attrs == B.Attrs; } //===----------------------------------------------------------------------===// @@ -1835,16 +1754,16 @@ bool AttrBuilder::operator==(const AttrBuilder &B) const { //===----------------------------------------------------------------------===// /// Which attributes cannot be applied to a type. -AttrBuilder AttributeFuncs::typeIncompatible(Type *Ty) { - AttrBuilder Incompatible; +AttributeMask AttributeFuncs::typeIncompatible(Type *Ty) { + AttributeMask Incompatible; if (!Ty->isIntegerTy()) - // Attribute that only apply to integers. + // Attributes that only apply to integers. Incompatible.addAttribute(Attribute::SExt) .addAttribute(Attribute::ZExt); if (!Ty->isPointerTy()) - // Attribute that only apply to pointers. + // Attributes that only apply to pointers. Incompatible.addAttribute(Attribute::Nest) .addAttribute(Attribute::NoAlias) .addAttribute(Attribute::NoCapture) @@ -1852,15 +1771,18 @@ AttrBuilder AttributeFuncs::typeIncompatible(Type *Ty) { .addAttribute(Attribute::ReadNone) .addAttribute(Attribute::ReadOnly) .addAttribute(Attribute::SwiftError) - .addAlignmentAttr(1) // the int here is ignored - .addDereferenceableAttr(1) // the int here is ignored - .addDereferenceableOrNullAttr(1) // the int here is ignored - .addPreallocatedAttr(Ty) - .addInAllocaAttr(Ty) - .addByValAttr(Ty) - .addStructRetAttr(Ty) - .addByRefAttr(Ty) - .addTypeAttr(Attribute::ElementType, Ty); + .addAttribute(Attribute::Dereferenceable) + .addAttribute(Attribute::DereferenceableOrNull) + .addAttribute(Attribute::Preallocated) + .addAttribute(Attribute::InAlloca) + .addAttribute(Attribute::ByVal) + .addAttribute(Attribute::StructRet) + .addAttribute(Attribute::ByRef) + .addAttribute(Attribute::ElementType); + + if (!Ty->isPtrOrPtrVectorTy()) + // Attributes that only apply to pointers or vectors of pointers. + Incompatible.addAttribute(Attribute::Alignment); // Some attributes can apply to all "values" but there are no `void` values. if (Ty->isVoidTy()) @@ -1869,12 +1791,12 @@ AttrBuilder AttributeFuncs::typeIncompatible(Type *Ty) { return Incompatible; } -AttrBuilder AttributeFuncs::getUBImplyingAttributes() { - AttrBuilder B; - B.addAttribute(Attribute::NoUndef); - B.addDereferenceableAttr(1); - B.addDereferenceableOrNullAttr(1); - return B; +AttributeMask AttributeFuncs::getUBImplyingAttributes() { + AttributeMask AM; + AM.addAttribute(Attribute::NoUndef); + AM.addAttribute(Attribute::Dereferenceable); + AM.addAttribute(Attribute::DereferenceableOrNull); + return AM; } template<typename AttrClass> @@ -1910,10 +1832,16 @@ static void setOR(Function &Caller, const Function &Callee) { /// If the inlined function had a higher stack protection level than the /// calling function, then bump up the caller's stack protection level. static void adjustCallerSSPLevel(Function &Caller, const Function &Callee) { + // If the calling function has *no* stack protection level (e.g. it was built + // with Clang's -fno-stack-protector or no_stack_protector attribute), don't + // change it as that could change the program's semantics. + if (!Caller.hasStackProtectorFnAttr()) + return; + // If upgrading the SSP attribute, clear out the old SSP Attributes first. // Having multiple SSP attributes doesn't actually hurt, but it adds useless // clutter to the IR. - AttrBuilder OldSSPAttr; + AttributeMask OldSSPAttr; OldSSPAttr.addAttribute(Attribute::StackProtect) .addAttribute(Attribute::StackProtectStrong) .addAttribute(Attribute::StackProtectReq); diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp index b8ad2b294b87..45459e200b3d 100644 --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -727,6 +727,13 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) { Name == "arm.cde.vcx3qa.predicated.v2i64.v4i1") return true; + if (Name == "amdgcn.alignbit") { + // Target specific intrinsic became redundant + NewFn = Intrinsic::getDeclaration(F->getParent(), Intrinsic::fshr, + {F->getReturnType()}); + return true; + } + break; } @@ -4488,7 +4495,7 @@ void llvm::UpgradeFunctionAttributes(Function &F) { if (F.getCallingConv() == CallingConv::X86_INTR && !F.arg_empty() && !F.hasParamAttribute(0, Attribute::ByVal)) { - Type *ByValTy = cast<PointerType>(F.getArg(0)->getType())->getElementType(); + Type *ByValTy = F.getArg(0)->getType()->getPointerElementType(); Attribute NewAttr = Attribute::getWithByValType(F.getContext(), ByValTy); F.addParamAttr(0, NewAttr); } @@ -4569,27 +4576,39 @@ std::string llvm::UpgradeDataLayoutString(StringRef DL, StringRef TT) { return DL.empty() ? std::string("G1") : (DL + "-G1").str(); } + std::string Res = DL.str(); + if (!T.isX86()) + return Res; + + // If the datalayout matches the expected format, add pointer size address + // spaces to the datalayout. std::string AddrSpaces = "-p270:32:32-p271:32:32-p272:64:64"; - // If X86, and the datalayout matches the expected format, add pointer size - // address spaces to the datalayout. - if (!T.isX86() || DL.contains(AddrSpaces)) - return std::string(DL); + if (!DL.contains(AddrSpaces)) { + SmallVector<StringRef, 4> Groups; + Regex R("(e-m:[a-z](-p:32:32)?)(-[if]64:.*$)"); + if (R.match(DL, &Groups)) + Res = (Groups[1] + AddrSpaces + Groups[3]).str(); + } - SmallVector<StringRef, 4> Groups; - Regex R("(e-m:[a-z](-p:32:32)?)(-[if]64:.*$)"); - if (!R.match(DL, &Groups)) - return std::string(DL); + // For 32-bit MSVC targets, raise the alignment of f80 values to 16 bytes. + // Raising the alignment is safe because Clang did not produce f80 values in + // the MSVC environment before this upgrade was added. + if (T.isWindowsMSVCEnvironment() && !T.isArch64Bit()) { + StringRef Ref = Res; + auto I = Ref.find("-f80:32-"); + if (I != StringRef::npos) + Res = (Ref.take_front(I) + "-f80:128-" + Ref.drop_front(I + 8)).str(); + } - return (Groups[1] + AddrSpaces + Groups[3]).str(); + return Res; } void llvm::UpgradeAttributes(AttrBuilder &B) { StringRef FramePointer; - if (B.contains("no-frame-pointer-elim")) { + Attribute A = B.getAttribute("no-frame-pointer-elim"); + if (A.isValid()) { // The value can be "true" or "false". - for (const auto &I : B.td_attrs()) - if (I.first == "no-frame-pointer-elim") - FramePointer = I.second == "true" ? "all" : "none"; + FramePointer = A.getValueAsString() == "true" ? "all" : "none"; B.removeAttribute("no-frame-pointer-elim"); } if (B.contains("no-frame-pointer-elim-non-leaf")) { @@ -4601,12 +4620,10 @@ void llvm::UpgradeAttributes(AttrBuilder &B) { if (!FramePointer.empty()) B.addAttribute("frame-pointer", FramePointer); - if (B.contains("null-pointer-is-valid")) { + A = B.getAttribute("null-pointer-is-valid"); + if (A.isValid()) { // The value can be "true" or "false". - bool NullPointerIsValid = false; - for (const auto &I : B.td_attrs()) - if (I.first == "null-pointer-is-valid") - NullPointerIsValid = I.second == "true"; + bool NullPointerIsValid = A.getValueAsString() == "true"; B.removeAttribute("null-pointer-is-valid"); if (NullPointerIsValid) B.addAttribute(Attribute::NullPointerIsValid); diff --git a/llvm/lib/IR/Comdat.cpp b/llvm/lib/IR/Comdat.cpp index 1a5d38d17bc0..90d5c6e82e5c 100644 --- a/llvm/lib/IR/Comdat.cpp +++ b/llvm/lib/IR/Comdat.cpp @@ -25,6 +25,10 @@ Comdat::Comdat() = default; StringRef Comdat::getName() const { return Name->first(); } +void Comdat::addUser(GlobalObject *GO) { Users.insert(GO); } + +void Comdat::removeUser(GlobalObject *GO) { Users.erase(GO); } + LLVMComdatRef LLVMGetOrInsertComdat(LLVMModuleRef M, const char *Name) { return wrap(unwrap(M)->getOrInsertComdat(Name)); } diff --git a/llvm/lib/IR/ConstantFold.cpp b/llvm/lib/IR/ConstantFold.cpp index 8668fe82601c..622a984be22c 100644 --- a/llvm/lib/IR/ConstantFold.cpp +++ b/llvm/lib/IR/ConstantFold.cpp @@ -119,21 +119,21 @@ static Constant *FoldBitCast(Constant *V, Type *DestTy) { if (PointerType *DPTy = dyn_cast<PointerType>(DestTy)) if (PTy->getAddressSpace() == DPTy->getAddressSpace() && !PTy->isOpaque() && !DPTy->isOpaque() && - PTy->getElementType()->isSized()) { + PTy->getNonOpaquePointerElementType()->isSized()) { SmallVector<Value*, 8> IdxList; Value *Zero = Constant::getNullValue(Type::getInt32Ty(DPTy->getContext())); IdxList.push_back(Zero); - Type *ElTy = PTy->getElementType(); - while (ElTy && ElTy != DPTy->getElementType()) { + Type *ElTy = PTy->getNonOpaquePointerElementType(); + while (ElTy && ElTy != DPTy->getNonOpaquePointerElementType()) { ElTy = GetElementPtrInst::getTypeAtIndex(ElTy, (uint64_t)0); IdxList.push_back(Zero); } - if (ElTy == DPTy->getElementType()) + if (ElTy == DPTy->getNonOpaquePointerElementType()) // This GEP is inbounds because all indices are zero. - return ConstantExpr::getInBoundsGetElementPtr(PTy->getElementType(), - V, IdxList); + return ConstantExpr::getInBoundsGetElementPtr( + PTy->getNonOpaquePointerElementType(), V, IdxList); } // Handle casts from one vector constant to another. We know that the src @@ -1299,63 +1299,6 @@ Constant *llvm::ConstantFoldBinaryInstruction(unsigned Opcode, Constant *C1, return nullptr; } -/// This type is zero-sized if it's an array or structure of zero-sized types. -/// The only leaf zero-sized type is an empty structure. -static bool isMaybeZeroSizedType(Type *Ty) { - if (StructType *STy = dyn_cast<StructType>(Ty)) { - if (STy->isOpaque()) return true; // Can't say. - - // If all of elements have zero size, this does too. - for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) - if (!isMaybeZeroSizedType(STy->getElementType(i))) return false; - return true; - - } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) { - return isMaybeZeroSizedType(ATy->getElementType()); - } - return false; -} - -/// Compare the two constants as though they were getelementptr indices. -/// This allows coercion of the types to be the same thing. -/// -/// If the two constants are the "same" (after coercion), return 0. If the -/// first is less than the second, return -1, if the second is less than the -/// first, return 1. If the constants are not integral, return -2. -/// -static int IdxCompare(Constant *C1, Constant *C2, Type *ElTy) { - if (C1 == C2) return 0; - - // Ok, we found a different index. If they are not ConstantInt, we can't do - // anything with them. - if (!isa<ConstantInt>(C1) || !isa<ConstantInt>(C2)) - return -2; // don't know! - - // We cannot compare the indices if they don't fit in an int64_t. - if (cast<ConstantInt>(C1)->getValue().getActiveBits() > 64 || - cast<ConstantInt>(C2)->getValue().getActiveBits() > 64) - return -2; // don't know! - - // Ok, we have two differing integer indices. Sign extend them to be the same - // type. - int64_t C1Val = cast<ConstantInt>(C1)->getSExtValue(); - int64_t C2Val = cast<ConstantInt>(C2)->getSExtValue(); - - if (C1Val == C2Val) return 0; // They are equal - - // If the type being indexed over is really just a zero sized type, there is - // no pointer difference being made here. - if (isMaybeZeroSizedType(ElTy)) - return -2; // dunno. - - // If they are really different, now that they are the same type, then we - // found a difference! - if (C1Val < C2Val) - return -1; - else - return 1; -} - /// This function determines if there is anything we can decide about the two /// constants provided. This doesn't need to handle simple things like /// ConstantFP comparisons, but should instead handle ConstantExprs. @@ -1594,103 +1537,28 @@ static ICmpInst::Predicate evaluateICmpRelation(Constant *V1, Constant *V2, if (const GlobalValue *GV = dyn_cast<GlobalValue>(CE1Op0)) { // If its not weak linkage, the GVal must have a non-zero address // so the result is greater-than - if (!GV->hasExternalWeakLinkage()) + if (!GV->hasExternalWeakLinkage() && CE1GEP->isInBounds()) return ICmpInst::ICMP_UGT; - } else if (isa<ConstantPointerNull>(CE1Op0)) { - // If we are indexing from a null pointer, check to see if we have any - // non-zero indices. - for (unsigned i = 1, e = CE1->getNumOperands(); i != e; ++i) - if (!CE1->getOperand(i)->isNullValue()) - // Offsetting from null, must not be equal. - return ICmpInst::ICMP_UGT; - // Only zero indexes from null, must still be zero. - return ICmpInst::ICMP_EQ; } - // Otherwise, we can't really say if the first operand is null or not. } else if (const GlobalValue *GV2 = dyn_cast<GlobalValue>(V2)) { - if (isa<ConstantPointerNull>(CE1Op0)) { - // If its not weak linkage, the GVal must have a non-zero address - // so the result is less-than - if (!GV2->hasExternalWeakLinkage()) - return ICmpInst::ICMP_ULT; - } else if (const GlobalValue *GV = dyn_cast<GlobalValue>(CE1Op0)) { - if (GV == GV2) { - // If this is a getelementptr of the same global, then it must be - // different. Because the types must match, the getelementptr could - // only have at most one index, and because we fold getelementptr's - // with a single zero index, it must be nonzero. - assert(CE1->getNumOperands() == 2 && - !CE1->getOperand(1)->isNullValue() && - "Surprising getelementptr!"); - return ICmpInst::ICMP_UGT; - } else { + if (const GlobalValue *GV = dyn_cast<GlobalValue>(CE1Op0)) { + if (GV != GV2) { if (CE1GEP->hasAllZeroIndices()) return areGlobalsPotentiallyEqual(GV, GV2); return ICmpInst::BAD_ICMP_PREDICATE; } } - } else { - ConstantExpr *CE2 = cast<ConstantExpr>(V2); - Constant *CE2Op0 = CE2->getOperand(0); - - // There are MANY other foldings that we could perform here. They will - // probably be added on demand, as they seem needed. - switch (CE2->getOpcode()) { - default: break; - case Instruction::GetElementPtr: - // By far the most common case to handle is when the base pointers are - // obviously to the same global. - if (isa<GlobalValue>(CE1Op0) && isa<GlobalValue>(CE2Op0)) { - // Don't know relative ordering, but check for inequality. - if (CE1Op0 != CE2Op0) { - GEPOperator *CE2GEP = cast<GEPOperator>(CE2); - if (CE1GEP->hasAllZeroIndices() && CE2GEP->hasAllZeroIndices()) - return areGlobalsPotentiallyEqual(cast<GlobalValue>(CE1Op0), - cast<GlobalValue>(CE2Op0)); - return ICmpInst::BAD_ICMP_PREDICATE; - } - // Ok, we know that both getelementptr instructions are based on the - // same global. From this, we can precisely determine the relative - // ordering of the resultant pointers. - unsigned i = 1; - - // The logic below assumes that the result of the comparison - // can be determined by finding the first index that differs. - // This doesn't work if there is over-indexing in any - // subsequent indices, so check for that case first. - if (!CE1->isGEPWithNoNotionalOverIndexing() || - !CE2->isGEPWithNoNotionalOverIndexing()) - return ICmpInst::BAD_ICMP_PREDICATE; // Might be equal. - - // Compare all of the operands the GEP's have in common. - gep_type_iterator GTI = gep_type_begin(CE1); - for (;i != CE1->getNumOperands() && i != CE2->getNumOperands(); - ++i, ++GTI) - switch (IdxCompare(CE1->getOperand(i), - CE2->getOperand(i), GTI.getIndexedType())) { - case -1: return isSigned ? ICmpInst::ICMP_SLT:ICmpInst::ICMP_ULT; - case 1: return isSigned ? ICmpInst::ICMP_SGT:ICmpInst::ICMP_UGT; - case -2: return ICmpInst::BAD_ICMP_PREDICATE; - } - - // Ok, we ran out of things they have in common. If any leftovers - // are non-zero then we have a difference, otherwise we are equal. - for (; i < CE1->getNumOperands(); ++i) - if (!CE1->getOperand(i)->isNullValue()) { - if (isa<ConstantInt>(CE1->getOperand(i))) - return isSigned ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT; - else - return ICmpInst::BAD_ICMP_PREDICATE; // Might be equal. - } - - for (; i < CE2->getNumOperands(); ++i) - if (!CE2->getOperand(i)->isNullValue()) { - if (isa<ConstantInt>(CE2->getOperand(i))) - return isSigned ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT; - else - return ICmpInst::BAD_ICMP_PREDICATE; // Might be equal. - } - return ICmpInst::ICMP_EQ; + } else if (const auto *CE2GEP = dyn_cast<GEPOperator>(V2)) { + // By far the most common case to handle is when the base pointers are + // obviously to the same global. + const Constant *CE2Op0 = cast<Constant>(CE2GEP->getPointerOperand()); + if (isa<GlobalValue>(CE1Op0) && isa<GlobalValue>(CE2Op0)) { + // Don't know relative ordering, but check for inequality. + if (CE1Op0 != CE2Op0) { + if (CE1GEP->hasAllZeroIndices() && CE2GEP->hasAllZeroIndices()) + return areGlobalsPotentiallyEqual(cast<GlobalValue>(CE1Op0), + cast<GlobalValue>(CE2Op0)); + return ICmpInst::BAD_ICMP_PREDICATE; } } } @@ -1704,7 +1572,7 @@ static ICmpInst::Predicate evaluateICmpRelation(Constant *V1, Constant *V2, return ICmpInst::BAD_ICMP_PREDICATE; } -Constant *llvm::ConstantFoldCompareInstruction(unsigned short pred, +Constant *llvm::ConstantFoldCompareInstruction(CmpInst::Predicate Predicate, Constant *C1, Constant *C2) { Type *ResultTy; if (VectorType *VT = dyn_cast<VectorType>(C1->getType())) @@ -1714,10 +1582,10 @@ Constant *llvm::ConstantFoldCompareInstruction(unsigned short pred, ResultTy = Type::getInt1Ty(C1->getContext()); // Fold FCMP_FALSE/FCMP_TRUE unconditionally. - if (pred == FCmpInst::FCMP_FALSE) + if (Predicate == FCmpInst::FCMP_FALSE) return Constant::getNullValue(ResultTy); - if (pred == FCmpInst::FCMP_TRUE) + if (Predicate == FCmpInst::FCMP_TRUE) return Constant::getAllOnesValue(ResultTy); // Handle some degenerate cases first @@ -1725,7 +1593,6 @@ Constant *llvm::ConstantFoldCompareInstruction(unsigned short pred, return PoisonValue::get(ResultTy); if (isa<UndefValue>(C1) || isa<UndefValue>(C2)) { - CmpInst::Predicate Predicate = CmpInst::Predicate(pred); bool isIntegerPredicate = ICmpInst::isIntPredicate(Predicate); // For EQ and NE, we can always pick a value for the undef to make the // predicate pass or fail, so we can return undef. @@ -1750,9 +1617,9 @@ Constant *llvm::ConstantFoldCompareInstruction(unsigned short pred, if (!isa<GlobalAlias>(GV) && !GV->hasExternalWeakLinkage() && !NullPointerIsDefined(nullptr /* F */, GV->getType()->getAddressSpace())) { - if (pred == ICmpInst::ICMP_EQ) + if (Predicate == ICmpInst::ICMP_EQ) return ConstantInt::getFalse(C1->getContext()); - else if (pred == ICmpInst::ICMP_NE) + else if (Predicate == ICmpInst::ICMP_NE) return ConstantInt::getTrue(C1->getContext()); } // icmp eq/ne(GV,null) -> false/true @@ -1762,9 +1629,9 @@ Constant *llvm::ConstantFoldCompareInstruction(unsigned short pred, if (!isa<GlobalAlias>(GV) && !GV->hasExternalWeakLinkage() && !NullPointerIsDefined(nullptr /* F */, GV->getType()->getAddressSpace())) { - if (pred == ICmpInst::ICMP_EQ) + if (Predicate == ICmpInst::ICMP_EQ) return ConstantInt::getFalse(C1->getContext()); - else if (pred == ICmpInst::ICMP_NE) + else if (Predicate == ICmpInst::ICMP_NE) return ConstantInt::getTrue(C1->getContext()); } } @@ -1772,16 +1639,16 @@ Constant *llvm::ConstantFoldCompareInstruction(unsigned short pred, // The caller is expected to commute the operands if the constant expression // is C2. // C1 >= 0 --> true - if (pred == ICmpInst::ICMP_UGE) + if (Predicate == ICmpInst::ICMP_UGE) return Constant::getAllOnesValue(ResultTy); // C1 < 0 --> false - if (pred == ICmpInst::ICMP_ULT) + if (Predicate == ICmpInst::ICMP_ULT) return Constant::getNullValue(ResultTy); } // If the comparison is a comparison between two i1's, simplify it. if (C1->getType()->isIntegerTy(1)) { - switch(pred) { + switch (Predicate) { case ICmpInst::ICMP_EQ: if (isa<ConstantInt>(C2)) return ConstantExpr::getXor(C1, ConstantExpr::getNot(C2)); @@ -1796,12 +1663,10 @@ Constant *llvm::ConstantFoldCompareInstruction(unsigned short pred, if (isa<ConstantInt>(C1) && isa<ConstantInt>(C2)) { const APInt &V1 = cast<ConstantInt>(C1)->getValue(); const APInt &V2 = cast<ConstantInt>(C2)->getValue(); - return ConstantInt::get( - ResultTy, ICmpInst::compare(V1, V2, (ICmpInst::Predicate)pred)); + return ConstantInt::get(ResultTy, ICmpInst::compare(V1, V2, Predicate)); } else if (isa<ConstantFP>(C1) && isa<ConstantFP>(C2)) { const APFloat &C1V = cast<ConstantFP>(C1)->getValueAPF(); const APFloat &C2V = cast<ConstantFP>(C2)->getValueAPF(); - CmpInst::Predicate Predicate = CmpInst::Predicate(pred); return ConstantInt::get(ResultTy, FCmpInst::compare(C1V, C2V, Predicate)); } else if (auto *C1VTy = dyn_cast<VectorType>(C1->getType())) { @@ -1810,7 +1675,7 @@ Constant *llvm::ConstantFoldCompareInstruction(unsigned short pred, if (Constant *C2Splat = C2->getSplatValue()) return ConstantVector::getSplat( C1VTy->getElementCount(), - ConstantExpr::getCompare(pred, C1Splat, C2Splat)); + ConstantExpr::getCompare(Predicate, C1Splat, C2Splat)); // Do not iterate on scalable vector. The number of elements is unknown at // compile-time. @@ -1829,7 +1694,7 @@ Constant *llvm::ConstantFoldCompareInstruction(unsigned short pred, Constant *C2E = ConstantExpr::getExtractElement(C2, ConstantInt::get(Ty, I)); - ResElts.push_back(ConstantExpr::getCompare(pred, C1E, C2E)); + ResElts.push_back(ConstantExpr::getCompare(Predicate, C1E, C2E)); } return ConstantVector::get(ResElts); @@ -1854,46 +1719,52 @@ Constant *llvm::ConstantFoldCompareInstruction(unsigned short pred, case FCmpInst::BAD_FCMP_PREDICATE: break; // Couldn't determine anything about these constants. case FCmpInst::FCMP_OEQ: // We know that C1 == C2 - Result = (pred == FCmpInst::FCMP_UEQ || pred == FCmpInst::FCMP_OEQ || - pred == FCmpInst::FCMP_ULE || pred == FCmpInst::FCMP_OLE || - pred == FCmpInst::FCMP_UGE || pred == FCmpInst::FCMP_OGE); + Result = + (Predicate == FCmpInst::FCMP_UEQ || Predicate == FCmpInst::FCMP_OEQ || + Predicate == FCmpInst::FCMP_ULE || Predicate == FCmpInst::FCMP_OLE || + Predicate == FCmpInst::FCMP_UGE || Predicate == FCmpInst::FCMP_OGE); break; case FCmpInst::FCMP_OLT: // We know that C1 < C2 - Result = (pred == FCmpInst::FCMP_UNE || pred == FCmpInst::FCMP_ONE || - pred == FCmpInst::FCMP_ULT || pred == FCmpInst::FCMP_OLT || - pred == FCmpInst::FCMP_ULE || pred == FCmpInst::FCMP_OLE); + Result = + (Predicate == FCmpInst::FCMP_UNE || Predicate == FCmpInst::FCMP_ONE || + Predicate == FCmpInst::FCMP_ULT || Predicate == FCmpInst::FCMP_OLT || + Predicate == FCmpInst::FCMP_ULE || Predicate == FCmpInst::FCMP_OLE); break; case FCmpInst::FCMP_OGT: // We know that C1 > C2 - Result = (pred == FCmpInst::FCMP_UNE || pred == FCmpInst::FCMP_ONE || - pred == FCmpInst::FCMP_UGT || pred == FCmpInst::FCMP_OGT || - pred == FCmpInst::FCMP_UGE || pred == FCmpInst::FCMP_OGE); + Result = + (Predicate == FCmpInst::FCMP_UNE || Predicate == FCmpInst::FCMP_ONE || + Predicate == FCmpInst::FCMP_UGT || Predicate == FCmpInst::FCMP_OGT || + Predicate == FCmpInst::FCMP_UGE || Predicate == FCmpInst::FCMP_OGE); break; case FCmpInst::FCMP_OLE: // We know that C1 <= C2 // We can only partially decide this relation. - if (pred == FCmpInst::FCMP_UGT || pred == FCmpInst::FCMP_OGT) + if (Predicate == FCmpInst::FCMP_UGT || Predicate == FCmpInst::FCMP_OGT) Result = 0; - else if (pred == FCmpInst::FCMP_ULT || pred == FCmpInst::FCMP_OLT) + else if (Predicate == FCmpInst::FCMP_ULT || + Predicate == FCmpInst::FCMP_OLT) Result = 1; break; case FCmpInst::FCMP_OGE: // We known that C1 >= C2 // We can only partially decide this relation. - if (pred == FCmpInst::FCMP_ULT || pred == FCmpInst::FCMP_OLT) + if (Predicate == FCmpInst::FCMP_ULT || Predicate == FCmpInst::FCMP_OLT) Result = 0; - else if (pred == FCmpInst::FCMP_UGT || pred == FCmpInst::FCMP_OGT) + else if (Predicate == FCmpInst::FCMP_UGT || + Predicate == FCmpInst::FCMP_OGT) Result = 1; break; case FCmpInst::FCMP_ONE: // We know that C1 != C2 // We can only partially decide this relation. - if (pred == FCmpInst::FCMP_OEQ || pred == FCmpInst::FCMP_UEQ) + if (Predicate == FCmpInst::FCMP_OEQ || Predicate == FCmpInst::FCMP_UEQ) Result = 0; - else if (pred == FCmpInst::FCMP_ONE || pred == FCmpInst::FCMP_UNE) + else if (Predicate == FCmpInst::FCMP_ONE || + Predicate == FCmpInst::FCMP_UNE) Result = 1; break; case FCmpInst::FCMP_UEQ: // We know that C1 == C2 || isUnordered(C1, C2). // We can only partially decide this relation. - if (pred == FCmpInst::FCMP_ONE) + if (Predicate == FCmpInst::FCMP_ONE) Result = 0; - else if (pred == FCmpInst::FCMP_UEQ) + else if (Predicate == FCmpInst::FCMP_UEQ) Result = 1; break; } @@ -1905,67 +1776,84 @@ Constant *llvm::ConstantFoldCompareInstruction(unsigned short pred, } else { // Evaluate the relation between the two constants, per the predicate. int Result = -1; // -1 = unknown, 0 = known false, 1 = known true. - switch (evaluateICmpRelation(C1, C2, - CmpInst::isSigned((CmpInst::Predicate)pred))) { + switch (evaluateICmpRelation(C1, C2, CmpInst::isSigned(Predicate))) { default: llvm_unreachable("Unknown relational!"); case ICmpInst::BAD_ICMP_PREDICATE: break; // Couldn't determine anything about these constants. case ICmpInst::ICMP_EQ: // We know the constants are equal! // If we know the constants are equal, we can decide the result of this // computation precisely. - Result = ICmpInst::isTrueWhenEqual((ICmpInst::Predicate)pred); + Result = ICmpInst::isTrueWhenEqual(Predicate); break; case ICmpInst::ICMP_ULT: - switch (pred) { + switch (Predicate) { case ICmpInst::ICMP_ULT: case ICmpInst::ICMP_NE: case ICmpInst::ICMP_ULE: Result = 1; break; case ICmpInst::ICMP_UGT: case ICmpInst::ICMP_EQ: case ICmpInst::ICMP_UGE: Result = 0; break; + default: + break; } break; case ICmpInst::ICMP_SLT: - switch (pred) { + switch (Predicate) { case ICmpInst::ICMP_SLT: case ICmpInst::ICMP_NE: case ICmpInst::ICMP_SLE: Result = 1; break; case ICmpInst::ICMP_SGT: case ICmpInst::ICMP_EQ: case ICmpInst::ICMP_SGE: Result = 0; break; + default: + break; } break; case ICmpInst::ICMP_UGT: - switch (pred) { + switch (Predicate) { case ICmpInst::ICMP_UGT: case ICmpInst::ICMP_NE: case ICmpInst::ICMP_UGE: Result = 1; break; case ICmpInst::ICMP_ULT: case ICmpInst::ICMP_EQ: case ICmpInst::ICMP_ULE: Result = 0; break; + default: + break; } break; case ICmpInst::ICMP_SGT: - switch (pred) { + switch (Predicate) { case ICmpInst::ICMP_SGT: case ICmpInst::ICMP_NE: case ICmpInst::ICMP_SGE: Result = 1; break; case ICmpInst::ICMP_SLT: case ICmpInst::ICMP_EQ: case ICmpInst::ICMP_SLE: Result = 0; break; + default: + break; } break; case ICmpInst::ICMP_ULE: - if (pred == ICmpInst::ICMP_UGT) Result = 0; - if (pred == ICmpInst::ICMP_ULT || pred == ICmpInst::ICMP_ULE) Result = 1; + if (Predicate == ICmpInst::ICMP_UGT) + Result = 0; + if (Predicate == ICmpInst::ICMP_ULT || Predicate == ICmpInst::ICMP_ULE) + Result = 1; break; case ICmpInst::ICMP_SLE: - if (pred == ICmpInst::ICMP_SGT) Result = 0; - if (pred == ICmpInst::ICMP_SLT || pred == ICmpInst::ICMP_SLE) Result = 1; + if (Predicate == ICmpInst::ICMP_SGT) + Result = 0; + if (Predicate == ICmpInst::ICMP_SLT || Predicate == ICmpInst::ICMP_SLE) + Result = 1; break; case ICmpInst::ICMP_UGE: - if (pred == ICmpInst::ICMP_ULT) Result = 0; - if (pred == ICmpInst::ICMP_UGT || pred == ICmpInst::ICMP_UGE) Result = 1; + if (Predicate == ICmpInst::ICMP_ULT) + Result = 0; + if (Predicate == ICmpInst::ICMP_UGT || Predicate == ICmpInst::ICMP_UGE) + Result = 1; break; case ICmpInst::ICMP_SGE: - if (pred == ICmpInst::ICMP_SLT) Result = 0; - if (pred == ICmpInst::ICMP_SGT || pred == ICmpInst::ICMP_SGE) Result = 1; + if (Predicate == ICmpInst::ICMP_SLT) + Result = 0; + if (Predicate == ICmpInst::ICMP_SGT || Predicate == ICmpInst::ICMP_SGE) + Result = 1; break; case ICmpInst::ICMP_NE: - if (pred == ICmpInst::ICMP_EQ) Result = 0; - if (pred == ICmpInst::ICMP_NE) Result = 1; + if (Predicate == ICmpInst::ICMP_EQ) + Result = 0; + if (Predicate == ICmpInst::ICMP_NE) + Result = 1; break; } @@ -1983,16 +1871,16 @@ Constant *llvm::ConstantFoldCompareInstruction(unsigned short pred, CE2->getType()->isVectorTy() == CE2Op0->getType()->isVectorTy() && !CE2Op0->getType()->isFPOrFPVectorTy()) { Constant *Inverse = ConstantExpr::getBitCast(C1, CE2Op0->getType()); - return ConstantExpr::getICmp(pred, Inverse, CE2Op0); + return ConstantExpr::getICmp(Predicate, Inverse, CE2Op0); } } // If the left hand side is an extension, try eliminating it. if (ConstantExpr *CE1 = dyn_cast<ConstantExpr>(C1)) { if ((CE1->getOpcode() == Instruction::SExt && - ICmpInst::isSigned((ICmpInst::Predicate)pred)) || + ICmpInst::isSigned(Predicate)) || (CE1->getOpcode() == Instruction::ZExt && - !ICmpInst::isSigned((ICmpInst::Predicate)pred))){ + !ICmpInst::isSigned(Predicate))) { Constant *CE1Op0 = CE1->getOperand(0); Constant *CE1Inverse = ConstantExpr::getTrunc(CE1, CE1Op0->getType()); if (CE1Inverse == CE1Op0) { @@ -2000,7 +1888,7 @@ Constant *llvm::ConstantFoldCompareInstruction(unsigned short pred, Constant *C2Inverse = ConstantExpr::getTrunc(C2, CE1Op0->getType()); if (ConstantExpr::getCast(CE1->getOpcode(), C2Inverse, C2->getType()) == C2) - return ConstantExpr::getICmp(pred, CE1Inverse, C2Inverse); + return ConstantExpr::getICmp(Predicate, CE1Inverse, C2Inverse); } } } @@ -2010,8 +1898,8 @@ Constant *llvm::ConstantFoldCompareInstruction(unsigned short pred, // If C2 is a constant expr and C1 isn't, flip them around and fold the // other way if possible. // Also, if C1 is null and C2 isn't, flip them around. - pred = ICmpInst::getSwappedPredicate((ICmpInst::Predicate)pred); - return ConstantExpr::getICmp(pred, C2, C1); + Predicate = ICmpInst::getSwappedPredicate(Predicate); + return ConstantExpr::getICmp(Predicate, C2, C1); } } return nullptr; @@ -2086,32 +1974,14 @@ static Constant *foldGEPOfGEP(GEPOperator *GEP, Type *PointeeTy, bool InBounds, I != E; ++I) LastI = I; - // We cannot combine indices if doing so would take us outside of an - // array or vector. Doing otherwise could trick us if we evaluated such a - // GEP as part of a load. - // - // e.g. Consider if the original GEP was: - // i8* getelementptr ({ [2 x i8], i32, i8, [3 x i8] }* @main.c, - // i32 0, i32 0, i64 0) - // - // If we then tried to offset it by '8' to get to the third element, - // an i8, we should *not* get: - // i8* getelementptr ({ [2 x i8], i32, i8, [3 x i8] }* @main.c, - // i32 0, i32 0, i64 8) - // - // This GEP tries to index array element '8 which runs out-of-bounds. - // Subsequent evaluation would get confused and produce erroneous results. - // - // The following prohibits such a GEP from being formed by checking to see - // if the index is in-range with respect to an array. + // We can't combine GEPs if the last index is a struct type. if (!LastI.isSequential()) return nullptr; + // We could perform the transform with non-constant index, but prefer leaving + // it as GEP of GEP rather than GEP of add for now. ConstantInt *CI = dyn_cast<ConstantInt>(Idx0); if (!CI) return nullptr; - if (LastI.isBoundedSequential() && - !isIndexInRangeOfArrayType(LastI.getSequentialNumElements(), CI)) - return nullptr; // TODO: This code may be extended to handle vectors as well. auto *LastIdx = cast<Constant>(GEP->getOperand(GEP->getNumOperands()-1)); @@ -2226,11 +2096,12 @@ Constant *llvm::ConstantFoldGetElementPtr(Type *PointeeTy, Constant *C, PointerType *SrcPtrTy = dyn_cast<PointerType>(CE->getOperand(0)->getType()); PointerType *DstPtrTy = dyn_cast<PointerType>(CE->getType()); - if (SrcPtrTy && DstPtrTy) { + if (SrcPtrTy && DstPtrTy && !SrcPtrTy->isOpaque() && + !DstPtrTy->isOpaque()) { ArrayType *SrcArrayTy = - dyn_cast<ArrayType>(SrcPtrTy->getElementType()); + dyn_cast<ArrayType>(SrcPtrTy->getNonOpaquePointerElementType()); ArrayType *DstArrayTy = - dyn_cast<ArrayType>(DstPtrTy->getElementType()); + dyn_cast<ArrayType>(DstPtrTy->getNonOpaquePointerElementType()); if (SrcArrayTy && DstArrayTy && SrcArrayTy->getElementType() == DstArrayTy->getElementType() && SrcPtrTy->getAddressSpace() == DstPtrTy->getAddressSpace()) diff --git a/llvm/lib/IR/ConstantFold.h b/llvm/lib/IR/ConstantFold.h index 0cdd5cf3cbce..1aa44f4d21e5 100644 --- a/llvm/lib/IR/ConstantFold.h +++ b/llvm/lib/IR/ConstantFold.h @@ -19,6 +19,7 @@ #define LLVM_LIB_IR_CONSTANTFOLD_H #include "llvm/ADT/Optional.h" +#include "llvm/IR/InstrTypes.h" namespace llvm { template <typename T> class ArrayRef; @@ -46,7 +47,7 @@ template <typename T> class ArrayRef; Constant *ConstantFoldUnaryInstruction(unsigned Opcode, Constant *V); Constant *ConstantFoldBinaryInstruction(unsigned Opcode, Constant *V1, Constant *V2); - Constant *ConstantFoldCompareInstruction(unsigned short predicate, + Constant *ConstantFoldCompareInstruction(CmpInst::Predicate Predicate, Constant *C1, Constant *C2); Constant *ConstantFoldGetElementPtr(Type *Ty, Constant *C, bool InBounds, Optional<unsigned> InRangeIndex, diff --git a/llvm/lib/IR/Constants.cpp b/llvm/lib/IR/Constants.cpp index 837be910f6d8..c13990af360e 100644 --- a/llvm/lib/IR/Constants.cpp +++ b/llvm/lib/IR/Constants.cpp @@ -739,15 +739,8 @@ static bool constantIsDead(const Constant *C, bool RemoveDeadUsers) { ++I; } - if (RemoveDeadUsers) { - // If C is only used by metadata, it should not be preserved but should - // have its uses replaced. - if (C->isUsedByMetadata()) { - const_cast<Constant *>(C)->replaceAllUsesWith( - UndefValue::get(C->getType())); - } + if (RemoveDeadUsers) const_cast<Constant *>(C)->destroyConstant(); - } return true; } @@ -779,18 +772,22 @@ void Constant::removeDeadConstantUsers() const { } } -bool Constant::hasOneLiveUse() const { +bool Constant::hasOneLiveUse() const { return hasNLiveUses(1); } + +bool Constant::hasZeroLiveUses() const { return hasNLiveUses(0); } + +bool Constant::hasNLiveUses(unsigned N) const { unsigned NumUses = 0; - for (const Use &use : uses()) { - const Constant *User = dyn_cast<Constant>(use.getUser()); + for (const Use &U : uses()) { + const Constant *User = dyn_cast<Constant>(U.getUser()); if (!User || !constantIsDead(User, /* RemoveDeadUsers= */ false)) { ++NumUses; - if (NumUses > 1) + if (NumUses > N) return false; } } - return NumUses == 1; + return NumUses == N; } Constant *Constant::replaceUndefsWith(Constant *C, Constant *Replacement) { @@ -1491,28 +1488,6 @@ bool ConstantExpr::isCompare() const { return getOpcode() == Instruction::ICmp || getOpcode() == Instruction::FCmp; } -bool ConstantExpr::isGEPWithNoNotionalOverIndexing() const { - if (getOpcode() != Instruction::GetElementPtr) return false; - - gep_type_iterator GEPI = gep_type_begin(this), E = gep_type_end(this); - User::const_op_iterator OI = std::next(this->op_begin()); - - // The remaining indices may be compile-time known integers within the bounds - // of the corresponding notional static array types. - for (; GEPI != E; ++GEPI, ++OI) { - if (isa<UndefValue>(*OI)) - continue; - auto *CI = dyn_cast<ConstantInt>(*OI); - if (!CI || (GEPI.isBoundedSequential() && - (CI->getValue().getActiveBits() > 64 || - CI->getZExtValue() >= GEPI.getSequentialNumElements()))) - return false; - } - - // All the indices checked out. - return true; -} - bool ConstantExpr::hasIndices() const { return getOpcode() == Instruction::ExtractValue || getOpcode() == Instruction::InsertValue; @@ -2546,11 +2521,11 @@ Constant *ConstantExpr::getGetElementPtr(Type *Ty, Constant *C, Constant *ConstantExpr::getICmp(unsigned short pred, Constant *LHS, Constant *RHS, bool OnlyIfReduced) { + auto Predicate = static_cast<CmpInst::Predicate>(pred); assert(LHS->getType() == RHS->getType()); - assert(CmpInst::isIntPredicate((CmpInst::Predicate)pred) && - "Invalid ICmp Predicate"); + assert(CmpInst::isIntPredicate(Predicate) && "Invalid ICmp Predicate"); - if (Constant *FC = ConstantFoldCompareInstruction(pred, LHS, RHS)) + if (Constant *FC = ConstantFoldCompareInstruction(Predicate, LHS, RHS)) return FC; // Fold a few common cases... if (OnlyIfReduced) @@ -2559,7 +2534,7 @@ Constant *ConstantExpr::getICmp(unsigned short pred, Constant *LHS, // Look up the constant in the table first to ensure uniqueness Constant *ArgVec[] = { LHS, RHS }; // Get the key type with both the opcode and predicate - const ConstantExprKeyType Key(Instruction::ICmp, ArgVec, pred); + const ConstantExprKeyType Key(Instruction::ICmp, ArgVec, Predicate); Type *ResultTy = Type::getInt1Ty(LHS->getContext()); if (VectorType *VT = dyn_cast<VectorType>(LHS->getType())) @@ -2571,11 +2546,11 @@ Constant *ConstantExpr::getICmp(unsigned short pred, Constant *LHS, Constant *ConstantExpr::getFCmp(unsigned short pred, Constant *LHS, Constant *RHS, bool OnlyIfReduced) { + auto Predicate = static_cast<CmpInst::Predicate>(pred); assert(LHS->getType() == RHS->getType()); - assert(CmpInst::isFPPredicate((CmpInst::Predicate)pred) && - "Invalid FCmp Predicate"); + assert(CmpInst::isFPPredicate(Predicate) && "Invalid FCmp Predicate"); - if (Constant *FC = ConstantFoldCompareInstruction(pred, LHS, RHS)) + if (Constant *FC = ConstantFoldCompareInstruction(Predicate, LHS, RHS)) return FC; // Fold a few common cases... if (OnlyIfReduced) @@ -2584,7 +2559,7 @@ Constant *ConstantExpr::getFCmp(unsigned short pred, Constant *LHS, // Look up the constant in the table first to ensure uniqueness Constant *ArgVec[] = { LHS, RHS }; // Get the key type with both the opcode and predicate - const ConstantExprKeyType Key(Instruction::FCmp, ArgVec, pred); + const ConstantExprKeyType Key(Instruction::FCmp, ArgVec, Predicate); Type *ResultTy = Type::getInt1Ty(LHS->getContext()); if (VectorType *VT = dyn_cast<VectorType>(LHS->getType())) diff --git a/llvm/lib/IR/Core.cpp b/llvm/lib/IR/Core.cpp index a263d2536541..43df15e4d932 100644 --- a/llvm/lib/IR/Core.cpp +++ b/llvm/lib/IR/Core.cpp @@ -142,12 +142,12 @@ LLVMAttributeRef LLVMCreateEnumAttribute(LLVMContextRef C, unsigned KindID, if (AttrKind == Attribute::AttrKind::ByVal) { // After r362128, byval attributes need to have a type attribute. Provide a // NULL one until a proper API is added for this. - return wrap(Attribute::getWithByValType(Ctx, NULL)); + return wrap(Attribute::getWithByValType(Ctx, nullptr)); } if (AttrKind == Attribute::AttrKind::StructRet) { // Same as byval. - return wrap(Attribute::getWithStructRetType(Ctx, NULL)); + return wrap(Attribute::getWithStructRetType(Ctx, nullptr)); } return wrap(Attribute::get(Ctx, AttrKind, Val)); @@ -796,7 +796,7 @@ LLVMTypeRef LLVMScalableVectorType(LLVMTypeRef ElementType, LLVMTypeRef LLVMGetElementType(LLVMTypeRef WrappedTy) { auto *Ty = unwrap<Type>(WrappedTy); if (auto *PTy = dyn_cast<PointerType>(Ty)) - return wrap(PTy->getElementType()); + return wrap(PTy->getPointerElementType()); if (auto *ATy = dyn_cast<ArrayType>(Ty)) return wrap(ATy->getElementType()); return wrap(cast<VectorType>(Ty)->getElementType()); @@ -1691,8 +1691,7 @@ LLVMValueRef LLVMConstGEP(LLVMValueRef ConstantVal, ArrayRef<Constant *> IdxList(unwrap<Constant>(ConstantIndices, NumIndices), NumIndices); Constant *Val = unwrap<Constant>(ConstantVal); - Type *Ty = - cast<PointerType>(Val->getType()->getScalarType())->getElementType(); + Type *Ty = Val->getType()->getScalarType()->getNonOpaquePointerElementType(); return wrap(ConstantExpr::getGetElementPtr(Ty, Val, IdxList)); } @@ -1710,8 +1709,7 @@ LLVMValueRef LLVMConstInBoundsGEP(LLVMValueRef ConstantVal, ArrayRef<Constant *> IdxList(unwrap<Constant>(ConstantIndices, NumIndices), NumIndices); Constant *Val = unwrap<Constant>(ConstantVal); - Type *Ty = - cast<PointerType>(Val->getType()->getScalarType())->getElementType(); + Type *Ty = Val->getType()->getScalarType()->getNonOpaquePointerElementType(); return wrap(ConstantExpr::getInBoundsGetElementPtr(Ty, Val, IdxList)); } @@ -2278,7 +2276,8 @@ void LLVMSetExternallyInitialized(LLVMValueRef GlobalVar, LLVMBool IsExtInit) { LLVMValueRef LLVMAddAlias(LLVMModuleRef M, LLVMTypeRef Ty, LLVMValueRef Aliasee, const char *Name) { auto *PTy = cast<PointerType>(unwrap(Ty)); - return wrap(GlobalAlias::create(PTy->getElementType(), PTy->getAddressSpace(), + return wrap(GlobalAlias::create(PTy->getNonOpaquePointerElementType(), + PTy->getAddressSpace(), GlobalValue::ExternalLinkage, Name, unwrap<Constant>(Aliasee), unwrap(M))); } @@ -2293,7 +2292,7 @@ LLVMValueRef LLVMAddAlias2(LLVMModuleRef M, LLVMTypeRef ValueTy, LLVMValueRef LLVMGetNamedGlobalAlias(LLVMModuleRef M, const char *Name, size_t NameLen) { - return wrap(unwrap(M)->getNamedAlias(Name)); + return wrap(unwrap(M)->getNamedAlias(StringRef(Name, NameLen))); } LLVMValueRef LLVMGetFirstGlobalAlias(LLVMModuleRef M) { @@ -3218,7 +3217,7 @@ LLVMValueRef LLVMBuildInvoke(LLVMBuilderRef B, LLVMValueRef Fn, const char *Name) { Value *V = unwrap(Fn); FunctionType *FnT = - cast<FunctionType>(cast<PointerType>(V->getType())->getElementType()); + cast<FunctionType>(V->getType()->getNonOpaquePointerElementType()); return wrap( unwrap(B)->CreateInvoke(FnT, unwrap(Fn), unwrap(Then), unwrap(Catch), @@ -3590,7 +3589,8 @@ LLVMValueRef LLVMBuildLoad(LLVMBuilderRef B, LLVMValueRef PointerVal, Value *V = unwrap(PointerVal); PointerType *Ty = cast<PointerType>(V->getType()); - return wrap(unwrap(B)->CreateLoad(Ty->getElementType(), V, Name)); + return wrap( + unwrap(B)->CreateLoad(Ty->getNonOpaquePointerElementType(), V, Name)); } LLVMValueRef LLVMBuildLoad2(LLVMBuilderRef B, LLVMTypeRef Ty, @@ -3692,8 +3692,7 @@ LLVMValueRef LLVMBuildGEP(LLVMBuilderRef B, LLVMValueRef Pointer, const char *Name) { ArrayRef<Value *> IdxList(unwrap(Indices), NumIndices); Value *Val = unwrap(Pointer); - Type *Ty = - cast<PointerType>(Val->getType()->getScalarType())->getElementType(); + Type *Ty = Val->getType()->getScalarType()->getNonOpaquePointerElementType(); return wrap(unwrap(B)->CreateGEP(Ty, Val, IdxList, Name)); } @@ -3709,8 +3708,7 @@ LLVMValueRef LLVMBuildInBoundsGEP(LLVMBuilderRef B, LLVMValueRef Pointer, const char *Name) { ArrayRef<Value *> IdxList(unwrap(Indices), NumIndices); Value *Val = unwrap(Pointer); - Type *Ty = - cast<PointerType>(Val->getType()->getScalarType())->getElementType(); + Type *Ty = Val->getType()->getScalarType()->getNonOpaquePointerElementType(); return wrap(unwrap(B)->CreateInBoundsGEP(Ty, Val, IdxList, Name)); } @@ -3725,8 +3723,7 @@ LLVMValueRef LLVMBuildInBoundsGEP2(LLVMBuilderRef B, LLVMTypeRef Ty, LLVMValueRef LLVMBuildStructGEP(LLVMBuilderRef B, LLVMValueRef Pointer, unsigned Idx, const char *Name) { Value *Val = unwrap(Pointer); - Type *Ty = - cast<PointerType>(Val->getType()->getScalarType())->getElementType(); + Type *Ty = Val->getType()->getScalarType()->getNonOpaquePointerElementType(); return wrap(unwrap(B)->CreateStructGEP(Ty, Val, Idx, Name)); } @@ -3947,7 +3944,7 @@ LLVMValueRef LLVMBuildCall(LLVMBuilderRef B, LLVMValueRef Fn, const char *Name) { Value *V = unwrap(Fn); FunctionType *FnT = - cast<FunctionType>(cast<PointerType>(V->getType())->getElementType()); + cast<FunctionType>(V->getType()->getNonOpaquePointerElementType()); return wrap(unwrap(B)->CreateCall(FnT, unwrap(Fn), makeArrayRef(unwrap(Args), NumArgs), Name)); @@ -4022,7 +4019,16 @@ LLVMValueRef LLVMBuildIsNotNull(LLVMBuilderRef B, LLVMValueRef Val, LLVMValueRef LLVMBuildPtrDiff(LLVMBuilderRef B, LLVMValueRef LHS, LLVMValueRef RHS, const char *Name) { - return wrap(unwrap(B)->CreatePtrDiff(unwrap(LHS), unwrap(RHS), Name)); + Value *L = unwrap(LHS); + Type *ElemTy = L->getType()->getNonOpaquePointerElementType(); + return wrap(unwrap(B)->CreatePtrDiff(ElemTy, L, unwrap(RHS), Name)); +} + +LLVMValueRef LLVMBuildPtrDiff2(LLVMBuilderRef B, LLVMTypeRef ElemTy, + LLVMValueRef LHS, LLVMValueRef RHS, + const char *Name) { + return wrap(unwrap(B)->CreatePtrDiff(unwrap(ElemTy), unwrap(LHS), + unwrap(RHS), Name)); } LLVMValueRef LLVMBuildAtomicRMW(LLVMBuilderRef B,LLVMAtomicRMWBinOp op, diff --git a/llvm/lib/IR/DIBuilder.cpp b/llvm/lib/IR/DIBuilder.cpp index 35af22034a12..a6e84dfbe1dd 100644 --- a/llvm/lib/IR/DIBuilder.cpp +++ b/llvm/lib/IR/DIBuilder.cpp @@ -33,7 +33,7 @@ static cl::opt<bool> DIBuilder::DIBuilder(Module &m, bool AllowUnresolvedNodes, DICompileUnit *CU) : M(m), VMContext(M.getContext()), CUNode(CU), DeclareFn(nullptr), - ValueFn(nullptr), LabelFn(nullptr), + ValueFn(nullptr), LabelFn(nullptr), AddrFn(nullptr), AllowUnresolvedNodes(AllowUnresolvedNodes) { if (CUNode) { if (const auto &ETs = CUNode->getEnumTypes()) @@ -821,12 +821,6 @@ DIExpression *DIBuilder::createExpression(ArrayRef<uint64_t> Addr) { return DIExpression::get(VMContext, Addr); } -DIExpression *DIBuilder::createExpression(ArrayRef<int64_t> Signed) { - // TODO: Remove the callers of this signed version and delete. - SmallVector<uint64_t, 8> Addr(Signed.begin(), Signed.end()); - return createExpression(Addr); -} - template <class... Ts> static DISubprogram *getSubprogram(bool IsDistinct, Ts &&...Args) { if (IsDistinct) @@ -980,6 +974,24 @@ Instruction *DIBuilder::insertDbgValueIntrinsic(Value *V, return insertDbgValueIntrinsic(V, VarInfo, Expr, DL, InsertAtEnd, nullptr); } +Instruction *DIBuilder::insertDbgAddrIntrinsic(Value *V, + DILocalVariable *VarInfo, + DIExpression *Expr, + const DILocation *DL, + Instruction *InsertBefore) { + return insertDbgAddrIntrinsic( + V, VarInfo, Expr, DL, InsertBefore ? InsertBefore->getParent() : nullptr, + InsertBefore); +} + +Instruction *DIBuilder::insertDbgAddrIntrinsic(Value *V, + DILocalVariable *VarInfo, + DIExpression *Expr, + const DILocation *DL, + BasicBlock *InsertAtEnd) { + return insertDbgAddrIntrinsic(V, VarInfo, Expr, DL, InsertAtEnd, nullptr); +} + /// Initialize IRBuilder for inserting dbg.declare and dbg.value intrinsics. /// This abstracts over the various ways to specify an insert position. static void initIRBuilder(IRBuilder<> &Builder, const DILocation *DL, @@ -1001,6 +1013,24 @@ static Function *getDeclareIntrin(Module &M) { : Intrinsic::dbg_declare); } +Instruction *DIBuilder::insertDbgValueIntrinsic( + llvm::Value *Val, DILocalVariable *VarInfo, DIExpression *Expr, + const DILocation *DL, BasicBlock *InsertBB, Instruction *InsertBefore) { + if (!ValueFn) + ValueFn = Intrinsic::getDeclaration(&M, Intrinsic::dbg_value); + return insertDbgIntrinsic(ValueFn, Val, VarInfo, Expr, DL, InsertBB, + InsertBefore); +} + +Instruction *DIBuilder::insertDbgAddrIntrinsic( + llvm::Value *Val, DILocalVariable *VarInfo, DIExpression *Expr, + const DILocation *DL, BasicBlock *InsertBB, Instruction *InsertBefore) { + if (!AddrFn) + AddrFn = Intrinsic::getDeclaration(&M, Intrinsic::dbg_addr); + return insertDbgIntrinsic(AddrFn, Val, VarInfo, Expr, DL, InsertBB, + InsertBefore); +} + Instruction *DIBuilder::insertDeclare(Value *Storage, DILocalVariable *VarInfo, DIExpression *Expr, const DILocation *DL, BasicBlock *InsertBB, @@ -1024,17 +1054,20 @@ Instruction *DIBuilder::insertDeclare(Value *Storage, DILocalVariable *VarInfo, return B.CreateCall(DeclareFn, Args); } -Instruction *DIBuilder::insertDbgValueIntrinsic( - Value *V, DILocalVariable *VarInfo, DIExpression *Expr, - const DILocation *DL, BasicBlock *InsertBB, Instruction *InsertBefore) { - assert(V && "no value passed to dbg.value"); - assert(VarInfo && "empty or invalid DILocalVariable* passed to dbg.value"); +Instruction *DIBuilder::insertDbgIntrinsic(llvm::Function *IntrinsicFn, + Value *V, DILocalVariable *VarInfo, + DIExpression *Expr, + const DILocation *DL, + BasicBlock *InsertBB, + Instruction *InsertBefore) { + assert(IntrinsicFn && "must pass a non-null intrinsic function"); + assert(V && "must pass a value to a dbg intrinsic"); + assert(VarInfo && + "empty or invalid DILocalVariable* passed to debug intrinsic"); assert(DL && "Expected debug loc"); assert(DL->getScope()->getSubprogram() == VarInfo->getScope()->getSubprogram() && "Expected matching subprograms"); - if (!ValueFn) - ValueFn = Intrinsic::getDeclaration(&M, Intrinsic::dbg_value); trackIfUnresolved(VarInfo); trackIfUnresolved(Expr); @@ -1044,7 +1077,7 @@ Instruction *DIBuilder::insertDbgValueIntrinsic( IRBuilder<> B(DL->getContext()); initIRBuilder(B, DL, InsertBB, InsertBefore); - return B.CreateCall(ValueFn, Args); + return B.CreateCall(IntrinsicFn, Args); } Instruction *DIBuilder::insertLabel(DILabel *LabelInfo, const DILocation *DL, diff --git a/llvm/lib/IR/DebugInfo.cpp b/llvm/lib/IR/DebugInfo.cpp index 7c69fbf7085d..98f25b035157 100644 --- a/llvm/lib/IR/DebugInfo.cpp +++ b/llvm/lib/IR/DebugInfo.cpp @@ -1436,14 +1436,14 @@ LLVMDIBuilderCreateSubroutineType(LLVMDIBuilderRef Builder, } LLVMMetadataRef LLVMDIBuilderCreateExpression(LLVMDIBuilderRef Builder, - int64_t *Addr, size_t Length) { - return wrap(unwrap(Builder)->createExpression(ArrayRef<int64_t>(Addr, - Length))); + uint64_t *Addr, size_t Length) { + return wrap( + unwrap(Builder)->createExpression(ArrayRef<uint64_t>(Addr, Length))); } LLVMMetadataRef LLVMDIBuilderCreateConstantValueExpression(LLVMDIBuilderRef Builder, - int64_t Value) { + uint64_t Value) { return wrap(unwrap(Builder)->createConstantValueExpression(Value)); } diff --git a/llvm/lib/IR/DebugInfoMetadata.cpp b/llvm/lib/IR/DebugInfoMetadata.cpp index b20e581d283a..59afb844eb89 100644 --- a/llvm/lib/IR/DebugInfoMetadata.cpp +++ b/llvm/lib/IR/DebugInfoMetadata.cpp @@ -567,13 +567,16 @@ Optional<DIBasicType::Signedness> DIBasicType::getSignedness() const { DIStringType *DIStringType::getImpl(LLVMContext &Context, unsigned Tag, MDString *Name, Metadata *StringLength, Metadata *StringLengthExp, + Metadata *StringLocationExp, uint64_t SizeInBits, uint32_t AlignInBits, unsigned Encoding, StorageType Storage, bool ShouldCreate) { assert(isCanonical(Name) && "Expected canonical MDString"); - DEFINE_GETIMPL_LOOKUP(DIStringType, (Tag, Name, StringLength, StringLengthExp, - SizeInBits, AlignInBits, Encoding)); - Metadata *Ops[] = {nullptr, nullptr, Name, StringLength, StringLengthExp}; + DEFINE_GETIMPL_LOOKUP(DIStringType, + (Tag, Name, StringLength, StringLengthExp, + StringLocationExp, SizeInBits, AlignInBits, Encoding)); + Metadata *Ops[] = {nullptr, nullptr, Name, + StringLength, StringLengthExp, StringLocationExp}; DEFINE_GETIMPL_STORE(DIStringType, (Tag, SizeInBits, AlignInBits, Encoding), Ops); } diff --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp index f1a6402fb11b..1e874d7afa79 100644 --- a/llvm/lib/IR/Function.cpp +++ b/llvm/lib/IR/Function.cpp @@ -300,9 +300,9 @@ void Argument::removeAttr(Attribute::AttrKind Kind) { getParent()->removeParamAttr(getArgNo(), Kind); } -void Argument::removeAttrs(const AttrBuilder &B) { +void Argument::removeAttrs(const AttributeMask &AM) { AttributeList AL = getParent()->getAttributes(); - AL = AL.removeParamAttributes(Parent->getContext(), getArgNo(), B); + AL = AL.removeParamAttributes(Parent->getContext(), getArgNo(), AM); getParent()->setAttributes(AL); } @@ -340,7 +340,7 @@ Function *Function::createWithDefaultAttr(FunctionType *Ty, unsigned AddrSpace, const Twine &N, Module *M) { auto *F = new Function(Ty, Linkage, AddrSpace, N, M); - AttrBuilder B; + AttrBuilder B(F->getContext()); if (M->getUwtable()) B.addAttribute(Attribute::UWTable); switch (M->getFramePointer()) { @@ -589,8 +589,8 @@ void Function::removeFnAttr(StringRef Kind) { AttributeSets = AttributeSets.removeFnAttribute(getContext(), Kind); } -void Function::removeFnAttrs(const AttrBuilder &Attrs) { - AttributeSets = AttributeSets.removeFnAttributes(getContext(), Attrs); +void Function::removeFnAttrs(const AttributeMask &AM) { + AttributeSets = AttributeSets.removeFnAttributes(getContext(), AM); } void Function::removeRetAttr(Attribute::AttrKind Kind) { @@ -601,7 +601,7 @@ void Function::removeRetAttr(StringRef Kind) { AttributeSets = AttributeSets.removeRetAttribute(getContext(), Kind); } -void Function::removeRetAttrs(const AttrBuilder &Attrs) { +void Function::removeRetAttrs(const AttributeMask &Attrs) { AttributeSets = AttributeSets.removeRetAttributes(getContext(), Attrs); } @@ -613,7 +613,7 @@ void Function::removeParamAttr(unsigned ArgNo, StringRef Kind) { AttributeSets = AttributeSets.removeParamAttribute(getContext(), ArgNo, Kind); } -void Function::removeParamAttrs(unsigned ArgNo, const AttrBuilder &Attrs) { +void Function::removeParamAttrs(unsigned ArgNo, const AttributeMask &Attrs) { AttributeSets = AttributeSets.removeParamAttributes(getContext(), ArgNo, Attrs); } @@ -817,7 +817,8 @@ static std::string getMangledTypeStr(Type *Ty, bool &HasUnnamedType) { // Opaque pointer doesn't have pointee type information, so we just mangle // address space for opaque pointer. if (!PTyp->isOpaque()) - Result += getMangledTypeStr(PTyp->getElementType(), HasUnnamedType); + Result += getMangledTypeStr(PTyp->getNonOpaquePointerElementType(), + HasUnnamedType); } else if (ArrayType *ATyp = dyn_cast<ArrayType>(Ty)) { Result += "a" + utostr(ATyp->getNumElements()) + getMangledTypeStr(ATyp->getElementType(), HasUnnamedType); @@ -1465,8 +1466,8 @@ static bool matchIntrinsicType( if (!PT || PT->getAddressSpace() != D.Pointer_AddressSpace) return true; if (!PT->isOpaque()) - return matchIntrinsicType(PT->getElementType(), Infos, ArgTys, - DeferredChecks, IsDeferredCheck); + return matchIntrinsicType(PT->getNonOpaquePointerElementType(), Infos, + ArgTys, DeferredChecks, IsDeferredCheck); // Consume IIT descriptors relating to the pointer element type. while (Infos.front().Kind == IITDescriptor::Pointer) Infos = Infos.slice(1); @@ -1573,7 +1574,8 @@ static bool matchIntrinsicType( return IsDeferredCheck || DeferCheck(Ty); Type * ReferenceType = ArgTys[D.getArgumentNumber()]; PointerType *ThisArgType = dyn_cast<PointerType>(Ty); - return (!ThisArgType || ThisArgType->getElementType() != ReferenceType); + return (!ThisArgType || + !ThisArgType->isOpaqueOrPointeeTypeMatches(ReferenceType)); } case IITDescriptor::PtrToElt: { if (D.getArgumentNumber() >= ArgTys.size()) diff --git a/llvm/lib/IR/Globals.cpp b/llvm/lib/IR/Globals.cpp index b6bd25aa1234..c832499dde06 100644 --- a/llvm/lib/IR/Globals.cpp +++ b/llvm/lib/IR/Globals.cpp @@ -95,6 +95,8 @@ void GlobalValue::eraseFromParent() { llvm_unreachable("not a global"); } +GlobalObject::~GlobalObject() { setComdat(nullptr); } + bool GlobalValue::isInterposable() const { if (isInterposableLinkage(getLinkage())) return true; @@ -103,10 +105,15 @@ bool GlobalValue::isInterposable() const { } bool GlobalValue::canBenefitFromLocalAlias() const { - // See AsmPrinter::getSymbolPreferLocal(). + // See AsmPrinter::getSymbolPreferLocal(). For a deduplicate comdat kind, + // references to a discarded local symbol from outside the group are not + // allowed, so avoid the local alias. + auto isDeduplicateComdat = [](const Comdat *C) { + return C && C->getSelectionKind() != Comdat::NoDeduplicate; + }; return hasDefaultVisibility() && GlobalObject::isExternalLinkage(getLinkage()) && !isDeclaration() && - !isa<GlobalIFunc>(this) && !hasComdat(); + !isa<GlobalIFunc>(this) && !isDeduplicateComdat(getComdat()); } unsigned GlobalValue::getAddressSpace() const { @@ -182,6 +189,14 @@ const Comdat *GlobalValue::getComdat() const { return cast<GlobalObject>(this)->getComdat(); } +void GlobalObject::setComdat(Comdat *C) { + if (ObjComdat) + ObjComdat->removeUser(this); + ObjComdat = C; + if (C) + C->addUser(this); +} + StringRef GlobalValue::getPartition() const { if (!hasPartition()) return ""; diff --git a/llvm/lib/IR/IRBuilder.cpp b/llvm/lib/IR/IRBuilder.cpp index 98f6ccf81973..27528a69be21 100644 --- a/llvm/lib/IR/IRBuilder.cpp +++ b/llvm/lib/IR/IRBuilder.cpp @@ -679,7 +679,7 @@ static CallInst *CreateGCStatepointCallCommon( const Twine &Name) { // Extract out the type of the callee. auto *FuncPtrType = cast<PointerType>(ActualCallee->getType()); - assert(isa<FunctionType>(FuncPtrType->getElementType()) && + assert(isa<FunctionType>(FuncPtrType->getPointerElementType()) && "actual callee must be a callable value"); Module *M = Builder->GetInsertBlock()->getParent()->getParent(); @@ -736,7 +736,7 @@ static InvokeInst *CreateGCStatepointInvokeCommon( ArrayRef<T3> GCArgs, const Twine &Name) { // Extract out the type of the callee. auto *FuncPtrType = cast<PointerType>(ActualInvokee->getType()); - assert(isa<FunctionType>(FuncPtrType->getElementType()) && + assert(isa<FunctionType>(FuncPtrType->getPointerElementType()) && "actual callee must be a callable value"); Module *M = Builder->GetInsertBlock()->getParent()->getParent(); @@ -984,10 +984,8 @@ CallInst *IRBuilderBase::CreateConstrainedFPCall( Value *IRBuilderBase::CreateSelect(Value *C, Value *True, Value *False, const Twine &Name, Instruction *MDFrom) { - if (auto *CC = dyn_cast<Constant>(C)) - if (auto *TC = dyn_cast<Constant>(True)) - if (auto *FC = dyn_cast<Constant>(False)) - return Insert(Folder.CreateSelect(CC, TC, FC), Name); + if (auto *V = Folder.FoldSelect(C, True, False)) + return V; SelectInst *Sel = SelectInst::Create(C, True, False); if (MDFrom) { @@ -1000,16 +998,17 @@ Value *IRBuilderBase::CreateSelect(Value *C, Value *True, Value *False, return Insert(Sel, Name); } -Value *IRBuilderBase::CreatePtrDiff(Value *LHS, Value *RHS, +Value *IRBuilderBase::CreatePtrDiff(Type *ElemTy, Value *LHS, Value *RHS, const Twine &Name) { assert(LHS->getType() == RHS->getType() && "Pointer subtraction operand types must match!"); - auto *ArgType = cast<PointerType>(LHS->getType()); + assert(cast<PointerType>(LHS->getType()) + ->isOpaqueOrPointeeTypeMatches(ElemTy) && + "Pointer type must match element type"); Value *LHS_int = CreatePtrToInt(LHS, Type::getInt64Ty(Context)); Value *RHS_int = CreatePtrToInt(RHS, Type::getInt64Ty(Context)); Value *Difference = CreateSub(LHS_int, RHS_int); - return CreateExactSDiv(Difference, - ConstantExpr::getSizeOf(ArgType->getElementType()), + return CreateExactSDiv(Difference, ConstantExpr::getSizeOf(ElemTy), Name); } diff --git a/llvm/lib/IR/Instruction.cpp b/llvm/lib/IR/Instruction.cpp index 4480ec799c35..59b7221d1fa2 100644 --- a/llvm/lib/IR/Instruction.cpp +++ b/llvm/lib/IR/Instruction.cpp @@ -186,7 +186,8 @@ void Instruction::dropUndefImplyingAttrsAndUnknownMetadata( AttributeList AL = CB->getAttributes(); if (AL.isEmpty()) return; - AttrBuilder UBImplyingAttributes = AttributeFuncs::getUBImplyingAttributes(); + AttributeMask UBImplyingAttributes = + AttributeFuncs::getUBImplyingAttributes(); for (unsigned ArgNo = 0; ArgNo < CB->arg_size(); ArgNo++) CB->removeParamAttrs(ArgNo, UBImplyingAttributes); CB->removeRetAttrs(UBImplyingAttributes); @@ -584,7 +585,7 @@ bool Instruction::mayReadFromMemory() const { case Instruction::Call: case Instruction::Invoke: case Instruction::CallBr: - return !cast<CallBase>(this)->doesNotReadMemory(); + return !cast<CallBase>(this)->onlyWritesMemory(); case Instruction::Store: return !cast<StoreInst>(this)->isUnordered(); } diff --git a/llvm/lib/IR/IntrinsicInst.cpp b/llvm/lib/IR/IntrinsicInst.cpp index 8f7318665cfb..adea7abb75cf 100644 --- a/llvm/lib/IR/IntrinsicInst.cpp +++ b/llvm/lib/IR/IntrinsicInst.cpp @@ -178,6 +178,18 @@ int llvm::Intrinsic::lookupLLVMIntrinsicByName(ArrayRef<const char *> NameTable, return -1; } +ConstantInt *InstrProfInstBase::getNumCounters() const { + if (InstrProfValueProfileInst::classof(this)) + llvm_unreachable("InstrProfValueProfileInst does not have counters!"); + return cast<ConstantInt>(const_cast<Value *>(getArgOperand(2))); +} + +ConstantInt *InstrProfInstBase::getIndex() const { + if (InstrProfValueProfileInst::classof(this)) + llvm_unreachable("Please use InstrProfValueProfileInst::getIndex()"); + return cast<ConstantInt>(const_cast<Value *>(getArgOperand(3))); +} + Value *InstrProfIncrementInst::getStep() const { if (InstrProfIncrementInstStep::classof(this)) { return const_cast<Value *>(getArgOperand(4)); @@ -482,6 +494,7 @@ Function *VPIntrinsic::getDeclarationForParams(Module *M, Intrinsic::ID VPID, VPFunc = Intrinsic::getDeclaration(M, VPID, OverloadTy); break; } + case Intrinsic::vp_merge: case Intrinsic::vp_select: VPFunc = Intrinsic::getDeclaration(M, VPID, {Params[1]->getType()}); break; diff --git a/llvm/lib/IR/LLVMContextImpl.h b/llvm/lib/IR/LLVMContextImpl.h index 24c4a348f4da..0b5f928165e8 100644 --- a/llvm/lib/IR/LLVMContextImpl.h +++ b/llvm/lib/IR/LLVMContextImpl.h @@ -428,20 +428,22 @@ template <> struct MDNodeKeyImpl<DIStringType> { MDString *Name; Metadata *StringLength; Metadata *StringLengthExp; + Metadata *StringLocationExp; uint64_t SizeInBits; uint32_t AlignInBits; unsigned Encoding; MDNodeKeyImpl(unsigned Tag, MDString *Name, Metadata *StringLength, - Metadata *StringLengthExp, uint64_t SizeInBits, - uint32_t AlignInBits, unsigned Encoding) + Metadata *StringLengthExp, Metadata *StringLocationExp, + uint64_t SizeInBits, uint32_t AlignInBits, unsigned Encoding) : Tag(Tag), Name(Name), StringLength(StringLength), - StringLengthExp(StringLengthExp), SizeInBits(SizeInBits), - AlignInBits(AlignInBits), Encoding(Encoding) {} + StringLengthExp(StringLengthExp), StringLocationExp(StringLocationExp), + SizeInBits(SizeInBits), AlignInBits(AlignInBits), Encoding(Encoding) {} MDNodeKeyImpl(const DIStringType *N) : Tag(N->getTag()), Name(N->getRawName()), StringLength(N->getRawStringLength()), StringLengthExp(N->getRawStringLengthExp()), + StringLocationExp(N->getRawStringLocationExp()), SizeInBits(N->getSizeInBits()), AlignInBits(N->getAlignInBits()), Encoding(N->getEncoding()) {} diff --git a/llvm/lib/IR/LegacyPassManager.cpp b/llvm/lib/IR/LegacyPassManager.cpp index bb72bec93066..4357c95aa9f6 100644 --- a/llvm/lib/IR/LegacyPassManager.cpp +++ b/llvm/lib/IR/LegacyPassManager.cpp @@ -256,9 +256,9 @@ private: bool wasRun; public: static char ID; - explicit FunctionPassManagerImpl() : - Pass(PT_PassManager, ID), PMDataManager(), - PMTopLevelManager(new FPPassManager()), wasRun(false) {} + explicit FunctionPassManagerImpl() + : Pass(PT_PassManager, ID), PMTopLevelManager(new FPPassManager()), + wasRun(false) {} /// \copydoc FunctionPassManager::add() void add(Pass *P) { @@ -387,8 +387,7 @@ namespace { class MPPassManager : public Pass, public PMDataManager { public: static char ID; - explicit MPPassManager() : - Pass(PT_PassManager, ID), PMDataManager() { } + explicit MPPassManager() : Pass(PT_PassManager, ID) {} // Delete on the fly managers. ~MPPassManager() override { @@ -478,9 +477,8 @@ class PassManagerImpl : public Pass, public: static char ID; - explicit PassManagerImpl() : - Pass(PT_PassManager, ID), PMDataManager(), - PMTopLevelManager(new MPPassManager()) {} + explicit PassManagerImpl() + : Pass(PT_PassManager, ID), PMTopLevelManager(new MPPassManager()) {} /// \copydoc PassManager::add() void add(Pass *P) { diff --git a/llvm/lib/IR/TypeFinder.cpp b/llvm/lib/IR/TypeFinder.cpp index 1f757d7dbf4e..904af7e737cc 100644 --- a/llvm/lib/IR/TypeFinder.cpp +++ b/llvm/lib/IR/TypeFinder.cpp @@ -18,8 +18,10 @@ #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" #include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" #include "llvm/IR/Metadata.h" #include "llvm/IR/Module.h" +#include "llvm/IR/Operator.h" #include "llvm/IR/Type.h" #include "llvm/IR/Use.h" #include "llvm/IR/User.h" @@ -34,22 +36,27 @@ void TypeFinder::run(const Module &M, bool onlyNamed) { // Get types from global variables. for (const auto &G : M.globals()) { - incorporateType(G.getType()); + incorporateType(G.getValueType()); if (G.hasInitializer()) incorporateValue(G.getInitializer()); } // Get types from aliases. for (const auto &A : M.aliases()) { - incorporateType(A.getType()); + incorporateType(A.getValueType()); if (const Value *Aliasee = A.getAliasee()) incorporateValue(Aliasee); } + // Get types from ifuncs. + for (const auto &GI : M.ifuncs()) + incorporateType(GI.getValueType()); + // Get types from functions. SmallVector<std::pair<unsigned, MDNode *>, 4> MDForInst; for (const Function &FI : M) { - incorporateType(FI.getType()); + incorporateType(FI.getFunctionType()); + incorporateAttributes(FI.getAttributes()); for (const Use &U : FI.operands()) incorporateValue(U.get()); @@ -69,6 +76,13 @@ void TypeFinder::run(const Module &M, bool onlyNamed) { if (&*O && !isa<Instruction>(&*O)) incorporateValue(&*O); + if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) + incorporateType(GEP->getSourceElementType()); + if (auto *AI = dyn_cast<AllocaInst>(&I)) + incorporateType(AI->getAllocatedType()); + if (const auto *CB = dyn_cast<CallBase>(&I)) + incorporateAttributes(CB->getAttributes()); + // Incorporate types hiding in metadata. I.getAllMetadataOtherThanDebugLoc(MDForInst); for (const auto &MD : MDForInst) @@ -138,6 +152,9 @@ void TypeFinder::incorporateValue(const Value *V) { if (isa<Instruction>(V)) return; + if (auto *GEP = dyn_cast<GEPOperator>(V)) + incorporateType(GEP->getSourceElementType()); + // Look in operands for types. const User *U = cast<User>(V); for (const auto &I : U->operands()) @@ -173,3 +190,13 @@ void TypeFinder::incorporateMDNode(const MDNode *V) { } } } + +void TypeFinder::incorporateAttributes(AttributeList AL) { + if (!VisitedAttributes.insert(AL).second) + return; + + for (AttributeSet AS : AL) + for (Attribute A : AS) + if (A.isTypeAttribute()) + incorporateType(A.getValueAsType()); +} diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp index fb7c423e54e2..b84edb789405 100644 --- a/llvm/lib/IR/Verifier.cpp +++ b/llvm/lib/IR/Verifier.cpp @@ -551,11 +551,12 @@ private: void checkUnsignedBaseTenFuncAttr(AttributeList Attrs, StringRef Attr, const Value *V); void verifyFunctionAttrs(FunctionType *FT, AttributeList Attrs, - const Value *V, bool IsIntrinsic); + const Value *V, bool IsIntrinsic, bool IsInlineAsm); void verifyFunctionMetadata(ArrayRef<std::pair<unsigned, MDNode *>> MDs); void visitConstantExprsRecursively(const Constant *EntryC); void visitConstantExpr(const ConstantExpr *CE); + void verifyInlineAsmCall(const CallBase &Call); void verifyStatepoint(const CallBase &Call); void verifyFrameRecoverIndices(); void verifySiblingFuncletUnwinds(); @@ -1058,6 +1059,7 @@ void Verifier::visitDIDerivedType(const DIDerivedType &N) { N.getTag() == dwarf::DW_TAG_reference_type || N.getTag() == dwarf::DW_TAG_rvalue_reference_type || N.getTag() == dwarf::DW_TAG_const_type || + N.getTag() == dwarf::DW_TAG_immutable_type || N.getTag() == dwarf::DW_TAG_volatile_type || N.getTag() == dwarf::DW_TAG_restrict_type || N.getTag() == dwarf::DW_TAG_atomic_type || @@ -1792,7 +1794,7 @@ void Verifier::verifyParameterAttrs(AttributeSet Attrs, Type *Ty, "'noinline and alwaysinline' are incompatible!", V); - AttrBuilder IncompatibleAttrs = AttributeFuncs::typeIncompatible(Ty); + AttributeMask IncompatibleAttrs = AttributeFuncs::typeIncompatible(Ty); for (Attribute Attr : Attrs) { if (!Attr.isStringAttribute() && IncompatibleAttrs.contains(Attr.getKindAsEnum())) { @@ -1824,33 +1826,34 @@ void Verifier::verifyParameterAttrs(AttributeSet Attrs, Type *Ty, "Attribute 'preallocated' does not support unsized types!", V); } if (!PTy->isOpaque()) { - if (!isa<PointerType>(PTy->getElementType())) + if (!isa<PointerType>(PTy->getNonOpaquePointerElementType())) Assert(!Attrs.hasAttribute(Attribute::SwiftError), "Attribute 'swifterror' only applies to parameters " "with pointer to pointer type!", V); if (Attrs.hasAttribute(Attribute::ByRef)) { - Assert(Attrs.getByRefType() == PTy->getElementType(), + Assert(Attrs.getByRefType() == PTy->getNonOpaquePointerElementType(), "Attribute 'byref' type does not match parameter!", V); } if (Attrs.hasAttribute(Attribute::ByVal) && Attrs.getByValType()) { - Assert(Attrs.getByValType() == PTy->getElementType(), + Assert(Attrs.getByValType() == PTy->getNonOpaquePointerElementType(), "Attribute 'byval' type does not match parameter!", V); } if (Attrs.hasAttribute(Attribute::Preallocated)) { - Assert(Attrs.getPreallocatedType() == PTy->getElementType(), + Assert(Attrs.getPreallocatedType() == + PTy->getNonOpaquePointerElementType(), "Attribute 'preallocated' type does not match parameter!", V); } if (Attrs.hasAttribute(Attribute::InAlloca)) { - Assert(Attrs.getInAllocaType() == PTy->getElementType(), + Assert(Attrs.getInAllocaType() == PTy->getNonOpaquePointerElementType(), "Attribute 'inalloca' type does not match parameter!", V); } if (Attrs.hasAttribute(Attribute::ElementType)) { - Assert(Attrs.getElementType() == PTy->getElementType(), + Assert(Attrs.getElementType() == PTy->getNonOpaquePointerElementType(), "Attribute 'elementtype' type does not match parameter!", V); } } @@ -1870,7 +1873,8 @@ void Verifier::checkUnsignedBaseTenFuncAttr(AttributeList Attrs, StringRef Attr, // Check parameter attributes against a function type. // The value V is printed in error messages. void Verifier::verifyFunctionAttrs(FunctionType *FT, AttributeList Attrs, - const Value *V, bool IsIntrinsic) { + const Value *V, bool IsIntrinsic, + bool IsInlineAsm) { if (Attrs.isEmpty()) return; @@ -1913,8 +1917,10 @@ void Verifier::verifyFunctionAttrs(FunctionType *FT, AttributeList Attrs, if (!IsIntrinsic) { Assert(!ArgAttrs.hasAttribute(Attribute::ImmArg), "immarg attribute only applies to intrinsics",V); - Assert(!ArgAttrs.hasAttribute(Attribute::ElementType), - "Attribute 'elementtype' can only be applied to intrinsics.", V); + if (!IsInlineAsm) + Assert(!ArgAttrs.hasAttribute(Attribute::ElementType), + "Attribute 'elementtype' can only be applied to intrinsics" + " and inline asm.", V); } verifyParameterAttrs(ArgAttrs, Ty, V); @@ -2141,6 +2147,33 @@ bool Verifier::verifyAttributeCount(AttributeList Attrs, unsigned Params) { return Attrs.getNumAttrSets() <= Params + 2; } +void Verifier::verifyInlineAsmCall(const CallBase &Call) { + const InlineAsm *IA = cast<InlineAsm>(Call.getCalledOperand()); + unsigned ArgNo = 0; + for (const InlineAsm::ConstraintInfo &CI : IA->ParseConstraints()) { + // Only deal with constraints that correspond to call arguments. + if (!CI.hasArg()) + continue; + + if (CI.isIndirect) { + const Value *Arg = Call.getArgOperand(ArgNo); + Assert(Arg->getType()->isPointerTy(), + "Operand for indirect constraint must have pointer type", + &Call); + + Assert(Call.getAttributes().getParamElementType(ArgNo), + "Operand for indirect constraint must have elementtype attribute", + &Call); + } else { + Assert(!Call.paramHasAttr(ArgNo, Attribute::ElementType), + "Elementtype attribute can only be applied for indirect " + "constraints", &Call); + } + + ArgNo++; + } +} + /// Verify that statepoint intrinsic is well formed. void Verifier::verifyStatepoint(const CallBase &Call) { assert(Call.getCalledFunction() && @@ -2163,9 +2196,10 @@ void Verifier::verifyStatepoint(const CallBase &Call) { const Value *Target = Call.getArgOperand(2); auto *PT = dyn_cast<PointerType>(Target->getType()); - Assert(PT && PT->getElementType()->isFunctionTy(), + Assert(PT && PT->getPointerElementType()->isFunctionTy(), "gc.statepoint callee must be of function pointer type", Call, Target); - FunctionType *TargetFuncType = cast<FunctionType>(PT->getElementType()); + FunctionType *TargetFuncType = + cast<FunctionType>(PT->getPointerElementType()); const int NumCallArgs = cast<ConstantInt>(Call.getArgOperand(3))->getZExtValue(); Assert(NumCallArgs >= 0, @@ -2364,7 +2398,7 @@ void Verifier::visitFunction(const Function &F) { bool IsIntrinsic = F.isIntrinsic(); // Check function attributes. - verifyFunctionAttrs(FT, Attrs, &F, IsIntrinsic); + verifyFunctionAttrs(FT, Attrs, &F, IsIntrinsic, /* IsInlineAsm */ false); // On function declarations/definitions, we do not support the builtin // attribute. We do not check this in VerifyFunctionAttrs since that is @@ -2779,6 +2813,7 @@ void Verifier::visitCallBrInst(CallBrInst &CBI) { Assert(ArgBBs.count(BB), "Indirect label missing from arglist.", &CBI); } + verifyInlineAsmCall(CBI); visitTerminator(CBI); } @@ -3123,7 +3158,7 @@ void Verifier::visitCallBase(CallBase &Call) { } // Verify call attributes. - verifyFunctionAttrs(FTy, Attrs, &Call, IsIntrinsic); + verifyFunctionAttrs(FTy, Attrs, &Call, IsIntrinsic, Call.isInlineAsm()); // Conservatively check the inalloca argument. // We have a bug if we can find that there is an underlying alloca without @@ -3316,6 +3351,9 @@ void Verifier::visitCallBase(CallBase &Call) { "debug info must have a !dbg location", Call); + if (Call.isInlineAsm()) + verifyInlineAsmCall(Call); + visitInstruction(Call); } @@ -3345,13 +3383,13 @@ static bool isTypeCongruent(Type *L, Type *R) { return PL->getAddressSpace() == PR->getAddressSpace(); } -static AttrBuilder getParameterABIAttributes(unsigned I, AttributeList Attrs) { +static AttrBuilder getParameterABIAttributes(LLVMContext& C, unsigned I, AttributeList Attrs) { static const Attribute::AttrKind ABIAttrs[] = { Attribute::StructRet, Attribute::ByVal, Attribute::InAlloca, Attribute::InReg, Attribute::StackAlignment, Attribute::SwiftSelf, Attribute::SwiftAsync, Attribute::SwiftError, Attribute::Preallocated, Attribute::ByRef}; - AttrBuilder Copy; + AttrBuilder Copy(C); for (auto AK : ABIAttrs) { Attribute Attr = Attrs.getParamAttrs(I).getAttribute(AK); if (Attr.isValid()) @@ -3414,12 +3452,12 @@ void Verifier::verifyMustTailCall(CallInst &CI) { // - Only sret, byval, swiftself, and swiftasync ABI-impacting attributes // are allowed in swifttailcc call for (unsigned I = 0, E = CallerTy->getNumParams(); I != E; ++I) { - AttrBuilder ABIAttrs = getParameterABIAttributes(I, CallerAttrs); + AttrBuilder ABIAttrs = getParameterABIAttributes(F->getContext(), I, CallerAttrs); SmallString<32> Context{CCName, StringRef(" musttail caller")}; verifyTailCCMustTailAttrs(ABIAttrs, Context); } for (unsigned I = 0, E = CalleeTy->getNumParams(); I != E; ++I) { - AttrBuilder ABIAttrs = getParameterABIAttributes(I, CalleeAttrs); + AttrBuilder ABIAttrs = getParameterABIAttributes(F->getContext(), I, CalleeAttrs); SmallString<32> Context{CCName, StringRef(" musttail callee")}; verifyTailCCMustTailAttrs(ABIAttrs, Context); } @@ -3446,8 +3484,8 @@ void Verifier::verifyMustTailCall(CallInst &CI) { // - All ABI-impacting function attributes, such as sret, byval, inreg, // returned, preallocated, and inalloca, must match. for (unsigned I = 0, E = CallerTy->getNumParams(); I != E; ++I) { - AttrBuilder CallerABIAttrs = getParameterABIAttributes(I, CallerAttrs); - AttrBuilder CalleeABIAttrs = getParameterABIAttributes(I, CalleeAttrs); + AttrBuilder CallerABIAttrs = getParameterABIAttributes(F->getContext(), I, CallerAttrs); + AttrBuilder CalleeABIAttrs = getParameterABIAttributes(F->getContext(), I, CalleeAttrs); Assert(CallerABIAttrs == CalleeABIAttrs, "cannot guarantee tail call due to mismatched ABI impacting " "function attributes", @@ -3963,6 +4001,11 @@ void Verifier::visitEHPadPredecessors(Instruction &I) { "A single unwind edge may only enter one EH pad", TI); Assert(Seen.insert(FromPad).second, "EH pad jumps through a cycle of pads", FromPad); + + // This will be diagnosed on the corresponding instruction already. We + // need the extra check here to make sure getParentPad() works. + Assert(isa<FuncletPadInst>(FromPad) || isa<CatchSwitchInst>(FromPad), + "Parent pad must be catchpad/cleanuppad/catchswitch", TI); } } } @@ -4964,7 +5007,7 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) { // Assert that result type matches wrapped callee. const Value *Target = StatepointCall->getArgOperand(2); auto *PT = cast<PointerType>(Target->getType()); - auto *TargetFuncType = cast<FunctionType>(PT->getElementType()); + auto *TargetFuncType = cast<FunctionType>(PT->getPointerElementType()); Assert(Call.getType() == TargetFuncType->getReturnType(), "gc.result result type does not match wrapped callee", Call); break; @@ -5271,7 +5314,7 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) { PointerType *Op0PtrTy = cast<PointerType>(Call.getArgOperand(0)->getType()); if (!Op0PtrTy->isOpaque()) - Op0ElemTy = Op0PtrTy->getElementType(); + Op0ElemTy = Op0PtrTy->getNonOpaquePointerElementType(); break; } case Intrinsic::matrix_column_major_store: { @@ -5285,7 +5328,7 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) { PointerType *Op1PtrTy = cast<PointerType>(Call.getArgOperand(1)->getType()); if (!Op1PtrTy->isOpaque()) - Op1ElemTy = Op1PtrTy->getElementType(); + Op1ElemTy = Op1PtrTy->getNonOpaquePointerElementType(); break; } default: @@ -5316,6 +5359,24 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) { break; } + case Intrinsic::experimental_vector_splice: { + VectorType *VecTy = cast<VectorType>(Call.getType()); + int64_t Idx = cast<ConstantInt>(Call.getArgOperand(2))->getSExtValue(); + int64_t KnownMinNumElements = VecTy->getElementCount().getKnownMinValue(); + if (Call.getParent() && Call.getParent()->getParent()) { + AttributeList Attrs = Call.getParent()->getParent()->getAttributes(); + if (Attrs.hasFnAttr(Attribute::VScaleRange)) + KnownMinNumElements *= Attrs.getFnAttrs().getVScaleRangeMin(); + } + Assert((Idx < 0 && std::abs(Idx) <= KnownMinNumElements) || + (Idx >= 0 && Idx < KnownMinNumElements), + "The splice index exceeds the range [-VL, VL-1] where VL is the " + "known minimum number of elements in the vector. For scalable " + "vectors the minimum number of elements is determined from " + "vscale_range.", + &Call); + break; + } case Intrinsic::experimental_stepvector: { VectorType *VecTy = dyn_cast<VectorType>(Call.getType()); Assert(VecTy && VecTy->getScalarType()->isIntegerTy() && diff --git a/llvm/lib/InterfaceStub/ELFObjHandler.cpp b/llvm/lib/InterfaceStub/ELFObjHandler.cpp index 0d1a864f31ac..cb72f57f7bde 100644 --- a/llvm/lib/InterfaceStub/ELFObjHandler.cpp +++ b/llvm/lib/InterfaceStub/ELFObjHandler.cpp @@ -19,7 +19,6 @@ #include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/Process.h" -using llvm::MemoryBufferRef; using llvm::object::ELFObjectFile; using namespace llvm; diff --git a/llvm/lib/InterfaceStub/IFSHandler.cpp b/llvm/lib/InterfaceStub/IFSHandler.cpp index e6bf09232ce2..4ccbb18ca04a 100644 --- a/llvm/lib/InterfaceStub/IFSHandler.cpp +++ b/llvm/lib/InterfaceStub/IFSHandler.cpp @@ -195,7 +195,7 @@ Expected<std::unique_ptr<IFSStub>> ifs::readIFSFromBuffer(StringRef Buf) { } Error ifs::writeIFSToOutputStream(raw_ostream &OS, const IFSStub &Stub) { - yaml::Output YamlOut(OS, NULL, /*WrapColumn =*/0); + yaml::Output YamlOut(OS, nullptr, /*WrapColumn =*/0); std::unique_ptr<IFSStubTriple> CopyStub(new IFSStubTriple(Stub)); if (Stub.Target.Arch) { CopyStub->Target.ArchString = std::string( diff --git a/llvm/lib/InterfaceStub/IFSStub.cpp b/llvm/lib/InterfaceStub/IFSStub.cpp index 008263f8db9f..1ce7a66869b8 100644 --- a/llvm/lib/InterfaceStub/IFSStub.cpp +++ b/llvm/lib/InterfaceStub/IFSStub.cpp @@ -37,7 +37,7 @@ IFSStubTriple::IFSStubTriple(IFSStubTriple const &Stub) : IFSStub() { Symbols = Stub.Symbols; } -IFSStubTriple::IFSStubTriple(IFSStub const &Stub) : IFSStub() { +IFSStubTriple::IFSStubTriple(IFSStub const &Stub) { IfsVersion = Stub.IfsVersion; Target = Stub.Target; SoName = Stub.SoName; diff --git a/llvm/lib/LTO/LTOBackend.cpp b/llvm/lib/LTO/LTOBackend.cpp index 855d0fc8a8be..7694c9848384 100644 --- a/llvm/lib/LTO/LTOBackend.cpp +++ b/llvm/lib/LTO/LTOBackend.cpp @@ -229,8 +229,7 @@ static void runNewPMPasses(const Config &Conf, Module &Mod, TargetMachine *TM, PGOOpt = PGOOptions("", "", "", PGOOptions::NoAction, PGOOptions::NoCSAction, true); } - if (TM) - TM->setPGOOption(PGOOpt); + TM->setPGOOption(PGOOpt); LoopAnalysisManager LAM; FunctionAnalysisManager FAM; @@ -415,6 +414,8 @@ static void codegen(const Config &Conf, TargetMachine *TM, TM->Options.ObjectFilenameForDebug = Stream->ObjectPathName; legacy::PassManager CodeGenPasses; + TargetLibraryInfoImpl TLII(Triple(Mod.getTargetTriple())); + CodeGenPasses.add(new TargetLibraryInfoWrapperPass(TLII)); CodeGenPasses.add( createImmutableModuleSummaryIndexWrapperPass(&CombinedIndex)); if (Conf.PreCodeGenPassesHook) diff --git a/llvm/lib/MC/MCAsmStreamer.cpp b/llvm/lib/MC/MCAsmStreamer.cpp index 5c2aaddff4d1..119237bb052e 100644 --- a/llvm/lib/MC/MCAsmStreamer.cpp +++ b/llvm/lib/MC/MCAsmStreamer.cpp @@ -620,6 +620,8 @@ void MCAsmStreamer::emitVersionMin(MCVersionMinType Type, unsigned Major, static const char *getPlatformName(MachO::PlatformType Type) { switch (Type) { + case MachO::PLATFORM_UNKNOWN: /* silence warning*/ + break; case MachO::PLATFORM_MACOS: return "macos"; case MachO::PLATFORM_IOS: return "ios"; case MachO::PLATFORM_TVOS: return "tvos"; diff --git a/llvm/lib/MC/MCContext.cpp b/llvm/lib/MC/MCContext.cpp index aa4051aa2400..7f639e9c408f 100644 --- a/llvm/lib/MC/MCContext.cpp +++ b/llvm/lib/MC/MCContext.cpp @@ -978,13 +978,3 @@ void MCContext::reportWarning(SMLoc Loc, const Twine &Msg) { }); } } - -void MCContext::reportFatalError(SMLoc Loc, const Twine &Msg) { - reportError(Loc, Msg); - - // If we reached here, we are failing ungracefully. Run the interrupt handlers - // to make sure any special cleanups get done, in particular that we remove - // files registered with RemoveFileOnSignal. - sys::RunInterruptHandlers(); - exit(1); -} diff --git a/llvm/lib/MC/MCDwarf.cpp b/llvm/lib/MC/MCDwarf.cpp index 1c9cfb9042e2..2cb5a000f88a 100644 --- a/llvm/lib/MC/MCDwarf.cpp +++ b/llvm/lib/MC/MCDwarf.cpp @@ -561,7 +561,7 @@ Expected<unsigned> MCDwarfLineTable::tryGetFile(StringRef &Directory, static bool isRootFile(const MCDwarfFile &RootFile, StringRef &Directory, StringRef &FileName, Optional<MD5::MD5Result> Checksum) { - if (RootFile.Name.empty() || RootFile.Name != FileName.data()) + if (RootFile.Name.empty() || StringRef(RootFile.Name) != FileName) return false; return RootFile.Checksum == Checksum; } @@ -586,7 +586,7 @@ MCDwarfLineTableHeader::tryGetFile(StringRef &Directory, trackMD5Usage(Checksum.hasValue()); HasSource = (Source != None); } - if (isRootFile(RootFile, Directory, FileName, Checksum) && DwarfVersion >= 5) + if (DwarfVersion >= 5 && isRootFile(RootFile, Directory, FileName, Checksum)) return 0; if (FileNumber == 0) { // File numbers start with 1 and/or after any file numbers diff --git a/llvm/lib/MC/MCMachOStreamer.cpp b/llvm/lib/MC/MCMachOStreamer.cpp index 3edf7a3f49e6..88aeeb980738 100644 --- a/llvm/lib/MC/MCMachOStreamer.cpp +++ b/llvm/lib/MC/MCMachOStreamer.cpp @@ -116,8 +116,16 @@ public: void emitLOHDirective(MCLOHType Kind, const MCLOHArgs &Args) override { getAssembler().getLOHContainer().addDirective(Kind, Args); } + void emitCGProfileEntry(const MCSymbolRefExpr *From, + const MCSymbolRefExpr *To, uint64_t Count) override { + if (!From->getSymbol().isTemporary() && !To->getSymbol().isTemporary()) + getAssembler().CGProfile.push_back({From, To, Count}); + } void finishImpl() override; + + void finalizeCGProfileEntry(const MCSymbolRefExpr *&SRE); + void finalizeCGProfile(); }; } // end anonymous namespace. @@ -145,7 +153,8 @@ static bool canGoAfterDWARF(const MCSectionMachO &MSec) { if (SegName == "__DATA" && (SecName == "__nl_symbol_ptr" || SecName == "__thread_ptr")) return true; - + if (SegName == "__LLVM" && SecName == "__cg_profile") + return true; return false; } @@ -513,9 +522,40 @@ void MCMachOStreamer::finishImpl() { } } + finalizeCGProfile(); + this->MCObjectStreamer::finishImpl(); } +void MCMachOStreamer::finalizeCGProfileEntry(const MCSymbolRefExpr *&SRE) { + const MCSymbol *S = &SRE->getSymbol(); + bool Created; + getAssembler().registerSymbol(*S, &Created); + if (Created) + S->setExternal(true); +} + +void MCMachOStreamer::finalizeCGProfile() { + MCAssembler &Asm = getAssembler(); + if (Asm.CGProfile.empty()) + return; + for (MCAssembler::CGProfileEntry &E : Asm.CGProfile) { + finalizeCGProfileEntry(E.From); + finalizeCGProfileEntry(E.To); + } + // We can't write the section out until symbol indices are finalized which + // doesn't happen until after section layout. We need to create the section + // and set its size now so that it's accounted for in layout. + MCSection *CGProfileSection = Asm.getContext().getMachOSection( + "__LLVM", "__cg_profile", 0, SectionKind::getMetadata()); + Asm.registerSection(*CGProfileSection); + auto *Frag = new MCDataFragment(CGProfileSection); + // For each entry, reserve space for 2 32-bit indices and a 64-bit count. + size_t SectionBytes = + Asm.CGProfile.size() * (2 * sizeof(uint32_t) + sizeof(uint64_t)); + Frag->getContents().resize(SectionBytes); +} + MCStreamer *llvm::createMachOStreamer(MCContext &Context, std::unique_ptr<MCAsmBackend> &&MAB, std::unique_ptr<MCObjectWriter> &&OW, diff --git a/llvm/lib/MC/MCObjectStreamer.cpp b/llvm/lib/MC/MCObjectStreamer.cpp index 6604d7988c4c..ebbbd6ad4e16 100644 --- a/llvm/lib/MC/MCObjectStreamer.cpp +++ b/llvm/lib/MC/MCObjectStreamer.cpp @@ -119,8 +119,31 @@ void MCObjectStreamer::resolvePendingFixups() { continue; } flushPendingLabels(PendingFixup.DF, PendingFixup.DF->getContents().size()); - PendingFixup.Fixup.setOffset(PendingFixup.Sym->getOffset()); - PendingFixup.DF->getFixups().push_back(PendingFixup.Fixup); + PendingFixup.Fixup.setOffset(PendingFixup.Sym->getOffset() + + PendingFixup.Fixup.getOffset()); + + // If the location symbol to relocate is in MCEncodedFragmentWithFixups, + // put the Fixup into location symbol's fragment. Otherwise + // put into PendingFixup.DF + MCFragment *SymFragment = PendingFixup.Sym->getFragment(); + switch (SymFragment->getKind()) { + case MCFragment::FT_Relaxable: + case MCFragment::FT_Dwarf: + case MCFragment::FT_PseudoProbe: + cast<MCEncodedFragmentWithFixups<8, 1>>(SymFragment) + ->getFixups() + .push_back(PendingFixup.Fixup); + break; + case MCFragment::FT_Data: + case MCFragment::FT_CVDefRange: + cast<MCEncodedFragmentWithFixups<32, 4>>(SymFragment) + ->getFixups() + .push_back(PendingFixup.Fixup); + break; + default: + PendingFixup.DF->getFixups().push_back(PendingFixup.Fixup); + break; + } } PendingFixups.clear(); } @@ -816,8 +839,9 @@ MCObjectStreamer::emitRelocDirective(const MCExpr &Offset, StringRef Name, return None; } - PendingFixups.emplace_back(&SRE.getSymbol(), DF, - MCFixup::create(-1, Expr, Kind, Loc)); + PendingFixups.emplace_back( + &SRE.getSymbol(), DF, + MCFixup::create(OffsetVal.getConstant(), Expr, Kind, Loc)); return None; } diff --git a/llvm/lib/MC/MCParser/AsmParser.cpp b/llvm/lib/MC/MCParser/AsmParser.cpp index 705f7159d55b..0cea491f227d 100644 --- a/llvm/lib/MC/MCParser/AsmParser.cpp +++ b/llvm/lib/MC/MCParser/AsmParser.cpp @@ -159,7 +159,7 @@ private: int64_t LineNumber; SMLoc Loc; unsigned Buf; - CppHashInfoTy() : Filename(), LineNumber(0), Loc(), Buf(0) {} + CppHashInfoTy() : LineNumber(0), Buf(0) {} }; CppHashInfoTy CppHashInfo; @@ -1121,11 +1121,8 @@ StringRef AsmParser::parseStringToComma() { bool AsmParser::parseParenExpr(const MCExpr *&Res, SMLoc &EndLoc) { if (parseExpression(Res)) return true; - if (Lexer.isNot(AsmToken::RParen)) - return TokError("expected ')' in parentheses expression"); EndLoc = Lexer.getTok().getEndLoc(); - Lex(); - return false; + return parseRParen(); } /// Parse a bracket expression and return it. @@ -1214,9 +1211,7 @@ bool AsmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc, Lex(); // eat '('. StringRef VName; parseIdentifier(VName); - // eat ')'. - if (parseToken(AsmToken::RParen, - "unexpected token in variant, expected ')'")) + if (parseRParen()) return true; Split = std::make_pair(Identifier, VName); } @@ -1379,9 +1374,8 @@ bool AsmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc, Lex(); // Eat the operator. if (parseExpression(Res, EndLoc)) return true; - if (Lexer.isNot(AsmToken::RParen)) - return TokError("expected ')'"); - Lex(); // Eat the operator. + if (parseRParen()) + return true; Res = getTargetParser().createTargetUnaryExpr(Res, FirstTokenKind, Ctx); return !Res; } @@ -1553,8 +1547,7 @@ bool AsmParser::parseParenExprOfDepth(unsigned ParenDepth, const MCExpr *&Res, // This is the same behavior as parseParenExpression(). if (ParenDepth - 1 > 0) { EndLoc = getTok().getEndLoc(); - if (parseToken(AsmToken::RParen, - "expected ')' in parentheses expression")) + if (parseRParen()) return true; } } @@ -5047,15 +5040,7 @@ bool AsmParser::parseDirectiveComm(bool IsLocal) { // NOTE: a size of zero for a .comm should create a undefined symbol // but a size of .lcomm creates a bss symbol of size zero. if (Size < 0) - return Error(SizeLoc, "invalid '.comm' or '.lcomm' directive size, can't " - "be less than zero"); - - // NOTE: The alignment in the directive is a power of 2 value, the assembler - // may internally end up wanting an alignment in bytes. - // FIXME: Diagnose overflow. - if (Pow2Alignment < 0) - return Error(Pow2AlignmentLoc, "invalid '.comm' or '.lcomm' directive " - "alignment, can't be less than zero"); + return Error(SizeLoc, "size must be non-negative"); Sym->redefineIfPossible(); if (!Sym->isUndefined()) diff --git a/llvm/lib/MC/MCParser/DarwinAsmParser.cpp b/llvm/lib/MC/MCParser/DarwinAsmParser.cpp index 3bc13012c019..308b3842c61e 100644 --- a/llvm/lib/MC/MCParser/DarwinAsmParser.cpp +++ b/llvm/lib/MC/MCParser/DarwinAsmParser.cpp @@ -195,6 +195,8 @@ public: addDirectiveHandler<&DarwinAsmParser::parseMacOSXVersionMin>( ".macosx_version_min"); addDirectiveHandler<&DarwinAsmParser::parseBuildVersion>(".build_version"); + addDirectiveHandler<&DarwinAsmParser::parseDirectiveCGProfile>( + ".cg_profile"); LastVersionDirective = SMLoc(); } @@ -467,6 +469,7 @@ public: bool parseSDKVersion(VersionTuple &SDKVersion); void checkVersion(StringRef Directive, StringRef Arg, SMLoc Loc, Triple::OSType ExpectedOS); + bool parseDirectiveCGProfile(StringRef Directive, SMLoc Loc); }; } // end anonymous namespace @@ -1142,6 +1145,8 @@ bool DarwinAsmParser::parseVersionMin(StringRef Directive, SMLoc Loc, static Triple::OSType getOSTypeFromPlatform(MachO::PlatformType Type) { switch (Type) { + case MachO::PLATFORM_UNKNOWN: /* silence warning */ + break; case MachO::PLATFORM_MACOS: return Triple::MacOSX; case MachO::PLATFORM_IOS: return Triple::IOS; case MachO::PLATFORM_TVOS: return Triple::TvOS; @@ -1198,6 +1203,11 @@ bool DarwinAsmParser::parseBuildVersion(StringRef Directive, SMLoc Loc) { return false; } +/// parseDirectiveCGProfile +/// ::= .cg_profile from, to, count +bool DarwinAsmParser::parseDirectiveCGProfile(StringRef S, SMLoc Loc) { + return MCAsmParserExtension::ParseDirectiveCGProfile(S, Loc); +} namespace llvm { diff --git a/llvm/lib/MC/MCParser/ELFAsmParser.cpp b/llvm/lib/MC/MCParser/ELFAsmParser.cpp index e95019c12db7..e814cf003656 100644 --- a/llvm/lib/MC/MCParser/ELFAsmParser.cpp +++ b/llvm/lib/MC/MCParser/ELFAsmParser.cpp @@ -499,7 +499,8 @@ bool ELFAsmParser::maybeParseUniqueID(int64_t &UniqueID) { } static bool hasPrefix(StringRef SectionName, StringRef Prefix) { - return SectionName.startswith(Prefix) || SectionName == Prefix.drop_back(); + return SectionName.consume_front(Prefix) && + (SectionName.empty() || SectionName[0] == '.'); } static bool allowSectionTypeMismatch(const Triple &TT, StringRef SectionName, @@ -514,7 +515,7 @@ static bool allowSectionTypeMismatch(const Triple &TT, StringRef SectionName, // MIPS .debug_* sections should have SHT_MIPS_DWARF section type to // distinguish among sections contain DWARF and ECOFF debug formats, // but in assembly files these sections have SHT_PROGBITS type. - return hasPrefix(SectionName, ".debug_") && Type == ELF::SHT_PROGBITS; + return SectionName.startswith(".debug_") && Type == ELF::SHT_PROGBITS; } return false; } @@ -537,19 +538,18 @@ bool ELFAsmParser::ParseSectionArguments(bool IsPush, SMLoc loc) { int64_t UniqueID = ~0; // Set the defaults first. - if (hasPrefix(SectionName, ".rodata.") || SectionName == ".rodata1") + if (hasPrefix(SectionName, ".rodata") || SectionName == ".rodata1") Flags |= ELF::SHF_ALLOC; else if (SectionName == ".fini" || SectionName == ".init" || - hasPrefix(SectionName, ".text.")) + hasPrefix(SectionName, ".text")) Flags |= ELF::SHF_ALLOC | ELF::SHF_EXECINSTR; - else if (hasPrefix(SectionName, ".data.") || SectionName == ".data1" || - hasPrefix(SectionName, ".bss.") || - hasPrefix(SectionName, ".init_array.") || - hasPrefix(SectionName, ".fini_array.") || - hasPrefix(SectionName, ".preinit_array.")) + else if (hasPrefix(SectionName, ".data") || SectionName == ".data1" || + hasPrefix(SectionName, ".bss") || + hasPrefix(SectionName, ".init_array") || + hasPrefix(SectionName, ".fini_array") || + hasPrefix(SectionName, ".preinit_array")) Flags |= ELF::SHF_ALLOC | ELF::SHF_WRITE; - else if (hasPrefix(SectionName, ".tdata.") || - hasPrefix(SectionName, ".tbss.")) + else if (hasPrefix(SectionName, ".tdata") || hasPrefix(SectionName, ".tbss")) Flags |= ELF::SHF_ALLOC | ELF::SHF_WRITE | ELF::SHF_TLS; if (getLexer().is(AsmToken::Comma)) { @@ -620,15 +620,15 @@ EndStmt: if (TypeName.empty()) { if (SectionName.startswith(".note")) Type = ELF::SHT_NOTE; - else if (hasPrefix(SectionName, ".init_array.")) + else if (hasPrefix(SectionName, ".init_array")) Type = ELF::SHT_INIT_ARRAY; - else if (hasPrefix(SectionName, ".bss.")) + else if (hasPrefix(SectionName, ".bss")) Type = ELF::SHT_NOBITS; - else if (hasPrefix(SectionName, ".tbss.")) + else if (hasPrefix(SectionName, ".tbss")) Type = ELF::SHT_NOBITS; - else if (hasPrefix(SectionName, ".fini_array.")) + else if (hasPrefix(SectionName, ".fini_array")) Type = ELF::SHT_FINI_ARRAY; - else if (hasPrefix(SectionName, ".preinit_array.")) + else if (hasPrefix(SectionName, ".preinit_array")) Type = ELF::SHT_PREINIT_ARRAY; } else { if (TypeName == "init_array") diff --git a/llvm/lib/MC/MCParser/MasmParser.cpp b/llvm/lib/MC/MCParser/MasmParser.cpp index f1704cef46ac..f9433240743d 100644 --- a/llvm/lib/MC/MCParser/MasmParser.cpp +++ b/llvm/lib/MC/MCParser/MasmParser.cpp @@ -13,6 +13,7 @@ #include "llvm/ADT/APFloat.h" #include "llvm/ADT/APInt.h" #include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/BitVector.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/None.h" #include "llvm/ADT/Optional.h" @@ -379,7 +380,7 @@ private: /// time of assembly struct tm TM; - std::vector<bool> EndStatementAtEOFStack; + BitVector EndStatementAtEOFStack; AsmCond TheCondState; std::vector<AsmCond> TheCondStack; @@ -424,7 +425,7 @@ private: int64_t LineNumber; SMLoc Loc; unsigned Buf; - CppHashInfoTy() : Filename(), LineNumber(0), Loc(), Buf(0) {} + CppHashInfoTy() : LineNumber(0), Buf(0) {} }; CppHashInfoTy CppHashInfo; @@ -1516,11 +1517,8 @@ StringRef MasmParser::parseStringToEndOfStatement() { bool MasmParser::parseParenExpr(const MCExpr *&Res, SMLoc &EndLoc) { if (parseExpression(Res)) return true; - if (Lexer.isNot(AsmToken::RParen)) - return TokError("expected ')' in parentheses expression"); EndLoc = Lexer.getTok().getEndLoc(); - Lex(); - return false; + return parseRParen(); } /// Parse a bracket expression and return it. @@ -1838,9 +1836,8 @@ bool MasmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc, Lex(); // Eat the operator. if (parseExpression(Res, EndLoc)) return true; - if (Lexer.isNot(AsmToken::RParen)) - return TokError("expected ')'"); - Lex(); // Eat the operator. + if (parseRParen()) + return true; Res = getTargetParser().createTargetUnaryExpr(Res, FirstTokenKind, Ctx); return !Res; } @@ -1929,8 +1926,7 @@ bool MasmParser::parseParenExprOfDepth(unsigned ParenDepth, const MCExpr *&Res, // This is the same behavior as parseParenExpression(). if (ParenDepth - 1 > 0) { EndLoc = getTok().getEndLoc(); - if (parseToken(AsmToken::RParen, - "expected ')' in parentheses expression")) + if (parseRParen()) return true; } } @@ -3358,8 +3354,7 @@ bool MasmParser::handleMacroInvocation(const MCAsmMacro *M, SMLoc NameLoc) { } // Consume the right-parenthesis on the other side of the arguments. - if (parseToken(AsmToken::RParen, "invoking macro function '" + M->Name + - "' requires arguments in parentheses")) + if (parseRParen()) return true; // Exit values may require lexing, unfortunately. We construct a new buffer to @@ -3743,8 +3738,7 @@ bool MasmParser::parseScalarInitializer(unsigned Size, SmallVector<const MCExpr *, 1> DuplicatedValues; if (parseToken(AsmToken::LParen, "parentheses required for 'dup' contents") || - parseScalarInstList(Size, DuplicatedValues) || - parseToken(AsmToken::RParen, "unmatched parentheses")) + parseScalarInstList(Size, DuplicatedValues) || parseRParen()) return true; for (int i = 0; i < Repetitions; ++i) @@ -3950,8 +3944,7 @@ bool MasmParser::parseRealInstList(const fltSemantics &Semantics, SmallVector<APInt, 1> DuplicatedValues; if (parseToken(AsmToken::LParen, "parentheses required for 'dup' contents") || - parseRealInstList(Semantics, DuplicatedValues) || - parseToken(AsmToken::RParen, "unmatched parentheses")) + parseRealInstList(Semantics, DuplicatedValues) || parseRParen()) return true; for (int i = 0; i < Repetitions; ++i) @@ -4316,8 +4309,7 @@ bool MasmParser::parseStructInstList( std::vector<StructInitializer> DuplicatedValues; if (parseToken(AsmToken::LParen, "parentheses required for 'dup' contents") || - parseStructInstList(Structure, DuplicatedValues) || - parseToken(AsmToken::RParen, "unmatched parentheses")) + parseStructInstList(Structure, DuplicatedValues) || parseRParen()) return true; for (int i = 0; i < Repetitions; ++i) diff --git a/llvm/lib/MC/MCSectionXCOFF.cpp b/llvm/lib/MC/MCSectionXCOFF.cpp index 7f7380bf810d..2ff4839d3706 100644 --- a/llvm/lib/MC/MCSectionXCOFF.cpp +++ b/llvm/lib/MC/MCSectionXCOFF.cpp @@ -34,7 +34,8 @@ void MCSectionXCOFF::PrintSwitchToSection(const MCAsmInfo &MAI, const Triple &T, } if (getKind().isReadOnly()) { - if (getMappingClass() != XCOFF::XMC_RO) + if (getMappingClass() != XCOFF::XMC_RO && + getMappingClass() != XCOFF::XMC_TD) report_fatal_error("Unhandled storage-mapping class for .rodata csect."); printCsectDirective(OS); return; @@ -70,7 +71,8 @@ void MCSectionXCOFF::PrintSwitchToSection(const MCAsmInfo &MAI, const Triple &T, } if (isCsect() && getMappingClass() == XCOFF::XMC_TD) { - assert((getKind().isBSSExtern() || getKind().isBSSLocal()) && + assert((getKind().isBSSExtern() || getKind().isBSSLocal() || + getKind().isReadOnlyWithRel()) && "Unexepected section kind for toc-data"); printCsectDirective(OS); return; diff --git a/llvm/lib/MC/MCStreamer.cpp b/llvm/lib/MC/MCStreamer.cpp index 9c37a7bebe2a..a14f0de65a9d 100644 --- a/llvm/lib/MC/MCStreamer.cpp +++ b/llvm/lib/MC/MCStreamer.cpp @@ -1348,8 +1348,8 @@ void MCStreamer::emitVersionForTarget( DarwinTargetVariantTriple->isMacOSX()) { emitVersionForTarget(*DarwinTargetVariantTriple, DarwinTargetVariantSDKVersion, - /*TargetVariantTriple=*/nullptr, - /*TargetVariantSDKVersion=*/VersionTuple()); + /*DarwinTargetVariantTriple=*/nullptr, + /*DarwinTargetVariantSDKVersion=*/VersionTuple()); emitDarwinTargetVariantBuildVersion( getMachoBuildVersionPlatformType(Target), LinkedTargetVersion.getMajor(), diff --git a/llvm/lib/MC/MachObjectWriter.cpp b/llvm/lib/MC/MachObjectWriter.cpp index 16941b1cb727..56bb03ad8d42 100644 --- a/llvm/lib/MC/MachObjectWriter.cpp +++ b/llvm/lib/MC/MachObjectWriter.cpp @@ -759,6 +759,23 @@ uint64_t MachObjectWriter::writeObject(MCAssembler &Asm, computeSymbolTable(Asm, LocalSymbolData, ExternalSymbolData, UndefinedSymbolData); + if (!Asm.CGProfile.empty()) { + MCSection *CGProfileSection = Asm.getContext().getMachOSection( + "__LLVM", "__cg_profile", 0, SectionKind::getMetadata()); + MCDataFragment *Frag = dyn_cast_or_null<MCDataFragment>( + &*CGProfileSection->getFragmentList().begin()); + assert(Frag && "call graph profile section not reserved"); + Frag->getContents().clear(); + raw_svector_ostream OS(Frag->getContents()); + for (const MCAssembler::CGProfileEntry &CGPE : Asm.CGProfile) { + uint32_t FromIndex = CGPE.From->getSymbol().getIndex(); + uint32_t ToIndex = CGPE.To->getSymbol().getIndex(); + support::endian::write(OS, FromIndex, W.Endian); + support::endian::write(OS, ToIndex, W.Endian); + support::endian::write(OS, CGPE.Count, W.Endian); + } + } + unsigned NumSections = Asm.size(); const MCAssembler::VersionInfoType &VersionInfo = Layout.getAssembler().getVersionInfo(); diff --git a/llvm/lib/MCA/HardwareUnits/LSUnit.cpp b/llvm/lib/MCA/HardwareUnits/LSUnit.cpp index 07be7b077bc9..121d320f10e6 100644 --- a/llvm/lib/MCA/HardwareUnits/LSUnit.cpp +++ b/llvm/lib/MCA/HardwareUnits/LSUnit.cpp @@ -68,7 +68,8 @@ void LSUnitBase::dump() const { unsigned LSUnit::dispatch(const InstRef &IR) { const InstrDesc &Desc = IR.getInstruction()->getDesc(); - unsigned IsMemBarrier = Desc.HasSideEffects; + bool IsStoreBarrier = IR.getInstruction()->isAStoreBarrier(); + bool IsLoadBarrier = IR.getInstruction()->isALoadBarrier(); assert((Desc.MayLoad || Desc.MayStore) && "Not a memory operation!"); if (Desc.MayLoad) @@ -111,12 +112,12 @@ unsigned LSUnit::dispatch(const InstRef &IR) { CurrentStoreGroupID = NewGID; - if (IsMemBarrier) + if (IsStoreBarrier) CurrentStoreBarrierGroupID = NewGID; if (Desc.MayLoad) { CurrentLoadGroupID = NewGID; - if (IsMemBarrier) + if (IsLoadBarrier) CurrentLoadBarrierGroupID = NewGID; } @@ -141,7 +142,7 @@ unsigned LSUnit::dispatch(const InstRef &IR) { // However that group has already started execution, so we cannot add // this load to it. bool ShouldCreateANewGroup = - IsMemBarrier || !ImmediateLoadDominator || + IsLoadBarrier || !ImmediateLoadDominator || CurrentLoadBarrierGroupID == ImmediateLoadDominator || ImmediateLoadDominator <= CurrentStoreGroupID || getGroup(ImmediateLoadDominator).isExecuting(); @@ -161,7 +162,7 @@ unsigned LSUnit::dispatch(const InstRef &IR) { } // A load barrier may not pass a previous load or load barrier. - if (IsMemBarrier) { + if (IsLoadBarrier) { if (ImmediateLoadDominator) { MemoryGroup &LoadGroup = getGroup(ImmediateLoadDominator); LLVM_DEBUG(dbgs() << "[LSUnit]: GROUP DEP: (" @@ -181,7 +182,7 @@ unsigned LSUnit::dispatch(const InstRef &IR) { } CurrentLoadGroupID = NewGID; - if (IsMemBarrier) + if (IsLoadBarrier) CurrentLoadBarrierGroupID = NewGID; return NewGID; } diff --git a/llvm/lib/MCA/Stages/DispatchStage.cpp b/llvm/lib/MCA/Stages/DispatchStage.cpp index 5385142698e6..66228bd5a862 100644 --- a/llvm/lib/MCA/Stages/DispatchStage.cpp +++ b/llvm/lib/MCA/Stages/DispatchStage.cpp @@ -30,7 +30,7 @@ DispatchStage::DispatchStage(const MCSubtargetInfo &Subtarget, unsigned MaxDispatchWidth, RetireControlUnit &R, RegisterFile &F) : DispatchWidth(MaxDispatchWidth), AvailableEntries(MaxDispatchWidth), - CarryOver(0U), CarriedOver(), STI(Subtarget), RCU(R), PRF(F) { + CarryOver(0U), STI(Subtarget), RCU(R), PRF(F) { if (!DispatchWidth) DispatchWidth = Subtarget.getSchedModel().IssueWidth; } diff --git a/llvm/lib/MCA/Stages/InOrderIssueStage.cpp b/llvm/lib/MCA/Stages/InOrderIssueStage.cpp index fa5c0fc66b9e..abfbc80f17c9 100644 --- a/llvm/lib/MCA/Stages/InOrderIssueStage.cpp +++ b/llvm/lib/MCA/Stages/InOrderIssueStage.cpp @@ -47,7 +47,7 @@ InOrderIssueStage::InOrderIssueStage(const MCSubtargetInfo &STI, RegisterFile &PRF, CustomBehaviour &CB, LSUnit &LSU) : STI(STI), PRF(PRF), RM(STI.getSchedModel()), CB(CB), LSU(LSU), - NumIssued(), SI(), CarryOver(), Bandwidth(), LastWriteBackCycle() {} + NumIssued(), CarryOver(), Bandwidth(), LastWriteBackCycle() {} unsigned InOrderIssueStage::getIssueWidth() const { return STI.getSchedModel().IssueWidth; diff --git a/llvm/lib/Object/Archive.cpp b/llvm/lib/Object/Archive.cpp index 5492692445e7..9a4ef055faa4 100644 --- a/llvm/lib/Object/Archive.cpp +++ b/llvm/lib/Object/Archive.cpp @@ -22,6 +22,7 @@ #include "llvm/Support/Error.h" #include "llvm/Support/ErrorOr.h" #include "llvm/Support/FileSystem.h" +#include "llvm/Support/MathExtras.h" #include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/Path.h" #include "llvm/Support/raw_ostream.h" @@ -38,9 +39,6 @@ using namespace llvm; using namespace object; using namespace llvm::support::endian; -const char Magic[] = "!<arch>\n"; -const char ThinMagic[] = "!<thin>\n"; - void Archive::anchor() {} static Error malformedError(Twine Msg) { @@ -49,27 +47,62 @@ static Error malformedError(Twine Msg) { object_error::parse_failed); } +static Error +createMemberHeaderParseError(const AbstractArchiveMemberHeader *ArMemHeader, + const char *RawHeaderPtr, uint64_t Size) { + StringRef Msg("remaining size of archive too small for next archive " + "member header "); + + Expected<StringRef> NameOrErr = ArMemHeader->getName(Size); + if (NameOrErr) + return malformedError(Msg + "for " + *NameOrErr); + + consumeError(NameOrErr.takeError()); + uint64_t Offset = RawHeaderPtr - ArMemHeader->Parent->getData().data(); + return malformedError(Msg + "at offset " + Twine(Offset)); +} + +template <class T, std::size_t N> +StringRef getFieldRawString(const T (&Field)[N]) { + return StringRef(Field, N).rtrim(" "); +} + +template <class T> +StringRef CommonArchiveMemberHeader<T>::getRawAccessMode() const { + return getFieldRawString(ArMemHdr->AccessMode); +} + +template <class T> +StringRef CommonArchiveMemberHeader<T>::getRawLastModified() const { + return getFieldRawString(ArMemHdr->LastModified); +} + +template <class T> StringRef CommonArchiveMemberHeader<T>::getRawUID() const { + return getFieldRawString(ArMemHdr->UID); +} + +template <class T> StringRef CommonArchiveMemberHeader<T>::getRawGID() const { + return getFieldRawString(ArMemHdr->GID); +} + +template <class T> uint64_t CommonArchiveMemberHeader<T>::getOffset() const { + return reinterpret_cast<const char *>(ArMemHdr) - Parent->getData().data(); +} + +template class object::CommonArchiveMemberHeader<UnixArMemHdrType>; +template class object::CommonArchiveMemberHeader<BigArMemHdrType>; + ArchiveMemberHeader::ArchiveMemberHeader(const Archive *Parent, const char *RawHeaderPtr, uint64_t Size, Error *Err) - : Parent(Parent), - ArMemHdr(reinterpret_cast<const ArMemHdrType *>(RawHeaderPtr)) { + : CommonArchiveMemberHeader<UnixArMemHdrType>( + Parent, reinterpret_cast<const UnixArMemHdrType *>(RawHeaderPtr)) { if (RawHeaderPtr == nullptr) return; ErrorAsOutParameter ErrAsOutParam(Err); - if (Size < sizeof(ArMemHdrType)) { - if (Err) { - std::string Msg("remaining size of archive too small for next archive " - "member header "); - Expected<StringRef> NameOrErr = getName(Size); - if (!NameOrErr) { - consumeError(NameOrErr.takeError()); - uint64_t Offset = RawHeaderPtr - Parent->getData().data(); - *Err = malformedError(Msg + "at offset " + Twine(Offset)); - } else - *Err = malformedError(Msg + "for " + NameOrErr.get()); - } + if (Size < getSizeOf()) { + *Err = createMemberHeaderParseError(this, RawHeaderPtr, Size); return; } if (ArMemHdr->Terminator[0] != '`' || ArMemHdr->Terminator[1] != '\n') { @@ -94,6 +127,19 @@ ArchiveMemberHeader::ArchiveMemberHeader(const Archive *Parent, } } +BigArchiveMemberHeader::BigArchiveMemberHeader(const Archive *Parent, + const char *RawHeaderPtr, + uint64_t Size, Error *Err) + : CommonArchiveMemberHeader<BigArMemHdrType>( + Parent, reinterpret_cast<const BigArMemHdrType *>(RawHeaderPtr)) { + if (RawHeaderPtr == nullptr) + return; + ErrorAsOutParameter ErrAsOutParam(Err); + + if (Size < getSizeOf()) + *Err = createMemberHeaderParseError(this, RawHeaderPtr, Size); +} + // This gets the raw name from the ArMemHdr->Name field and checks that it is // valid for the kind of archive. If it is not valid it returns an Error. Expected<StringRef> ArchiveMemberHeader::getRawName() const { @@ -121,7 +167,69 @@ Expected<StringRef> ArchiveMemberHeader::getRawName() const { return StringRef(ArMemHdr->Name, end); } -// This gets the name looking up long names. Size is the size of the archive +Expected<uint64_t> +getArchiveMemberDecField(Twine FieldName, const StringRef RawField, + const Archive *Parent, + const AbstractArchiveMemberHeader *MemHeader) { + uint64_t Value; + if (RawField.getAsInteger(10, Value)) { + uint64_t Offset = MemHeader->getOffset(); + return malformedError("characters in " + FieldName + + " field in archive member header are not " + "all decimal numbers: '" + + RawField + + "' for the archive " + "member header at offset " + + Twine(Offset)); + } + return Value; +} + +Expected<uint64_t> +getArchiveMemberOctField(Twine FieldName, const StringRef RawField, + const Archive *Parent, + const AbstractArchiveMemberHeader *MemHeader) { + uint64_t Value; + if (RawField.getAsInteger(8, Value)) { + uint64_t Offset = MemHeader->getOffset(); + return malformedError("characters in " + FieldName + + " field in archive member header are not " + "all octal numbers: '" + + RawField + + "' for the archive " + "member header at offset " + + Twine(Offset)); + } + return Value; +} + +Expected<StringRef> BigArchiveMemberHeader::getRawName() const { + Expected<uint64_t> NameLenOrErr = getArchiveMemberDecField( + "NameLen", getFieldRawString(ArMemHdr->NameLen), Parent, this); + if (!NameLenOrErr) + // TODO: Out-of-line. + return NameLenOrErr.takeError(); + uint64_t NameLen = NameLenOrErr.get(); + + // If the name length is odd, pad with '\0' to get an even length. After + // padding, there is the name terminator "`\n". + uint64_t NameLenWithPadding = alignTo(NameLen, 2); + StringRef NameTerminator = "`\n"; + StringRef NameStringWithNameTerminator = + StringRef(ArMemHdr->Name, NameLenWithPadding + NameTerminator.size()); + if (!NameStringWithNameTerminator.endswith(NameTerminator)) { + uint64_t Offset = + reinterpret_cast<const char *>(ArMemHdr->Name + NameLenWithPadding) - + Parent->getData().data(); + // TODO: Out-of-line. + return malformedError( + "name does not have name terminator \"`\\n\" for archive member" + "header at offset " + + Twine(Offset)); + } + return StringRef(ArMemHdr->Name, NameLen); +} + // member including the header, so the size of any name following the header // is checked to make sure it does not overflow. Expected<StringRef> ArchiveMemberHeader::getName(uint64_t Size) const { @@ -129,7 +237,7 @@ Expected<StringRef> ArchiveMemberHeader::getName(uint64_t Size) const { // This can be called from the ArchiveMemberHeader constructor when the // archive header is truncated to produce an error message with the name. // Make sure the name field is not truncated. - if (Size < offsetof(ArMemHdrType, Name) + sizeof(ArMemHdr->Name)) { + if (Size < offsetof(UnixArMemHdrType, Name) + sizeof(ArMemHdr->Name)) { uint64_t ArchiveOffset = reinterpret_cast<const char *>(ArMemHdr) - Parent->getData().data(); return malformedError("archive header truncated before the name field " @@ -224,126 +332,133 @@ Expected<StringRef> ArchiveMemberHeader::getName(uint64_t Size) const { return Name.drop_back(1); } +Expected<StringRef> BigArchiveMemberHeader::getName(uint64_t Size) const { + return getRawName(); +} + Expected<uint64_t> ArchiveMemberHeader::getSize() const { - uint64_t Ret; - if (StringRef(ArMemHdr->Size, sizeof(ArMemHdr->Size)) - .rtrim(" ") - .getAsInteger(10, Ret)) { - std::string Buf; - raw_string_ostream OS(Buf); - OS.write_escaped( - StringRef(ArMemHdr->Size, sizeof(ArMemHdr->Size)).rtrim(" ")); - OS.flush(); - uint64_t Offset = - reinterpret_cast<const char *>(ArMemHdr) - Parent->getData().data(); - return malformedError("characters in size field in archive header are not " - "all decimal numbers: '" + - Buf + - "' for archive " - "member header at offset " + - Twine(Offset)); - } - return Ret; + return getArchiveMemberDecField("size", getFieldRawString(ArMemHdr->Size), + Parent, this); } -Expected<sys::fs::perms> ArchiveMemberHeader::getAccessMode() const { - unsigned Ret; - if (StringRef(ArMemHdr->AccessMode, sizeof(ArMemHdr->AccessMode)) - .rtrim(' ') - .getAsInteger(8, Ret)) { - std::string Buf; - raw_string_ostream OS(Buf); - OS.write_escaped( - StringRef(ArMemHdr->AccessMode, sizeof(ArMemHdr->AccessMode)) - .rtrim(" ")); - OS.flush(); - uint64_t Offset = - reinterpret_cast<const char *>(ArMemHdr) - Parent->getData().data(); - return malformedError("characters in AccessMode field in archive header " - "are not all decimal numbers: '" + - Buf + "' for the archive member header at offset " + - Twine(Offset)); - } - return static_cast<sys::fs::perms>(Ret); +Expected<uint64_t> BigArchiveMemberHeader::getSize() const { + Expected<uint64_t> SizeOrErr = getArchiveMemberDecField( + "size", getFieldRawString(ArMemHdr->Size), Parent, this); + if (!SizeOrErr) + return SizeOrErr.takeError(); + + Expected<uint64_t> NameLenOrErr = getRawNameSize(); + if (!NameLenOrErr) + return NameLenOrErr.takeError(); + + return *SizeOrErr + alignTo(*NameLenOrErr, 2); +} + +Expected<uint64_t> BigArchiveMemberHeader::getRawNameSize() const { + return getArchiveMemberDecField( + "NameLen", getFieldRawString(ArMemHdr->NameLen), Parent, this); +} + +Expected<uint64_t> BigArchiveMemberHeader::getNextOffset() const { + return getArchiveMemberDecField( + "NextOffset", getFieldRawString(ArMemHdr->NextOffset), Parent, this); +} + +Expected<sys::fs::perms> AbstractArchiveMemberHeader::getAccessMode() const { + Expected<uint64_t> AccessModeOrErr = + getArchiveMemberOctField("AccessMode", getRawAccessMode(), Parent, this); + if (!AccessModeOrErr) + return AccessModeOrErr.takeError(); + return static_cast<sys::fs::perms>(*AccessModeOrErr); } Expected<sys::TimePoint<std::chrono::seconds>> -ArchiveMemberHeader::getLastModified() const { - unsigned Seconds; - if (StringRef(ArMemHdr->LastModified, sizeof(ArMemHdr->LastModified)) - .rtrim(' ') - .getAsInteger(10, Seconds)) { - std::string Buf; - raw_string_ostream OS(Buf); - OS.write_escaped( - StringRef(ArMemHdr->LastModified, sizeof(ArMemHdr->LastModified)) - .rtrim(" ")); - OS.flush(); - uint64_t Offset = - reinterpret_cast<const char *>(ArMemHdr) - Parent->getData().data(); - return malformedError("characters in LastModified field in archive header " - "are not all decimal numbers: '" + - Buf + "' for the archive member header at offset " + - Twine(Offset)); - } +AbstractArchiveMemberHeader::getLastModified() const { + Expected<uint64_t> SecondsOrErr = getArchiveMemberDecField( + "LastModified", getRawLastModified(), Parent, this); - return sys::toTimePoint(Seconds); + if (!SecondsOrErr) + return SecondsOrErr.takeError(); + + return sys::toTimePoint(*SecondsOrErr); } -Expected<unsigned> ArchiveMemberHeader::getUID() const { - unsigned Ret; - StringRef User = StringRef(ArMemHdr->UID, sizeof(ArMemHdr->UID)).rtrim(' '); +Expected<unsigned> AbstractArchiveMemberHeader::getUID() const { + StringRef User = getRawUID(); if (User.empty()) return 0; - if (User.getAsInteger(10, Ret)) { - std::string Buf; - raw_string_ostream OS(Buf); - OS.write_escaped(User); - OS.flush(); - uint64_t Offset = - reinterpret_cast<const char *>(ArMemHdr) - Parent->getData().data(); - return malformedError("characters in UID field in archive header " - "are not all decimal numbers: '" + - Buf + "' for the archive member header at offset " + - Twine(Offset)); - } - return Ret; + return getArchiveMemberDecField("UID", User, Parent, this); } -Expected<unsigned> ArchiveMemberHeader::getGID() const { - unsigned Ret; - StringRef Group = StringRef(ArMemHdr->GID, sizeof(ArMemHdr->GID)).rtrim(' '); +Expected<unsigned> AbstractArchiveMemberHeader::getGID() const { + StringRef Group = getRawGID(); if (Group.empty()) return 0; - if (Group.getAsInteger(10, Ret)) { - std::string Buf; - raw_string_ostream OS(Buf); - OS.write_escaped(Group); - OS.flush(); - uint64_t Offset = - reinterpret_cast<const char *>(ArMemHdr) - Parent->getData().data(); - return malformedError("characters in GID field in archive header " - "are not all decimal numbers: '" + - Buf + "' for the archive member header at offset " + - Twine(Offset)); + return getArchiveMemberDecField("GID", Group, Parent, this); +} + +Expected<bool> ArchiveMemberHeader::isThin() const { + Expected<StringRef> NameOrErr = getRawName(); + if (!NameOrErr) + return NameOrErr.takeError(); + StringRef Name = NameOrErr.get(); + return Parent->isThin() && Name != "/" && Name != "//" && Name != "/SYM64/"; +} + +Expected<const char *> ArchiveMemberHeader::getNextChildLoc() const { + uint64_t Size = getSizeOf(); + Expected<bool> isThinOrErr = isThin(); + if (!isThinOrErr) + return isThinOrErr.takeError(); + + bool isThin = isThinOrErr.get(); + if (!isThin) { + Expected<uint64_t> MemberSize = getSize(); + if (!MemberSize) + return MemberSize.takeError(); + + Size += MemberSize.get(); } - return Ret; + + // If Size is odd, add 1 to make it even. + const char *NextLoc = + reinterpret_cast<const char *>(ArMemHdr) + alignTo(Size, 2); + + if (NextLoc == Parent->getMemoryBufferRef().getBufferEnd()) + return nullptr; + + return NextLoc; +} + +Expected<const char *> BigArchiveMemberHeader::getNextChildLoc() const { + if (getOffset() == + static_cast<const BigArchive *>(Parent)->getLastChildOffset()) + return nullptr; + + Expected<uint64_t> NextOffsetOrErr = getNextOffset(); + if (!NextOffsetOrErr) + return NextOffsetOrErr.takeError(); + return Parent->getData().data() + NextOffsetOrErr.get(); } Archive::Child::Child(const Archive *Parent, StringRef Data, uint16_t StartOfFile) - : Parent(Parent), Header(Parent, Data.data(), Data.size(), nullptr), - Data(Data), StartOfFile(StartOfFile) {} + : Parent(Parent), Data(Data), StartOfFile(StartOfFile) { + Header = Parent->createArchiveMemberHeader(Data.data(), Data.size(), nullptr); +} Archive::Child::Child(const Archive *Parent, const char *Start, Error *Err) - : Parent(Parent), - Header(Parent, Start, - Parent - ? Parent->getData().size() - (Start - Parent->getData().data()) - : 0, - Err) { - if (!Start) + : Parent(Parent) { + if (!Start) { + Header = nullptr; return; + } + + Header = Parent->createArchiveMemberHeader( + Start, + Parent ? Parent->getData().size() - (Start - Parent->getData().data()) + : 0, + Err); // If we are pointed to real data, Start is not a nullptr, then there must be // a non-null Err pointer available to report malformed data on. Only in @@ -358,7 +473,7 @@ Archive::Child::Child(const Archive *Parent, const char *Start, Error *Err) if (*Err) return; - uint64_t Size = Header.getSizeOf(); + uint64_t Size = Header->getSizeOf(); Data = StringRef(Start, Size); Expected<bool> isThinOrErr = isThinMember(); if (!isThinOrErr) { @@ -377,7 +492,7 @@ Archive::Child::Child(const Archive *Parent, const char *Start, Error *Err) } // Setup StartOfFile and PaddingBytes. - StartOfFile = Header.getSizeOf(); + StartOfFile = Header->getSizeOf(); // Don't include attached name. Expected<StringRef> NameOrErr = getRawName(); if (!NameOrErr) { @@ -385,17 +500,20 @@ Archive::Child::Child(const Archive *Parent, const char *Start, Error *Err) return; } StringRef Name = NameOrErr.get(); - if (Name.startswith("#1/")) { + + if (Parent->kind() == Archive::K_AIXBIG) { + // The actual start of the file is after the name and any necessary + // even-alignment padding. + StartOfFile += ((Name.size() + 1) >> 1) << 1; + } else if (Name.startswith("#1/")) { uint64_t NameSize; - if (Name.substr(3).rtrim(' ').getAsInteger(10, NameSize)) { - std::string Buf; - raw_string_ostream OS(Buf); - OS.write_escaped(Name.substr(3).rtrim(' ')); - OS.flush(); + StringRef RawNameSize = Name.substr(3).rtrim(' '); + if (RawNameSize.getAsInteger(10, NameSize)) { uint64_t Offset = Start - Parent->getData().data(); *Err = malformedError("long name length characters after the #1/ are " "not all decimal numbers: '" + - Buf + "' for archive member header at offset " + + RawNameSize + + "' for archive member header at offset " + Twine(Offset)); return; } @@ -405,21 +523,15 @@ Archive::Child::Child(const Archive *Parent, const char *Start, Error *Err) Expected<uint64_t> Archive::Child::getSize() const { if (Parent->IsThin) - return Header.getSize(); + return Header->getSize(); return Data.size() - StartOfFile; } Expected<uint64_t> Archive::Child::getRawSize() const { - return Header.getSize(); + return Header->getSize(); } -Expected<bool> Archive::Child::isThinMember() const { - Expected<StringRef> NameOrErr = Header.getRawName(); - if (!NameOrErr) - return NameOrErr.takeError(); - StringRef Name = NameOrErr.get(); - return Parent->IsThin && Name != "/" && Name != "//" && Name != "/SYM64/"; -} +Expected<bool> Archive::Child::isThinMember() const { return Header->isThin(); } Expected<std::string> Archive::Child::getFullName() const { Expected<bool> isThin = isThinMember(); @@ -462,15 +574,14 @@ Expected<StringRef> Archive::Child::getBuffer() const { } Expected<Archive::Child> Archive::Child::getNext() const { - size_t SpaceToSkip = Data.size(); - // If it's odd, add 1 to make it even. - if (SpaceToSkip & 1) - ++SpaceToSkip; + Expected<const char *> NextLocOrErr = Header->getNextChildLoc(); + if (!NextLocOrErr) + return NextLocOrErr.takeError(); - const char *NextLoc = Data.data() + SpaceToSkip; + const char *NextLoc = *NextLocOrErr; // Check to see if this is at the end of the archive. - if (NextLoc == Parent->Data.getBufferEnd()) + if (NextLoc == nullptr) return Child(nullptr, nullptr, nullptr); // Check to see if this is past the end of the archive. @@ -505,7 +616,8 @@ Expected<StringRef> Archive::Child::getName() const { if (!RawSizeOrErr) return RawSizeOrErr.takeError(); uint64_t RawSize = RawSizeOrErr.get(); - Expected<StringRef> NameOrErr = Header.getName(Header.getSizeOf() + RawSize); + Expected<StringRef> NameOrErr = + Header->getName(Header->getSizeOf() + RawSize); if (!NameOrErr) return NameOrErr.takeError(); StringRef Name = NameOrErr.get(); @@ -537,12 +649,39 @@ Archive::Child::getAsBinary(LLVMContext *Context) const { Expected<std::unique_ptr<Archive>> Archive::create(MemoryBufferRef Source) { Error Err = Error::success(); - std::unique_ptr<Archive> Ret(new Archive(Source, Err)); + std::unique_ptr<Archive> Ret; + StringRef Buffer = Source.getBuffer(); + + if (Buffer.startswith(BigArchiveMagic)) + Ret = std::make_unique<BigArchive>(Source, Err); + else + Ret = std::make_unique<Archive>(Source, Err); + if (Err) return std::move(Err); return std::move(Ret); } +std::unique_ptr<AbstractArchiveMemberHeader> +Archive::createArchiveMemberHeader(const char *RawHeaderPtr, uint64_t Size, + Error *Err) const { + ErrorAsOutParameter ErrAsOutParam(Err); + if (kind() != K_AIXBIG) + return std::make_unique<ArchiveMemberHeader>(this, RawHeaderPtr, Size, Err); + return std::make_unique<BigArchiveMemberHeader>(this, RawHeaderPtr, Size, + Err); +} + +uint64_t Archive::getArchiveMagicLen() const { + if (isThin()) + return sizeof(ThinArchiveMagic) - 1; + + if (Kind() == K_AIXBIG) + return sizeof(BigArchiveMagic) - 1; + + return sizeof(ArchiveMagic) - 1; +} + void Archive::setFirstRegular(const Child &C) { FirstRegularData = C.Data; FirstRegularStartOfFile = C.StartOfFile; @@ -553,10 +692,14 @@ Archive::Archive(MemoryBufferRef Source, Error &Err) ErrorAsOutParameter ErrAsOutParam(&Err); StringRef Buffer = Data.getBuffer(); // Check for sufficient magic. - if (Buffer.startswith(ThinMagic)) { + if (Buffer.startswith(ThinArchiveMagic)) { IsThin = true; - } else if (Buffer.startswith(Magic)) { + } else if (Buffer.startswith(ArchiveMagic)) { + IsThin = false; + } else if (Buffer.startswith(BigArchiveMagic)) { + Format = K_AIXBIG; IsThin = false; + return; } else { Err = make_error<GenericBinaryError>("file too small to be an archive", object_error::invalid_file_type); @@ -788,7 +931,7 @@ Archive::child_iterator Archive::child_begin(Error &Err, return child_iterator::itr( Child(this, FirstRegularData, FirstRegularStartOfFile), Err); - const char *Loc = Data.getBufferStart() + strlen(Magic); + const char *Loc = Data.getBufferStart() + getFirstChildOffset(); Child C(this, Loc, &Err); if (Err) return child_end(); @@ -997,6 +1140,38 @@ Expected<Optional<Archive::Child>> Archive::findSym(StringRef name) const { } // Returns true if archive file contains no member file. -bool Archive::isEmpty() const { return Data.getBufferSize() == 8; } +bool Archive::isEmpty() const { + return Data.getBufferSize() == getArchiveMagicLen(); +} bool Archive::hasSymbolTable() const { return !SymbolTable.empty(); } + +BigArchive::BigArchive(MemoryBufferRef Source, Error &Err) + : Archive(Source, Err) { + ErrorAsOutParameter ErrAsOutParam(&Err); + StringRef Buffer = Data.getBuffer(); + ArFixLenHdr = reinterpret_cast<const FixLenHdr *>(Buffer.data()); + + StringRef RawOffset = getFieldRawString(ArFixLenHdr->FirstChildOffset); + if (RawOffset.getAsInteger(10, FirstChildOffset)) + // TODO: Out-of-line. + Err = malformedError("malformed AIX big archive: first member offset \"" + + RawOffset + "\" is not a number"); + + RawOffset = getFieldRawString(ArFixLenHdr->LastChildOffset); + if (RawOffset.getAsInteger(10, LastChildOffset)) + // TODO: Out-of-line. + Err = malformedError("malformed AIX big archive: last member offset \"" + + RawOffset + "\" is not a number"); + + child_iterator I = child_begin(Err, false); + if (Err) + return; + child_iterator E = child_end(); + if (I == E) { + Err = Error::success(); + return; + } + setFirstRegular(*I); + Err = Error::success(); +} diff --git a/llvm/lib/Object/ArchiveWriter.cpp b/llvm/lib/Object/ArchiveWriter.cpp index da8bcec7f3d4..053b3dafed95 100644 --- a/llvm/lib/Object/ArchiveWriter.cpp +++ b/llvm/lib/Object/ArchiveWriter.cpp @@ -137,6 +137,7 @@ static bool isBSDLike(object::Archive::Kind Kind) { case object::Archive::K_DARWIN: case object::Archive::K_DARWIN64: return true; + case object::Archive::K_AIXBIG: case object::Archive::K_COFF: break; } @@ -199,6 +200,7 @@ static bool is64BitKind(object::Archive::Kind Kind) { case object::Archive::K_BSD: case object::Archive::K_DARWIN: case object::Archive::K_COFF: + case object::Archive::K_AIXBIG: return false; case object::Archive::K_DARWIN64: case object::Archive::K_GNU64: diff --git a/llvm/lib/Object/IRSymtab.cpp b/llvm/lib/Object/IRSymtab.cpp index 093ae1bbc267..dea3d90d3560 100644 --- a/llvm/lib/Object/IRSymtab.cpp +++ b/llvm/lib/Object/IRSymtab.cpp @@ -14,6 +14,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/Triple.h" +#include "llvm/Bitcode/BitcodeReader.h" #include "llvm/Config/llvm-config.h" #include "llvm/IR/Comdat.h" #include "llvm/IR/DataLayout.h" @@ -22,13 +23,13 @@ #include "llvm/IR/Mangler.h" #include "llvm/IR/Metadata.h" #include "llvm/IR/Module.h" -#include "llvm/Bitcode/BitcodeReader.h" #include "llvm/MC/StringTableBuilder.h" #include "llvm/Object/IRObjectFile.h" #include "llvm/Object/ModuleSymbolTable.h" #include "llvm/Object/SymbolicFile.h" #include "llvm/Support/Allocator.h" #include "llvm/Support/Casting.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Error.h" #include "llvm/Support/StringSaver.h" #include "llvm/Support/VCSRevision.h" @@ -41,6 +42,10 @@ using namespace llvm; using namespace irsymtab; +cl::opt<bool> DisableBitcodeVersionUpgrade( + "disable-bitcode-version-upgrade", cl::init(false), cl::Hidden, + cl::desc("Disable automatic bitcode upgrade for version mismatch")); + static const char *PreservedSymbols[] = { #define HANDLE_LIBCALL(code, name) name, #include "llvm/IR/RuntimeLibcalls.def" @@ -402,20 +407,22 @@ Expected<FileContents> irsymtab::readBitcode(const BitcodeFileContents &BFC) { return make_error<StringError>("Bitcode file does not contain any modules", inconvertibleErrorCode()); - if (BFC.StrtabForSymtab.empty() || - BFC.Symtab.size() < sizeof(storage::Header)) - return upgrade(BFC.Mods); - - // We cannot use the regular reader to read the version and producer, because - // it will expect the header to be in the current format. The only thing we - // can rely on is that the version and producer will be present as the first - // struct elements. - auto *Hdr = reinterpret_cast<const storage::Header *>(BFC.Symtab.data()); - unsigned Version = Hdr->Version; - StringRef Producer = Hdr->Producer.get(BFC.StrtabForSymtab); - if (Version != storage::Header::kCurrentVersion || - Producer != kExpectedProducerName) - return upgrade(BFC.Mods); + if (!DisableBitcodeVersionUpgrade) { + if (BFC.StrtabForSymtab.empty() || + BFC.Symtab.size() < sizeof(storage::Header)) + return upgrade(BFC.Mods); + + // We cannot use the regular reader to read the version and producer, + // because it will expect the header to be in the current format. The only + // thing we can rely on is that the version and producer will be present as + // the first struct elements. + auto *Hdr = reinterpret_cast<const storage::Header *>(BFC.Symtab.data()); + unsigned Version = Hdr->Version; + StringRef Producer = Hdr->Producer.get(BFC.StrtabForSymtab); + if (Version != storage::Header::kCurrentVersion || + Producer != kExpectedProducerName) + return upgrade(BFC.Mods); + } FileContents FC; FC.TheReader = {{BFC.Symtab.data(), BFC.Symtab.size()}, diff --git a/llvm/lib/Object/Object.cpp b/llvm/lib/Object/Object.cpp index 0659cf6a2d41..576eb8d069d6 100644 --- a/llvm/lib/Object/Object.cpp +++ b/llvm/lib/Object/Object.cpp @@ -16,6 +16,7 @@ #include "llvm/IR/LLVMContext.h" #include "llvm/Object/ObjectFile.h" #include "llvm/Object/MachOUniversal.h" +#include "llvm/Support/MemAlloc.h" using namespace llvm; using namespace object; diff --git a/llvm/lib/Object/TapiFile.cpp b/llvm/lib/Object/TapiFile.cpp index 6b576260bdb1..83568e8d823a 100644 --- a/llvm/lib/Object/TapiFile.cpp +++ b/llvm/lib/Object/TapiFile.cpp @@ -45,8 +45,7 @@ TapiFile::TapiFile(MemoryBufferRef Source, const InterfaceFile &interface, Symbols.emplace_back(StringRef(), Symbol->getName(), getFlags(Symbol)); break; case SymbolKind::ObjectiveCClass: - if (interface.getPlatforms().count(PlatformKind::macOS) && - Arch == AK_i386) { + if (interface.getPlatforms().count(PLATFORM_MACOS) && Arch == AK_i386) { Symbols.emplace_back(ObjC1ClassNamePrefix, Symbol->getName(), getFlags(Symbol)); } else { diff --git a/llvm/lib/Object/XCOFFObjectFile.cpp b/llvm/lib/Object/XCOFFObjectFile.cpp index 9b0a5efacba7..f2f6d700ddd8 100644 --- a/llvm/lib/Object/XCOFFObjectFile.cpp +++ b/llvm/lib/Object/XCOFFObjectFile.cpp @@ -1112,8 +1112,12 @@ bool XCOFFSymbolRef::isFunction() const { return true; Expected<XCOFFCsectAuxRef> ExpCsectAuxEnt = getXCOFFCsectAuxRef(); - if (!ExpCsectAuxEnt) + if (!ExpCsectAuxEnt) { + // If we could not get the CSECT auxiliary entry, then treat this symbol as + // if it isn't a function. Consume the error and return `false` to move on. + consumeError(ExpCsectAuxEnt.takeError()); return false; + } const XCOFFCsectAuxRef CsectAuxRef = ExpCsectAuxEnt.get(); diff --git a/llvm/lib/ObjectYAML/ELFYAML.cpp b/llvm/lib/ObjectYAML/ELFYAML.cpp index 9b9266998ea6..ffe2599beaf8 100644 --- a/llvm/lib/ObjectYAML/ELFYAML.cpp +++ b/llvm/lib/ObjectYAML/ELFYAML.cpp @@ -518,6 +518,7 @@ void ScalarBitSetTraits<ELFYAML::ELF_EF>::bitset(IO &IO, BCaseMask(EF_RISCV_FLOAT_ABI_DOUBLE, EF_RISCV_FLOAT_ABI); BCaseMask(EF_RISCV_FLOAT_ABI_QUAD, EF_RISCV_FLOAT_ABI); BCase(EF_RISCV_RVE); + BCase(EF_RISCV_TSO); break; case ELF::EM_AMDGPU: BCaseMask(EF_AMDGPU_MACH_NONE, EF_AMDGPU_MACH); diff --git a/llvm/lib/ObjectYAML/MachOEmitter.cpp b/llvm/lib/ObjectYAML/MachOEmitter.cpp index e5ffb12df434..b9fad2982828 100644 --- a/llvm/lib/ObjectYAML/MachOEmitter.cpp +++ b/llvm/lib/ObjectYAML/MachOEmitter.cpp @@ -481,9 +481,9 @@ void MachOWriter::writeLinkEditData(raw_ostream &OS) { typedef std::pair<uint64_t, writeHandler> writeOperation; std::vector<writeOperation> WriteQueue; - MachO::dyld_info_command *DyldInfoOnlyCmd = 0; - MachO::symtab_command *SymtabCmd = 0; - MachO::dysymtab_command *DSymtabCmd = 0; + MachO::dyld_info_command *DyldInfoOnlyCmd = nullptr; + MachO::symtab_command *SymtabCmd = nullptr; + MachO::dysymtab_command *DSymtabCmd = nullptr; for (auto &LC : Obj.LoadCommands) { switch (LC.Data.load_command_data.cmd) { case MachO::LC_SYMTAB: diff --git a/llvm/lib/ObjectYAML/XCOFFEmitter.cpp b/llvm/lib/ObjectYAML/XCOFFEmitter.cpp index cf0d058c518c..2a7204d3f773 100644 --- a/llvm/lib/ObjectYAML/XCOFFEmitter.cpp +++ b/llvm/lib/ObjectYAML/XCOFFEmitter.cpp @@ -47,6 +47,7 @@ private: bool initRelocations(uint64_t &CurrentOffset); bool initStringTable(); bool assignAddressesAndIndices(); + void writeFileHeader(); void writeAuxFileHeader(); void writeSectionHeader(); @@ -55,6 +56,15 @@ private: bool writeSymbols(); void writeStringTable(); + void writeAuxSymbol(const XCOFFYAML::CsectAuxEnt &AuxSym); + void writeAuxSymbol(const XCOFFYAML::FileAuxEnt &AuxSym); + void writeAuxSymbol(const XCOFFYAML::FunctionAuxEnt &AuxSym); + void writeAuxSymbol(const XCOFFYAML::ExcpetionAuxEnt &AuxSym); + void writeAuxSymbol(const XCOFFYAML::BlockAuxEnt &AuxSym); + void writeAuxSymbol(const XCOFFYAML::SectAuxEntForDWARF &AuxSym); + void writeAuxSymbol(const XCOFFYAML::SectAuxEntForStat &AuxSym); + void writeAuxSymbol(const std::unique_ptr<XCOFFYAML::AuxSymbolEnt> &AuxSym); + XCOFFYAML::Object &Obj; bool Is64Bit = false; support::endian::Writer W; @@ -190,12 +200,23 @@ bool XCOFFWriter::initStringTable() { } } } else { - for (XCOFFYAML::Symbol &YamlSym : Obj.Symbols) { + for (const XCOFFYAML::Symbol &YamlSym : Obj.Symbols) { if (nameShouldBeInStringTable(YamlSym.SymbolName)) StrTblBuilder.add(YamlSym.SymbolName); } } + // Check if the file name in the File Auxiliary Entry should be added to the + // string table. + for (const XCOFFYAML::Symbol &YamlSym : Obj.Symbols) { + for (const std::unique_ptr<XCOFFYAML::AuxSymbolEnt> &AuxSym : + YamlSym.AuxEntries) { + if (auto AS = dyn_cast<XCOFFYAML::FileAuxEnt>(AuxSym.get())) + if (nameShouldBeInStringTable(AS->FileNameOrString.getValueOr(""))) + StrTblBuilder.add(AS->FileNameOrString.getValueOr("")); + } + } + StrTblBuilder.finalize(); size_t StrTblSize = StrTblBuilder.getSize(); @@ -216,9 +237,21 @@ bool XCOFFWriter::initFileHeader(uint64_t CurrentOffset) { InitFileHdr.NumberOfSections = Obj.Sections.size(); InitFileHdr.NumberOfSymTableEntries = Obj.Symbols.size(); - for (const XCOFFYAML::Symbol &YamlSym : Obj.Symbols) + for (XCOFFYAML::Symbol &YamlSym : Obj.Symbols) { + uint32_t AuxCount = YamlSym.AuxEntries.size(); + if (YamlSym.NumberOfAuxEntries && *YamlSym.NumberOfAuxEntries < AuxCount) { + ErrHandler("specified NumberOfAuxEntries " + + Twine(static_cast<uint32_t>(*YamlSym.NumberOfAuxEntries)) + + " is less than the actual number " + "of auxiliary entries " + + Twine(AuxCount)); + return false; + } + YamlSym.NumberOfAuxEntries = + YamlSym.NumberOfAuxEntries.getValueOr(AuxCount); // Add the number of auxiliary symbols to the total number. - InitFileHdr.NumberOfSymTableEntries += YamlSym.NumberOfAuxEntries; + InitFileHdr.NumberOfSymTableEntries += *YamlSym.NumberOfAuxEntries; + } // Calculate SymbolTableOffset for the file header. if (InitFileHdr.NumberOfSymTableEntries) { @@ -491,6 +524,125 @@ bool XCOFFWriter::writeRelocations() { return true; } +void XCOFFWriter::writeAuxSymbol(const XCOFFYAML::CsectAuxEnt &AuxSym) { + if (Is64Bit) { + W.write<uint32_t>(AuxSym.SectionOrLengthLo.getValueOr(0)); + W.write<uint32_t>(AuxSym.ParameterHashIndex.getValueOr(0)); + W.write<uint16_t>(AuxSym.TypeChkSectNum.getValueOr(0)); + W.write<uint8_t>(AuxSym.SymbolAlignmentAndType.getValueOr(0)); + W.write<uint8_t>(AuxSym.StorageMappingClass.getValueOr(XCOFF::XMC_PR)); + W.write<uint32_t>(AuxSym.SectionOrLengthHi.getValueOr(0)); + W.write<uint8_t>(0); + W.write<uint8_t>(XCOFF::AUX_CSECT); + } else { + W.write<uint32_t>(AuxSym.SectionOrLength.getValueOr(0)); + W.write<uint32_t>(AuxSym.ParameterHashIndex.getValueOr(0)); + W.write<uint16_t>(AuxSym.TypeChkSectNum.getValueOr(0)); + W.write<uint8_t>(AuxSym.SymbolAlignmentAndType.getValueOr(0)); + W.write<uint8_t>(AuxSym.StorageMappingClass.getValueOr(XCOFF::XMC_PR)); + W.write<uint32_t>(AuxSym.StabInfoIndex.getValueOr(0)); + W.write<uint16_t>(AuxSym.StabSectNum.getValueOr(0)); + } +} + +void XCOFFWriter::writeAuxSymbol(const XCOFFYAML::ExcpetionAuxEnt &AuxSym) { + assert(Is64Bit && "can't write the exception auxiliary symbol for XCOFF32"); + W.write<uint64_t>(AuxSym.OffsetToExceptionTbl.getValueOr(0)); + W.write<uint32_t>(AuxSym.SizeOfFunction.getValueOr(0)); + W.write<uint32_t>(AuxSym.SymIdxOfNextBeyond.getValueOr(0)); + W.write<uint8_t>(0); + W.write<uint8_t>(XCOFF::AUX_EXCEPT); +} + +void XCOFFWriter::writeAuxSymbol(const XCOFFYAML::FunctionAuxEnt &AuxSym) { + if (Is64Bit) { + W.write<uint64_t>(AuxSym.PtrToLineNum.getValueOr(0)); + W.write<uint32_t>(AuxSym.SizeOfFunction.getValueOr(0)); + W.write<uint32_t>(AuxSym.SymIdxOfNextBeyond.getValueOr(0)); + W.write<uint8_t>(0); + W.write<uint8_t>(XCOFF::AUX_FCN); + } else { + W.write<uint32_t>(AuxSym.OffsetToExceptionTbl.getValueOr(0)); + W.write<uint32_t>(AuxSym.SizeOfFunction.getValueOr(0)); + W.write<uint32_t>(AuxSym.PtrToLineNum.getValueOr(0)); + W.write<uint32_t>(AuxSym.SymIdxOfNextBeyond.getValueOr(0)); + W.OS.write_zeros(2); + } +} + +void XCOFFWriter::writeAuxSymbol(const XCOFFYAML::FileAuxEnt &AuxSym) { + StringRef FileName = AuxSym.FileNameOrString.getValueOr(""); + if (nameShouldBeInStringTable(FileName)) { + W.write<int32_t>(0); + W.write<uint32_t>(StrTblBuilder.getOffset(FileName)); + } else { + writeName(FileName, W); + } + W.OS.write_zeros(XCOFF::FileNamePadSize); + W.write<uint8_t>(AuxSym.FileStringType.getValueOr(XCOFF::XFT_FN)); + if (Is64Bit) { + W.OS.write_zeros(2); + W.write<uint8_t>(XCOFF::AUX_FILE); + } else { + W.OS.write_zeros(3); + } +} + +void XCOFFWriter::writeAuxSymbol(const XCOFFYAML::BlockAuxEnt &AuxSym) { + if (Is64Bit) { + W.write<uint32_t>(AuxSym.LineNum.getValueOr(0)); + W.OS.write_zeros(13); + W.write<uint8_t>(XCOFF::AUX_SYM); + } else { + W.OS.write_zeros(2); + W.write<uint16_t>(AuxSym.LineNumHi.getValueOr(0)); + W.write<uint16_t>(AuxSym.LineNumLo.getValueOr(0)); + W.OS.write_zeros(12); + } +} + +void XCOFFWriter::writeAuxSymbol(const XCOFFYAML::SectAuxEntForDWARF &AuxSym) { + if (Is64Bit) { + W.write<uint64_t>(AuxSym.LengthOfSectionPortion.getValueOr(0)); + W.write<uint64_t>(AuxSym.NumberOfRelocEnt.getValueOr(0)); + W.write<uint8_t>(0); + W.write<uint8_t>(XCOFF::AUX_SECT); + } else { + W.write<uint32_t>(AuxSym.LengthOfSectionPortion.getValueOr(0)); + W.OS.write_zeros(4); + W.write<uint32_t>(AuxSym.NumberOfRelocEnt.getValueOr(0)); + W.OS.write_zeros(6); + } +} + +void XCOFFWriter::writeAuxSymbol(const XCOFFYAML::SectAuxEntForStat &AuxSym) { + assert(!Is64Bit && "can't write the stat auxiliary symbol for XCOFF64"); + W.write<uint32_t>(AuxSym.SectionLength.getValueOr(0)); + W.write<uint16_t>(AuxSym.NumberOfRelocEnt.getValueOr(0)); + W.write<uint16_t>(AuxSym.NumberOfLineNum.getValueOr(0)); + W.OS.write_zeros(10); +} + +void XCOFFWriter::writeAuxSymbol( + const std::unique_ptr<XCOFFYAML::AuxSymbolEnt> &AuxSym) { + if (auto AS = dyn_cast<XCOFFYAML::CsectAuxEnt>(AuxSym.get())) + writeAuxSymbol(*AS); + else if (auto AS = dyn_cast<XCOFFYAML::FunctionAuxEnt>(AuxSym.get())) + writeAuxSymbol(*AS); + else if (auto AS = dyn_cast<XCOFFYAML::ExcpetionAuxEnt>(AuxSym.get())) + writeAuxSymbol(*AS); + else if (auto AS = dyn_cast<XCOFFYAML::FileAuxEnt>(AuxSym.get())) + writeAuxSymbol(*AS); + else if (auto AS = dyn_cast<XCOFFYAML::BlockAuxEnt>(AuxSym.get())) + writeAuxSymbol(*AS); + else if (auto AS = dyn_cast<XCOFFYAML::SectAuxEntForDWARF>(AuxSym.get())) + writeAuxSymbol(*AS); + else if (auto AS = dyn_cast<XCOFFYAML::SectAuxEntForStat>(AuxSym.get())) + writeAuxSymbol(*AS); + else + llvm_unreachable("unknown auxiliary symbol type"); +} + bool XCOFFWriter::writeSymbols() { int64_t PaddingSize = (uint64_t)InitFileHdr.SymbolTableOffset - (W.OS.tell() - StartOffset); @@ -533,16 +685,25 @@ bool XCOFFWriter::writeSymbols() { } W.write<uint16_t>(YamlSym.Type); W.write<uint8_t>(YamlSym.StorageClass); - W.write<uint8_t>(YamlSym.NumberOfAuxEntries); - - // Now output the auxiliary entry. - for (uint8_t I = 0, E = YamlSym.NumberOfAuxEntries; I < E; ++I) { - // TODO: Auxiliary entry is not supported yet. - // The auxiliary entries for a symbol follow its symbol table entry. The - // length of each auxiliary entry is the same as a symbol table entry (18 - // bytes). The format and quantity of auxiliary entries depend on the - // storage class (n_sclass) and type (n_type) of the symbol table entry. - W.OS.write_zeros(XCOFF::SymbolTableEntrySize); + + uint8_t NumOfAuxSym = YamlSym.NumberOfAuxEntries.getValueOr(0); + W.write<uint8_t>(NumOfAuxSym); + + if (!NumOfAuxSym && !YamlSym.AuxEntries.size()) + continue; + + // Now write auxiliary entries. + if (!YamlSym.AuxEntries.size()) { + W.OS.write_zeros(XCOFF::SymbolTableEntrySize * NumOfAuxSym); + } else { + for (const std::unique_ptr<XCOFFYAML::AuxSymbolEnt> &AuxSym : + YamlSym.AuxEntries) { + writeAuxSymbol(AuxSym); + } + // Pad with zeros. + if (NumOfAuxSym > YamlSym.AuxEntries.size()) + W.OS.write_zeros(XCOFF::SymbolTableEntrySize * + (NumOfAuxSym - YamlSym.AuxEntries.size())); } } return true; diff --git a/llvm/lib/ObjectYAML/XCOFFYAML.cpp b/llvm/lib/ObjectYAML/XCOFFYAML.cpp index 221cf3b064c0..44ef33501b65 100644 --- a/llvm/lib/ObjectYAML/XCOFFYAML.cpp +++ b/llvm/lib/ObjectYAML/XCOFFYAML.cpp @@ -19,6 +19,8 @@ namespace XCOFFYAML { Object::Object() { memset(&Header, 0, sizeof(Header)); } +AuxSymbolEnt::~AuxSymbolEnt() = default; + } // namespace XCOFFYAML namespace yaml { @@ -98,6 +100,56 @@ void ScalarEnumerationTraits<XCOFF::StorageClass>::enumeration( #undef ECase } +void ScalarEnumerationTraits<XCOFF::StorageMappingClass>::enumeration( + IO &IO, XCOFF::StorageMappingClass &Value) { +#define ECase(X) IO.enumCase(Value, #X, XCOFF::X) + ECase(XMC_PR); + ECase(XMC_RO); + ECase(XMC_DB); + ECase(XMC_GL); + ECase(XMC_XO); + ECase(XMC_SV); + ECase(XMC_SV64); + ECase(XMC_SV3264); + ECase(XMC_TI); + ECase(XMC_TB); + ECase(XMC_RW); + ECase(XMC_TC0); + ECase(XMC_TC); + ECase(XMC_TD); + ECase(XMC_DS); + ECase(XMC_UA); + ECase(XMC_BS); + ECase(XMC_UC); + ECase(XMC_TL); + ECase(XMC_UL); + ECase(XMC_TE); +#undef ECase +} + +void ScalarEnumerationTraits<XCOFFYAML::AuxSymbolType>::enumeration( + IO &IO, XCOFFYAML::AuxSymbolType &Type) { +#define ECase(X) IO.enumCase(Type, #X, XCOFFYAML::X) + ECase(AUX_EXCEPT); + ECase(AUX_FCN); + ECase(AUX_SYM); + ECase(AUX_FILE); + ECase(AUX_CSECT); + ECase(AUX_SECT); + ECase(AUX_STAT); +#undef ECase +} + +void ScalarEnumerationTraits<XCOFF::CFileStringType>::enumeration( + IO &IO, XCOFF::CFileStringType &Type) { +#define ECase(X) IO.enumCase(Type, #X, XCOFF::X) + ECase(XFT_FN); + ECase(XFT_CT); + ECase(XFT_CV); + ECase(XFT_CD); +#undef ECase +} + struct NSectionFlags { NSectionFlags(IO &) : Flags(XCOFF::SectionTypeFlags(0)) {} NSectionFlags(IO &, uint32_t C) : Flags(XCOFF::SectionTypeFlags(C)) {} @@ -173,6 +225,107 @@ void MappingTraits<XCOFFYAML::Section>::mapping(IO &IO, IO.mapOptional("Relocations", Sec.Relocations); } +static void auxSymMapping(IO &IO, XCOFFYAML::CsectAuxEnt &AuxSym, bool Is64) { + IO.mapOptional("ParameterHashIndex", AuxSym.ParameterHashIndex); + IO.mapOptional("TypeChkSectNum", AuxSym.TypeChkSectNum); + IO.mapOptional("SymbolAlignmentAndType", AuxSym.SymbolAlignmentAndType); + IO.mapOptional("StorageMappingClass", AuxSym.StorageMappingClass); + if (Is64) { + IO.mapOptional("SectionOrLengthLo", AuxSym.SectionOrLengthLo); + IO.mapOptional("SectionOrLengthHi", AuxSym.SectionOrLengthHi); + } else { + IO.mapOptional("SectionOrLength", AuxSym.SectionOrLength); + IO.mapOptional("StabInfoIndex", AuxSym.StabInfoIndex); + IO.mapOptional("StabSectNum", AuxSym.StabSectNum); + } +} + +static void auxSymMapping(IO &IO, XCOFFYAML::FileAuxEnt &AuxSym) { + IO.mapOptional("FileNameOrString", AuxSym.FileNameOrString); + IO.mapOptional("FileStringType", AuxSym.FileStringType); +} + +static void auxSymMapping(IO &IO, XCOFFYAML::BlockAuxEnt &AuxSym, bool Is64) { + if (Is64) { + IO.mapOptional("LineNum", AuxSym.LineNum); + } else { + IO.mapOptional("LineNumHi", AuxSym.LineNumHi); + IO.mapOptional("LineNumLo", AuxSym.LineNumLo); + } +} + +static void auxSymMapping(IO &IO, XCOFFYAML::FunctionAuxEnt &AuxSym, + bool Is64) { + if (!Is64) + IO.mapOptional("OffsetToExceptionTbl", AuxSym.OffsetToExceptionTbl); + IO.mapOptional("SizeOfFunction", AuxSym.SizeOfFunction); + IO.mapOptional("SymIdxOfNextBeyond", AuxSym.SymIdxOfNextBeyond); + IO.mapOptional("PtrToLineNum", AuxSym.PtrToLineNum); +} + +static void auxSymMapping(IO &IO, XCOFFYAML::ExcpetionAuxEnt &AuxSym) { + IO.mapOptional("OffsetToExceptionTbl", AuxSym.OffsetToExceptionTbl); + IO.mapOptional("SizeOfFunction", AuxSym.SizeOfFunction); + IO.mapOptional("SymIdxOfNextBeyond", AuxSym.SymIdxOfNextBeyond); +} + +static void auxSymMapping(IO &IO, XCOFFYAML::SectAuxEntForDWARF &AuxSym) { + IO.mapOptional("LengthOfSectionPortion", AuxSym.LengthOfSectionPortion); + IO.mapOptional("NumberOfRelocEnt", AuxSym.NumberOfRelocEnt); +} + +static void auxSymMapping(IO &IO, XCOFFYAML::SectAuxEntForStat &AuxSym) { + IO.mapOptional("SectionLength", AuxSym.SectionLength); + IO.mapOptional("NumberOfRelocEnt", AuxSym.NumberOfRelocEnt); + IO.mapOptional("NumberOfLineNum", AuxSym.NumberOfLineNum); +} + +void MappingTraits<std::unique_ptr<XCOFFYAML::AuxSymbolEnt>>::mapping( + IO &IO, std::unique_ptr<XCOFFYAML::AuxSymbolEnt> &AuxSym) { + assert(!IO.outputting() && "We don't dump aux symbols currently."); + const bool Is64 = + static_cast<XCOFFYAML::Object *>(IO.getContext())->Header.Magic == + (llvm::yaml::Hex16)XCOFF::XCOFF64; + XCOFFYAML::AuxSymbolType AuxType; + IO.mapRequired("Type", AuxType); + switch (AuxType) { + case XCOFFYAML::AUX_EXCEPT: + if (!Is64) + IO.setError("an auxiliary symbol of type AUX_EXCEPT cannot be defined in " + "XCOFF32"); + AuxSym.reset(new XCOFFYAML::ExcpetionAuxEnt()); + auxSymMapping(IO, *cast<XCOFFYAML::ExcpetionAuxEnt>(AuxSym.get())); + break; + case XCOFFYAML::AUX_FCN: + AuxSym.reset(new XCOFFYAML::FunctionAuxEnt()); + auxSymMapping(IO, *cast<XCOFFYAML::FunctionAuxEnt>(AuxSym.get()), Is64); + break; + case XCOFFYAML::AUX_SYM: + AuxSym.reset(new XCOFFYAML::BlockAuxEnt()); + auxSymMapping(IO, *cast<XCOFFYAML::BlockAuxEnt>(AuxSym.get()), Is64); + break; + case XCOFFYAML::AUX_FILE: + AuxSym.reset(new XCOFFYAML::FileAuxEnt()); + auxSymMapping(IO, *cast<XCOFFYAML::FileAuxEnt>(AuxSym.get())); + break; + case XCOFFYAML::AUX_CSECT: + AuxSym.reset(new XCOFFYAML::CsectAuxEnt()); + auxSymMapping(IO, *cast<XCOFFYAML::CsectAuxEnt>(AuxSym.get()), Is64); + break; + case XCOFFYAML::AUX_SECT: + AuxSym.reset(new XCOFFYAML::SectAuxEntForDWARF()); + auxSymMapping(IO, *cast<XCOFFYAML::SectAuxEntForDWARF>(AuxSym.get())); + break; + case XCOFFYAML::AUX_STAT: + if (Is64) + IO.setError( + "an auxiliary symbol of type AUX_STAT cannot be defined in XCOFF64"); + AuxSym.reset(new XCOFFYAML::SectAuxEntForStat()); + auxSymMapping(IO, *cast<XCOFFYAML::SectAuxEntForStat>(AuxSym.get())); + break; + } +} + void MappingTraits<XCOFFYAML::Symbol>::mapping(IO &IO, XCOFFYAML::Symbol &S) { IO.mapOptional("Name", S.SymbolName); IO.mapOptional("Value", S.Value); @@ -181,6 +334,8 @@ void MappingTraits<XCOFFYAML::Symbol>::mapping(IO &IO, XCOFFYAML::Symbol &S) { IO.mapOptional("Type", S.Type); IO.mapOptional("StorageClass", S.StorageClass); IO.mapOptional("NumberOfAuxEntries", S.NumberOfAuxEntries); + if (!IO.outputting()) + IO.mapOptional("AuxEntries", S.AuxEntries); } void MappingTraits<XCOFFYAML::StringTable>::mapping(IO &IO, XCOFFYAML::StringTable &Str) { @@ -191,12 +346,14 @@ void MappingTraits<XCOFFYAML::StringTable>::mapping(IO &IO, XCOFFYAML::StringTab } void MappingTraits<XCOFFYAML::Object>::mapping(IO &IO, XCOFFYAML::Object &Obj) { + IO.setContext(&Obj); IO.mapTag("!XCOFF", true); IO.mapRequired("FileHeader", Obj.Header); IO.mapOptional("AuxiliaryHeader", Obj.AuxHeader); IO.mapOptional("Sections", Obj.Sections); IO.mapOptional("Symbols", Obj.Symbols); IO.mapOptional("StringTable", Obj.StrTbl); + IO.setContext(nullptr); } } // namespace yaml diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp index d7615ef4e9bf..015ca1eec4df 100644 --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -35,6 +35,7 @@ #include "llvm/Analysis/DemandedBits.h" #include "llvm/Analysis/DependenceAnalysis.h" #include "llvm/Analysis/DivergenceAnalysis.h" +#include "llvm/Analysis/DomPrinter.h" #include "llvm/Analysis/DominanceFrontier.h" #include "llvm/Analysis/FunctionPropertiesAnalysis.h" #include "llvm/Analysis/GlobalsModRef.h" @@ -218,6 +219,7 @@ #include "llvm/Transforms/Utils/BreakCriticalEdges.h" #include "llvm/Transforms/Utils/CanonicalizeAliases.h" #include "llvm/Transforms/Utils/CanonicalizeFreezeInLoops.h" +#include "llvm/Transforms/Utils/Debugify.h" #include "llvm/Transforms/Utils/EntryExitInstrumenter.h" #include "llvm/Transforms/Utils/FixIrreducible.h" #include "llvm/Transforms/Utils/HelloWorld.h" @@ -655,6 +657,8 @@ Expected<MemorySanitizerOptions> parseMSanPassOptions(StringRef Params) { ParamName) .str(), inconvertibleErrorCode()); + } else if (ParamName == "eager-checks") { + Result.EagerChecks = true; } else { return make_error<StringError>( formatv("invalid MemorySanitizer pass parameter '{0}' ", ParamName) diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp index a6a36ff25402..6110bda02406 100644 --- a/llvm/lib/Passes/PassBuilderPipelines.cpp +++ b/llvm/lib/Passes/PassBuilderPipelines.cpp @@ -300,6 +300,8 @@ PassBuilder::buildO1FunctionSimplificationPipeline(OptimizationLevel Level, // TODO: Investigate promotion cap for O1. LPM1.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap)); LPM1.addPass(SimpleLoopUnswitchPass()); + if (EnableLoopFlatten) + LPM1.addPass(LoopFlattenPass()); LPM2.addPass(LoopIdiomRecognizePass()); LPM2.addPass(IndVarSimplifyPass()); @@ -335,8 +337,6 @@ PassBuilder::buildO1FunctionSimplificationPipeline(OptimizationLevel Level, /*UseBlockFrequencyInfo=*/true)); FPM.addPass(SimplifyCFGPass()); FPM.addPass(InstCombinePass()); - if (EnableLoopFlatten) - FPM.addPass(createFunctionToLoopPassAdaptor(LoopFlattenPass())); // The loop passes in LPM2 (LoopFullUnrollPass) do not preserve MemorySSA. // *All* loop passes must preserve it, in order to be able to use it. FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM2), @@ -475,6 +475,9 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level, LPM1.addPass( SimpleLoopUnswitchPass(/* NonTrivial */ Level == OptimizationLevel::O3 && EnableO3NonTrivialUnswitching)); + if (EnableLoopFlatten) + LPM1.addPass(LoopFlattenPass()); + LPM2.addPass(LoopIdiomRecognizePass()); LPM2.addPass(IndVarSimplifyPass()); @@ -509,8 +512,6 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level, /*UseBlockFrequencyInfo=*/true)); FPM.addPass(SimplifyCFGPass()); FPM.addPass(InstCombinePass()); - if (EnableLoopFlatten) - FPM.addPass(createFunctionToLoopPassAdaptor(LoopFlattenPass())); // The loop passes in LPM2 (LoopIdiomRecognizePass, IndVarSimplifyPass, // LoopDeletionPass and LoopFullUnrollPass) do not preserve MemorySSA. // *All* loop passes must preserve it, in order to be able to use it. @@ -1623,14 +1624,13 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level, MainFPM.addPass(DSEPass()); MainFPM.addPass(MergedLoadStoreMotionPass()); - // More loops are countable; try to optimize them. - if (EnableLoopFlatten && Level.getSpeedupLevel() > 1) - MainFPM.addPass(createFunctionToLoopPassAdaptor(LoopFlattenPass())); if (EnableConstraintElimination) MainFPM.addPass(ConstraintEliminationPass()); LoopPassManager LPM; + if (EnableLoopFlatten && Level.getSpeedupLevel() > 1) + LPM.addPass(LoopFlattenPass()); LPM.addPass(IndVarSimplifyPass()); LPM.addPass(LoopDeletionPass()); // FIXME: Add loop interchange. diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def index 74613a7fcce0..8e0af11b854d 100644 --- a/llvm/lib/Passes/PassRegistry.def +++ b/llvm/lib/Passes/PassRegistry.def @@ -48,9 +48,11 @@ MODULE_PASS("openmp-opt", OpenMPOptPass()) MODULE_PASS("called-value-propagation", CalledValuePropagationPass()) MODULE_PASS("canonicalize-aliases", CanonicalizeAliasesPass()) MODULE_PASS("cg-profile", CGProfilePass()) +MODULE_PASS("check-debugify", NewPMCheckDebugifyPass()) MODULE_PASS("constmerge", ConstantMergePass()) MODULE_PASS("cross-dso-cfi", CrossDSOCFIPass()) MODULE_PASS("deadargelim", DeadArgumentEliminationPass()) +MODULE_PASS("debugify", NewPMDebugifyPass()) MODULE_PASS("elim-avail-extern", EliminateAvailableExternallyPass()) MODULE_PASS("extract-blocks", BlockExtractorPass()) MODULE_PASS("forceattrs", ForceFunctionAttrsPass()) @@ -62,6 +64,7 @@ MODULE_PASS("globalsplit", GlobalSplitPass()) MODULE_PASS("hotcoldsplit", HotColdSplittingPass()) MODULE_PASS("inferattrs", InferFunctionAttrsPass()) MODULE_PASS("inliner-wrapper", ModuleInlinerWrapperPass()) +MODULE_PASS("print<inline-advisor>", InlineAdvisorAnalysisPrinterPass(dbgs())) MODULE_PASS("inliner-wrapper-no-mandatory-first", ModuleInlinerWrapperPass( getInlineParams(), false)) @@ -254,6 +257,8 @@ FUNCTION_PASS("div-rem-pairs", DivRemPairsPass()) FUNCTION_PASS("dse", DSEPass()) FUNCTION_PASS("dot-cfg", CFGPrinterPass()) FUNCTION_PASS("dot-cfg-only", CFGOnlyPrinterPass()) +FUNCTION_PASS("dot-dom", DomTreePrinterPass()) +FUNCTION_PASS("dot-dom-only", DomTreeOnlyPrinterPass()) FUNCTION_PASS("fix-irreducible", FixIrreduciblePass()) FUNCTION_PASS("flattencfg", FlattenCFGPass()) FUNCTION_PASS("make-guards-explicit", MakeGuardsExplicitPass()) @@ -410,7 +415,7 @@ FUNCTION_PASS_WITH_PARAMS("msan", return MemorySanitizerPass(Opts); }, parseMSanPassOptions, - "recover;kernel;track-origins=N") + "recover;kernel;eager-checks;track-origins=N") FUNCTION_PASS_WITH_PARAMS("simplifycfg", "SimplifyCFGPass", [](SimplifyCFGOptions Opts) { diff --git a/llvm/lib/Passes/StandardInstrumentations.cpp b/llvm/lib/Passes/StandardInstrumentations.cpp index 23c825c78713..c42b1cb26f13 100644 --- a/llvm/lib/Passes/StandardInstrumentations.cpp +++ b/llvm/lib/Passes/StandardInstrumentations.cpp @@ -441,7 +441,7 @@ const Module *getModuleForComparison(Any IR) { } // namespace -template <typename T> ChangeReporter<T>::~ChangeReporter<T>() { +template <typename T> ChangeReporter<T>::~ChangeReporter() { assert(BeforeStack.empty() && "Problem with Change Printer stack."); } diff --git a/llvm/lib/ProfileData/GCOV.cpp b/llvm/lib/ProfileData/GCOV.cpp index afef71f5b5ad..72d1addab01e 100644 --- a/llvm/lib/ProfileData/GCOV.cpp +++ b/llvm/lib/ProfileData/GCOV.cpp @@ -346,7 +346,7 @@ StringRef GCOVFunction::getName(bool demangle) const { } } demangled = Name; - } while (0); + } while (false); } return demangled; } diff --git a/llvm/lib/ProfileData/InstrProf.cpp b/llvm/lib/ProfileData/InstrProf.cpp index 34e0c5ebcd58..051655e1fed6 100644 --- a/llvm/lib/ProfileData/InstrProf.cpp +++ b/llvm/lib/ProfileData/InstrProf.cpp @@ -119,9 +119,6 @@ static std::string getInstrProfErrString(instrprof_error Err, case instrprof_error::unable_to_correlate_profile: OS << "unable to correlate profile"; break; - case instrprof_error::unsupported_debug_format: - OS << "unsupported debug info format (only DWARF is supported)"; - break; case instrprof_error::invalid_prof: OS << "invalid profile created. Please file a bug " "at: " BUG_REPORT_URL diff --git a/llvm/lib/ProfileData/InstrProfCorrelator.cpp b/llvm/lib/ProfileData/InstrProfCorrelator.cpp index f9c113027da2..8e38a6869d07 100644 --- a/llvm/lib/ProfileData/InstrProfCorrelator.cpp +++ b/llvm/lib/ProfileData/InstrProfCorrelator.cpp @@ -23,7 +23,8 @@ Expected<object::SectionRef> getCountersSection(const object::ObjectFile &Obj) { if (SectionName.get() == INSTR_PROF_CNTS_SECT_NAME) return Section; return make_error<InstrProfError>( - instrprof_error::unable_to_correlate_profile); + instrprof_error::unable_to_correlate_profile, + "could not find counter section (" INSTR_PROF_CNTS_SECT_NAME ")"); } const char *InstrProfCorrelator::FunctionNameAttributeName = "Function Name"; @@ -54,9 +55,9 @@ InstrProfCorrelator::get(StringRef DebugInfoFilename) { // TODO: Enable profile correlation when there are multiple objects in a // dSYM bundle. if (DsymObjectsOrErr->size() > 1) - return createStringError( - std::error_code(), - "Profile correlation using multiple objects is not yet supported"); + return make_error<InstrProfError>( + instrprof_error::unable_to_correlate_profile, + "using multiple objects is not yet supported"); DebugInfoFilename = *DsymObjectsOrErr->begin(); } auto BufferOrErr = @@ -84,7 +85,16 @@ InstrProfCorrelator::get(std::unique_ptr<MemoryBuffer> Buffer) { return InstrProfCorrelatorImpl<uint32_t>::get(std::move(*CtxOrErr), *Obj); } return make_error<InstrProfError>( - instrprof_error::unable_to_correlate_profile); + instrprof_error::unable_to_correlate_profile, "not an object file"); +} + +Optional<size_t> InstrProfCorrelator::getDataSize() const { + if (auto *C = dyn_cast<InstrProfCorrelatorImpl<uint32_t>>(this)) { + return C->getDataSize(); + } else if (auto *C = dyn_cast<InstrProfCorrelatorImpl<uint64_t>>(this)) { + return C->getDataSize(); + } + return {}; } namespace llvm { @@ -120,16 +130,23 @@ InstrProfCorrelatorImpl<IntPtrT>::get( return std::make_unique<DwarfInstrProfCorrelator<IntPtrT>>(std::move(DICtx), std::move(Ctx)); } - return make_error<InstrProfError>(instrprof_error::unsupported_debug_format); + return make_error<InstrProfError>( + instrprof_error::unable_to_correlate_profile, + "unsupported debug info format (only DWARF is supported)"); } template <class IntPtrT> Error InstrProfCorrelatorImpl<IntPtrT>::correlateProfileData() { - assert(Data.empty() && CompressedNames.empty() && Names.empty()); + assert(Data.empty() && Names.empty() && NamesVec.empty()); correlateProfileDataImpl(); + if (Data.empty() || NamesVec.empty()) + return make_error<InstrProfError>( + instrprof_error::unable_to_correlate_profile, + "could not find any profile metadata in debug info"); auto Result = - collectPGOFuncNameStrings(Names, /*doCompression=*/true, CompressedNames); - Names.clear(); + collectPGOFuncNameStrings(NamesVec, /*doCompression=*/false, Names); + CounterOffsets.clear(); + NamesVec.clear(); return Result; } @@ -139,6 +156,9 @@ void InstrProfCorrelatorImpl<IntPtrT>::addProbe(StringRef FunctionName, IntPtrT CounterOffset, IntPtrT FunctionPtr, uint32_t NumCounters) { + // Check if a probe was already added for this counter offset. + if (!CounterOffsets.insert(CounterOffset).second) + return; Data.push_back({ maybeSwap<uint64_t>(IndexedInstrProf::ComputeHash(FunctionName)), maybeSwap<uint64_t>(CFGHash), @@ -151,7 +171,7 @@ void InstrProfCorrelatorImpl<IntPtrT>::addProbe(StringRef FunctionName, maybeSwap<uint32_t>(NumCounters), /*NumValueSites=*/{maybeSwap<uint16_t>(0), maybeSwap<uint16_t>(0)}, }); - Names.push_back(FunctionName.str()); + NamesVec.push_back(FunctionName.str()); } template <class IntPtrT> @@ -163,13 +183,19 @@ DwarfInstrProfCorrelator<IntPtrT>::getLocation(const DWARFDie &Die) const { return {}; } auto &DU = *Die.getDwarfUnit(); + auto AddressSize = DU.getAddressByteSize(); for (auto &Location : *Locations) { - auto AddressSize = DU.getAddressByteSize(); DataExtractor Data(Location.Expr, DICtx->isLittleEndian(), AddressSize); DWARFExpression Expr(Data, AddressSize); - for (auto &Op : Expr) - if (Op.getCode() == dwarf::DW_OP_addr) + for (auto &Op : Expr) { + if (Op.getCode() == dwarf::DW_OP_addr) { return Op.getRawOperand(0); + } else if (Op.getCode() == dwarf::DW_OP_addrx) { + uint64_t Index = Op.getRawOperand(0); + if (auto SA = DU.getAddrOffsetSectionItem(Index)) + return SA->Address; + } + } } return {}; } diff --git a/llvm/lib/ProfileData/InstrProfReader.cpp b/llvm/lib/ProfileData/InstrProfReader.cpp index 37cdf4dd1fe2..861ff61df510 100644 --- a/llvm/lib/ProfileData/InstrProfReader.cpp +++ b/llvm/lib/ProfileData/InstrProfReader.cpp @@ -383,22 +383,21 @@ Error RawInstrProfReader<IntPtrT>::readHeader( CountersDelta = swap(Header.CountersDelta); NamesDelta = swap(Header.NamesDelta); - auto DataSize = swap(Header.DataSize); + auto NumData = swap(Header.DataSize); auto PaddingBytesBeforeCounters = swap(Header.PaddingBytesBeforeCounters); - auto CountersSize = swap(Header.CountersSize); + auto CountersSize = swap(Header.CountersSize) * getCounterTypeSize(); auto PaddingBytesAfterCounters = swap(Header.PaddingBytesAfterCounters); auto NamesSize = swap(Header.NamesSize); ValueKindLast = swap(Header.ValueKindLast); - auto DataSizeInBytes = DataSize * sizeof(RawInstrProf::ProfileData<IntPtrT>); + auto DataSize = NumData * sizeof(RawInstrProf::ProfileData<IntPtrT>); auto PaddingSize = getNumPaddingBytes(NamesSize); // Profile data starts after profile header and binary ids if exist. ptrdiff_t DataOffset = sizeof(RawInstrProf::Header) + BinaryIdsSize; - ptrdiff_t CountersOffset = - DataOffset + DataSizeInBytes + PaddingBytesBeforeCounters; - ptrdiff_t NamesOffset = CountersOffset + (sizeof(uint64_t) * CountersSize) + - PaddingBytesAfterCounters; + ptrdiff_t CountersOffset = DataOffset + DataSize + PaddingBytesBeforeCounters; + ptrdiff_t NamesOffset = + CountersOffset + CountersSize + PaddingBytesAfterCounters; ptrdiff_t ValueDataOffset = NamesOffset + NamesSize + PaddingSize; auto *Start = reinterpret_cast<const char *>(&Header); @@ -412,12 +411,12 @@ Error RawInstrProfReader<IntPtrT>::readHeader( assert(CountersDelta == 0 && NamesDelta == 0); Data = Correlator->getDataPointer(); DataEnd = Data + Correlator->getDataSize(); - NamesStart = Correlator->getCompressedNamesPointer(); - NamesEnd = NamesStart + Correlator->getCompressedNamesSize(); + NamesStart = Correlator->getNamesPointer(); + NamesEnd = NamesStart + Correlator->getNamesSize(); } else { Data = reinterpret_cast<const RawInstrProf::ProfileData<IntPtrT> *>( Start + DataOffset); - DataEnd = Data + DataSize; + DataEnd = Data + NumData; NamesStart = Start + NamesOffset; NamesEnd = NamesStart + NamesSize; } @@ -425,7 +424,8 @@ Error RawInstrProfReader<IntPtrT>::readHeader( // Binary ids start just after the header. BinaryIdsStart = reinterpret_cast<const uint8_t *>(&Header) + sizeof(RawInstrProf::Header); - CountersStart = reinterpret_cast<const uint64_t *>(Start + CountersOffset); + CountersStart = Start + CountersOffset; + CountersEnd = CountersStart + CountersSize; ValueDataStart = reinterpret_cast<const uint8_t *>(Start + ValueDataOffset); const uint8_t *BufferEnd = (const uint8_t *)DataBuffer->getBufferEnd(); @@ -459,58 +459,36 @@ Error RawInstrProfReader<IntPtrT>::readRawCounts( if (NumCounters == 0) return error(instrprof_error::malformed, "number of counters is zero"); - ArrayRef<uint64_t> RawCounts; - if (Correlator) { - uint64_t CounterOffset = swap<IntPtrT>(Data->CounterPtr) / sizeof(uint64_t); - RawCounts = - makeArrayRef<uint64_t>(CountersStart + CounterOffset, NumCounters); - } else { - IntPtrT CounterPtr = Data->CounterPtr; - ptrdiff_t CounterOffset = getCounterOffset(CounterPtr); - if (CounterOffset < 0) - return error( - instrprof_error::malformed, - ("counter offset " + Twine(CounterOffset) + " is negative").str()); - - // Check bounds. Note that the counter pointer embedded in the data record - // may itself be corrupt. - auto *NamesStartAsCounter = reinterpret_cast<const uint64_t *>(NamesStart); - ptrdiff_t MaxNumCounters = NamesStartAsCounter - CountersStart; - if (MaxNumCounters < 0 || NumCounters > (uint32_t)MaxNumCounters) - return error(instrprof_error::malformed, - "counter pointer is out of bounds"); - // We need to compute the in-buffer counter offset from the in-memory - // address distance. The initial CountersDelta is the in-memory address - // difference start(__llvm_prf_cnts)-start(__llvm_prf_data), so - // SrcData->CounterPtr - CountersDelta computes the offset into the - // in-buffer counter section. - if (CounterOffset > MaxNumCounters) - return error(instrprof_error::malformed, - ("counter offset " + Twine(CounterOffset) + - " is greater than the maximum number of counters " + - Twine((uint32_t)MaxNumCounters)) - .str()); - - if (((uint32_t)CounterOffset + NumCounters) > (uint32_t)MaxNumCounters) - return error(instrprof_error::malformed, - ("number of counters " + - Twine(((uint32_t)CounterOffset + NumCounters)) + - " is greater than the maximum number of counters " + - Twine((uint32_t)MaxNumCounters)) - .str()); - // CountersDelta decreases as we advance to the next data record. - CountersDelta -= sizeof(*Data); - - RawCounts = makeArrayRef(getCounter(CounterOffset), NumCounters); - } + ptrdiff_t CounterBaseOffset = swap(Data->CounterPtr) - CountersDelta; + if (CounterBaseOffset < 0) + return error( + instrprof_error::malformed, + ("counter offset " + Twine(CounterBaseOffset) + " is negative").str()); - if (ShouldSwapBytes) { - Record.Counts.clear(); - Record.Counts.reserve(RawCounts.size()); - for (uint64_t Count : RawCounts) - Record.Counts.push_back(swap(Count)); - } else - Record.Counts = RawCounts; + if (CounterBaseOffset >= CountersEnd - CountersStart) + return error(instrprof_error::malformed, + ("counter offset " + Twine(CounterBaseOffset) + + " is greater than the maximum counter offset " + + Twine(CountersEnd - CountersStart - 1)) + .str()); + + uint64_t MaxNumCounters = + (CountersEnd - (CountersStart + CounterBaseOffset)) / + getCounterTypeSize(); + if (NumCounters > MaxNumCounters) + return error(instrprof_error::malformed, + ("number of counters " + Twine(NumCounters) + + " is greater than the maximum number of counters " + + Twine(MaxNumCounters)) + .str()); + + Record.Counts.clear(); + Record.Counts.reserve(NumCounters); + for (uint32_t I = 0; I < NumCounters; I++) { + const auto *CounterValue = reinterpret_cast<const uint64_t *>( + CountersStart + CounterBaseOffset + I * getCounterTypeSize()); + Record.Counts.push_back(swap(*CounterValue)); + } return success(); } diff --git a/llvm/lib/ProfileData/SampleProfReader.cpp b/llvm/lib/ProfileData/SampleProfReader.cpp index da16309fb82c..80c02faaba04 100644 --- a/llvm/lib/ProfileData/SampleProfReader.cpp +++ b/llvm/lib/ProfileData/SampleProfReader.cpp @@ -655,6 +655,8 @@ std::error_code SampleProfileReaderExtBinaryBase::readOneSection( Summary->setPartialProfile(true); if (hasSecFlag(Entry, SecProfSummaryFlags::SecFlagFullContext)) FunctionSamples::ProfileIsCSFlat = ProfileIsCSFlat = true; + if (hasSecFlag(Entry, SecProfSummaryFlags::SecFlagIsCSNested)) + FunctionSamples::ProfileIsCSNested = ProfileIsCSNested; if (hasSecFlag(Entry, SecProfSummaryFlags::SecFlagFSDiscriminator)) FunctionSamples::ProfileIsFS = ProfileIsFS = true; break; @@ -688,9 +690,6 @@ std::error_code SampleProfileReaderExtBinaryBase::readOneSection( ProfileIsProbeBased = hasSecFlag(Entry, SecFuncMetadataFlags::SecFlagIsProbeBased); FunctionSamples::ProfileIsProbeBased = ProfileIsProbeBased; - ProfileIsCSNested = - hasSecFlag(Entry, SecFuncMetadataFlags::SecFlagIsCSNested); - FunctionSamples::ProfileIsCSNested = ProfileIsCSNested; bool HasAttribute = hasSecFlag(Entry, SecFuncMetadataFlags::SecFlagHasAttribute); if (std::error_code EC = readFuncMetadata(HasAttribute)) @@ -1276,6 +1275,8 @@ static std::string getSecFlagsStr(const SecHdrTableEntry &Entry) { Flags.append("partial,"); if (hasSecFlag(Entry, SecProfSummaryFlags::SecFlagFullContext)) Flags.append("context,"); + if (hasSecFlag(Entry, SecProfSummaryFlags::SecFlagIsCSNested)) + Flags.append("context-nested,"); if (hasSecFlag(Entry, SecProfSummaryFlags::SecFlagFSDiscriminator)) Flags.append("fs-discriminator,"); break; @@ -1288,8 +1289,6 @@ static std::string getSecFlagsStr(const SecHdrTableEntry &Entry) { Flags.append("probe,"); if (hasSecFlag(Entry, SecFuncMetadataFlags::SecFlagHasAttribute)) Flags.append("attr,"); - if (hasSecFlag(Entry, SecFuncMetadataFlags::SecFlagIsCSNested)) - Flags.append("preinlined,"); break; default: break; diff --git a/llvm/lib/ProfileData/SampleProfWriter.cpp b/llvm/lib/ProfileData/SampleProfWriter.cpp index 6f02bd203a9f..b575425d4e94 100644 --- a/llvm/lib/ProfileData/SampleProfWriter.cpp +++ b/llvm/lib/ProfileData/SampleProfWriter.cpp @@ -323,13 +323,13 @@ std::error_code SampleProfileWriterExtBinaryBase::writeOneSection( setToCompressSection(SecProfileSymbolList); if (Type == SecFuncMetadata && FunctionSamples::ProfileIsProbeBased) addSectionFlag(SecFuncMetadata, SecFuncMetadataFlags::SecFlagIsProbeBased); - if (Type == SecFuncMetadata && FunctionSamples::ProfileIsCSNested) - addSectionFlag(SecFuncMetadata, SecFuncMetadataFlags::SecFlagIsCSNested); if (Type == SecFuncMetadata && (FunctionSamples::ProfileIsCSFlat || FunctionSamples::ProfileIsCSNested)) addSectionFlag(SecFuncMetadata, SecFuncMetadataFlags::SecFlagHasAttribute); if (Type == SecProfSummary && FunctionSamples::ProfileIsCSFlat) addSectionFlag(SecProfSummary, SecProfSummaryFlags::SecFlagFullContext); + if (Type == SecProfSummary && FunctionSamples::ProfileIsCSNested) + addSectionFlag(SecProfSummary, SecProfSummaryFlags::SecFlagIsCSNested); if (Type == SecProfSummary && FunctionSamples::ProfileIsFS) addSectionFlag(SecProfSummary, SecProfSummaryFlags::SecFlagFSDiscriminator); diff --git a/llvm/lib/Remarks/BitstreamRemarkSerializer.cpp b/llvm/lib/Remarks/BitstreamRemarkSerializer.cpp index 36ba93564771..0810bf531db8 100644 --- a/llvm/lib/Remarks/BitstreamRemarkSerializer.cpp +++ b/llvm/lib/Remarks/BitstreamRemarkSerializer.cpp @@ -18,7 +18,7 @@ using namespace llvm::remarks; BitstreamRemarkSerializerHelper::BitstreamRemarkSerializerHelper( BitstreamRemarkContainerType ContainerType) - : Encoded(), R(), Bitstream(Encoded), ContainerType(ContainerType) {} + : Bitstream(Encoded), ContainerType(ContainerType) {} static void push(SmallVectorImpl<uint64_t> &R, StringRef Str) { append_range(R, Str); diff --git a/llvm/lib/Remarks/Remark.cpp b/llvm/lib/Remarks/Remark.cpp index 057d1a378599..e6b7de1a2cf5 100644 --- a/llvm/lib/Remarks/Remark.cpp +++ b/llvm/lib/Remarks/Remark.cpp @@ -111,7 +111,7 @@ LLVMRemarkEntryGetFirstArg(LLVMRemarkEntryRef Remark) { ArrayRef<Argument> Args = unwrap(Remark)->Args; // No arguments to iterate on. if (Args.empty()) - return NULL; + return nullptr; return reinterpret_cast<LLVMRemarkArgRef>( const_cast<Argument *>(Args.begin())); } @@ -119,13 +119,13 @@ LLVMRemarkEntryGetFirstArg(LLVMRemarkEntryRef Remark) { extern "C" LLVMRemarkArgRef LLVMRemarkEntryGetNextArg(LLVMRemarkArgRef ArgIt, LLVMRemarkEntryRef Remark) { // No more arguments to iterate on. - if (ArgIt == NULL) - return NULL; + if (ArgIt == nullptr) + return nullptr; auto It = (ArrayRef<Argument>::const_iterator)ArgIt; auto Next = std::next(It); if (Next == unwrap(Remark)->Args.end()) - return NULL; + return nullptr; return reinterpret_cast<LLVMRemarkArgRef>(const_cast<Argument *>(Next)); } diff --git a/llvm/lib/Remarks/RemarkStreamer.cpp b/llvm/lib/Remarks/RemarkStreamer.cpp index 2f00b8e73670..543b00723659 100644 --- a/llvm/lib/Remarks/RemarkStreamer.cpp +++ b/llvm/lib/Remarks/RemarkStreamer.cpp @@ -26,7 +26,7 @@ static cl::opt<cl::boolOrDefault> EnableRemarksSection( RemarkStreamer::RemarkStreamer( std::unique_ptr<remarks::RemarkSerializer> RemarkSerializer, Optional<StringRef> FilenameIn) - : PassFilter(), RemarkSerializer(std::move(RemarkSerializer)), + : RemarkSerializer(std::move(RemarkSerializer)), Filename(FilenameIn ? Optional<std::string>(FilenameIn->str()) : None) {} Error RemarkStreamer::setFilter(StringRef Filter) { diff --git a/llvm/lib/Remarks/RemarkStringTable.cpp b/llvm/lib/Remarks/RemarkStringTable.cpp index 5f462f01bb9a..03d93baba038 100644 --- a/llvm/lib/Remarks/RemarkStringTable.cpp +++ b/llvm/lib/Remarks/RemarkStringTable.cpp @@ -20,7 +20,7 @@ using namespace llvm; using namespace llvm::remarks; -StringTable::StringTable(const ParsedStringTable &Other) : StrTab() { +StringTable::StringTable(const ParsedStringTable &Other) { for (unsigned i = 0, e = Other.size(); i < e; ++i) if (Expected<StringRef> MaybeStr = Other[i]) add(*MaybeStr); diff --git a/llvm/lib/Remarks/YAMLRemarkParser.cpp b/llvm/lib/Remarks/YAMLRemarkParser.cpp index 3d9996c931ae..a32629c9f557 100644 --- a/llvm/lib/Remarks/YAMLRemarkParser.cpp +++ b/llvm/lib/Remarks/YAMLRemarkParser.cpp @@ -171,7 +171,7 @@ YAMLRemarkParser::YAMLRemarkParser(StringRef Buf) YAMLRemarkParser::YAMLRemarkParser(StringRef Buf, Optional<ParsedStringTable> StrTab) - : RemarkParser{Format::YAML}, StrTab(std::move(StrTab)), LastErrorMessage(), + : RemarkParser{Format::YAML}, StrTab(std::move(StrTab)), SM(setupSM(LastErrorMessage)), Stream(Buf, SM), YAMLIt(Stream.begin()) {} Error YAMLRemarkParser::error(StringRef Message, yaml::Node &Node) { diff --git a/llvm/lib/Support/AArch64TargetParser.cpp b/llvm/lib/Support/AArch64TargetParser.cpp index 4bc9c8487131..cdf7c8ade9aa 100644 --- a/llvm/lib/Support/AArch64TargetParser.cpp +++ b/llvm/lib/Support/AArch64TargetParser.cpp @@ -114,6 +114,12 @@ bool AArch64::getExtensionFeatures(uint64_t Extensions, Features.push_back("+sme-f64"); if (Extensions & AArch64::AEK_SMEI64) Features.push_back("+sme-i64"); + if (Extensions & AArch64::AEK_HBC) + Features.push_back("+hbc"); + if (Extensions & AArch64::AEK_MOPS) + Features.push_back("+mops"); + if (Extensions & AArch64::AEK_PERFMON) + Features.push_back("+perfmon"); return true; } @@ -136,12 +142,16 @@ bool AArch64::getArchFeatures(AArch64::ArchKind AK, Features.push_back("+v8.6a"); if (AK == AArch64::ArchKind::ARMV8_7A) Features.push_back("+v8.7a"); + if (AK == AArch64::ArchKind::ARMV8_8A) + Features.push_back("+v8.8a"); if (AK == AArch64::ArchKind::ARMV9A) Features.push_back("+v9a"); if (AK == AArch64::ArchKind::ARMV9_1A) Features.push_back("+v9.1a"); if (AK == AArch64::ArchKind::ARMV9_2A) Features.push_back("+v9.2a"); + if (AK == AArch64::ArchKind::ARMV9_3A) + Features.push_back("+v9.3a"); if(AK == AArch64::ArchKind::ARMV8R) Features.push_back("+v8r"); diff --git a/llvm/lib/Support/APInt.cpp b/llvm/lib/Support/APInt.cpp index 4940b61602d1..b536e9a9a6d0 100644 --- a/llvm/lib/Support/APInt.cpp +++ b/llvm/lib/Support/APInt.cpp @@ -24,9 +24,7 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" -#include <climits> #include <cmath> -#include <cstdlib> #include <cstring> using namespace llvm; diff --git a/llvm/lib/Support/ARMAttributeParser.cpp b/llvm/lib/Support/ARMAttributeParser.cpp index 241cfb1eedbe..908e56319025 100644 --- a/llvm/lib/Support/ARMAttributeParser.cpp +++ b/llvm/lib/Support/ARMAttributeParser.cpp @@ -9,8 +9,6 @@ #include "llvm/Support/ARMAttributeParser.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/StringExtras.h" -#include "llvm/Support/Errc.h" -#include "llvm/Support/LEB128.h" #include "llvm/Support/ScopedPrinter.h" using namespace llvm; @@ -70,7 +68,7 @@ const ARMAttributeParser::DisplayHandler ARMAttributeParser::displayRoutines[] = Error ARMAttributeParser::stringAttribute(AttrType tag) { StringRef tagName = - ELFAttrs::attrTypeAsString(tag, tagToStringMap, /*TagPrefix=*/false); + ELFAttrs::attrTypeAsString(tag, tagToStringMap, /*hasTagPrefix=*/false); StringRef desc = de.getCStrRef(cursor); if (sw) { diff --git a/llvm/lib/Support/ARMTargetParser.cpp b/llvm/lib/Support/ARMTargetParser.cpp index 4405ed176fe2..d7294b5b1074 100644 --- a/llvm/lib/Support/ARMTargetParser.cpp +++ b/llvm/lib/Support/ARMTargetParser.cpp @@ -77,6 +77,7 @@ unsigned ARM::parseArchVersion(StringRef Arch) { case ArchKind::ARMV8_5A: case ArchKind::ARMV8_6A: case ArchKind::ARMV8_7A: + case ArchKind::ARMV8_8A: case ArchKind::ARMV8R: case ArchKind::ARMV8MBaseline: case ArchKind::ARMV8MMainline: @@ -85,6 +86,7 @@ unsigned ARM::parseArchVersion(StringRef Arch) { case ArchKind::ARMV9A: case ArchKind::ARMV9_1A: case ArchKind::ARMV9_2A: + case ArchKind::ARMV9_3A: return 9; case ArchKind::INVALID: return 0; @@ -117,9 +119,11 @@ ARM::ProfileKind ARM::parseArchProfile(StringRef Arch) { case ArchKind::ARMV8_5A: case ArchKind::ARMV8_6A: case ArchKind::ARMV8_7A: + case ArchKind::ARMV8_8A: case ArchKind::ARMV9A: case ArchKind::ARMV9_1A: case ArchKind::ARMV9_2A: + case ArchKind::ARMV9_3A: return ProfileKind::A; case ArchKind::ARMV2: case ArchKind::ARMV2A: @@ -164,10 +168,12 @@ StringRef ARM::getArchSynonym(StringRef Arch) { .Case("v8.5a", "v8.5-a") .Case("v8.6a", "v8.6-a") .Case("v8.7a", "v8.7-a") + .Case("v8.8a", "v8.8-a") .Case("v8r", "v8-r") .Cases("v9", "v9a", "v9-a") .Case("v9.1a", "v9.1-a") .Case("v9.2a", "v9.2-a") + .Case("v9.3a", "v9.3-a") .Case("v8m.base", "v8-m.base") .Case("v8m.main", "v8-m.main") .Case("v8.1m.main", "v8.1-m.main") diff --git a/llvm/lib/Support/ARMWinEH.cpp b/llvm/lib/Support/ARMWinEH.cpp index 2e2fcf28451f..8e7fa1149082 100644 --- a/llvm/lib/Support/ARMWinEH.cpp +++ b/llvm/lib/Support/ARMWinEH.cpp @@ -7,7 +7,6 @@ //===----------------------------------------------------------------------===// #include "llvm/Support/ARMWinEH.h" -#include "llvm/Support/raw_ostream.h" namespace llvm { namespace ARM { diff --git a/llvm/lib/Support/BinaryStreamError.cpp b/llvm/lib/Support/BinaryStreamError.cpp index f22523f09ac8..9b8f6862b65c 100644 --- a/llvm/lib/Support/BinaryStreamError.cpp +++ b/llvm/lib/Support/BinaryStreamError.cpp @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// #include "llvm/Support/BinaryStreamError.h" -#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" using namespace llvm; diff --git a/llvm/lib/Support/BlockFrequency.cpp b/llvm/lib/Support/BlockFrequency.cpp index 2b63294f3789..702165ac480b 100644 --- a/llvm/lib/Support/BlockFrequency.cpp +++ b/llvm/lib/Support/BlockFrequency.cpp @@ -11,6 +11,7 @@ //===----------------------------------------------------------------------===// #include "llvm/Support/BlockFrequency.h" +#include "llvm/Support/BranchProbability.h" #include <cassert> using namespace llvm; diff --git a/llvm/lib/Support/Caching.cpp b/llvm/lib/Support/Caching.cpp index 8c685640f791..d6902f660e39 100644 --- a/llvm/lib/Support/Caching.cpp +++ b/llvm/lib/Support/Caching.cpp @@ -30,8 +30,6 @@ Expected<FileCache> llvm::localCache(Twine CacheNameRef, Twine TempFilePrefixRef, Twine CacheDirectoryPathRef, AddBufferFn AddBuffer) { - if (std::error_code EC = sys::fs::create_directories(CacheDirectoryPathRef)) - return errorCodeToError(EC); // Create local copies which are safely captured-by-copy in lambdas SmallString<64> CacheName, TempFilePrefix, CacheDirectoryPath; @@ -140,6 +138,12 @@ Expected<FileCache> llvm::localCache(Twine CacheNameRef, }; return [=](size_t Task) -> Expected<std::unique_ptr<CachedFileStream>> { + // Create the cache directory if not already done. Doing this lazily + // ensures the filesystem isn't mutated until the cache is. + if (std::error_code EC = sys::fs::create_directories( + CacheDirectoryPath, /*IgnoreExisting=*/true)) + return errorCodeToError(EC); + // Write to a temporary to avoid race condition SmallString<64> TempFilenameModel; sys::path::append(TempFilenameModel, CacheDirectoryPath, diff --git a/llvm/lib/Support/CodeGenCoverage.cpp b/llvm/lib/Support/CodeGenCoverage.cpp index 93f386b6e23d..73e0fb3edce8 100644 --- a/llvm/lib/Support/CodeGenCoverage.cpp +++ b/llvm/lib/Support/CodeGenCoverage.cpp @@ -27,7 +27,7 @@ CodeGenCoverage::CodeGenCoverage() {} void CodeGenCoverage::setCovered(uint64_t RuleID) { if (RuleCoverage.size() <= RuleID) - RuleCoverage.resize(RuleID + 1, 0); + RuleCoverage.resize(RuleID + 1, false); RuleCoverage[RuleID] = true; } diff --git a/llvm/lib/Support/CommandLine.cpp b/llvm/lib/Support/CommandLine.cpp index 4153a69abf5d..71a6ebf2a72e 100644 --- a/llvm/lib/Support/CommandLine.cpp +++ b/llvm/lib/Support/CommandLine.cpp @@ -22,7 +22,7 @@ #include "llvm-c/Support.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/Optional.h" -#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/STLFunctionalExtras.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallString.h" #include "llvm/ADT/StringExtras.h" @@ -45,7 +45,6 @@ #include "llvm/Support/VirtualFileSystem.h" #include "llvm/Support/raw_ostream.h" #include <cstdlib> -#include <map> #include <string> using namespace llvm; using namespace cl; @@ -1078,11 +1077,45 @@ static bool hasUTF8ByteOrderMark(ArrayRef<char> S) { return (S.size() >= 3 && S[0] == '\xef' && S[1] == '\xbb' && S[2] == '\xbf'); } +// Substitute <CFGDIR> with the file's base path. +static void ExpandBasePaths(StringRef BasePath, StringSaver &Saver, + const char *&Arg) { + assert(sys::path::is_absolute(BasePath)); + constexpr StringLiteral Token("<CFGDIR>"); + const StringRef ArgString(Arg); + + SmallString<128> ResponseFile; + StringRef::size_type StartPos = 0; + for (StringRef::size_type TokenPos = ArgString.find(Token); + TokenPos != StringRef::npos; + TokenPos = ArgString.find(Token, StartPos)) { + // Token may appear more than once per arg (e.g. comma-separated linker + // args). Support by using path-append on any subsequent appearances. + const StringRef LHS = ArgString.substr(StartPos, TokenPos - StartPos); + if (ResponseFile.empty()) + ResponseFile = LHS; + else + llvm::sys::path::append(ResponseFile, LHS); + ResponseFile.append(BasePath); + StartPos = TokenPos + Token.size(); + } + + if (!ResponseFile.empty()) { + // Path-append the remaining arg substring if at least one token appeared. + const StringRef Remaining = ArgString.substr(StartPos); + if (!Remaining.empty()) + llvm::sys::path::append(ResponseFile, Remaining); + Arg = Saver.save(ResponseFile.str()).data(); + } +} + // FName must be an absolute path. -static llvm::Error ExpandResponseFile( - StringRef FName, StringSaver &Saver, TokenizerCallback Tokenizer, - SmallVectorImpl<const char *> &NewArgv, bool MarkEOLs, bool RelativeNames, - llvm::vfs::FileSystem &FS) { +static llvm::Error ExpandResponseFile(StringRef FName, StringSaver &Saver, + TokenizerCallback Tokenizer, + SmallVectorImpl<const char *> &NewArgv, + bool MarkEOLs, bool RelativeNames, + bool ExpandBasePath, + llvm::vfs::FileSystem &FS) { assert(sys::path::is_absolute(FName)); llvm::ErrorOr<std::unique_ptr<MemoryBuffer>> MemBufOrErr = FS.getBufferForFile(FName); @@ -1116,8 +1149,15 @@ static llvm::Error ExpandResponseFile( // file, replace the included response file names with their full paths // obtained by required resolution. for (auto &Arg : NewArgv) { + if (!Arg) + continue; + + // Substitute <CFGDIR> with the file's base path. + if (ExpandBasePath) + ExpandBasePaths(BasePath, Saver, Arg); + // Skip non-rsp file arguments. - if (!Arg || Arg[0] != '@') + if (Arg[0] != '@') continue; StringRef FileName(Arg + 1); @@ -1129,7 +1169,7 @@ static llvm::Error ExpandResponseFile( ResponseFile.push_back('@'); ResponseFile.append(BasePath); llvm::sys::path::append(ResponseFile, FileName); - Arg = Saver.save(ResponseFile.c_str()).data(); + Arg = Saver.save(ResponseFile.str()).data(); } return Error::success(); } @@ -1138,7 +1178,7 @@ static llvm::Error ExpandResponseFile( /// StringSaver and tokenization strategy. bool cl::ExpandResponseFiles(StringSaver &Saver, TokenizerCallback Tokenizer, SmallVectorImpl<const char *> &Argv, bool MarkEOLs, - bool RelativeNames, + bool RelativeNames, bool ExpandBasePath, llvm::Optional<llvm::StringRef> CurrentDir, llvm::vfs::FileSystem &FS) { bool AllExpanded = true; @@ -1218,7 +1258,7 @@ bool cl::ExpandResponseFiles(StringSaver &Saver, TokenizerCallback Tokenizer, SmallVector<const char *, 0> ExpandedArgv; if (llvm::Error Err = ExpandResponseFile(FName, Saver, Tokenizer, ExpandedArgv, MarkEOLs, - RelativeNames, FS)) { + RelativeNames, ExpandBasePath, FS)) { // We couldn't read this file, so we leave it in the argument stream and // move on. // TODO: The error should be propagated up the stack. @@ -1250,11 +1290,11 @@ bool cl::ExpandResponseFiles(StringSaver &Saver, TokenizerCallback Tokenizer, bool cl::ExpandResponseFiles(StringSaver &Saver, TokenizerCallback Tokenizer, SmallVectorImpl<const char *> &Argv, bool MarkEOLs, - bool RelativeNames, + bool RelativeNames, bool ExpandBasePath, llvm::Optional<StringRef> CurrentDir) { return ExpandResponseFiles(Saver, std::move(Tokenizer), Argv, MarkEOLs, - RelativeNames, std::move(CurrentDir), - *vfs::getRealFileSystem()); + RelativeNames, ExpandBasePath, + std::move(CurrentDir), *vfs::getRealFileSystem()); } bool cl::expandResponseFiles(int Argc, const char *const *Argv, @@ -1281,16 +1321,17 @@ bool cl::readConfigFile(StringRef CfgFile, StringSaver &Saver, llvm::sys::path::append(AbsPath, CfgFile); CfgFile = AbsPath.str(); } - if (llvm::Error Err = - ExpandResponseFile(CfgFile, Saver, cl::tokenizeConfigFile, Argv, - /*MarkEOLs=*/false, /*RelativeNames=*/true, - *llvm::vfs::getRealFileSystem())) { + if (llvm::Error Err = ExpandResponseFile( + CfgFile, Saver, cl::tokenizeConfigFile, Argv, + /*MarkEOLs=*/false, /*RelativeNames=*/true, /*ExpandBasePath=*/true, + *llvm::vfs::getRealFileSystem())) { // TODO: The error should be propagated up the stack. llvm::consumeError(std::move(Err)); return false; } return ExpandResponseFiles(Saver, cl::tokenizeConfigFile, Argv, - /*MarkEOLs=*/false, /*RelativeNames=*/true); + /*MarkEOLs=*/false, /*RelativeNames=*/true, + /*ExpandBasePath=*/true, llvm::None); } static void initCommonOptions(); @@ -2297,7 +2338,7 @@ public: protected: void printOptions(StrOptionPairVector &Opts, size_t MaxArgLen) override { std::vector<OptionCategory *> SortedCategories; - std::map<OptionCategory *, std::vector<Option *>> CategorizedOptions; + DenseMap<OptionCategory *, std::vector<Option *>> CategorizedOptions; // Collect registered option categories into vector in preparation for // sorting. @@ -2309,17 +2350,13 @@ protected: array_pod_sort(SortedCategories.begin(), SortedCategories.end(), OptionCategoryCompare); - // Create map to empty vectors. - for (OptionCategory *Category : SortedCategories) - CategorizedOptions[Category] = std::vector<Option *>(); - // Walk through pre-sorted options and assign into categories. // Because the options are already alphabetically sorted the // options within categories will also be alphabetically sorted. for (size_t I = 0, E = Opts.size(); I != E; ++I) { Option *Opt = Opts[I].second; for (auto &Cat : Opt->Categories) { - assert(CategorizedOptions.count(Cat) > 0 && + assert(find(SortedCategories, Cat) != SortedCategories.end() && "Option has an unregistered category"); CategorizedOptions[Cat].push_back(Opt); } diff --git a/llvm/lib/Support/CrashRecoveryContext.cpp b/llvm/lib/Support/CrashRecoveryContext.cpp index b6aaf373a522..2ee3074b840e 100644 --- a/llvm/lib/Support/CrashRecoveryContext.cpp +++ b/llvm/lib/Support/CrashRecoveryContext.cpp @@ -9,7 +9,6 @@ #include "llvm/Support/CrashRecoveryContext.h" #include "llvm/Config/llvm-config.h" #include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/ExitCodes.h" #include "llvm/Support/ManagedStatic.h" #include "llvm/Support/Signals.h" #include "llvm/Support/ThreadLocal.h" @@ -17,6 +16,10 @@ #include <mutex> #include <setjmp.h> +#if !defined(_MSC_VER) && !defined(_WIN32) +#include "llvm/Support/ExitCodes.h" +#endif + using namespace llvm; namespace { diff --git a/llvm/lib/Support/DAGDeltaAlgorithm.cpp b/llvm/lib/Support/DAGDeltaAlgorithm.cpp index a6daee00bd43..f1b730e2b58c 100644 --- a/llvm/lib/Support/DAGDeltaAlgorithm.cpp +++ b/llvm/lib/Support/DAGDeltaAlgorithm.cpp @@ -37,7 +37,6 @@ #include "llvm/Support/raw_ostream.h" #include <algorithm> #include <cassert> -#include <iterator> #include <map> using namespace llvm; diff --git a/llvm/lib/Support/DataExtractor.cpp b/llvm/lib/Support/DataExtractor.cpp index 133d674275e8..8cf312191153 100644 --- a/llvm/lib/Support/DataExtractor.cpp +++ b/llvm/lib/Support/DataExtractor.cpp @@ -9,7 +9,6 @@ #include "llvm/Support/DataExtractor.h" #include "llvm/Support/Errc.h" #include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/Host.h" #include "llvm/Support/LEB128.h" #include "llvm/Support/SwapByteOrder.h" diff --git a/llvm/lib/Support/DivisionByConstantInfo.cpp b/llvm/lib/Support/DivisionByConstantInfo.cpp index 077629670e40..69f39386798c 100644 --- a/llvm/lib/Support/DivisionByConstantInfo.cpp +++ b/llvm/lib/Support/DivisionByConstantInfo.cpp @@ -67,7 +67,7 @@ UnsignedDivisonByConstantInfo::get(const APInt &D, unsigned LeadingZeros) { unsigned P; APInt NC, Delta, Q1, R1, Q2, R2; struct UnsignedDivisonByConstantInfo Retval; - Retval.IsAdd = 0; // initialize "add" indicator + Retval.IsAdd = false; // initialize "add" indicator APInt AllOnes = APInt::getAllOnes(D.getBitWidth()).lshr(LeadingZeros); APInt SignedMin = APInt::getSignedMinValue(D.getBitWidth()); APInt SignedMax = APInt::getSignedMaxValue(D.getBitWidth()); @@ -89,12 +89,12 @@ UnsignedDivisonByConstantInfo::get(const APInt &D, unsigned LeadingZeros) { } if ((R2 + 1).uge(D - R2)) { if (Q2.uge(SignedMax)) - Retval.IsAdd = 1; + Retval.IsAdd = true; Q2 = Q2 + Q2 + 1; // update Q2 R2 = R2 + R2 + 1 - D; // update R2 } else { if (Q2.uge(SignedMin)) - Retval.IsAdd = 1; + Retval.IsAdd = true; Q2 = Q2 + Q2; // update Q2 R2 = R2 + R2 + 1; // update R2 } diff --git a/llvm/lib/Support/ELFAttributeParser.cpp b/llvm/lib/Support/ELFAttributeParser.cpp index 1206553343ef..cf8a666e92bc 100644 --- a/llvm/lib/Support/ELFAttributeParser.cpp +++ b/llvm/lib/Support/ELFAttributeParser.cpp @@ -7,10 +7,8 @@ //===----------------------------------------------------------------------===// #include "llvm/Support/ELFAttributeParser.h" -#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Support/Errc.h" -#include "llvm/Support/LEB128.h" #include "llvm/Support/ScopedPrinter.h" using namespace llvm; diff --git a/llvm/lib/Support/FileOutputBuffer.cpp b/llvm/lib/Support/FileOutputBuffer.cpp index 4b4406c4c9f4..c11ee59da0dd 100644 --- a/llvm/lib/Support/FileOutputBuffer.cpp +++ b/llvm/lib/Support/FileOutputBuffer.cpp @@ -11,11 +11,9 @@ //===----------------------------------------------------------------------===// #include "llvm/Support/FileOutputBuffer.h" -#include "llvm/ADT/STLExtras.h" #include "llvm/Support/Errc.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/Memory.h" -#include "llvm/Support/Path.h" #include <system_error> #if !defined(_MSC_VER) && !defined(__MINGW32__) diff --git a/llvm/lib/Support/FileUtilities.cpp b/llvm/lib/Support/FileUtilities.cpp index dbe28e56b2c3..489b8d119e6f 100644 --- a/llvm/lib/Support/FileUtilities.cpp +++ b/llvm/lib/Support/FileUtilities.cpp @@ -12,16 +12,12 @@ //===----------------------------------------------------------------------===// #include "llvm/Support/FileUtilities.h" -#include "llvm/ADT/ScopeExit.h" #include "llvm/ADT/SmallString.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Support/Error.h" #include "llvm/Support/ErrorOr.h" #include "llvm/Support/MemoryBuffer.h" -#include "llvm/Support/Path.h" #include "llvm/Support/raw_ostream.h" -#include <cctype> -#include <cmath> #include <cstdint> #include <cstdlib> #include <cstring> diff --git a/llvm/lib/Support/GraphWriter.cpp b/llvm/lib/Support/GraphWriter.cpp index 696e6b7a99d8..e875e18a7e92 100644 --- a/llvm/lib/Support/GraphWriter.cpp +++ b/llvm/lib/Support/GraphWriter.cpp @@ -18,7 +18,6 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" #include "llvm/Config/config.h" -#include "llvm/Support/CommandLine.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/ErrorOr.h" @@ -26,7 +25,11 @@ #include "llvm/Support/Path.h" #include "llvm/Support/Program.h" #include "llvm/Support/raw_ostream.h" -#include <cassert> + +#ifdef __APPLE__ +#include "llvm/Support/CommandLine.h" +#endif + #include <string> #include <system_error> #include <vector> diff --git a/llvm/lib/Support/Host.cpp b/llvm/lib/Support/Host.cpp index 7b14616f6fea..9a4470289bcf 100644 --- a/llvm/lib/Support/Host.cpp +++ b/llvm/lib/Support/Host.cpp @@ -83,12 +83,12 @@ StringRef sys::detail::getHostCPUNameForPowerPC(StringRef ProcCpuinfoContent) { StringRef::const_iterator CIP = CPUInfoStart; - StringRef::const_iterator CPUStart = 0; + StringRef::const_iterator CPUStart = nullptr; size_t CPULen = 0; // We need to find the first line which starts with cpu, spaces, and a colon. // After the colon, there may be some additional spaces and then the cpu type. - while (CIP < CPUInfoEnd && CPUStart == 0) { + while (CIP < CPUInfoEnd && CPUStart == nullptr) { if (CIP < CPUInfoEnd && *CIP == '\n') ++CIP; @@ -118,12 +118,12 @@ StringRef sys::detail::getHostCPUNameForPowerPC(StringRef ProcCpuinfoContent) { } } - if (CPUStart == 0) + if (CPUStart == nullptr) while (CIP < CPUInfoEnd && *CIP != '\n') ++CIP; } - if (CPUStart == 0) + if (CPUStart == nullptr) return generic; return StringSwitch<const char *>(StringRef(CPUStart, CPULen)) @@ -213,6 +213,7 @@ StringRef sys::detail::getHostCPUNameForARM(StringRef ProcCpuinfoContent) { .Case("0xd44", "cortex-x1") .Case("0xd0c", "neoverse-n1") .Case("0xd49", "neoverse-n2") + .Case("0xd40", "neoverse-v1") .Default("generic"); } diff --git a/llvm/lib/Support/InitLLVM.cpp b/llvm/lib/Support/InitLLVM.cpp index 152de6ebae0a..2b7173b28940 100644 --- a/llvm/lib/Support/InitLLVM.cpp +++ b/llvm/lib/Support/InitLLVM.cpp @@ -7,14 +7,15 @@ //===----------------------------------------------------------------------===// #include "llvm/Support/InitLLVM.h" -#include "llvm/Support/Error.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/ErrorHandling.h" #include "llvm/Support/ManagedStatic.h" #include "llvm/Support/PrettyStackTrace.h" -#include "llvm/Support/Process.h" #include "llvm/Support/Signals.h" -#include <string> +#include "llvm/Support/SwapByteOrder.h" #ifdef _WIN32 +#include "llvm/Support/Error.h" #include "llvm/Support/Windows/WindowsSupport.h" #endif diff --git a/llvm/lib/Support/JSON.cpp b/llvm/lib/Support/JSON.cpp index 17b36ed51850..20babbe56d86 100644 --- a/llvm/lib/Support/JSON.cpp +++ b/llvm/lib/Support/JSON.cpp @@ -12,6 +12,7 @@ #include "llvm/Support/Error.h" #include "llvm/Support/Format.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Support/NativeFormatting.h" #include <cctype> namespace llvm { diff --git a/llvm/lib/Support/LowLevelType.cpp b/llvm/lib/Support/LowLevelType.cpp index ecf557997ad1..0282cd9bd79e 100644 --- a/llvm/lib/Support/LowLevelType.cpp +++ b/llvm/lib/Support/LowLevelType.cpp @@ -17,7 +17,7 @@ using namespace llvm; LLT::LLT(MVT VT) { if (VT.isVector()) { - bool asVector = VT.getVectorNumElements() > 1; + bool asVector = VT.getVectorMinNumElements() > 1; init(/*IsPointer=*/false, asVector, /*IsScalar=*/!asVector, VT.getVectorElementCount(), VT.getVectorElementType().getSizeInBits(), /*AddressSpace=*/0); diff --git a/llvm/lib/Support/MD5.cpp b/llvm/lib/Support/MD5.cpp index 9dceb4d418cd..caadde389504 100644 --- a/llvm/lib/Support/MD5.cpp +++ b/llvm/lib/Support/MD5.cpp @@ -40,10 +40,9 @@ #include "llvm/Support/MD5.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/SmallString.h" +#include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringRef.h" #include "llvm/Support/Endian.h" -#include "llvm/Support/Format.h" -#include "llvm/Support/raw_ostream.h" #include <array> #include <cstdint> #include <cstring> @@ -281,14 +280,12 @@ StringRef MD5::result() { SmallString<32> MD5::MD5Result::digest() const { SmallString<32> Str; - raw_svector_ostream Res(Str); - for (int i = 0; i < 16; ++i) - Res << format("%.2x", Bytes[i]); + toHex(Bytes, /*LowerCase*/ true, Str); return Str; } -void MD5::stringifyResult(MD5Result &Result, SmallString<32> &Str) { - Str = Result.digest(); +void MD5::stringifyResult(MD5Result &Result, SmallVectorImpl<char> &Str) { + toHex(Result.Bytes, /*LowerCase*/ true, Str); } std::array<uint8_t, 16> MD5::hash(ArrayRef<uint8_t> Data) { diff --git a/llvm/lib/Support/MSP430AttributeParser.cpp b/llvm/lib/Support/MSP430AttributeParser.cpp index a9948a158fc0..a230a3a70adb 100644 --- a/llvm/lib/Support/MSP430AttributeParser.cpp +++ b/llvm/lib/Support/MSP430AttributeParser.cpp @@ -7,7 +7,8 @@ //===----------------------------------------------------------------------===// #include "llvm/Support/MSP430AttributeParser.h" -#include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/Support/ErrorHandling.h" using namespace llvm; using namespace llvm::MSP430Attrs; diff --git a/llvm/lib/Support/MemAlloc.cpp b/llvm/lib/Support/MemAlloc.cpp index 7aaa0dc6e205..07a26cf26480 100644 --- a/llvm/lib/Support/MemAlloc.cpp +++ b/llvm/lib/Support/MemAlloc.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "llvm/Support/MemAlloc.h" +#include <new> // These are out of line to have __cpp_aligned_new not affect ABI. diff --git a/llvm/lib/Support/MemoryBuffer.cpp b/llvm/lib/Support/MemoryBuffer.cpp index d3fa3c6f065d..7816779cca1d 100644 --- a/llvm/lib/Support/MemoryBuffer.cpp +++ b/llvm/lib/Support/MemoryBuffer.cpp @@ -14,16 +14,15 @@ #include "llvm/ADT/SmallString.h" #include "llvm/Config/config.h" #include "llvm/Support/AutoConvert.h" +#include "llvm/Support/Error.h" +#include "llvm/Support/ErrorHandling.h" #include "llvm/Support/Errc.h" -#include "llvm/Support/Errno.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/MathExtras.h" -#include "llvm/Support/Path.h" #include "llvm/Support/Process.h" #include "llvm/Support/Program.h" #include "llvm/Support/SmallVectorMemoryBuffer.h" #include <cassert> -#include <cerrno> #include <cstring> #include <new> #include <sys/types.h> @@ -220,28 +219,16 @@ public: MemoryBuffer::BufferKind getBufferKind() const override { return MemoryBuffer::MemoryBuffer_MMap; } + + void dontNeedIfMmap() override { MFR.dontNeed(); } }; } // namespace static ErrorOr<std::unique_ptr<WritableMemoryBuffer>> getMemoryBufferForStream(sys::fs::file_t FD, const Twine &BufferName) { - const ssize_t ChunkSize = 4096*4; - SmallString<ChunkSize> Buffer; - - // Read into Buffer until we hit EOF. - size_t Size = Buffer.size(); - for (;;) { - Buffer.resize_for_overwrite(Size + ChunkSize); - Expected<size_t> ReadBytes = sys::fs::readNativeFile( - FD, makeMutableArrayRef(Buffer.begin() + Size, ChunkSize)); - if (!ReadBytes) - return errorToErrorCode(ReadBytes.takeError()); - if (*ReadBytes == 0) - break; - Size += *ReadBytes; - } - Buffer.truncate(Size); - + SmallString<sys::fs::DefaultReadChunkSize> Buffer; + if (Error E = sys::fs::readNativeFileToEOF(FD, Buffer)) + return errorToErrorCode(std::move(E)); return getMemBufferCopyImpl(Buffer, BufferName); } diff --git a/llvm/lib/Support/NativeFormatting.cpp b/llvm/lib/Support/NativeFormatting.cpp index 254d18d797b3..0a797046bb68 100644 --- a/llvm/lib/Support/NativeFormatting.cpp +++ b/llvm/lib/Support/NativeFormatting.cpp @@ -13,7 +13,6 @@ #include "llvm/Support/Format.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" -#include <float.h> using namespace llvm; diff --git a/llvm/lib/Support/Parallel.cpp b/llvm/lib/Support/Parallel.cpp index 71e3a1362f7e..4977c188f934 100644 --- a/llvm/lib/Support/Parallel.cpp +++ b/llvm/lib/Support/Parallel.cpp @@ -174,3 +174,35 @@ void TaskGroup::spawn(std::function<void()> F) { } // namespace parallel } // namespace llvm #endif // LLVM_ENABLE_THREADS + +void llvm::parallelForEachN(size_t Begin, size_t End, + llvm::function_ref<void(size_t)> Fn) { + // If we have zero or one items, then do not incur the overhead of spinning up + // a task group. They are surprisingly expensive, and because they do not + // support nested parallelism, a single entry task group can block parallel + // execution underneath them. +#if LLVM_ENABLE_THREADS + auto NumItems = End - Begin; + if (NumItems > 1 && parallel::strategy.ThreadsRequested != 1) { + // Limit the number of tasks to MaxTasksPerGroup to limit job scheduling + // overhead on large inputs. + auto TaskSize = NumItems / parallel::detail::MaxTasksPerGroup; + if (TaskSize == 0) + TaskSize = 1; + + parallel::detail::TaskGroup TG; + for (; Begin + TaskSize < End; Begin += TaskSize) { + TG.spawn([=, &Fn] { + for (size_t I = Begin, E = Begin + TaskSize; I != E; ++I) + Fn(I); + }); + } + for (; Begin != End; ++Begin) + Fn(Begin); + return; + } +#endif + + for (; Begin != End; ++Begin) + Fn(Begin); +} diff --git a/llvm/lib/Support/Path.cpp b/llvm/lib/Support/Path.cpp index 7c99d088911c..63d8d4ee4648 100644 --- a/llvm/lib/Support/Path.cpp +++ b/llvm/lib/Support/Path.cpp @@ -12,6 +12,7 @@ #include "llvm/Support/Path.h" #include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/ScopeExit.h" #include "llvm/Config/config.h" #include "llvm/Config/llvm-config.h" #include "llvm/Support/Endian.h" @@ -1167,6 +1168,25 @@ const char *mapped_file_region::const_data() const { return reinterpret_cast<const char *>(Mapping); } +Error readNativeFileToEOF(file_t FileHandle, SmallVectorImpl<char> &Buffer, + ssize_t ChunkSize) { + // Install a handler to truncate the buffer to the correct size on exit. + size_t Size = Buffer.size(); + auto TruncateOnExit = make_scope_exit([&]() { Buffer.truncate(Size); }); + + // Read into Buffer until we hit EOF. + for (;;) { + Buffer.resize_for_overwrite(Size + ChunkSize); + Expected<size_t> ReadBytes = readNativeFile( + FileHandle, makeMutableArrayRef(Buffer.begin() + Size, ChunkSize)); + if (!ReadBytes) + return ReadBytes.takeError(); + if (*ReadBytes == 0) + return Error::success(); + Size += *ReadBytes; + } +} + } // end namespace fs } // end namespace sys } // end namespace llvm @@ -1234,7 +1254,8 @@ Error TempFile::keep(const Twine &Name) { #ifdef _WIN32 // If we can't cancel the delete don't rename. auto H = reinterpret_cast<HANDLE>(_get_osfhandle(FD)); - std::error_code RenameEC = setDeleteDisposition(H, false); + std::error_code RenameEC = + RemoveOnClose ? std::error_code() : setDeleteDisposition(H, false); bool ShouldDelete = false; if (!RenameEC) { RenameEC = rename_handle(H, Name); diff --git a/llvm/lib/Support/PrettyStackTrace.cpp b/llvm/lib/Support/PrettyStackTrace.cpp index 0d07057f1df0..fa91405fee10 100644 --- a/llvm/lib/Support/PrettyStackTrace.cpp +++ b/llvm/lib/Support/PrettyStackTrace.cpp @@ -13,7 +13,6 @@ #include "llvm/Support/PrettyStackTrace.h" #include "llvm-c/ErrorHandling.h" -#include "llvm/ADT/SmallString.h" #include "llvm/Config/config.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/SaveAndRestore.h" @@ -21,6 +20,10 @@ #include "llvm/Support/Watchdog.h" #include "llvm/Support/raw_ostream.h" +#ifdef __APPLE__ +#include "llvm/ADT/SmallString.h" +#endif + #include <atomic> #include <cassert> #include <cstdarg> diff --git a/llvm/lib/Support/RISCVISAInfo.cpp b/llvm/lib/Support/RISCVISAInfo.cpp index e2e4340f44e9..6c59d8a7ef04 100644 --- a/llvm/lib/Support/RISCVISAInfo.cpp +++ b/llvm/lib/Support/RISCVISAInfo.cpp @@ -9,6 +9,7 @@ #include "llvm/Support/RISCVISAInfo.h" #include "llvm/ADT/None.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SetVector.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringRef.h" #include "llvm/Support/Errc.h" @@ -46,25 +47,56 @@ static const RISCVSupportedExtension SupportedExtensions[] = { {"f", RISCVExtensionVersion{2, 0}}, {"d", RISCVExtensionVersion{2, 0}}, {"c", RISCVExtensionVersion{2, 0}}, -}; -static const RISCVSupportedExtension SupportedExperimentalExtensions[] = { - {"v", RISCVExtensionVersion{0, 10}}, + {"zfhmin", RISCVExtensionVersion{1, 0}}, + {"zfh", RISCVExtensionVersion{1, 0}}, + {"zba", RISCVExtensionVersion{1, 0}}, {"zbb", RISCVExtensionVersion{1, 0}}, {"zbc", RISCVExtensionVersion{1, 0}}, + {"zbs", RISCVExtensionVersion{1, 0}}, + + {"zbkb", RISCVExtensionVersion{1, 0}}, + {"zbkc", RISCVExtensionVersion{1, 0}}, + {"zbkx", RISCVExtensionVersion{1, 0}}, + {"zknd", RISCVExtensionVersion{1, 0}}, + {"zkne", RISCVExtensionVersion{1, 0}}, + {"zknh", RISCVExtensionVersion{1, 0}}, + {"zksed", RISCVExtensionVersion{1, 0}}, + {"zksh", RISCVExtensionVersion{1, 0}}, + {"zkr", RISCVExtensionVersion{1, 0}}, + {"zkn", RISCVExtensionVersion{1, 0}}, + {"zks", RISCVExtensionVersion{1, 0}}, + {"zkt", RISCVExtensionVersion{1, 0}}, + {"zk", RISCVExtensionVersion{1, 0}}, + + {"v", RISCVExtensionVersion{1, 0}}, + {"zvl32b", RISCVExtensionVersion{1, 0}}, + {"zvl64b", RISCVExtensionVersion{1, 0}}, + {"zvl128b", RISCVExtensionVersion{1, 0}}, + {"zvl256b", RISCVExtensionVersion{1, 0}}, + {"zvl512b", RISCVExtensionVersion{1, 0}}, + {"zvl1024b", RISCVExtensionVersion{1, 0}}, + {"zvl2048b", RISCVExtensionVersion{1, 0}}, + {"zvl4096b", RISCVExtensionVersion{1, 0}}, + {"zvl8192b", RISCVExtensionVersion{1, 0}}, + {"zvl16384b", RISCVExtensionVersion{1, 0}}, + {"zvl32768b", RISCVExtensionVersion{1, 0}}, + {"zvl65536b", RISCVExtensionVersion{1, 0}}, + {"zve32x", RISCVExtensionVersion{1, 0}}, + {"zve32f", RISCVExtensionVersion{1, 0}}, + {"zve64x", RISCVExtensionVersion{1, 0}}, + {"zve64f", RISCVExtensionVersion{1, 0}}, + {"zve64d", RISCVExtensionVersion{1, 0}}, +}; + +static const RISCVSupportedExtension SupportedExperimentalExtensions[] = { {"zbe", RISCVExtensionVersion{0, 93}}, {"zbf", RISCVExtensionVersion{0, 93}}, {"zbm", RISCVExtensionVersion{0, 93}}, {"zbp", RISCVExtensionVersion{0, 93}}, {"zbr", RISCVExtensionVersion{0, 93}}, - {"zbs", RISCVExtensionVersion{1, 0}}, {"zbt", RISCVExtensionVersion{0, 93}}, - - {"zvlsseg", RISCVExtensionVersion{0, 10}}, - - {"zfhmin", RISCVExtensionVersion{0, 1}}, - {"zfh", RISCVExtensionVersion{0, 1}}, }; static bool stripExperimentalPrefix(StringRef &Ext) { @@ -78,9 +110,9 @@ static bool stripExperimentalPrefix(StringRef &Ext) { // NOTE: This function is NOT able to take empty strings or strings that only // have version numbers and no extension name. It assumes the extension name // will be at least more than one character. -static size_t findFirstNonVersionCharacter(const StringRef &Ext) { - if (Ext.size() == 0) - llvm_unreachable("Already guarded by if-statement in ::parseArchString"); +static size_t findFirstNonVersionCharacter(StringRef Ext) { + assert(!Ext.empty() && + "Already guarded by if-statement in ::parseArchString"); int Pos = Ext.size() - 1; while (Pos > 0 && isDigit(Ext[Pos])) @@ -276,16 +308,13 @@ bool RISCVISAInfo::compareExtension(const std::string &LHS, void RISCVISAInfo::toFeatures( std::vector<StringRef> &Features, std::function<StringRef(const Twine &)> StrAlloc) const { - for (auto &Ext : Exts) { + for (auto const &Ext : Exts) { StringRef ExtName = Ext.first; if (ExtName == "i") continue; - if (ExtName == "zvlsseg") { - Features.push_back("+experimental-v"); - Features.push_back("+experimental-zvlsseg"); - } else if (isExperimentalExtension(ExtName)) { + if (isExperimentalExtension(ExtName)) { Features.push_back(StrAlloc("+experimental-" + ExtName)); } else { Features.push_back(StrAlloc("+" + ExtName)); @@ -434,6 +463,8 @@ RISCVISAInfo::parseFeatures(unsigned XLen, ISAInfo->updateImplication(); ISAInfo->updateFLen(); + ISAInfo->updateMinVLen(); + ISAInfo->updateMaxELen(); if (Error Result = ISAInfo->checkDependency()) return std::move(Result); @@ -657,6 +688,8 @@ RISCVISAInfo::parseArchString(StringRef Arch, bool EnableExperimentalExtension, ISAInfo->updateImplication(); ISAInfo->updateFLen(); + ISAInfo->updateMinVLen(); + ISAInfo->updateMaxELen(); if (Error Result = ISAInfo->checkDependency()) return std::move(Result); @@ -669,6 +702,12 @@ Error RISCVISAInfo::checkDependency() { bool HasE = Exts.count("e") == 1; bool HasD = Exts.count("d") == 1; bool HasF = Exts.count("f") == 1; + bool HasZve32x = Exts.count("zve32x") == 1; + bool HasZve32f = Exts.count("zve32f") == 1; + bool HasZve64d = Exts.count("zve64d") == 1; + bool HasV = Exts.count("v") == 1; + bool HasVector = HasZve32x || HasV; + bool HasZvl = MinVLen != 0; if (HasE && !IsRv32) return createStringError( @@ -683,6 +722,29 @@ Error RISCVISAInfo::checkDependency() { return createStringError(errc::invalid_argument, "d requires f extension to also be specified"); + // FIXME: Consider Zfinx in the future + if (HasZve32f && !HasF) + return createStringError( + errc::invalid_argument, + "zve32f requires f extension to also be specified"); + + // FIXME: Consider Zdinx in the future + if (HasZve64d && !HasD) + return createStringError( + errc::invalid_argument, + "zve64d requires d extension to also be specified"); + + if (HasZvl && !HasVector) + return createStringError( + errc::invalid_argument, + "zvl*b requires v or zve* extension to also be specified"); + + // Could not implement Zve* extension and the V extension at the same time. + if (HasZve32x && HasV) + return createStringError( + errc::invalid_argument, + "It is illegal to specify the v extension with zve* extensions"); + // Additional dependency checks. // TODO: The 'q' extension requires rv64. // TODO: It is illegal to specify 'e' extensions with 'f' and 'd'. @@ -690,8 +752,27 @@ Error RISCVISAInfo::checkDependency() { return Error::success(); } -static const char *ImpliedExtsV[] = {"zvlsseg"}; +static const char *ImpliedExtsV[] = {"zvl128b", "f", "d"}; static const char *ImpliedExtsZfh[] = {"zfhmin"}; +static const char *ImpliedExtsZve64d[] = {"zve64f"}; +static const char *ImpliedExtsZve64f[] = {"zve64x", "zve32f"}; +static const char *ImpliedExtsZve64x[] = {"zve32x", "zvl64b"}; +static const char *ImpliedExtsZve32f[] = {"zve32x"}; +static const char *ImpliedExtsZve32x[] = {"zvl32b"}; +static const char *ImpliedExtsZvl65536b[] = {"zvl32768b"}; +static const char *ImpliedExtsZvl32768b[] = {"zvl16384b"}; +static const char *ImpliedExtsZvl16384b[] = {"zvl8192b"}; +static const char *ImpliedExtsZvl8192b[] = {"zvl4096b"}; +static const char *ImpliedExtsZvl4096b[] = {"zvl2048b"}; +static const char *ImpliedExtsZvl2048b[] = {"zvl1024b"}; +static const char *ImpliedExtsZvl1024b[] = {"zvl512b"}; +static const char *ImpliedExtsZvl512b[] = {"zvl256b"}; +static const char *ImpliedExtsZvl256b[] = {"zvl128b"}; +static const char *ImpliedExtsZvl128b[] = {"zvl64b"}; +static const char *ImpliedExtsZvl64b[] = {"zvl32b"}; +static const char *ImpliedExtsZk[] = {"zkn", "zkt", "zkr"}; +static const char *ImpliedExtsZkn[] = {"zbkb", "zbkc", "zbkx", "zkne", "zknd", "zknh"}; +static const char *ImpliedExtsZks[] = {"zbkb", "zbkc", "zbkx", "zksed", "zksh"}; struct ImpliedExtsEntry { StringLiteral Name; @@ -707,6 +788,25 @@ struct ImpliedExtsEntry { static constexpr ImpliedExtsEntry ImpliedExts[] = { {{"v"}, {ImpliedExtsV}}, {{"zfh"}, {ImpliedExtsZfh}}, + {{"zk"}, {ImpliedExtsZk}}, + {{"zkn"}, {ImpliedExtsZkn}}, + {{"zks"}, {ImpliedExtsZks}}, + {{"zve32f"}, {ImpliedExtsZve32f}}, + {{"zve32x"}, {ImpliedExtsZve32x}}, + {{"zve64d"}, {ImpliedExtsZve64d}}, + {{"zve64f"}, {ImpliedExtsZve64f}}, + {{"zve64x"}, {ImpliedExtsZve64x}}, + {{"zvl1024b"}, {ImpliedExtsZvl1024b}}, + {{"zvl128b"}, {ImpliedExtsZvl128b}}, + {{"zvl16384b"}, {ImpliedExtsZvl16384b}}, + {{"zvl2048b"}, {ImpliedExtsZvl2048b}}, + {{"zvl256b"}, {ImpliedExtsZvl256b}}, + {{"zvl32768b"}, {ImpliedExtsZvl32768b}}, + {{"zvl4096b"}, {ImpliedExtsZvl4096b}}, + {{"zvl512b"}, {ImpliedExtsZvl512b}}, + {{"zvl64b"}, {ImpliedExtsZvl64b}}, + {{"zvl65536b"}, {ImpliedExtsZvl65536b}}, + {{"zvl8192b"}, {ImpliedExtsZvl8192b}}, }; void RISCVISAInfo::updateImplication() { @@ -721,12 +821,25 @@ void RISCVISAInfo::updateImplication() { } assert(llvm::is_sorted(ImpliedExts) && "Table not sorted by Name"); - for (auto &Ext : Exts) { - auto I = llvm::lower_bound(ImpliedExts, Ext.first); - if (I != std::end(ImpliedExts) && I->Name == Ext.first) { - for (auto &ImpliedExt : I->Exts) { + + // This loop may execute over 1 iteration since implication can be layered + // Exits loop if no more implication is applied + SmallSetVector<StringRef, 16> WorkList; + for (auto const &Ext : Exts) + WorkList.insert(Ext.first); + + while (!WorkList.empty()) { + StringRef ExtName = WorkList.pop_back_val(); + auto I = llvm::lower_bound(ImpliedExts, ExtName); + if (I != std::end(ImpliedExts) && I->Name == ExtName) { + for (const char *ImpliedExt : I->Exts) { + if (WorkList.count(ImpliedExt)) + continue; + if (Exts.count(ImpliedExt)) + continue; auto Version = findDefaultVersion(ImpliedExt); addExtension(ImpliedExt, Version->Major, Version->Minor); + WorkList.insert(ImpliedExt); } } } @@ -741,6 +854,41 @@ void RISCVISAInfo::updateFLen() { FLen = 32; } +void RISCVISAInfo::updateMinVLen() { + for (auto const &Ext : Exts) { + StringRef ExtName = Ext.first; + bool IsZvlExt = ExtName.consume_front("zvl") && ExtName.consume_back("b"); + if (IsZvlExt) { + unsigned ZvlLen; + if (!ExtName.getAsInteger(10, ZvlLen)) + MinVLen = std::max(MinVLen, ZvlLen); + } + } +} + +void RISCVISAInfo::updateMaxELen() { + // handles EEW restriction by sub-extension zve + for (auto const &Ext : Exts) { + StringRef ExtName = Ext.first; + bool IsZveExt = ExtName.consume_front("zve"); + if (IsZveExt) { + if (ExtName.back() == 'f') + MaxELenFp = std::max(MaxELenFp, 32u); + if (ExtName.back() == 'd') + MaxELenFp = std::max(MaxELenFp, 64u); + ExtName = ExtName.drop_back(); + unsigned ZveELen; + ExtName.getAsInteger(10, ZveELen); + MaxELen = std::max(MaxELen, ZveELen); + } + if (ExtName == "v") { + MaxELenFp = 64; + MaxELen = 64; + return; + } + } +} + std::string RISCVISAInfo::toString() const { std::string Buffer; raw_string_ostream Arch(Buffer); @@ -748,7 +896,7 @@ std::string RISCVISAInfo::toString() const { Arch << "rv" << XLen; ListSeparator LS("_"); - for (auto &Ext : Exts) { + for (auto const &Ext : Exts) { StringRef ExtName = Ext.first; auto ExtInfo = Ext.second; Arch << LS << ExtName; @@ -757,3 +905,17 @@ std::string RISCVISAInfo::toString() const { return Arch.str(); } + +std::vector<std::string> RISCVISAInfo::toFeatureVector() const { + std::vector<std::string> FeatureVector; + for (auto const &Ext : Exts) { + std::string ExtName = Ext.first; + if (ExtName == "i") // i is not recognized in clang -cc1 + continue; + std::string Feature = isExperimentalExtension(ExtName) + ? "+experimental-" + ExtName + : "+" + ExtName; + FeatureVector.push_back(Feature); + } + return FeatureVector; +} diff --git a/llvm/lib/Support/ScopedPrinter.cpp b/llvm/lib/Support/ScopedPrinter.cpp index ea90a24eaced..a434e50e8c1f 100644 --- a/llvm/lib/Support/ScopedPrinter.cpp +++ b/llvm/lib/Support/ScopedPrinter.cpp @@ -1,7 +1,6 @@ #include "llvm/Support/ScopedPrinter.h" #include "llvm/Support/Format.h" -#include <cctype> using namespace llvm::support; diff --git a/llvm/lib/Support/Signals.cpp b/llvm/lib/Support/Signals.cpp index c018dc92bf40..5ce41c987029 100644 --- a/llvm/lib/Support/Signals.cpp +++ b/llvm/lib/Support/Signals.cpp @@ -28,6 +28,7 @@ #include "llvm/Support/ManagedStatic.h" #include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/Mutex.h" +#include "llvm/Support/Path.h" #include "llvm/Support/Program.h" #include "llvm/Support/StringSaver.h" #include "llvm/Support/raw_ostream.h" diff --git a/llvm/lib/Support/Signposts.cpp b/llvm/lib/Support/Signposts.cpp index 58fafb26cdf3..074dddc81c80 100644 --- a/llvm/lib/Support/Signposts.cpp +++ b/llvm/lib/Support/Signposts.cpp @@ -7,7 +7,6 @@ //===----------------------------------------------------------------------===// #include "llvm/Support/Signposts.h" -#include "llvm/Support/Timer.h" #include "llvm/Config/config.h" #if LLVM_SUPPORT_XCODE_SIGNPOSTS diff --git a/llvm/lib/Support/SmallPtrSet.cpp b/llvm/lib/Support/SmallPtrSet.cpp index f6e2dfb8a6c9..cbb87ea8717c 100644 --- a/llvm/lib/Support/SmallPtrSet.cpp +++ b/llvm/lib/Support/SmallPtrSet.cpp @@ -13,7 +13,6 @@ #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/DenseMapInfo.h" -#include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/MemAlloc.h" #include <algorithm> diff --git a/llvm/lib/Support/SmallVector.cpp b/llvm/lib/Support/SmallVector.cpp index 2d7721e4e1fb..8cafbc7fad0d 100644 --- a/llvm/lib/Support/SmallVector.cpp +++ b/llvm/lib/Support/SmallVector.cpp @@ -12,6 +12,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Twine.h" +#include "llvm/Support/MemAlloc.h" #include <cstdint> #ifdef LLVM_ENABLE_EXCEPTIONS #include <stdexcept> diff --git a/llvm/lib/Support/SpecialCaseList.cpp b/llvm/lib/Support/SpecialCaseList.cpp index 1939ed9e9547..137b37f2b1c3 100644 --- a/llvm/lib/Support/SpecialCaseList.cpp +++ b/llvm/lib/Support/SpecialCaseList.cpp @@ -15,7 +15,6 @@ #include "llvm/Support/SpecialCaseList.h" #include "llvm/ADT/SmallVector.h" -#include "llvm/ADT/StringExtras.h" #include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/Regex.h" #include "llvm/Support/VirtualFileSystem.h" diff --git a/llvm/lib/Support/StringMap.cpp b/llvm/lib/Support/StringMap.cpp index f65d3846623c..012c785b4351 100644 --- a/llvm/lib/Support/StringMap.cpp +++ b/llvm/lib/Support/StringMap.cpp @@ -11,7 +11,6 @@ //===----------------------------------------------------------------------===// #include "llvm/ADT/StringMap.h" -#include "llvm/ADT/StringExtras.h" #include "llvm/Support/DJB.h" #include "llvm/Support/MathExtras.h" diff --git a/llvm/lib/Support/StringRef.cpp b/llvm/lib/Support/StringRef.cpp index 652303fdb6a0..3ed08ed38661 100644 --- a/llvm/lib/Support/StringRef.cpp +++ b/llvm/lib/Support/StringRef.cpp @@ -597,3 +597,11 @@ bool StringRef::getAsDouble(double &Result, bool AllowInexact) const { hash_code llvm::hash_value(StringRef S) { return hash_combine_range(S.begin(), S.end()); } + +unsigned DenseMapInfo<StringRef, void>::getHashValue(StringRef Val) { + assert(Val.data() != getEmptyKey().data() && + "Cannot hash the empty key!"); + assert(Val.data() != getTombstoneKey().data() && + "Cannot hash the tombstone key!"); + return (unsigned)(hash_value(Val)); +} diff --git a/llvm/lib/Support/SymbolRemappingReader.cpp b/llvm/lib/Support/SymbolRemappingReader.cpp index 1caf0947216e..90997ab0a6ce 100644 --- a/llvm/lib/Support/SymbolRemappingReader.cpp +++ b/llvm/lib/Support/SymbolRemappingReader.cpp @@ -15,6 +15,7 @@ #include "llvm/ADT/StringSwitch.h" #include "llvm/ADT/Twine.h" #include "llvm/Support/LineIterator.h" +#include "llvm/Support/MemoryBuffer.h" using namespace llvm; diff --git a/llvm/lib/Support/TargetParser.cpp b/llvm/lib/Support/TargetParser.cpp index bc60bdea5f62..0105cd2e8153 100644 --- a/llvm/lib/Support/TargetParser.cpp +++ b/llvm/lib/Support/TargetParser.cpp @@ -13,10 +13,8 @@ #include "llvm/Support/TargetParser.h" #include "llvm/ADT/ArrayRef.h" -#include "llvm/ADT/SmallString.h" #include "llvm/ADT/StringSwitch.h" -#include "llvm/ADT/Twine.h" -#include "llvm/Support/ARMBuildAttributes.h" +#include "llvm/ADT/Triple.h" using namespace llvm; using namespace AMDGPU; diff --git a/llvm/lib/Support/ThreadPool.cpp b/llvm/lib/Support/ThreadPool.cpp index 54ea84d4bd6d..9f92ae1c7a7c 100644 --- a/llvm/lib/Support/ThreadPool.cpp +++ b/llvm/lib/Support/ThreadPool.cpp @@ -13,8 +13,12 @@ #include "llvm/Support/ThreadPool.h" #include "llvm/Config/llvm-config.h" + +#if LLVM_ENABLE_THREADS #include "llvm/Support/Threading.h" +#else #include "llvm/Support/raw_ostream.h" +#endif using namespace llvm; @@ -117,6 +121,10 @@ void ThreadPool::wait() { } } +bool ThreadPool::isWorkerThread() const { + report_fatal_error("LLVM compiled without multithreading"); +} + ThreadPool::~ThreadPool() { wait(); } #endif diff --git a/llvm/lib/Support/TimeProfiler.cpp b/llvm/lib/Support/TimeProfiler.cpp index 2b094a4983a0..9380fa01c84a 100644 --- a/llvm/lib/Support/TimeProfiler.cpp +++ b/llvm/lib/Support/TimeProfiler.cpp @@ -11,10 +11,10 @@ //===----------------------------------------------------------------------===// #include "llvm/Support/TimeProfiler.h" -#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/STLFunctionalExtras.h" #include "llvm/ADT/StringMap.h" -#include "llvm/Support/CommandLine.h" #include "llvm/Support/JSON.h" +#include "llvm/Support/ManagedStatic.h" #include "llvm/Support/Path.h" #include "llvm/Support/Process.h" #include "llvm/Support/Threading.h" diff --git a/llvm/lib/Support/ToolOutputFile.cpp b/llvm/lib/Support/ToolOutputFile.cpp index c192ce60f31c..c2ca97a59c62 100644 --- a/llvm/lib/Support/ToolOutputFile.cpp +++ b/llvm/lib/Support/ToolOutputFile.cpp @@ -11,7 +11,6 @@ //===----------------------------------------------------------------------===// #include "llvm/Support/ToolOutputFile.h" -#include "llvm/ADT/Triple.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/Signals.h" using namespace llvm; diff --git a/llvm/lib/Support/Triple.cpp b/llvm/lib/Support/Triple.cpp index 2819dc0c139a..20dea8c302a5 100644 --- a/llvm/lib/Support/Triple.cpp +++ b/llvm/lib/Support/Triple.cpp @@ -14,7 +14,7 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/Host.h" #include "llvm/Support/SwapByteOrder.h" -#include "llvm/Support/TargetParser.h" +#include "llvm/Support/ARMTargetParser.h" #include "llvm/Support/VersionTuple.h" #include <cassert> #include <cstring> @@ -663,12 +663,16 @@ static Triple::SubArchType parseSubArch(StringRef SubArchName) { return Triple::ARMSubArch_v8_6a; case ARM::ArchKind::ARMV8_7A: return Triple::ARMSubArch_v8_7a; + case ARM::ArchKind::ARMV8_8A: + return Triple::ARMSubArch_v8_8a; case ARM::ArchKind::ARMV9A: return Triple::ARMSubArch_v9; case ARM::ArchKind::ARMV9_1A: return Triple::ARMSubArch_v9_1a; case ARM::ArchKind::ARMV9_2A: return Triple::ARMSubArch_v9_2a; + case ARM::ArchKind::ARMV9_3A: + return Triple::ARMSubArch_v9_3a; case ARM::ArchKind::ARMV8R: return Triple::ARMSubArch_v8r; case ARM::ArchKind::ARMV8MBaseline: diff --git a/llvm/lib/Support/TypeSize.cpp b/llvm/lib/Support/TypeSize.cpp index abb81016a0ba..a80fde83e3bc 100644 --- a/llvm/lib/Support/TypeSize.cpp +++ b/llvm/lib/Support/TypeSize.cpp @@ -8,6 +8,7 @@ #include "llvm/Support/TypeSize.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Support/WithColor.h" #include "DebugOptions.h" diff --git a/llvm/lib/Support/Unix/Path.inc b/llvm/lib/Support/Unix/Path.inc index f5cb5895d95d..788460d657fe 100644 --- a/llvm/lib/Support/Unix/Path.inc +++ b/llvm/lib/Support/Unix/Path.inc @@ -273,7 +273,7 @@ std::string getMainExecutable(const char *argv0, void *MainAddr) { // the program, and not the eventual binary file. Therefore, call realpath // so this behaves the same on all platforms. #if _POSIX_VERSION >= 200112 || defined(__GLIBC__) - if (char *real_path = realpath(exe_path, NULL)) { + if (char *real_path = realpath(exe_path, nullptr)) { std::string ret = std::string(real_path); free(real_path); return ret; @@ -380,20 +380,22 @@ std::error_code current_path(SmallVectorImpl<char> &result) { return std::error_code(); } - result.reserve(PATH_MAX); + result.resize_for_overwrite(PATH_MAX); while (true) { - if (::getcwd(result.data(), result.capacity()) == nullptr) { + if (::getcwd(result.data(), result.size()) == nullptr) { // See if there was a real error. - if (errno != ENOMEM) + if (errno != ENOMEM) { + result.clear(); return std::error_code(errno, std::generic_category()); + } // Otherwise there just wasn't enough space. - result.reserve(result.capacity() * 2); + result.resize_for_overwrite(result.capacity() * 2); } else break; } - result.set_size(strlen(result.data())); + result.truncate(strlen(result.data())); return std::error_code(); } @@ -870,6 +872,17 @@ void mapped_file_region::unmapImpl() { ::munmap(Mapping, Size); } +void mapped_file_region::dontNeedImpl() { + assert(Mode == mapped_file_region::readonly); +#if defined(__MVS__) || defined(_AIX) + // If we don't have madvise, or it isn't beneficial, treat this as a no-op. + return; +#else + if (Mapping) + ::madvise(Mapping, Size, MADV_DONTNEED); +#endif +} + int mapped_file_region::alignment() { return Process::getPageSizeEstimate(); } diff --git a/llvm/lib/Support/VirtualFileSystem.cpp b/llvm/lib/Support/VirtualFileSystem.cpp index bec4e8dbe06c..f15e301874c4 100644 --- a/llvm/lib/Support/VirtualFileSystem.cpp +++ b/llvm/lib/Support/VirtualFileSystem.cpp @@ -35,7 +35,6 @@ #include "llvm/Support/FileSystem/UniqueID.h" #include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/Path.h" -#include "llvm/Support/Process.h" #include "llvm/Support/SMLoc.h" #include "llvm/Support/SourceMgr.h" #include "llvm/Support/YAMLParser.h" @@ -46,9 +45,7 @@ #include <cstdint> #include <iterator> #include <limits> -#include <map> #include <memory> -#include <mutex> #include <string> #include <system_error> #include <utility> @@ -574,6 +571,11 @@ public: } virtual ~InMemoryNode() = default; + /// Return the \p Status for this node. \p RequestedName should be the name + /// through which the caller referred to this node. It will override + /// \p Status::Name in the return value, to mimic the behavior of \p RealFile. + virtual Status getStatus(const Twine &RequestedName) const = 0; + /// Get the filename of this node (the name without the directory part). StringRef getFileName() const { return FileName; } InMemoryNodeKind getKind() const { return Kind; } @@ -589,10 +591,7 @@ public: : InMemoryNode(Stat.getName(), IME_File), Stat(std::move(Stat)), Buffer(std::move(Buffer)) {} - /// Return the \p Status for this node. \p RequestedName should be the name - /// through which the caller referred to this node. It will override - /// \p Status::Name in the return value, to mimic the behavior of \p RealFile. - Status getStatus(const Twine &RequestedName) const { + Status getStatus(const Twine &RequestedName) const override { return Status::copyWithNewName(Stat, RequestedName); } llvm::MemoryBuffer *getBuffer() const { return Buffer.get(); } @@ -616,6 +615,10 @@ public: : InMemoryNode(Path, IME_HardLink), ResolvedFile(ResolvedFile) {} const InMemoryFile &getResolvedFile() const { return ResolvedFile; } + Status getStatus(const Twine &RequestedName) const override { + return ResolvedFile.getStatus(RequestedName); + } + std::string toString(unsigned Indent) const override { return std::string(Indent, ' ') + "HardLink to -> " + ResolvedFile.toString(0); @@ -668,7 +671,7 @@ public: /// Return the \p Status for this node. \p RequestedName should be the name /// through which the caller referred to this node. It will override /// \p Status::Name in the return value, to mimic the behavior of \p RealFile. - Status getStatus(const Twine &RequestedName) const { + Status getStatus(const Twine &RequestedName) const override { return Status::copyWithNewName(Stat, RequestedName); } @@ -704,17 +707,6 @@ public: } }; -namespace { -Status getNodeStatus(const InMemoryNode *Node, const Twine &RequestedName) { - if (auto Dir = dyn_cast<detail::InMemoryDirectory>(Node)) - return Dir->getStatus(RequestedName); - if (auto File = dyn_cast<detail::InMemoryFile>(Node)) - return File->getStatus(RequestedName); - if (auto Link = dyn_cast<detail::InMemoryHardLink>(Node)) - return Link->getResolvedFile().getStatus(RequestedName); - llvm_unreachable("Unknown node type"); -} -} // namespace } // namespace detail // The UniqueID of in-memory files is derived from path and content. @@ -734,6 +726,16 @@ static sys::fs::UniqueID getDirectoryID(sys::fs::UniqueID Parent, return getUniqueID(llvm::hash_combine(Parent.getFile(), Name)); } +Status detail::NewInMemoryNodeInfo::makeStatus() const { + UniqueID UID = + (Type == sys::fs::file_type::directory_file) + ? getDirectoryID(DirUID, Name) + : getFileID(DirUID, Name, Buffer ? Buffer->getBuffer() : ""); + + return Status(Path, UID, llvm::sys::toTimePoint(ModificationTime), User, + Group, Buffer ? Buffer->getBufferSize() : 0, Type, Perms); +} + InMemoryFileSystem::InMemoryFileSystem(bool UseNormalizedPaths) : Root(new detail::InMemoryDirectory( Status("", getDirectoryID(llvm::sys::fs::UniqueID(), ""), @@ -754,7 +756,7 @@ bool InMemoryFileSystem::addFile(const Twine &P, time_t ModificationTime, Optional<uint32_t> Group, Optional<llvm::sys::fs::file_type> Type, Optional<llvm::sys::fs::perms> Perms, - const detail::InMemoryFile *HardLinkTarget) { + MakeNodeFn MakeNode) { SmallString<128> Path; P.toVector(Path); @@ -775,7 +777,6 @@ bool InMemoryFileSystem::addFile(const Twine &P, time_t ModificationTime, const auto ResolvedGroup = Group.getValueOr(0); const auto ResolvedType = Type.getValueOr(sys::fs::file_type::regular_file); const auto ResolvedPerms = Perms.getValueOr(sys::fs::all_all); - assert(!(HardLinkTarget && Buffer) && "HardLink cannot have a buffer"); // Any intermediate directories we create should be accessible by // the owner, even if Perms says otherwise for the final path. const auto NewDirectoryPerms = ResolvedPerms | sys::fs::owner_all; @@ -786,27 +787,10 @@ bool InMemoryFileSystem::addFile(const Twine &P, time_t ModificationTime, if (!Node) { if (I == E) { // End of the path. - std::unique_ptr<detail::InMemoryNode> Child; - if (HardLinkTarget) - Child.reset(new detail::InMemoryHardLink(P.str(), *HardLinkTarget)); - else { - // Create a new file or directory. - Status Stat( - P.str(), - (ResolvedType == sys::fs::file_type::directory_file) - ? getDirectoryID(Dir->getUniqueID(), Name) - : getFileID(Dir->getUniqueID(), Name, Buffer->getBuffer()), - llvm::sys::toTimePoint(ModificationTime), ResolvedUser, - ResolvedGroup, Buffer->getBufferSize(), ResolvedType, - ResolvedPerms); - if (ResolvedType == sys::fs::file_type::directory_file) { - Child.reset(new detail::InMemoryDirectory(std::move(Stat))); - } else { - Child.reset( - new detail::InMemoryFile(std::move(Stat), std::move(Buffer))); - } - } - Dir->addChild(Name, std::move(Child)); + Dir->addChild( + Name, MakeNode({Dir->getUniqueID(), Path, Name, ModificationTime, + std::move(Buffer), ResolvedUser, ResolvedGroup, + ResolvedType, ResolvedPerms})); return true; } @@ -850,7 +834,15 @@ bool InMemoryFileSystem::addFile(const Twine &P, time_t ModificationTime, Optional<llvm::sys::fs::file_type> Type, Optional<llvm::sys::fs::perms> Perms) { return addFile(P, ModificationTime, std::move(Buffer), User, Group, Type, - Perms, /*HardLinkTarget=*/nullptr); + Perms, + [](detail::NewInMemoryNodeInfo NNI) + -> std::unique_ptr<detail::InMemoryNode> { + Status Stat = NNI.makeStatus(); + if (Stat.getType() == sys::fs::file_type::directory_file) + return std::make_unique<detail::InMemoryDirectory>(Stat); + return std::make_unique<detail::InMemoryFile>( + Stat, std::move(NNI.Buffer)); + }); } bool InMemoryFileSystem::addFileNoOwn(const Twine &P, time_t ModificationTime, @@ -861,7 +853,15 @@ bool InMemoryFileSystem::addFileNoOwn(const Twine &P, time_t ModificationTime, Optional<llvm::sys::fs::perms> Perms) { return addFile(P, ModificationTime, llvm::MemoryBuffer::getMemBuffer(Buffer), std::move(User), std::move(Group), std::move(Type), - std::move(Perms)); + std::move(Perms), + [](detail::NewInMemoryNodeInfo NNI) + -> std::unique_ptr<detail::InMemoryNode> { + Status Stat = NNI.makeStatus(); + if (Stat.getType() == sys::fs::file_type::directory_file) + return std::make_unique<detail::InMemoryDirectory>(Stat); + return std::make_unique<detail::InMemoryFile>( + Stat, std::move(NNI.Buffer)); + }); } static ErrorOr<const detail::InMemoryNode *> @@ -916,14 +916,17 @@ bool InMemoryFileSystem::addHardLink(const Twine &FromPath, // before. Resolved ToPath must be a File. if (!ToNode || FromNode || !isa<detail::InMemoryFile>(*ToNode)) return false; - return this->addFile(FromPath, 0, nullptr, None, None, None, None, - cast<detail::InMemoryFile>(*ToNode)); + return addFile(FromPath, 0, nullptr, None, None, None, None, + [&](detail::NewInMemoryNodeInfo NNI) { + return std::make_unique<detail::InMemoryHardLink>( + NNI.Path.str(), *cast<detail::InMemoryFile>(*ToNode)); + }); } llvm::ErrorOr<Status> InMemoryFileSystem::status(const Twine &Path) { auto Node = lookupInMemoryNode(*this, Root.get(), Path); if (Node) - return detail::getNodeStatus(*Node, Path); + return (*Node)->getStatus(Path); return Node.getError(); } @@ -1649,10 +1652,19 @@ private: sys::path::Style::windows_backslash)) { path_style = sys::path::Style::windows_backslash; } else { - assert(NameValueNode && "Name presence should be checked earlier"); - error(NameValueNode, + // Relative VFS root entries are made absolute to the current working + // directory, then we can determine the path style from that. + auto EC = sys::fs::make_absolute(Name); + if (EC) { + assert(NameValueNode && "Name presence should be checked earlier"); + error( + NameValueNode, "entry with relative path at the root level is not discoverable"); - return nullptr; + return nullptr; + } + path_style = sys::path::is_absolute(Name, sys::path::Style::posix) + ? sys::path::Style::posix + : sys::path::Style::windows_backslash; } } diff --git a/llvm/lib/Support/Windows/Path.inc b/llvm/lib/Support/Windows/Path.inc index b15e71a9ce2a..5f1a364ea1a8 100644 --- a/llvm/lib/Support/Windows/Path.inc +++ b/llvm/lib/Support/Windows/Path.inc @@ -132,7 +132,8 @@ const file_t kInvalidFile = INVALID_HANDLE_VALUE; std::string getMainExecutable(const char *argv0, void *MainExecAddr) { SmallVector<wchar_t, MAX_PATH> PathName; - DWORD Size = ::GetModuleFileNameW(NULL, PathName.data(), PathName.capacity()); + PathName.resize_for_overwrite(PathName.capacity()); + DWORD Size = ::GetModuleFileNameW(NULL, PathName.data(), PathName.size()); // A zero return value indicates a failure other than insufficient space. if (Size == 0) @@ -145,7 +146,7 @@ std::string getMainExecutable(const char *argv0, void *MainExecAddr) { // On success, GetModuleFileNameW returns the number of characters written to // the buffer not including the NULL terminator. - PathName.set_size(Size); + PathName.truncate(Size); // Convert the result from UTF-16 to UTF-8. SmallVector<char, MAX_PATH> PathNameUTF8; @@ -201,8 +202,8 @@ std::error_code current_path(SmallVectorImpl<char> &result) { DWORD len = MAX_PATH; do { - cur_path.reserve(len); - len = ::GetCurrentDirectoryW(cur_path.capacity(), cur_path.data()); + cur_path.resize_for_overwrite(len); + len = ::GetCurrentDirectoryW(cur_path.size(), cur_path.data()); // A zero return value indicates a failure other than insufficient space. if (len == 0) @@ -210,11 +211,11 @@ std::error_code current_path(SmallVectorImpl<char> &result) { // If there's insufficient space, the len returned is larger than the len // given. - } while (len > cur_path.capacity()); + } while (len > cur_path.size()); // On success, GetCurrentDirectoryW returns the number of characters not // including the null-terminator. - cur_path.set_size(len); + cur_path.truncate(len); if (std::error_code EC = UTF16ToUTF8(cur_path.begin(), cur_path.size(), result)) @@ -328,7 +329,7 @@ static std::error_code is_local_internal(SmallVectorImpl<wchar_t> &Path, // the null terminator, it will leave the output unterminated. Push a null // terminator onto the end to ensure that this never happens. VolumePath.push_back(L'\0'); - VolumePath.set_size(wcslen(VolumePath.data())); + VolumePath.truncate(wcslen(VolumePath.data())); const wchar_t *P = VolumePath.data(); UINT Type = ::GetDriveTypeW(P); @@ -364,18 +365,19 @@ std::error_code is_local(const Twine &path, bool &result) { static std::error_code realPathFromHandle(HANDLE H, SmallVectorImpl<wchar_t> &Buffer) { + Buffer.resize_for_overwrite(Buffer.capacity()); DWORD CountChars = ::GetFinalPathNameByHandleW( H, Buffer.begin(), Buffer.capacity(), FILE_NAME_NORMALIZED); if (CountChars && CountChars >= Buffer.capacity()) { // The buffer wasn't big enough, try again. In this case the return value // *does* indicate the size of the null terminator. - Buffer.reserve(CountChars); + Buffer.resize_for_overwrite(CountChars); CountChars = ::GetFinalPathNameByHandleW( - H, Buffer.begin(), Buffer.capacity(), FILE_NAME_NORMALIZED); + H, Buffer.begin(), Buffer.size(), FILE_NAME_NORMALIZED); } + Buffer.truncate(CountChars); if (CountChars == 0) return mapWindowsError(GetLastError()); - Buffer.set_size(CountChars); return std::error_code(); } @@ -959,6 +961,8 @@ void mapped_file_region::unmapImpl() { } } +void mapped_file_region::dontNeedImpl() {} + int mapped_file_region::alignment() { SYSTEM_INFO SysInfo; ::GetSystemInfo(&SysInfo); @@ -1448,14 +1452,14 @@ static bool getTempDirEnvVar(const wchar_t *Var, SmallVectorImpl<char> &Res) { SmallVector<wchar_t, 1024> Buf; size_t Size = 1024; do { - Buf.reserve(Size); - Size = GetEnvironmentVariableW(Var, Buf.data(), Buf.capacity()); + Buf.resize_for_overwrite(Size); + Size = GetEnvironmentVariableW(Var, Buf.data(), Buf.size()); if (Size == 0) return false; // Try again with larger buffer. - } while (Size > Buf.capacity()); - Buf.set_size(Size); + } while (Size > Buf.size()); + Buf.truncate(Size); return !windows::UTF16ToUTF8(Buf.data(), Size, Res); } @@ -1504,7 +1508,7 @@ std::error_code CodePageToUTF16(unsigned codepage, } utf16.reserve(len + 1); - utf16.set_size(len); + utf16.resize_for_overwrite(len); len = ::MultiByteToWideChar(codepage, MB_ERR_INVALID_CHARS, original.begin(), original.size(), utf16.begin(), utf16.size()); @@ -1544,8 +1548,8 @@ std::error_code UTF16ToCodePage(unsigned codepage, const wchar_t *utf16, return mapWindowsError(::GetLastError()); } - converted.reserve(len); - converted.set_size(len); + converted.reserve(len + 1); + converted.resize_for_overwrite(len); // Now do the actual conversion. len = ::WideCharToMultiByte(codepage, 0, utf16, utf16_len, converted.data(), diff --git a/llvm/lib/Support/Windows/Process.inc b/llvm/lib/Support/Windows/Process.inc index 6732063b562e..dfaab1613de1 100644 --- a/llvm/lib/Support/Windows/Process.inc +++ b/llvm/lib/Support/Windows/Process.inc @@ -129,16 +129,16 @@ Optional<std::string> Process::GetEnv(StringRef Name) { SmallVector<wchar_t, MAX_PATH> Buf; size_t Size = MAX_PATH; do { - Buf.reserve(Size); + Buf.resize_for_overwrite(Size); SetLastError(NO_ERROR); Size = - GetEnvironmentVariableW(NameUTF16.data(), Buf.data(), Buf.capacity()); + GetEnvironmentVariableW(NameUTF16.data(), Buf.data(), Buf.size()); if (Size == 0 && GetLastError() == ERROR_ENVVAR_NOT_FOUND) return None; // Try again with larger buffer. - } while (Size > Buf.capacity()); - Buf.set_size(Size); + } while (Size > Buf.size()); + Buf.truncate(Size); // Convert the result from UTF-16 to UTF-8. SmallVector<char, MAX_PATH> Res; diff --git a/llvm/lib/Support/Windows/Program.inc b/llvm/lib/Support/Windows/Program.inc index a9cf2db7ec72..ee633411584f 100644 --- a/llvm/lib/Support/Windows/Program.inc +++ b/llvm/lib/Support/Windows/Program.inc @@ -72,7 +72,7 @@ ErrorOr<std::string> sys::findProgramByName(StringRef Name, SmallVector<wchar_t, MAX_PATH> U16Result; DWORD Len = MAX_PATH; do { - U16Result.reserve(Len); + U16Result.resize_for_overwrite(Len); // Lets attach the extension manually. That is needed for files // with a point in name like aaa.bbb. SearchPathW will not add extension // from its argument to such files because it thinks they already had one. @@ -82,13 +82,13 @@ ErrorOr<std::string> sys::findProgramByName(StringRef Name, return EC; Len = ::SearchPathW(Path, c_str(U16NameExt), nullptr, - U16Result.capacity(), U16Result.data(), nullptr); - } while (Len > U16Result.capacity()); + U16Result.size(), U16Result.data(), nullptr); + } while (Len > U16Result.size()); if (Len == 0) continue; - U16Result.set_size(Len); + U16Result.truncate(Len); if (std::error_code EC = windows::UTF16ToUTF8(U16Result.data(), U16Result.size(), U8Result)) diff --git a/llvm/lib/Support/X86TargetParser.cpp b/llvm/lib/Support/X86TargetParser.cpp index ab49ac548f89..10f9692d217e 100644 --- a/llvm/lib/Support/X86TargetParser.cpp +++ b/llvm/lib/Support/X86TargetParser.cpp @@ -12,7 +12,6 @@ #include "llvm/Support/X86TargetParser.h" #include "llvm/ADT/StringSwitch.h" -#include "llvm/ADT/Triple.h" #include <numeric> using namespace llvm; diff --git a/llvm/lib/Support/YAMLParser.cpp b/llvm/lib/Support/YAMLParser.cpp index 2adf37a511d1..200261d3ed5c 100644 --- a/llvm/lib/Support/YAMLParser.cpp +++ b/llvm/lib/Support/YAMLParser.cpp @@ -27,7 +27,6 @@ #include "llvm/Support/SourceMgr.h" #include "llvm/Support/Unicode.h" #include "llvm/Support/raw_ostream.h" -#include <algorithm> #include <cassert> #include <cstddef> #include <cstdint> diff --git a/llvm/lib/Support/YAMLTraits.cpp b/llvm/lib/Support/YAMLTraits.cpp index aa6163a76161..8cdd03149bcf 100644 --- a/llvm/lib/Support/YAMLTraits.cpp +++ b/llvm/lib/Support/YAMLTraits.cpp @@ -18,13 +18,12 @@ #include "llvm/Support/Format.h" #include "llvm/Support/LineIterator.h" #include "llvm/Support/MemoryBuffer.h" -#include "llvm/Support/Unicode.h" +#include "llvm/Support/VersionTuple.h" #include "llvm/Support/YAMLParser.h" #include "llvm/Support/raw_ostream.h" #include <algorithm> #include <cassert> #include <cstdint> -#include <cstdlib> #include <cstring> #include <string> #include <vector> @@ -300,7 +299,7 @@ void Input::endEnumScalar() { bool Input::beginBitSetScalar(bool &DoClear) { BitValuesUsed.clear(); if (SequenceHNode *SQ = dyn_cast<SequenceHNode>(CurrentNode)) { - BitValuesUsed.insert(BitValuesUsed.begin(), SQ->Entries.size(), false); + BitValuesUsed.resize(SQ->Entries.size()); } else { setError(CurrentNode, "expected sequence of bit values"); } @@ -527,8 +526,9 @@ std::vector<StringRef> Output::keys() { } bool Output::preflightKey(const char *Key, bool Required, bool SameAsDefault, - bool &UseDefault, void *&) { + bool &UseDefault, void *&SaveInfo) { UseDefault = false; + SaveInfo = nullptr; if (Required || !SameAsDefault || WriteDefaultValues) { auto State = StateStack.back(); if (State == inFlowMapFirstKey || State == inFlowMapOtherKey) { @@ -599,7 +599,8 @@ void Output::endSequence() { StateStack.pop_back(); } -bool Output::preflightElement(unsigned, void *&) { +bool Output::preflightElement(unsigned, void *&SaveInfo) { + SaveInfo = nullptr; return true; } @@ -627,7 +628,7 @@ void Output::endFlowSequence() { outputUpToEndOfLine(" ]"); } -bool Output::preflightFlowElement(unsigned, void *&) { +bool Output::preflightFlowElement(unsigned, void *&SaveInfo) { if (NeedFlowSequenceComma) output(", "); if (WrapColumn && Column > WrapColumn) { @@ -637,6 +638,7 @@ bool Output::preflightFlowElement(unsigned, void *&) { Column = ColumnAtFlowStart; output(" "); } + SaveInfo = nullptr; return true; } diff --git a/llvm/lib/Support/raw_ostream.cpp b/llvm/lib/Support/raw_ostream.cpp index 4590a3d19b0d..e4b747b68bea 100644 --- a/llvm/lib/Support/raw_ostream.cpp +++ b/llvm/lib/Support/raw_ostream.cpp @@ -15,6 +15,7 @@ #include "llvm/ADT/StringExtras.h" #include "llvm/Config/config.h" #include "llvm/Support/Compiler.h" +#include "llvm/Support/Duration.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/Format.h" @@ -24,10 +25,8 @@ #include "llvm/Support/Process.h" #include "llvm/Support/Program.h" #include <algorithm> -#include <cctype> #include <cerrno> #include <cstdio> -#include <iterator> #include <sys/stat.h> // <fcntl.h> may provide O_BINARY. @@ -643,13 +642,14 @@ raw_fd_ostream::raw_fd_ostream(int fd, bool shouldClose, bool unbuffered, // Get the starting position. off_t loc = ::lseek(FD, 0, SEEK_CUR); -#ifdef _WIN32 - // MSVCRT's _lseek(SEEK_CUR) doesn't return -1 for pipes. sys::fs::file_status Status; std::error_code EC = status(FD, Status); - SupportsSeeking = !EC && Status.type() == sys::fs::file_type::regular_file; + IsRegularFile = Status.type() == sys::fs::file_type::regular_file; +#ifdef _WIN32 + // MSVCRT's _lseek(SEEK_CUR) doesn't return -1 for pipes. + SupportsSeeking = !EC && IsRegularFile; #else - SupportsSeeking = loc != (off_t)-1; + SupportsSeeking = !EC && loc != (off_t)-1; #endif if (!SupportsSeeking) pos = 0; @@ -869,8 +869,8 @@ Expected<sys::fs::FileLocker> raw_fd_ostream::lock() { } Expected<sys::fs::FileLocker> -raw_fd_ostream::tryLockFor(std::chrono::milliseconds Timeout) { - std::error_code EC = sys::fs::tryLockFile(FD, Timeout); +raw_fd_ostream::tryLockFor(Duration const& Timeout) { + std::error_code EC = sys::fs::tryLockFile(FD, Timeout.getDuration()); if (!EC) return sys::fs::FileLocker(FD); return errorCodeToError(EC); @@ -914,8 +914,7 @@ raw_fd_stream::raw_fd_stream(StringRef Filename, std::error_code &EC) if (EC) return; - // Do not support non-seekable files. - if (!supportsSeeking()) + if (!isRegularFile()) EC = std::make_error_code(std::errc::invalid_argument); } @@ -937,10 +936,6 @@ bool raw_fd_stream::classof(const raw_ostream *OS) { // raw_string_ostream //===----------------------------------------------------------------------===// -raw_string_ostream::~raw_string_ostream() { - flush(); -} - void raw_string_ostream::write_impl(const char *Ptr, size_t Size) { OS.append(Ptr, Size); } diff --git a/llvm/lib/TableGen/TGParser.cpp b/llvm/lib/TableGen/TGParser.cpp index 6ccca4d69f40..3709a375ed1b 100644 --- a/llvm/lib/TableGen/TGParser.cpp +++ b/llvm/lib/TableGen/TGParser.cpp @@ -3203,7 +3203,8 @@ bool TGParser::ParseIf(MultiClass *CurMultiClass) { // iteration variable being assigned. ListInit *EmptyList = ListInit::get({}, BitRecTy::get()); - ListInit *SingletonList = ListInit::get({BitInit::get(1)}, BitRecTy::get()); + ListInit *SingletonList = + ListInit::get({BitInit::get(true)}, BitRecTy::get()); RecTy *BitListTy = ListRecTy::get(BitRecTy::get()); // The foreach containing the then-clause selects SingletonList if diff --git a/llvm/lib/Target/AArch64/AArch64.h b/llvm/lib/Target/AArch64/AArch64.h index b0dd30c13137..4d1464901777 100644 --- a/llvm/lib/Target/AArch64/AArch64.h +++ b/llvm/lib/Target/AArch64/AArch64.h @@ -26,7 +26,6 @@ class AArch64Subtarget; class AArch64TargetMachine; class FunctionPass; class InstructionSelector; -class MachineFunctionPass; FunctionPass *createAArch64DeadRegisterDefinitions(); FunctionPass *createAArch64RedundantCopyEliminationPass(); diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td index cb17fd94c335..b87468d5c8de 100644 --- a/llvm/lib/Target/AArch64/AArch64.td +++ b/llvm/lib/Target/AArch64/AArch64.td @@ -416,6 +416,12 @@ def FeatureHCX : SubtargetFeature< def FeatureLS64 : SubtargetFeature<"ls64", "HasLS64", "true", "Enable Armv8.7-A LD64B/ST64B Accelerator Extension">; +def FeatureHBC : SubtargetFeature<"hbc", "HasHBC", + "true", "Enable Armv8.8-A Hinted Conditional Branches Extension">; + +def FeatureMOPS : SubtargetFeature<"mops", "HasMOPS", + "true", "Enable Armv8.8-A memcpy and memset acceleration instructions">; + def FeatureBRBE : SubtargetFeature<"brbe", "HasBRBE", "true", "Enable Branch Record Buffer Extension">; @@ -497,6 +503,10 @@ def HasV8_7aOps : SubtargetFeature< "v8.7a", "HasV8_7aOps", "true", "Support ARM v8.7a instructions", [HasV8_6aOps, FeatureXS, FeatureWFxT, FeatureHCX]>; +def HasV8_8aOps : SubtargetFeature< + "v8.8a", "HasV8_8aOps", "true", "Support ARM v8.8a instructions", + [HasV8_7aOps, FeatureHBC, FeatureMOPS]>; + def HasV9_0aOps : SubtargetFeature< "v9a", "HasV9_0aOps", "true", "Support ARM v9a instructions", [HasV8_5aOps, FeatureSVE2]>; @@ -509,21 +519,22 @@ def HasV9_2aOps : SubtargetFeature< "v9.2a", "HasV9_2aOps", "true", "Support ARM v9.2a instructions", [HasV8_7aOps, HasV9_1aOps]>; +def HasV9_3aOps : SubtargetFeature< + "v9.3a", "HasV9_3aOps", "true", "Support ARM v9.3a instructions", + [HasV8_8aOps, HasV9_2aOps]>; + def HasV8_0rOps : SubtargetFeature< "v8r", "HasV8_0rOps", "true", "Support ARM v8r instructions", [//v8.1 FeatureCRC, FeaturePAN, FeatureRDM, FeatureLSE, FeatureCONTEXTIDREL2, //v8.2 - FeaturePerfMon, FeatureRAS, FeaturePsUAO, FeatureSM4, - FeatureSHA3, FeatureCCPP, FeatureFullFP16, FeaturePAN_RWV, + FeatureRAS, FeaturePsUAO, FeatureCCPP, FeaturePAN_RWV, //v8.3 FeatureComplxNum, FeatureCCIDX, FeatureJS, FeaturePAuth, FeatureRCPC, //v8.4 - FeatureDotProd, FeatureFP16FML, FeatureTRACEV8_4, - FeatureTLB_RMI, FeatureFlagM, FeatureDIT, FeatureSEL2, FeatureRCPC_IMMO, - //v8.5 - FeatureSSBS, FeaturePredRes, FeatureSB, FeatureSpecRestrict]>; + FeatureDotProd, FeatureTRACEV8_4, FeatureTLB_RMI, + FeatureFlagM, FeatureDIT, FeatureSEL2, FeatureRCPC_IMMO]>; //===----------------------------------------------------------------------===// // Register File Description @@ -955,7 +966,9 @@ def ProcessorFeatures { list<SubtargetFeature> A710 = [HasV9_0aOps, FeatureNEON, FeaturePerfMon, FeatureETE, FeatureMTE, FeatureFP16FML, FeatureSVE2BitPerm, FeatureBF16, FeatureMatMulInt8]; - list<SubtargetFeature> R82 = [HasV8_0rOps]; + list<SubtargetFeature> R82 = [HasV8_0rOps, FeaturePerfMon, FeatureFullFP16, + FeatureFP16FML, FeatureSSBS, FeaturePredRes, + FeatureSB, FeatureSpecRestrict]; list<SubtargetFeature> X1 = [HasV8_2aOps, FeatureCrypto, FeatureFPARMv8, FeatureNEON, FeatureRCPC, FeaturePerfMon, FeatureSPE, FeatureFullFP16, FeatureDotProd]; diff --git a/llvm/lib/Target/AArch64/AArch64CallingConvention.td b/llvm/lib/Target/AArch64/AArch64CallingConvention.td index c90601443934..f26151536a58 100644 --- a/llvm/lib/Target/AArch64/AArch64CallingConvention.td +++ b/llvm/lib/Target/AArch64/AArch64CallingConvention.td @@ -468,7 +468,7 @@ def CSR_Darwin_AArch64_TLS // CSR_Darwin_AArch64_CXX_TLS should be a subset of CSR_Darwin_AArch64_TLS. def CSR_Darwin_AArch64_CXX_TLS : CalleeSavedRegs<(add CSR_Darwin_AArch64_AAPCS, - (sub (sequence "X%u", 1, 28), X15, X16, X17, X18), + (sub (sequence "X%u", 1, 28), X9, X15, X16, X17, X18, X19), (sequence "D%u", 0, 31))>; // CSRs that are handled by prologue, epilogue. diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp index ee6e670fe3cd..109b739528bf 100644 --- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp +++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp @@ -443,7 +443,7 @@ bool AArch64ExpandPseudo::expand_DestructiveOp( uint64_t FalseLanes = MI.getDesc().TSFlags & AArch64::FalseLanesMask; bool FalseZero = FalseLanes == AArch64::FalseLanesZero; - unsigned DstReg = MI.getOperand(0).getReg(); + Register DstReg = MI.getOperand(0).getReg(); bool DstIsDead = MI.getOperand(0).isDead(); if (DType == AArch64::DestructiveBinary) @@ -989,7 +989,7 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB, .addReg(DstReg, RegState::Kill) .addReg(DstReg, DstFlags | RegState::Implicit); } else { - unsigned DstReg = MI.getOperand(0).getReg(); + Register DstReg = MI.getOperand(0).getReg(); MIB2 = BuildMI(MBB, MBBI, DL, TII->get(AArch64::LDRXui)) .add(MI.getOperand(0)) .addUse(DstReg, RegState::Kill); diff --git a/llvm/lib/Target/AArch64/AArch64FastISel.cpp b/llvm/lib/Target/AArch64/AArch64FastISel.cpp index 3dc694df509d..c67fa62c7a92 100644 --- a/llvm/lib/Target/AArch64/AArch64FastISel.cpp +++ b/llvm/lib/Target/AArch64/AArch64FastISel.cpp @@ -355,7 +355,7 @@ unsigned AArch64FastISel::fastMaterializeAlloca(const AllocaInst *AI) { FuncInfo.StaticAllocaMap.find(AI); if (SI != FuncInfo.StaticAllocaMap.end()) { - unsigned ResultReg = createResultReg(&AArch64::GPR64spRegClass); + Register ResultReg = createResultReg(&AArch64::GPR64spRegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADDXri), ResultReg) .addFrameIndex(SI->second) @@ -378,7 +378,7 @@ unsigned AArch64FastISel::materializeInt(const ConstantInt *CI, MVT VT) { const TargetRegisterClass *RC = (VT == MVT::i64) ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass; unsigned ZeroReg = (VT == MVT::i64) ? AArch64::XZR : AArch64::WZR; - unsigned ResultReg = createResultReg(RC); + Register ResultReg = createResultReg(RC); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), ResultReg).addReg(ZeroReg, getKillRegState(true)); return ResultReg; @@ -410,11 +410,11 @@ unsigned AArch64FastISel::materializeFP(const ConstantFP *CFP, MVT VT) { const TargetRegisterClass *RC = Is64Bit ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass; - unsigned TmpReg = createResultReg(RC); + Register TmpReg = createResultReg(RC); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc1), TmpReg) .addImm(CFP->getValueAPF().bitcastToAPInt().getZExtValue()); - unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT)); + Register ResultReg = createResultReg(TLI.getRegClassFor(VT)); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), ResultReg) .addReg(TmpReg, getKillRegState(true)); @@ -427,12 +427,12 @@ unsigned AArch64FastISel::materializeFP(const ConstantFP *CFP, MVT VT) { Align Alignment = DL.getPrefTypeAlign(CFP->getType()); unsigned CPI = MCP.getConstantPoolIndex(cast<Constant>(CFP), Alignment); - unsigned ADRPReg = createResultReg(&AArch64::GPR64commonRegClass); + Register ADRPReg = createResultReg(&AArch64::GPR64commonRegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADRP), ADRPReg).addConstantPoolIndex(CPI, 0, AArch64II::MO_PAGE); unsigned Opc = Is64Bit ? AArch64::LDRDui : AArch64::LDRSui; - unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT)); + Register ResultReg = createResultReg(TLI.getRegClassFor(VT)); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg) .addReg(ADRPReg) .addConstantPoolIndex(CPI, 0, AArch64II::MO_PAGEOFF | AArch64II::MO_NC); @@ -455,7 +455,7 @@ unsigned AArch64FastISel::materializeGV(const GlobalValue *GV) { if (!DestEVT.isSimple()) return 0; - unsigned ADRPReg = createResultReg(&AArch64::GPR64commonRegClass); + Register ADRPReg = createResultReg(&AArch64::GPR64commonRegClass); unsigned ResultReg; if (OpFlags & AArch64II::MO_GOT) { @@ -482,7 +482,7 @@ unsigned AArch64FastISel::materializeGV(const GlobalValue *GV) { // LDRWui produces a 32-bit register, but pointers in-register are 64-bits // so we must extend the result on ILP32. - unsigned Result64 = createResultReg(&AArch64::GPR64RegClass); + Register Result64 = createResultReg(&AArch64::GPR64RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::SUBREG_TO_REG)) .addDef(Result64) @@ -751,7 +751,7 @@ bool AArch64FastISel::computeAddress(const Value *Obj, Address &Addr, Type *Ty) if (const auto *C = dyn_cast<ConstantInt>(RHS)) if (C->getValue() == 0xffffffff) { Addr.setExtendType(AArch64_AM::UXTW); - unsigned Reg = getRegForValue(LHS); + Register Reg = getRegForValue(LHS); if (!Reg) return false; Reg = fastEmitInst_extractsubreg(MVT::i32, Reg, AArch64::sub_32); @@ -760,7 +760,7 @@ bool AArch64FastISel::computeAddress(const Value *Obj, Address &Addr, Type *Ty) } } - unsigned Reg = getRegForValue(Src); + Register Reg = getRegForValue(Src); if (!Reg) return false; Addr.setOffsetReg(Reg); @@ -821,7 +821,7 @@ bool AArch64FastISel::computeAddress(const Value *Obj, Address &Addr, Type *Ty) } } - unsigned Reg = getRegForValue(Src); + Register Reg = getRegForValue(Src); if (!Reg) return false; Addr.setOffsetReg(Reg); @@ -847,7 +847,7 @@ bool AArch64FastISel::computeAddress(const Value *Obj, Address &Addr, Type *Ty) Addr.setExtendType(AArch64_AM::LSL); Addr.setExtendType(AArch64_AM::UXTW); - unsigned Reg = getRegForValue(LHS); + Register Reg = getRegForValue(LHS); if (!Reg) return false; Reg = fastEmitInst_extractsubreg(MVT::i32, Reg, AArch64::sub_32); @@ -879,7 +879,7 @@ bool AArch64FastISel::computeAddress(const Value *Obj, Address &Addr, Type *Ty) break; Addr.setShift(0); - unsigned Reg = getRegForValue(Src); + Register Reg = getRegForValue(Src); if (!Reg) return false; Addr.setOffsetReg(Reg); @@ -888,7 +888,7 @@ bool AArch64FastISel::computeAddress(const Value *Obj, Address &Addr, Type *Ty) } // end switch if (Addr.isRegBase() && !Addr.getReg()) { - unsigned Reg = getRegForValue(Obj); + Register Reg = getRegForValue(Obj); if (!Reg) return false; Addr.setReg(Reg); @@ -896,7 +896,7 @@ bool AArch64FastISel::computeAddress(const Value *Obj, Address &Addr, Type *Ty) } if (!Addr.getOffsetReg()) { - unsigned Reg = getRegForValue(Obj); + Register Reg = getRegForValue(Obj); if (!Reg) return false; Addr.setOffsetReg(Reg); @@ -1034,7 +1034,7 @@ bool AArch64FastISel::simplifyAddress(Address &Addr, MVT VT) { // continue. This should almost never happen. if ((ImmediateOffsetNeedsLowering || Addr.getOffsetReg()) && Addr.isFIBase()) { - unsigned ResultReg = createResultReg(&AArch64::GPR64spRegClass); + Register ResultReg = createResultReg(&AArch64::GPR64spRegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADDXri), ResultReg) .addFrameIndex(Addr.getFI()) @@ -1178,7 +1178,7 @@ unsigned AArch64FastISel::emitAddSub(bool UseAdd, MVT RetVT, const Value *LHS, SI->getOpcode() == Instruction::AShr ) std::swap(LHS, RHS); - unsigned LHSReg = getRegForValue(LHS); + Register LHSReg = getRegForValue(LHS); if (!LHSReg) return 0; @@ -1207,13 +1207,13 @@ unsigned AArch64FastISel::emitAddSub(bool UseAdd, MVT RetVT, const Value *LHS, if (const auto *SI = dyn_cast<BinaryOperator>(RHS)) if (const auto *C = dyn_cast<ConstantInt>(SI->getOperand(1))) if ((SI->getOpcode() == Instruction::Shl) && (C->getZExtValue() < 4)) { - unsigned RHSReg = getRegForValue(SI->getOperand(0)); + Register RHSReg = getRegForValue(SI->getOperand(0)); if (!RHSReg) return 0; return emitAddSub_rx(UseAdd, RetVT, LHSReg, RHSReg, ExtendType, C->getZExtValue(), SetFlags, WantResult); } - unsigned RHSReg = getRegForValue(RHS); + Register RHSReg = getRegForValue(RHS); if (!RHSReg) return 0; return emitAddSub_rx(UseAdd, RetVT, LHSReg, RHSReg, ExtendType, 0, @@ -1232,7 +1232,7 @@ unsigned AArch64FastISel::emitAddSub(bool UseAdd, MVT RetVT, const Value *LHS, assert(isa<ConstantInt>(MulRHS) && "Expected a ConstantInt."); uint64_t ShiftVal = cast<ConstantInt>(MulRHS)->getValue().logBase2(); - unsigned RHSReg = getRegForValue(MulLHS); + Register RHSReg = getRegForValue(MulLHS); if (!RHSReg) return 0; ResultReg = emitAddSub_rs(UseAdd, RetVT, LHSReg, RHSReg, AArch64_AM::LSL, @@ -1255,7 +1255,7 @@ unsigned AArch64FastISel::emitAddSub(bool UseAdd, MVT RetVT, const Value *LHS, } uint64_t ShiftVal = C->getZExtValue(); if (ShiftType != AArch64_AM::InvalidShiftExtend) { - unsigned RHSReg = getRegForValue(SI->getOperand(0)); + Register RHSReg = getRegForValue(SI->getOperand(0)); if (!RHSReg) return 0; ResultReg = emitAddSub_rs(UseAdd, RetVT, LHSReg, RHSReg, ShiftType, @@ -1267,7 +1267,7 @@ unsigned AArch64FastISel::emitAddSub(bool UseAdd, MVT RetVT, const Value *LHS, } } - unsigned RHSReg = getRegForValue(RHS); + Register RHSReg = getRegForValue(RHS); if (!RHSReg) return 0; @@ -1489,7 +1489,7 @@ bool AArch64FastISel::emitFCmp(MVT RetVT, const Value *LHS, const Value *RHS) { if (CFP->isZero() && !CFP->isNegative()) UseImm = true; - unsigned LHSReg = getRegForValue(LHS); + Register LHSReg = getRegForValue(LHS); if (!LHSReg) return false; @@ -1500,7 +1500,7 @@ bool AArch64FastISel::emitFCmp(MVT RetVT, const Value *LHS, const Value *RHS) { return true; } - unsigned RHSReg = getRegForValue(RHS); + Register RHSReg = getRegForValue(RHS); if (!RHSReg) return false; @@ -1577,7 +1577,7 @@ unsigned AArch64FastISel::emitLogicalOp(unsigned ISDOpc, MVT RetVT, if (isa<ConstantInt>(SI->getOperand(1))) std::swap(LHS, RHS); - unsigned LHSReg = getRegForValue(LHS); + Register LHSReg = getRegForValue(LHS); if (!LHSReg) return 0; @@ -1602,7 +1602,7 @@ unsigned AArch64FastISel::emitLogicalOp(unsigned ISDOpc, MVT RetVT, assert(isa<ConstantInt>(MulRHS) && "Expected a ConstantInt."); uint64_t ShiftVal = cast<ConstantInt>(MulRHS)->getValue().logBase2(); - unsigned RHSReg = getRegForValue(MulLHS); + Register RHSReg = getRegForValue(MulLHS); if (!RHSReg) return 0; ResultReg = emitLogicalOp_rs(ISDOpc, RetVT, LHSReg, RHSReg, ShiftVal); @@ -1616,7 +1616,7 @@ unsigned AArch64FastISel::emitLogicalOp(unsigned ISDOpc, MVT RetVT, if (const auto *SI = dyn_cast<ShlOperator>(RHS)) if (const auto *C = dyn_cast<ConstantInt>(SI->getOperand(1))) { uint64_t ShiftVal = C->getZExtValue(); - unsigned RHSReg = getRegForValue(SI->getOperand(0)); + Register RHSReg = getRegForValue(SI->getOperand(0)); if (!RHSReg) return 0; ResultReg = emitLogicalOp_rs(ISDOpc, RetVT, LHSReg, RHSReg, ShiftVal); @@ -1625,7 +1625,7 @@ unsigned AArch64FastISel::emitLogicalOp(unsigned ISDOpc, MVT RetVT, } } - unsigned RHSReg = getRegForValue(RHS); + Register RHSReg = getRegForValue(RHS); if (!RHSReg) return 0; @@ -1673,7 +1673,7 @@ unsigned AArch64FastISel::emitLogicalOp_ri(unsigned ISDOpc, MVT RetVT, if (!AArch64_AM::isLogicalImmediate(Imm, RegSize)) return 0; - unsigned ResultReg = + Register ResultReg = fastEmitInst_ri(Opc, RC, LHSReg, AArch64_AM::encodeLogicalImmediate(Imm, RegSize)); if (RetVT >= MVT::i8 && RetVT <= MVT::i16 && ISDOpc != ISD::AND) { @@ -1715,7 +1715,7 @@ unsigned AArch64FastISel::emitLogicalOp_rs(unsigned ISDOpc, MVT RetVT, RC = &AArch64::GPR64RegClass; break; } - unsigned ResultReg = + Register ResultReg = fastEmitInst_rri(Opc, RC, LHSReg, RHSReg, AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftImm)); if (RetVT >= MVT::i8 && RetVT <= MVT::i16) { @@ -1841,7 +1841,7 @@ unsigned AArch64FastISel::emitLoad(MVT VT, MVT RetVT, Address Addr, } // Create the base instruction, then add the operands. - unsigned ResultReg = createResultReg(RC); + Register ResultReg = createResultReg(RC); MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg); addLoadStoreOperands(Addr, MIB, MachineMemOperand::MOLoad, ScaleFactor, MMO); @@ -1856,7 +1856,7 @@ unsigned AArch64FastISel::emitLoad(MVT VT, MVT RetVT, Address Addr, // For zero-extending loads to 64bit we emit a 32bit load and then convert // the 32bit reg to a 64bit reg. if (WantZExt && RetVT == MVT::i64 && VT <= MVT::i32) { - unsigned Reg64 = createResultReg(&AArch64::GPR64RegClass); + Register Reg64 = createResultReg(&AArch64::GPR64RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::SUBREG_TO_REG), Reg64) .addImm(0) @@ -1991,7 +1991,7 @@ bool AArch64FastISel::selectLoad(const Instruction *I) { // The integer extend hasn't been emitted yet. FastISel or SelectionDAG // could select it. Emit a copy to subreg if necessary. FastISel will remove // it when it selects the integer extend. - unsigned Reg = lookUpRegForValue(IntExtVal); + Register Reg = lookUpRegForValue(IntExtVal); auto *MI = MRI.getUniqueVRegDef(Reg); if (!MI) { if (RetVT == MVT::i64 && VT <= MVT::i32) { @@ -2174,7 +2174,7 @@ bool AArch64FastISel::selectStore(const Instruction *I) { // The non-atomic instructions are sufficient for relaxed stores. if (isReleaseOrStronger(Ord)) { // The STLR addressing mode only supports a base reg; pass that directly. - unsigned AddrReg = getRegForValue(PtrV); + Register AddrReg = getRegForValue(PtrV); return emitStoreRelease(VT, SrcReg, AddrReg, createMachineMemOperandFor(I)); } @@ -2339,7 +2339,7 @@ bool AArch64FastISel::emitCompareAndBranch(const BranchInst *BI) { unsigned Opc = OpcTable[IsBitTest][IsCmpNE][Is64Bit]; const MCInstrDesc &II = TII.get(Opc); - unsigned SrcReg = getRegForValue(LHS); + Register SrcReg = getRegForValue(LHS); if (!SrcReg) return false; @@ -2454,7 +2454,7 @@ bool AArch64FastISel::selectBranch(const Instruction *I) { if (foldXALUIntrinsic(CC, I, BI->getCondition())) { // Fake request the condition, otherwise the intrinsic might be completely // optimized away. - unsigned CondReg = getRegForValue(BI->getCondition()); + Register CondReg = getRegForValue(BI->getCondition()); if (!CondReg) return false; @@ -2468,7 +2468,7 @@ bool AArch64FastISel::selectBranch(const Instruction *I) { } } - unsigned CondReg = getRegForValue(BI->getCondition()); + Register CondReg = getRegForValue(BI->getCondition()); if (CondReg == 0) return false; @@ -2480,7 +2480,7 @@ bool AArch64FastISel::selectBranch(const Instruction *I) { } const MCInstrDesc &II = TII.get(Opcode); - unsigned ConstrainedCondReg + Register ConstrainedCondReg = constrainOperandRegClass(II, CondReg, II.getNumDefs()); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II) .addReg(ConstrainedCondReg) @@ -2493,7 +2493,7 @@ bool AArch64FastISel::selectBranch(const Instruction *I) { bool AArch64FastISel::selectIndirectBr(const Instruction *I) { const IndirectBrInst *BI = cast<IndirectBrInst>(I); - unsigned AddrReg = getRegForValue(BI->getOperand(0)); + Register AddrReg = getRegForValue(BI->getOperand(0)); if (AddrReg == 0) return false; @@ -2563,7 +2563,7 @@ bool AArch64FastISel::selectCmp(const Instruction *I) { } if (CondCodes) { - unsigned TmpReg1 = createResultReg(&AArch64::GPR32RegClass); + Register TmpReg1 = createResultReg(&AArch64::GPR32RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::CSINCWr), TmpReg1) .addReg(AArch64::WZR, getKillRegState(true)) @@ -2630,18 +2630,18 @@ bool AArch64FastISel::optimizeSelect(const SelectInst *SI) { if (!Opc) return false; - unsigned Src1Reg = getRegForValue(Src1Val); + Register Src1Reg = getRegForValue(Src1Val); if (!Src1Reg) return false; - unsigned Src2Reg = getRegForValue(Src2Val); + Register Src2Reg = getRegForValue(Src2Val); if (!Src2Reg) return false; if (NeedExtraOp) Src1Reg = emitLogicalOp_ri(ISD::XOR, MVT::i32, Src1Reg, 1); - unsigned ResultReg = fastEmitInst_rr(Opc, &AArch64::GPR32RegClass, Src1Reg, + Register ResultReg = fastEmitInst_rr(Opc, &AArch64::GPR32RegClass, Src1Reg, Src2Reg); updateValueMap(SI, ResultReg); return true; @@ -2690,7 +2690,7 @@ bool AArch64FastISel::selectSelect(const Instruction *I) { // Try to pickup the flags, so we don't have to emit another compare. if (foldXALUIntrinsic(CC, I, Cond)) { // Fake request the condition to force emission of the XALU intrinsic. - unsigned CondReg = getRegForValue(Cond); + Register CondReg = getRegForValue(Cond); if (!CondReg) return false; } else if (isa<CmpInst>(Cond) && cast<CmpInst>(Cond)->hasOneUse() && @@ -2711,7 +2711,7 @@ bool AArch64FastISel::selectSelect(const Instruction *I) { } if (FoldSelect) { - unsigned SrcReg = getRegForValue(FoldSelect); + Register SrcReg = getRegForValue(FoldSelect); if (!SrcReg) return false; @@ -2739,7 +2739,7 @@ bool AArch64FastISel::selectSelect(const Instruction *I) { } assert((CC != AArch64CC::AL) && "Unexpected condition code."); } else { - unsigned CondReg = getRegForValue(Cond); + Register CondReg = getRegForValue(Cond); if (!CondReg) return false; @@ -2753,8 +2753,8 @@ bool AArch64FastISel::selectSelect(const Instruction *I) { .addImm(AArch64_AM::encodeLogicalImmediate(1, 32)); } - unsigned Src1Reg = getRegForValue(SI->getTrueValue()); - unsigned Src2Reg = getRegForValue(SI->getFalseValue()); + Register Src1Reg = getRegForValue(SI->getTrueValue()); + Register Src2Reg = getRegForValue(SI->getFalseValue()); if (!Src1Reg || !Src2Reg) return false; @@ -2762,7 +2762,7 @@ bool AArch64FastISel::selectSelect(const Instruction *I) { if (ExtraCC != AArch64CC::AL) Src2Reg = fastEmitInst_rri(Opc, RC, Src1Reg, Src2Reg, ExtraCC); - unsigned ResultReg = fastEmitInst_rri(Opc, RC, Src1Reg, Src2Reg, CC); + Register ResultReg = fastEmitInst_rri(Opc, RC, Src1Reg, Src2Reg, CC); updateValueMap(I, ResultReg); return true; } @@ -2772,11 +2772,11 @@ bool AArch64FastISel::selectFPExt(const Instruction *I) { if (!I->getType()->isDoubleTy() || !V->getType()->isFloatTy()) return false; - unsigned Op = getRegForValue(V); + Register Op = getRegForValue(V); if (Op == 0) return false; - unsigned ResultReg = createResultReg(&AArch64::FPR64RegClass); + Register ResultReg = createResultReg(&AArch64::FPR64RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::FCVTDSr), ResultReg).addReg(Op); updateValueMap(I, ResultReg); @@ -2788,11 +2788,11 @@ bool AArch64FastISel::selectFPTrunc(const Instruction *I) { if (!I->getType()->isFloatTy() || !V->getType()->isDoubleTy()) return false; - unsigned Op = getRegForValue(V); + Register Op = getRegForValue(V); if (Op == 0) return false; - unsigned ResultReg = createResultReg(&AArch64::FPR32RegClass); + Register ResultReg = createResultReg(&AArch64::FPR32RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::FCVTSDr), ResultReg).addReg(Op); updateValueMap(I, ResultReg); @@ -2805,7 +2805,7 @@ bool AArch64FastISel::selectFPToInt(const Instruction *I, bool Signed) { if (!isTypeLegal(I->getType(), DestVT) || DestVT.isVector()) return false; - unsigned SrcReg = getRegForValue(I->getOperand(0)); + Register SrcReg = getRegForValue(I->getOperand(0)); if (SrcReg == 0) return false; @@ -2825,7 +2825,7 @@ bool AArch64FastISel::selectFPToInt(const Instruction *I, bool Signed) { else Opc = (DestVT == MVT::i32) ? AArch64::FCVTZUUWSr : AArch64::FCVTZUUXSr; } - unsigned ResultReg = createResultReg( + Register ResultReg = createResultReg( DestVT == MVT::i32 ? &AArch64::GPR32RegClass : &AArch64::GPR64RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg) .addReg(SrcReg); @@ -2844,7 +2844,7 @@ bool AArch64FastISel::selectIntToFP(const Instruction *I, bool Signed) { assert((DestVT == MVT::f32 || DestVT == MVT::f64) && "Unexpected value type."); - unsigned SrcReg = getRegForValue(I->getOperand(0)); + Register SrcReg = getRegForValue(I->getOperand(0)); if (!SrcReg) return false; @@ -2871,7 +2871,7 @@ bool AArch64FastISel::selectIntToFP(const Instruction *I, bool Signed) { Opc = (DestVT == MVT::f32) ? AArch64::UCVTFUWSri : AArch64::UCVTFUWDri; } - unsigned ResultReg = fastEmitInst_r(Opc, TLI.getRegClassFor(DestVT), SrcReg); + Register ResultReg = fastEmitInst_r(Opc, TLI.getRegClassFor(DestVT), SrcReg); updateValueMap(I, ResultReg); return true; } @@ -2975,11 +2975,11 @@ bool AArch64FastISel::fastLowerArguments() { } else llvm_unreachable("Unexpected value type."); - unsigned DstReg = FuncInfo.MF->addLiveIn(SrcReg, RC); + Register DstReg = FuncInfo.MF->addLiveIn(SrcReg, RC); // FIXME: Unfortunately it's necessary to emit a copy from the livein copy. // Without this, EmitLiveInCopies may eliminate the livein if its only // use is a bitcast (which isn't turned into an instruction). - unsigned ResultReg = createResultReg(RC); + Register ResultReg = createResultReg(RC); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), ResultReg) .addReg(DstReg, getKillRegState(true)); @@ -3009,7 +3009,7 @@ bool AArch64FastISel::processCallArgs(CallLoweringInfo &CLI, const Value *ArgVal = CLI.OutVals[VA.getValNo()]; MVT ArgVT = OutVTs[VA.getValNo()]; - unsigned ArgReg = getRegForValue(ArgVal); + Register ArgReg = getRegForValue(ArgVal); if (!ArgReg) return false; @@ -3104,7 +3104,7 @@ bool AArch64FastISel::finishCall(CallLoweringInfo &CLI, MVT RetVT, if (CopyVT.isVector() && !Subtarget->isLittleEndian()) return false; - unsigned ResultReg = createResultReg(TLI.getRegClassFor(CopyVT)); + Register ResultReg = createResultReg(TLI.getRegClassFor(CopyVT)); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), ResultReg) .addReg(RVLocs[0].getLocReg()); @@ -3209,14 +3209,14 @@ bool AArch64FastISel::fastLowerCall(CallLoweringInfo &CLI) { else if (Addr.getGlobalValue()) MIB.addGlobalAddress(Addr.getGlobalValue(), 0, 0); else if (Addr.getReg()) { - unsigned Reg = constrainOperandRegClass(II, Addr.getReg(), 0); + Register Reg = constrainOperandRegClass(II, Addr.getReg(), 0); MIB.addReg(Reg); } else return false; } else { unsigned CallReg = 0; if (Symbol) { - unsigned ADRPReg = createResultReg(&AArch64::GPR64commonRegClass); + Register ADRPReg = createResultReg(&AArch64::GPR64commonRegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADRP), ADRPReg) .addSym(Symbol, AArch64II::MO_GOT | AArch64II::MO_PAGE); @@ -3438,7 +3438,7 @@ bool AArch64FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) { // SP = FP + Fixed Object + 16 int FI = MFI.CreateFixedObject(4, 0, false); - unsigned ResultReg = createResultReg(&AArch64::GPR64spRegClass); + Register ResultReg = createResultReg(&AArch64::GPR64spRegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADDXri), ResultReg) .addFrameIndex(FI) @@ -3568,10 +3568,10 @@ bool AArch64FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) { Opc = AArch64::FABSDr; break; } - unsigned SrcReg = getRegForValue(II->getOperand(0)); + Register SrcReg = getRegForValue(II->getOperand(0)); if (!SrcReg) return false; - unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT)); + Register ResultReg = createResultReg(TLI.getRegClassFor(VT)); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg) .addReg(SrcReg); updateValueMap(II, ResultReg); @@ -3593,7 +3593,7 @@ bool AArch64FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) { if (!isTypeLegal(RetTy, VT)) return false; - unsigned Op0Reg = getRegForValue(II->getOperand(0)); + Register Op0Reg = getRegForValue(II->getOperand(0)); if (!Op0Reg) return false; @@ -3671,17 +3671,17 @@ bool AArch64FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) { break; case Intrinsic::smul_with_overflow: { CC = AArch64CC::NE; - unsigned LHSReg = getRegForValue(LHS); + Register LHSReg = getRegForValue(LHS); if (!LHSReg) return false; - unsigned RHSReg = getRegForValue(RHS); + Register RHSReg = getRegForValue(RHS); if (!RHSReg) return false; if (VT == MVT::i32) { MulReg = emitSMULL_rr(MVT::i64, LHSReg, RHSReg); - unsigned MulSubReg = + Register MulSubReg = fastEmitInst_extractsubreg(VT, MulReg, AArch64::sub_32); // cmp xreg, wreg, sxtw emitAddSub_rx(/*UseAdd=*/false, MVT::i64, MulReg, MulSubReg, @@ -3701,11 +3701,11 @@ bool AArch64FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) { } case Intrinsic::umul_with_overflow: { CC = AArch64CC::NE; - unsigned LHSReg = getRegForValue(LHS); + Register LHSReg = getRegForValue(LHS); if (!LHSReg) return false; - unsigned RHSReg = getRegForValue(RHS); + Register RHSReg = getRegForValue(RHS); if (!RHSReg) return false; @@ -3799,7 +3799,7 @@ bool AArch64FastISel::selectRet(const Instruction *I) { if (!VA.isRegLoc()) return false; - unsigned Reg = getRegForValue(RV); + Register Reg = getRegForValue(RV); if (Reg == 0) return false; @@ -3879,7 +3879,7 @@ bool AArch64FastISel::selectTrunc(const Instruction *I) { DestVT != MVT::i1) return false; - unsigned SrcReg = getRegForValue(Op); + Register SrcReg = getRegForValue(Op); if (!SrcReg) return false; @@ -3906,7 +3906,7 @@ bool AArch64FastISel::selectTrunc(const Instruction *I) { break; } // Issue an extract_subreg to get the lower 32-bits. - unsigned Reg32 = fastEmitInst_extractsubreg(MVT::i32, SrcReg, + Register Reg32 = fastEmitInst_extractsubreg(MVT::i32, SrcReg, AArch64::sub_32); // Create the AND instruction which performs the actual truncation. ResultReg = emitAnd_ri(MVT::i32, Reg32, Mask); @@ -4007,7 +4007,7 @@ unsigned AArch64FastISel::emitLSL_rr(MVT RetVT, unsigned Op0Reg, if (NeedTrunc) Op1Reg = emitAnd_ri(MVT::i32, Op1Reg, Mask); - unsigned ResultReg = fastEmitInst_rr(Opc, RC, Op0Reg, Op1Reg); + Register ResultReg = fastEmitInst_rr(Opc, RC, Op0Reg, Op1Reg); if (NeedTrunc) ResultReg = emitAnd_ri(MVT::i32, ResultReg, Mask); return ResultReg; @@ -4033,7 +4033,7 @@ unsigned AArch64FastISel::emitLSL_ri(MVT RetVT, MVT SrcVT, unsigned Op0, // Just emit a copy for "zero" shifts. if (Shift == 0) { if (RetVT == SrcVT) { - unsigned ResultReg = createResultReg(RC); + Register ResultReg = createResultReg(RC); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), ResultReg) .addReg(Op0); @@ -4110,7 +4110,7 @@ unsigned AArch64FastISel::emitLSR_rr(MVT RetVT, unsigned Op0Reg, Op0Reg = emitAnd_ri(MVT::i32, Op0Reg, Mask); Op1Reg = emitAnd_ri(MVT::i32, Op1Reg, Mask); } - unsigned ResultReg = fastEmitInst_rr(Opc, RC, Op0Reg, Op1Reg); + Register ResultReg = fastEmitInst_rr(Opc, RC, Op0Reg, Op1Reg); if (NeedTrunc) ResultReg = emitAnd_ri(MVT::i32, ResultReg, Mask); return ResultReg; @@ -4136,7 +4136,7 @@ unsigned AArch64FastISel::emitLSR_ri(MVT RetVT, MVT SrcVT, unsigned Op0, // Just emit a copy for "zero" shifts. if (Shift == 0) { if (RetVT == SrcVT) { - unsigned ResultReg = createResultReg(RC); + Register ResultReg = createResultReg(RC); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), ResultReg) .addReg(Op0); @@ -4226,7 +4226,7 @@ unsigned AArch64FastISel::emitASR_rr(MVT RetVT, unsigned Op0Reg, Op0Reg = emitIntExt(RetVT, Op0Reg, MVT::i32, /*isZExt=*/false); Op1Reg = emitAnd_ri(MVT::i32, Op1Reg, Mask); } - unsigned ResultReg = fastEmitInst_rr(Opc, RC, Op0Reg, Op1Reg); + Register ResultReg = fastEmitInst_rr(Opc, RC, Op0Reg, Op1Reg); if (NeedTrunc) ResultReg = emitAnd_ri(MVT::i32, ResultReg, Mask); return ResultReg; @@ -4252,7 +4252,7 @@ unsigned AArch64FastISel::emitASR_ri(MVT RetVT, MVT SrcVT, unsigned Op0, // Just emit a copy for "zero" shifts. if (Shift == 0) { if (RetVT == SrcVT) { - unsigned ResultReg = createResultReg(RC); + Register ResultReg = createResultReg(RC); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), ResultReg) .addReg(Op0); @@ -4428,7 +4428,7 @@ bool AArch64FastISel::optimizeIntExtLoad(const Instruction *I, MVT RetVT, return false; // Check if the load instruction has already been selected. - unsigned Reg = lookUpRegForValue(LI); + Register Reg = lookUpRegForValue(LI); if (!Reg) return false; @@ -4456,7 +4456,7 @@ bool AArch64FastISel::optimizeIntExtLoad(const Instruction *I, MVT RetVT, } if (IsZExt) { - unsigned Reg64 = createResultReg(&AArch64::GPR64RegClass); + Register Reg64 = createResultReg(&AArch64::GPR64RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::SUBREG_TO_REG), Reg64) .addImm(0) @@ -4490,7 +4490,7 @@ bool AArch64FastISel::selectIntExt(const Instruction *I) { if (optimizeIntExtLoad(I, RetVT, SrcVT)) return true; - unsigned SrcReg = getRegForValue(I->getOperand(0)); + Register SrcReg = getRegForValue(I->getOperand(0)); if (!SrcReg) return false; @@ -4499,7 +4499,7 @@ bool AArch64FastISel::selectIntExt(const Instruction *I) { if (const auto *Arg = dyn_cast<Argument>(I->getOperand(0))) { if ((IsZExt && Arg->hasZExtAttr()) || (!IsZExt && Arg->hasSExtAttr())) { if (RetVT == MVT::i64 && SrcVT != MVT::i64) { - unsigned ResultReg = createResultReg(&AArch64::GPR64RegClass); + Register ResultReg = createResultReg(&AArch64::GPR64RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::SUBREG_TO_REG), ResultReg) .addImm(0) @@ -4543,21 +4543,21 @@ bool AArch64FastISel::selectRem(const Instruction *I, unsigned ISDOpcode) { break; } unsigned MSubOpc = Is64bit ? AArch64::MSUBXrrr : AArch64::MSUBWrrr; - unsigned Src0Reg = getRegForValue(I->getOperand(0)); + Register Src0Reg = getRegForValue(I->getOperand(0)); if (!Src0Reg) return false; - unsigned Src1Reg = getRegForValue(I->getOperand(1)); + Register Src1Reg = getRegForValue(I->getOperand(1)); if (!Src1Reg) return false; const TargetRegisterClass *RC = (DestVT == MVT::i64) ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass; - unsigned QuotReg = fastEmitInst_rr(DivOpc, RC, Src0Reg, Src1Reg); + Register QuotReg = fastEmitInst_rr(DivOpc, RC, Src0Reg, Src1Reg); assert(QuotReg && "Unexpected DIV instruction emission failure."); // The remainder is computed as numerator - (quotient * denominator) using the // MSUB instruction. - unsigned ResultReg = fastEmitInst_rrr(MSubOpc, RC, QuotReg, Src1Reg, Src0Reg); + Register ResultReg = fastEmitInst_rrr(MSubOpc, RC, QuotReg, Src1Reg, Src0Reg); updateValueMap(I, ResultReg); return true; } @@ -4602,7 +4602,7 @@ bool AArch64FastISel::selectMul(const Instruction *I) { } } - unsigned Src0Reg = getRegForValue(Src0); + Register Src0Reg = getRegForValue(Src0); if (!Src0Reg) return false; @@ -4615,11 +4615,11 @@ bool AArch64FastISel::selectMul(const Instruction *I) { } } - unsigned Src0Reg = getRegForValue(I->getOperand(0)); + Register Src0Reg = getRegForValue(I->getOperand(0)); if (!Src0Reg) return false; - unsigned Src1Reg = getRegForValue(I->getOperand(1)); + Register Src1Reg = getRegForValue(I->getOperand(1)); if (!Src1Reg) return false; @@ -4666,7 +4666,7 @@ bool AArch64FastISel::selectShift(const Instruction *I) { } } - unsigned Op0Reg = getRegForValue(Op0); + Register Op0Reg = getRegForValue(Op0); if (!Op0Reg) return false; @@ -4689,11 +4689,11 @@ bool AArch64FastISel::selectShift(const Instruction *I) { return true; } - unsigned Op0Reg = getRegForValue(I->getOperand(0)); + Register Op0Reg = getRegForValue(I->getOperand(0)); if (!Op0Reg) return false; - unsigned Op1Reg = getRegForValue(I->getOperand(1)); + Register Op1Reg = getRegForValue(I->getOperand(1)); if (!Op1Reg) return false; @@ -4746,11 +4746,11 @@ bool AArch64FastISel::selectBitCast(const Instruction *I) { case MVT::f32: RC = &AArch64::FPR32RegClass; break; case MVT::f64: RC = &AArch64::FPR64RegClass; break; } - unsigned Op0Reg = getRegForValue(I->getOperand(0)); + Register Op0Reg = getRegForValue(I->getOperand(0)); if (!Op0Reg) return false; - unsigned ResultReg = fastEmitInst_r(Opc, RC, Op0Reg); + Register ResultReg = fastEmitInst_r(Opc, RC, Op0Reg); if (!ResultReg) return false; @@ -4810,7 +4810,7 @@ bool AArch64FastISel::selectSDiv(const Instruction *I) { return selectBinaryOp(I, ISD::SDIV); unsigned Lg2 = C.countTrailingZeros(); - unsigned Src0Reg = getRegForValue(I->getOperand(0)); + Register Src0Reg = getRegForValue(I->getOperand(0)); if (!Src0Reg) return false; @@ -4840,7 +4840,7 @@ bool AArch64FastISel::selectSDiv(const Instruction *I) { SelectOpc = AArch64::CSELWr; RC = &AArch64::GPR32RegClass; } - unsigned SelectReg = fastEmitInst_rri(SelectOpc, RC, AddReg, Src0Reg, + Register SelectReg = fastEmitInst_rri(SelectOpc, RC, AddReg, Src0Reg, AArch64CC::LT); if (!SelectReg) return false; @@ -4866,7 +4866,7 @@ bool AArch64FastISel::selectSDiv(const Instruction *I) { /// have to duplicate it for AArch64, because otherwise we would fail during the /// sign-extend emission. unsigned AArch64FastISel::getRegForGEPIndex(const Value *Idx) { - unsigned IdxN = getRegForValue(Idx); + Register IdxN = getRegForValue(Idx); if (IdxN == 0) // Unhandled operand. Halt "fast" selection and bail. return 0; @@ -4889,7 +4889,7 @@ bool AArch64FastISel::selectGetElementPtr(const Instruction *I) { if (Subtarget->isTargetILP32()) return false; - unsigned N = getRegForValue(I->getOperand(0)); + Register N = getRegForValue(I->getOperand(0)); if (!N) return false; @@ -4983,16 +4983,16 @@ bool AArch64FastISel::selectAtomicCmpXchg(const AtomicCmpXchgInst *I) { const MCInstrDesc &II = TII.get(Opc); - const unsigned AddrReg = constrainOperandRegClass( + const Register AddrReg = constrainOperandRegClass( II, getRegForValue(I->getPointerOperand()), II.getNumDefs()); - const unsigned DesiredReg = constrainOperandRegClass( + const Register DesiredReg = constrainOperandRegClass( II, getRegForValue(I->getCompareOperand()), II.getNumDefs() + 1); - const unsigned NewReg = constrainOperandRegClass( + const Register NewReg = constrainOperandRegClass( II, getRegForValue(I->getNewValOperand()), II.getNumDefs() + 2); - const unsigned ResultReg1 = createResultReg(ResRC); - const unsigned ResultReg2 = createResultReg(&AArch64::GPR32RegClass); - const unsigned ScratchReg = createResultReg(&AArch64::GPR32RegClass); + const Register ResultReg1 = createResultReg(ResRC); + const Register ResultReg2 = createResultReg(&AArch64::GPR32RegClass); + const Register ScratchReg = createResultReg(&AArch64::GPR32RegClass); // FIXME: MachineMemOperand doesn't support cmpxchg yet. BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II) diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp index 638e45b30d99..a4d20735e2b1 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -547,7 +547,7 @@ void AArch64FrameLowering::emitCalleeSavedFrameMoves( return; for (const auto &Info : CSI) { - unsigned Reg = Info.getReg(); + Register Reg = Info.getReg(); // Not all unwinders may know about SVE registers, so assume the lowest // common demoninator. @@ -1653,8 +1653,7 @@ static void InsertReturnAddressAuth(MachineFunction &MF, // The AUTIASP instruction assembles to a hint instruction before v8.3a so // this instruction can safely used for any v8a architecture. // From v8.3a onwards there are optimised authenticate LR and return - // instructions, namely RETA{A,B}, that can be used instead. In this case the - // DW_CFA_AARCH64_negate_ra_state can't be emitted. + // instructions, namely RETA{A,B}, that can be used instead. if (Subtarget.hasPAuth() && MBBI != MBB.end() && MBBI->getOpcode() == AArch64::RET_ReallyLR) { BuildMI(MBB, MBBI, DL, @@ -1666,12 +1665,6 @@ static void InsertReturnAddressAuth(MachineFunction &MF, MBB, MBBI, DL, TII->get(MFI.shouldSignWithBKey() ? AArch64::AUTIBSP : AArch64::AUTIASP)) .setMIFlag(MachineInstr::FrameDestroy); - - unsigned CFIIndex = - MF.addFrameInst(MCCFIInstruction::createNegateRAState(nullptr)); - BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex) - .setMIFlags(MachineInstr::FrameDestroy); } } @@ -2292,7 +2285,7 @@ static void computeCalleeSaveRegisterPairs( // MachO's compact unwind format relies on all registers being stored in // pairs. assert((!produceCompactUnwindFrame(MF) || - CC == CallingConv::PreserveMost || + CC == CallingConv::PreserveMost || CC == CallingConv::CXX_FAST_TLS || (Count & 1) == 0) && "Odd number of callee-saved regs to spill!"); int ByteOffset = AFI->getCalleeSavedStackSize(); @@ -2331,7 +2324,7 @@ static void computeCalleeSaveRegisterPairs( // Add the next reg to the pair if it is in the same register class. if (unsigned(i + RegInc) < Count) { - unsigned NextReg = CSI[i + RegInc].getReg(); + Register NextReg = CSI[i + RegInc].getReg(); bool IsFirst = i == FirstReg; switch (RPI.Type) { case RegPairInfo::GPR: @@ -2387,7 +2380,7 @@ static void computeCalleeSaveRegisterPairs( // MachO's compact unwind format relies on all registers being stored in // adjacent register pairs. assert((!produceCompactUnwindFrame(MF) || - CC == CallingConv::PreserveMost || + CC == CallingConv::PreserveMost || CC == CallingConv::CXX_FAST_TLS || (RPI.isPaired() && ((RPI.Reg1 == AArch64::LR && RPI.Reg2 == AArch64::FP) || RPI.Reg1 + 1 == RPI.Reg2))) && @@ -3135,7 +3128,7 @@ void AArch64FrameLowering::processFunctionBeforeFrameFinalized( DebugLoc DL; RS->enterBasicBlockEnd(MBB); RS->backward(std::prev(MBBI)); - unsigned DstReg = RS->FindUnusedReg(&AArch64::GPR64commonRegClass); + Register DstReg = RS->FindUnusedReg(&AArch64::GPR64commonRegClass); assert(DstReg && "There must be a free register after frame setup"); BuildMI(MBB, MBBI, DL, TII.get(AArch64::MOVi64imm), DstReg).addImm(-2); BuildMI(MBB, MBBI, DL, TII.get(AArch64::STURXi)) diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.h b/llvm/lib/Target/AArch64/AArch64FrameLowering.h index e6d997f91b47..31f57cbc49f2 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.h +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.h @@ -26,9 +26,8 @@ public: : TargetFrameLowering(StackGrowsDown, Align(16), 0, Align(16), true /*StackRealignable*/) {} - void - emitCalleeSavedFrameMoves(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI) const override; + void emitCalleeSavedFrameMoves(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI) const; MachineBasicBlock::iterator eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp index fe9b2f8883b9..899f069abdd4 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -5147,5 +5147,5 @@ bool AArch64DAGToDAGISel::SelectAllActivePredicate(SDValue N) { const AArch64TargetLowering *TLI = static_cast<const AArch64TargetLowering *>(getTargetLowering()); - return TLI->isAllActivePredicate(N); + return TLI->isAllActivePredicate(*CurDAG, N); } diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index e141179fb5c8..a26bbc77f248 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -962,6 +962,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setMinFunctionAlignment(Align(4)); // Set preferred alignments. setPrefLoopAlignment(Align(1ULL << STI.getPrefLoopLogAlignment())); + setMaxBytesForAlignment(STI.getMaxBytesForLoopAlignment()); setPrefFunctionAlignment(Align(1ULL << STI.getPrefFunctionLogAlignment())); // Only change the limit for entries in a jump table if specified by @@ -1205,6 +1206,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::SRL, VT, Custom); setOperationAction(ISD::SRA, VT, Custom); setOperationAction(ISD::ABS, VT, Custom); + setOperationAction(ISD::ABDS, VT, Custom); + setOperationAction(ISD::ABDU, VT, Custom); setOperationAction(ISD::VECREDUCE_ADD, VT, Custom); setOperationAction(ISD::VECREDUCE_AND, VT, Custom); setOperationAction(ISD::VECREDUCE_OR, VT, Custom); @@ -1245,6 +1248,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::SELECT_CC, VT, Expand); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); // There are no legal MVT::nxv16f## based types. if (VT != MVT::nxv16i1) { @@ -1831,6 +1835,28 @@ void AArch64TargetLowering::computeKnownBitsForTargetNode( Known = KnownBits::commonBits(Known, Known2); break; } + case AArch64ISD::BICi: { + // Compute the bit cleared value. + uint64_t Mask = + ~(Op->getConstantOperandVal(1) << Op->getConstantOperandVal(2)); + Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1); + Known &= KnownBits::makeConstant(APInt(Known.getBitWidth(), Mask)); + break; + } + case AArch64ISD::VLSHR: { + KnownBits Known2; + Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1); + Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1); + Known = KnownBits::lshr(Known, Known2); + break; + } + case AArch64ISD::VASHR: { + KnownBits Known2; + Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1); + Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1); + Known = KnownBits::ashr(Known, Known2); + break; + } case AArch64ISD::LOADgot: case AArch64ISD::ADDlow: { if (!Subtarget->isTargetILP32()) @@ -1971,6 +1997,8 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { MAKE_CASE(AArch64ISD::CSINC) MAKE_CASE(AArch64ISD::THREAD_POINTER) MAKE_CASE(AArch64ISD::TLSDESC_CALLSEQ) + MAKE_CASE(AArch64ISD::ABDS_PRED) + MAKE_CASE(AArch64ISD::ABDU_PRED) MAKE_CASE(AArch64ISD::ADD_PRED) MAKE_CASE(AArch64ISD::MUL_PRED) MAKE_CASE(AArch64ISD::MULHS_PRED) @@ -2173,6 +2201,7 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { MAKE_CASE(AArch64ISD::INSR) MAKE_CASE(AArch64ISD::PTEST) MAKE_CASE(AArch64ISD::PTRUE) + MAKE_CASE(AArch64ISD::PFALSE) MAKE_CASE(AArch64ISD::LD1_MERGE_ZERO) MAKE_CASE(AArch64ISD::LD1S_MERGE_ZERO) MAKE_CASE(AArch64ISD::LDNF1_MERGE_ZERO) @@ -5173,6 +5202,10 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op, return LowerFixedLengthVectorSelectToSVE(Op, DAG); case ISD::ABS: return LowerABS(Op, DAG); + case ISD::ABDS: + return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABDS_PRED); + case ISD::ABDU: + return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABDU_PRED); case ISD::BITREVERSE: return LowerBitreverse(Op, DAG); case ISD::BSWAP: @@ -5380,7 +5413,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments( llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering"); // Transform the arguments in physical registers into virtual ones. - unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); + Register Reg = MF.addLiveIn(VA.getLocReg(), RC); ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT); // If this is an 8, 16 or 32-bit value, it is really passed promoted @@ -5542,7 +5575,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments( // Conservatively forward X8, since it might be used for aggregate return. if (!CCInfo.isAllocated(AArch64::X8)) { - unsigned X8VReg = MF.addLiveIn(AArch64::X8, &AArch64::GPR64RegClass); + Register X8VReg = MF.addLiveIn(AArch64::X8, &AArch64::GPR64RegClass); Forwards.push_back(ForwardedRegister(X8VReg, AArch64::X8, MVT::i64)); } } @@ -5626,7 +5659,7 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo, SDValue FIN = DAG.getFrameIndex(GPRIdx, PtrVT); for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) { - unsigned VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass); + Register VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass); SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64); SDValue Store = DAG.getStore(Val.getValue(1), DL, Val, FIN, @@ -5656,7 +5689,7 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo, SDValue FIN = DAG.getFrameIndex(FPRIdx, PtrVT); for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) { - unsigned VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass); + Register VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass); SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128); SDValue Store = DAG.getStore(Val.getValue(1), DL, Val, FIN, @@ -7256,6 +7289,9 @@ SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op, return getSVESafeBitCast(VT, IntResult, DAG); } + if (!Subtarget->hasNEON()) + return SDValue(); + if (SrcVT.bitsLT(VT)) In2 = DAG.getNode(ISD::FP_EXTEND, DL, VT, In2); else if (SrcVT.bitsGT(VT)) @@ -7795,10 +7831,37 @@ SDValue AArch64TargetLowering::LowerVECTOR_SPLICE(SDValue Op, SelectionDAG &DAG) const { EVT Ty = Op.getValueType(); auto Idx = Op.getConstantOperandAPInt(2); + int64_t IdxVal = Idx.getSExtValue(); + assert(Ty.isScalableVector() && + "Only expect scalable vectors for custom lowering of VECTOR_SPLICE"); + + // We can use the splice instruction for certain index values where we are + // able to efficiently generate the correct predicate. The index will be + // inverted and used directly as the input to the ptrue instruction, i.e. + // -1 -> vl1, -2 -> vl2, etc. The predicate will then be reversed to get the + // splice predicate. However, we can only do this if we can guarantee that + // there are enough elements in the vector, hence we check the index <= min + // number of elements. + Optional<unsigned> PredPattern; + if (Ty.isScalableVector() && IdxVal < 0 && + (PredPattern = getSVEPredPatternFromNumElements(std::abs(IdxVal))) != + None) { + SDLoc DL(Op); + + // Create a predicate where all but the last -IdxVal elements are false. + EVT PredVT = Ty.changeVectorElementType(MVT::i1); + SDValue Pred = getPTrue(DAG, DL, PredVT, *PredPattern); + Pred = DAG.getNode(ISD::VECTOR_REVERSE, DL, PredVT, Pred); + + // Now splice the two inputs together using the predicate. + return DAG.getNode(AArch64ISD::SPLICE, DL, Ty, Pred, Op.getOperand(0), + Op.getOperand(1)); + } // This will select to an EXT instruction, which has a maximum immediate // value of 255, hence 2048-bits is the maximum value we can lower. - if (Idx.sge(-1) && Idx.slt(2048 / Ty.getVectorElementType().getSizeInBits())) + if (IdxVal >= 0 && + IdxVal < int64_t(2048 / Ty.getVectorElementType().getSizeInBits())) return Op; return SDValue(); @@ -8227,7 +8290,7 @@ SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op, } else { // Return LR, which contains the return address. Mark it an implicit // live-in. - unsigned Reg = MF.addLiveIn(AArch64::LR, &AArch64::GPR64RegClass); + Register Reg = MF.addLiveIn(AArch64::LR, &AArch64::GPR64RegClass); ReturnAddress = DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT); } @@ -9631,14 +9694,12 @@ static SDValue constructDup(SDValue V, int Lane, SDLoc dl, EVT VT, MVT CastVT; if (getScaledOffsetDup(V, Lane, CastVT)) { V = DAG.getBitcast(CastVT, V.getOperand(0).getOperand(0)); - } else if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR) { + } else if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR && + V.getOperand(0).getValueType().is128BitVector()) { // The lane is incremented by the index of the extract. // Example: dup v2f32 (extract v4f32 X, 2), 1 --> dup v4f32 X, 3 - auto VecVT = V.getOperand(0).getValueType(); - if (VecVT.isFixedLengthVector() && VecVT.getFixedSizeInBits() <= 128) { - Lane += V.getConstantOperandVal(1); - V = V.getOperand(0); - } + Lane += V.getConstantOperandVal(1); + V = V.getOperand(0); } else if (V.getOpcode() == ISD::CONCAT_VECTORS) { // The lane is decremented if we are splatting from the 2nd operand. // Example: dup v4i32 (concat v2i32 X, v2i32 Y), 3 --> dup v4i32 Y, 1 @@ -9925,7 +9986,7 @@ SDValue AArch64TargetLowering::LowerSPLAT_VECTOR(SDValue Op, // lowering code. if (auto *ConstVal = dyn_cast<ConstantSDNode>(SplatVal)) { if (ConstVal->isZero()) - return SDValue(DAG.getMachineNode(AArch64::PFALSE, dl, VT), 0); + return DAG.getNode(AArch64ISD::PFALSE, dl, VT); if (ConstVal->isOne()) return getPTrue(DAG, dl, VT, AArch64SVEPredPattern::all); } @@ -10978,6 +11039,28 @@ SDValue AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op, if (!isTypeLegal(VT)) return SDValue(); + // Break down insert_subvector into simpler parts. + if (VT.getVectorElementType() == MVT::i1) { + unsigned NumElts = VT.getVectorMinNumElements(); + EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext()); + + SDValue Lo, Hi; + Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Vec0, + DAG.getVectorIdxConstant(0, DL)); + Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Vec0, + DAG.getVectorIdxConstant(NumElts / 2, DL)); + if (Idx < (NumElts / 2)) { + SDValue NewLo = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, HalfVT, Lo, Vec1, + DAG.getVectorIdxConstant(Idx, DL)); + return DAG.getNode(AArch64ISD::UZP1, DL, VT, NewLo, Hi); + } else { + SDValue NewHi = + DAG.getNode(ISD::INSERT_SUBVECTOR, DL, HalfVT, Hi, Vec1, + DAG.getVectorIdxConstant(Idx - (NumElts / 2), DL)); + return DAG.getNode(AArch64ISD::UZP1, DL, VT, Lo, NewHi); + } + } + // Ensure the subvector is half the size of the main vector. if (VT.getVectorElementCount() != (InVT.getVectorElementCount() * 2)) return SDValue(); @@ -11012,10 +11095,10 @@ SDValue AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op, if (Vec0.isUndef()) return Op; - unsigned int PredPattern = + Optional<unsigned> PredPattern = getSVEPredPatternFromNumElements(InVT.getVectorNumElements()); auto PredTy = VT.changeVectorElementType(MVT::i1); - SDValue PTrue = getPTrue(DAG, DL, PredTy, PredPattern); + SDValue PTrue = getPTrue(DAG, DL, PredTy, *PredPattern); SDValue ScalableVec1 = convertToScalableVector(DAG, VT, Vec1); return DAG.getNode(ISD::VSELECT, DL, VT, PTrue, ScalableVec1, Vec0); } @@ -11730,10 +11813,10 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, case Intrinsic::aarch64_ldxr: { PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType()); Info.opc = ISD::INTRINSIC_W_CHAIN; - Info.memVT = MVT::getVT(PtrTy->getElementType()); + Info.memVT = MVT::getVT(PtrTy->getPointerElementType()); Info.ptrVal = I.getArgOperand(0); Info.offset = 0; - Info.align = DL.getABITypeAlign(PtrTy->getElementType()); + Info.align = DL.getABITypeAlign(PtrTy->getPointerElementType()); Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile; return true; } @@ -11741,10 +11824,10 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, case Intrinsic::aarch64_stxr: { PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType()); Info.opc = ISD::INTRINSIC_W_CHAIN; - Info.memVT = MVT::getVT(PtrTy->getElementType()); + Info.memVT = MVT::getVT(PtrTy->getPointerElementType()); Info.ptrVal = I.getArgOperand(1); Info.offset = 0; - Info.align = DL.getABITypeAlign(PtrTy->getElementType()); + Info.align = DL.getABITypeAlign(PtrTy->getPointerElementType()); Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile; return true; } @@ -11772,7 +11855,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.memVT = MVT::getVT(I.getType()); Info.ptrVal = I.getArgOperand(1); Info.offset = 0; - Info.align = DL.getABITypeAlign(PtrTy->getElementType()); + Info.align = DL.getABITypeAlign(PtrTy->getPointerElementType()); Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MONonTemporal; return true; } @@ -11782,7 +11865,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.memVT = MVT::getVT(I.getOperand(0)->getType()); Info.ptrVal = I.getArgOperand(2); Info.offset = 0; - Info.align = DL.getABITypeAlign(PtrTy->getElementType()); + Info.align = DL.getABITypeAlign(PtrTy->getPointerElementType()); Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MONonTemporal; return true; } @@ -12320,7 +12403,7 @@ bool AArch64TargetLowering::lowerInterleavedLoad( Value *PTrue = nullptr; if (UseScalable) { - unsigned PgPattern = + Optional<unsigned> PgPattern = getSVEPredPatternFromNumElements(FVTy->getNumElements()); if (Subtarget->getMinSVEVectorSizeInBits() == Subtarget->getMaxSVEVectorSizeInBits() && @@ -12328,7 +12411,7 @@ bool AArch64TargetLowering::lowerInterleavedLoad( PgPattern = AArch64SVEPredPattern::all; auto *PTruePat = - ConstantInt::get(Type::getInt32Ty(LDVTy->getContext()), PgPattern); + ConstantInt::get(Type::getInt32Ty(LDVTy->getContext()), *PgPattern); PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {PredTy}, {PTruePat}); } @@ -12500,7 +12583,7 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI, Value *PTrue = nullptr; if (UseScalable) { - unsigned PgPattern = + Optional<unsigned> PgPattern = getSVEPredPatternFromNumElements(SubVecTy->getNumElements()); if (Subtarget->getMinSVEVectorSizeInBits() == Subtarget->getMaxSVEVectorSizeInBits() && @@ -12509,7 +12592,7 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI, PgPattern = AArch64SVEPredPattern::all; auto *PTruePat = - ConstantInt::get(Type::getInt32Ty(STVTy->getContext()), PgPattern); + ConstantInt::get(Type::getInt32Ty(STVTy->getContext()), *PgPattern); PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {PredTy}, {PTruePat}); } @@ -12901,7 +12984,7 @@ bool AArch64TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT)) return false; - return (Index == 0 || Index == ResVT.getVectorNumElements()); + return (Index == 0 || Index == ResVT.getVectorMinNumElements()); } /// Turn vector tests of the signbit in the form of: @@ -14261,6 +14344,7 @@ static SDValue performConcatVectorsCombine(SDNode *N, static SDValue performInsertSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { + SDLoc DL(N); SDValue Vec = N->getOperand(0); SDValue SubVec = N->getOperand(1); uint64_t IdxVal = N->getConstantOperandVal(2); @@ -14286,7 +14370,6 @@ performInsertSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, // Fold insert_subvector -> concat_vectors // insert_subvector(Vec,Sub,lo) -> concat_vectors(Sub,extract(Vec,hi)) // insert_subvector(Vec,Sub,hi) -> concat_vectors(extract(Vec,lo),Sub) - SDLoc DL(N); SDValue Lo, Hi; if (IdxVal == 0) { Lo = SubVec; @@ -15004,7 +15087,15 @@ static SDValue combineSVEReductionOrderedFP(SDNode *N, unsigned Opc, Zero); } -static bool isAllActivePredicate(SDValue N) { +static bool isAllInactivePredicate(SDValue N) { + // Look through cast. + while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST) + N = N.getOperand(0); + + return N.getOpcode() == AArch64ISD::PFALSE; +} + +static bool isAllActivePredicate(SelectionDAG &DAG, SDValue N) { unsigned NumElts = N.getValueType().getVectorMinNumElements(); // Look through cast. @@ -15023,6 +15114,21 @@ static bool isAllActivePredicate(SDValue N) { N.getConstantOperandVal(0) == AArch64SVEPredPattern::all) return N.getValueType().getVectorMinNumElements() >= NumElts; + // If we're compiling for a specific vector-length, we can check if the + // pattern's VL equals that of the scalable vector at runtime. + if (N.getOpcode() == AArch64ISD::PTRUE) { + const auto &Subtarget = + static_cast<const AArch64Subtarget &>(DAG.getSubtarget()); + unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits(); + unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits(); + if (MaxSVESize && MinSVESize == MaxSVESize) { + unsigned VScale = MaxSVESize / AArch64::SVEBitsPerBlock; + unsigned PatNumElts = + getNumElementsFromSVEPredPattern(N.getConstantOperandVal(0)); + return PatNumElts == (NumElts * VScale); + } + } + return false; } @@ -15039,7 +15145,7 @@ static SDValue convertMergedOpToPredOp(SDNode *N, unsigned Opc, SDValue Op2 = N->getOperand(SwapOperands ? 2 : 3); // ISD way to specify an all active predicate. - if (isAllActivePredicate(Pg)) { + if (isAllActivePredicate(DAG, Pg)) { if (UnpredOp) return DAG.getNode(Opc, SDLoc(N), N->getValueType(0), Op1, Op2); @@ -15870,7 +15976,7 @@ static SDValue performPostLD1Combine(SDNode *N, SelectionDAG &DAG = DCI.DAG; EVT VT = N->getValueType(0); - if (VT.isScalableVector()) + if (!VT.is128BitVector() && !VT.is64BitVector()) return SDValue(); unsigned LoadIdx = IsLaneOp ? 1 : 0; @@ -16710,6 +16816,12 @@ static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG) { SDValue N0 = N->getOperand(0); EVT CCVT = N0.getValueType(); + if (isAllActivePredicate(DAG, N0)) + return N->getOperand(1); + + if (isAllInactivePredicate(N0)) + return N->getOperand(2); + // Check for sign pattern (VSELECT setgt, iN lhs, -1, 1, -1) and transform // into (OR (ASR lhs, N-1), 1), which requires less instructions for the // supported types. @@ -18753,7 +18865,7 @@ static SDValue getPredicateForFixedLengthVector(SelectionDAG &DAG, SDLoc &DL, DAG.getTargetLoweringInfo().isTypeLegal(VT) && "Expected legal fixed length vector!"); - unsigned PgPattern = + Optional<unsigned> PgPattern = getSVEPredPatternFromNumElements(VT.getVectorNumElements()); assert(PgPattern && "Unexpected element count for SVE predicate"); @@ -18789,7 +18901,7 @@ static SDValue getPredicateForFixedLengthVector(SelectionDAG &DAG, SDLoc &DL, break; } - return getPTrue(DAG, DL, MaskVT, PgPattern); + return getPTrue(DAG, DL, MaskVT, *PgPattern); } static SDValue getPredicateForScalableVector(SelectionDAG &DAG, SDLoc &DL, @@ -19281,7 +19393,12 @@ SDValue AArch64TargetLowering::LowerPredReductionToSVE(SDValue ReduceOp, default: return SDValue(); case ISD::VECREDUCE_OR: - return getPTest(DAG, VT, Pg, Op, AArch64CC::ANY_ACTIVE); + if (isAllActivePredicate(DAG, Pg)) + // The predicate can be 'Op' because + // vecreduce_or(Op & <all true>) <=> vecreduce_or(Op). + return getPTest(DAG, VT, Op, Op, AArch64CC::ANY_ACTIVE); + else + return getPTest(DAG, VT, Pg, Op, AArch64CC::ANY_ACTIVE); case ISD::VECREDUCE_AND: { Op = DAG.getNode(ISD::XOR, DL, OpVT, Op, Pg); return getPTest(DAG, VT, Pg, Op, AArch64CC::NONE_ACTIVE); @@ -19725,8 +19842,9 @@ SDValue AArch64TargetLowering::getSVESafeBitCast(EVT VT, SDValue Op, return Op; } -bool AArch64TargetLowering::isAllActivePredicate(SDValue N) const { - return ::isAllActivePredicate(N); +bool AArch64TargetLowering::isAllActivePredicate(SelectionDAG &DAG, + SDValue N) const { + return ::isAllActivePredicate(DAG, N); } EVT AArch64TargetLowering::getPromotedVTForPredicate(EVT VT) const { @@ -19777,7 +19895,7 @@ bool AArch64TargetLowering::SimplifyDemandedBitsForTargetNode( Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth); } -bool AArch64TargetLowering::isConstantUnsignedBitfieldExtactLegal( +bool AArch64TargetLowering::isConstantUnsignedBitfieldExtractLegal( unsigned Opc, LLT Ty1, LLT Ty2) const { return Ty1 == Ty2 && (Ty1 == LLT::scalar(32) || Ty1 == LLT::scalar(64)); } diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index 367ba3039a0c..ca6c70297c0b 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -77,14 +77,16 @@ enum NodeType : unsigned { SBC, // adc, sbc instructions // Predicated instructions where inactive lanes produce undefined results. + ABDS_PRED, + ABDU_PRED, ADD_PRED, FADD_PRED, FDIV_PRED, FMA_PRED, - FMAXNM_PRED, - FMINNM_PRED, FMAX_PRED, + FMAXNM_PRED, FMIN_PRED, + FMINNM_PRED, FMUL_PRED, FSUB_PRED, MUL_PRED, @@ -321,6 +323,7 @@ enum NodeType : unsigned { INSR, PTEST, PTRUE, + PFALSE, BITREVERSE_MERGE_PASSTHRU, BSWAP_MERGE_PASSTHRU, @@ -487,7 +490,6 @@ const unsigned RoundingBitsPos = 22; } // namespace AArch64 class AArch64Subtarget; -class AArch64TargetMachine; class AArch64TargetLowering : public TargetLowering { public: @@ -842,7 +844,7 @@ public: return 128; } - bool isAllActivePredicate(SDValue N) const; + bool isAllActivePredicate(SelectionDAG &DAG, SDValue N) const; EVT getPromotedVTForPredicate(EVT VT) const; EVT getAsmOperandValueType(const DataLayout &DL, Type *Ty, @@ -1137,8 +1139,8 @@ private: // with BITCAST used otherwise. SDValue getSVESafeBitCast(EVT VT, SDValue Op, SelectionDAG &DAG) const; - bool isConstantUnsignedBitfieldExtactLegal(unsigned Opc, LLT Ty1, - LLT Ty2) const override; + bool isConstantUnsignedBitfieldExtractLegal(unsigned Opc, LLT Ty1, + LLT Ty2) const override; }; namespace AArch64 { diff --git a/llvm/lib/Target/AArch64/AArch64InstrAtomics.td b/llvm/lib/Target/AArch64/AArch64InstrAtomics.td index 84573dac7e41..b220929514f9 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrAtomics.td +++ b/llvm/lib/Target/AArch64/AArch64InstrAtomics.td @@ -102,6 +102,34 @@ def : Pat<(relaxed_load<atomic_load_64> (am_unscaled64 GPR64sp:$Rn, simm9:$offset)), (LDURXi GPR64sp:$Rn, simm9:$offset)>; +// FP 32-bit loads +def : Pat<(f32 (bitconvert (i32 (relaxed_load<atomic_load_32> (ro_Windexed32 GPR64sp:$Rn, GPR32:$Rm, + ro_Wextend32:$extend))))), + (LDRSroW GPR64sp:$Rn, GPR32:$Rm, ro_Wextend32:$extend)>; +def : Pat<(f32 (bitconvert (i32 (relaxed_load<atomic_load_32> (ro_Xindexed32 GPR64sp:$Rn, GPR64:$Rm, + ro_Xextend32:$extend))))), + (LDRSroX GPR64sp:$Rn, GPR64:$Rm, ro_Xextend32:$extend)>; +def : Pat<(f32 (bitconvert (i32 (relaxed_load<atomic_load_32> (am_indexed32 GPR64sp:$Rn, + uimm12s8:$offset))))), + (LDRSui GPR64sp:$Rn, uimm12s8:$offset)>; +def : Pat<(f32 (bitconvert (i32 (relaxed_load<atomic_load_32> + (am_unscaled32 GPR64sp:$Rn, simm9:$offset))))), + (LDURSi GPR64sp:$Rn, simm9:$offset)>; + +// FP 64-bit loads +def : Pat<(f64 (bitconvert (i64 (relaxed_load<atomic_load_64> (ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm, + ro_Wextend64:$extend))))), + (LDRDroW GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend)>; +def : Pat<(f64 (bitconvert (i64 (relaxed_load<atomic_load_64> (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm, + ro_Xextend64:$extend))))), + (LDRDroX GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend)>; +def : Pat<(f64 (bitconvert (i64 (relaxed_load<atomic_load_64> (am_indexed64 GPR64sp:$Rn, + uimm12s8:$offset))))), + (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>; +def : Pat<(f64 (bitconvert (i64 (relaxed_load<atomic_load_64> + (am_unscaled64 GPR64sp:$Rn, simm9:$offset))))), + (LDURDi GPR64sp:$Rn, simm9:$offset)>; + //===---------------------------------- // Atomic stores //===---------------------------------- @@ -196,6 +224,38 @@ def : Pat<(relaxed_store<atomic_store_64> (am_unscaled64 GPR64sp:$Rn, simm9:$offset), GPR64:$val), (STURXi GPR64:$val, GPR64sp:$Rn, simm9:$offset)>; +// FP 32-bit stores +def : Pat<(relaxed_store<atomic_store_32> (ro_Windexed32 GPR64sp:$Rn, GPR32:$Rm, + ro_Wextend32:$extend), + (i32 (bitconvert (f32 FPR32Op:$val)))), + (STRSroW FPR32Op:$val, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend32:$extend)>; +def : Pat<(relaxed_store<atomic_store_32> (ro_Xindexed32 GPR64sp:$Rn, GPR64:$Rm, + ro_Xextend32:$extend), + (i32 (bitconvert (f32 FPR32Op:$val)))), + (STRSroX FPR32Op:$val, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend32:$extend)>; +def : Pat<(relaxed_store<atomic_store_32> + (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset), (i32 (bitconvert (f32 FPR32Op:$val)))), + (STRSui FPR32Op:$val, GPR64sp:$Rn, uimm12s4:$offset)>; +def : Pat<(relaxed_store<atomic_store_32> + (am_unscaled32 GPR64sp:$Rn, simm9:$offset), (i32 (bitconvert (f32 FPR32Op:$val)))), + (STURSi FPR32Op:$val, GPR64sp:$Rn, simm9:$offset)>; + +// FP 64-bit stores +def : Pat<(relaxed_store<atomic_store_64> (ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm, + ro_Wextend64:$extend), + (i64 (bitconvert (f64 FPR64Op:$val)))), + (STRDroW FPR64Op:$val, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend)>; +def : Pat<(relaxed_store<atomic_store_64> (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm, + ro_Xextend64:$extend), + (i64 (bitconvert (f64 FPR64Op:$val)))), + (STRDroX FPR64Op:$val, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend)>; +def : Pat<(relaxed_store<atomic_store_64> + (am_indexed64 GPR64sp:$Rn, uimm12s4:$offset), (i64 (bitconvert (f64 FPR64Op:$val)))), + (STRDui FPR64Op:$val, GPR64sp:$Rn, uimm12s4:$offset)>; +def : Pat<(relaxed_store<atomic_store_64> + (am_unscaled64 GPR64sp:$Rn, simm9:$offset), (i64 (bitconvert (f64 FPR64Op:$val)))), + (STURDi FPR64Op:$val, GPR64sp:$Rn, simm9:$offset)>; + //===---------------------------------- // Low-level exclusive operations //===---------------------------------- diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td index f8d492188744..4c1e41b7efee 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -1816,10 +1816,10 @@ def am_brcond : Operand<OtherVT> { let OperandType = "OPERAND_PCREL"; } -class BranchCond : I<(outs), (ins ccode:$cond, am_brcond:$target), - "b", ".$cond\t$target", "", - [(AArch64brcond bb:$target, imm:$cond, NZCV)]>, - Sched<[WriteBr]> { +class BranchCond<bit bit4, string mnemonic> + : I<(outs), (ins ccode:$cond, am_brcond:$target), + mnemonic, ".$cond\t$target", "", + [(AArch64brcond bb:$target, imm:$cond, NZCV)]>, Sched<[WriteBr]> { let isBranch = 1; let isTerminator = 1; let Uses = [NZCV]; @@ -1828,7 +1828,7 @@ class BranchCond : I<(outs), (ins ccode:$cond, am_brcond:$target), bits<19> target; let Inst{31-24} = 0b01010100; let Inst{23-5} = target; - let Inst{4} = 0; + let Inst{4} = bit4; let Inst{3-0} = cond; } @@ -7700,10 +7700,10 @@ multiclass SIMDTableLookupTied<bit op, string asm> { //---------------------------------------------------------------------------- -// AdvSIMD scalar CPY +// AdvSIMD scalar DUP //---------------------------------------------------------------------------- let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in -class BaseSIMDScalarCPY<RegisterClass regtype, RegisterOperand vectype, +class BaseSIMDScalarDUP<RegisterClass regtype, RegisterOperand vectype, string asm, string kind, Operand idxtype> : I<(outs regtype:$dst), (ins vectype:$src, idxtype:$idx), asm, "{\t$dst, $src" # kind # "$idx" # @@ -7717,30 +7717,30 @@ class BaseSIMDScalarCPY<RegisterClass regtype, RegisterOperand vectype, let Inst{4-0} = dst; } -class SIMDScalarCPYAlias<string asm, string size, Instruction inst, +class SIMDScalarDUPAlias<string asm, string size, Instruction inst, RegisterClass regtype, RegisterOperand vectype, Operand idxtype> : InstAlias<asm # "{\t$dst, $src" # size # "$index" # "|\t$dst, $src$index}", (inst regtype:$dst, vectype:$src, idxtype:$index), 0>; -multiclass SIMDScalarCPY<string asm> { - def i8 : BaseSIMDScalarCPY<FPR8, V128, asm, ".b", VectorIndexB> { +multiclass SIMDScalarDUP<string asm> { + def i8 : BaseSIMDScalarDUP<FPR8, V128, asm, ".b", VectorIndexB> { bits<4> idx; let Inst{20-17} = idx; let Inst{16} = 1; } - def i16 : BaseSIMDScalarCPY<FPR16, V128, asm, ".h", VectorIndexH> { + def i16 : BaseSIMDScalarDUP<FPR16, V128, asm, ".h", VectorIndexH> { bits<3> idx; let Inst{20-18} = idx; let Inst{17-16} = 0b10; } - def i32 : BaseSIMDScalarCPY<FPR32, V128, asm, ".s", VectorIndexS> { + def i32 : BaseSIMDScalarDUP<FPR32, V128, asm, ".s", VectorIndexS> { bits<2> idx; let Inst{20-19} = idx; let Inst{18-16} = 0b100; } - def i64 : BaseSIMDScalarCPY<FPR64, V128, asm, ".d", VectorIndexD> { + def i64 : BaseSIMDScalarDUP<FPR64, V128, asm, ".d", VectorIndexD> { bits<1> idx; let Inst{20} = idx; let Inst{19-16} = 0b1000; @@ -7751,16 +7751,16 @@ multiclass SIMDScalarCPY<string asm> { (!cast<Instruction>(NAME # i64) V128:$src, VectorIndexD:$idx)>; // 'DUP' mnemonic aliases. - def : SIMDScalarCPYAlias<"dup", ".b", + def : SIMDScalarDUPAlias<"dup", ".b", !cast<Instruction>(NAME#"i8"), FPR8, V128, VectorIndexB>; - def : SIMDScalarCPYAlias<"dup", ".h", + def : SIMDScalarDUPAlias<"dup", ".h", !cast<Instruction>(NAME#"i16"), FPR16, V128, VectorIndexH>; - def : SIMDScalarCPYAlias<"dup", ".s", + def : SIMDScalarDUPAlias<"dup", ".s", !cast<Instruction>(NAME#"i32"), FPR32, V128, VectorIndexS>; - def : SIMDScalarCPYAlias<"dup", ".d", + def : SIMDScalarDUPAlias<"dup", ".d", !cast<Instruction>(NAME#"i64"), FPR64, V128, VectorIndexD>; } @@ -10556,40 +10556,30 @@ class BaseSIMDThreeSameVectorTiedR0<bit Q, bit U, bits<2> size, bits<5> opcode, pattern> { } multiclass SIMDThreeSameVectorSQRDMLxHTiedHS<bit U, bits<5> opc, string asm, - SDPatternOperator Accum> { + SDPatternOperator op> { def v4i16 : BaseSIMDThreeSameVectorTiedR0<0, U, 0b01, opc, V64, asm, ".4h", [(set (v4i16 V64:$dst), - (Accum (v4i16 V64:$Rd), - (v4i16 (int_aarch64_neon_sqrdmulh (v4i16 V64:$Rn), - (v4i16 V64:$Rm)))))]>; + (v4i16 (op (v4i16 V64:$Rd), (v4i16 V64:$Rn), (v4i16 V64:$Rm))))]>; def v8i16 : BaseSIMDThreeSameVectorTiedR0<1, U, 0b01, opc, V128, asm, ".8h", [(set (v8i16 V128:$dst), - (Accum (v8i16 V128:$Rd), - (v8i16 (int_aarch64_neon_sqrdmulh (v8i16 V128:$Rn), - (v8i16 V128:$Rm)))))]>; + (v8i16 (op (v8i16 V128:$Rd), (v8i16 V128:$Rn), (v8i16 V128:$Rm))))]>; def v2i32 : BaseSIMDThreeSameVectorTiedR0<0, U, 0b10, opc, V64, asm, ".2s", [(set (v2i32 V64:$dst), - (Accum (v2i32 V64:$Rd), - (v2i32 (int_aarch64_neon_sqrdmulh (v2i32 V64:$Rn), - (v2i32 V64:$Rm)))))]>; + (v2i32 (op (v2i32 V64:$Rd), (v2i32 V64:$Rn), (v2i32 V64:$Rm))))]>; def v4i32 : BaseSIMDThreeSameVectorTiedR0<1, U, 0b10, opc, V128, asm, ".4s", [(set (v4i32 V128:$dst), - (Accum (v4i32 V128:$Rd), - (v4i32 (int_aarch64_neon_sqrdmulh (v4i32 V128:$Rn), - (v4i32 V128:$Rm)))))]>; + (v4i32 (op (v4i32 V128:$Rd), (v4i32 V128:$Rn), (v4i32 V128:$Rm))))]>; } multiclass SIMDIndexedSQRDMLxHSDTied<bit U, bits<4> opc, string asm, - SDPatternOperator Accum> { + SDPatternOperator op> { def v4i16_indexed : BaseSIMDIndexedTied<0, U, 0, 0b01, opc, V64, V64, V128_lo, VectorIndexH, asm, ".4h", ".4h", ".4h", ".h", [(set (v4i16 V64:$dst), - (Accum (v4i16 V64:$Rd), - (v4i16 (int_aarch64_neon_sqrdmulh - (v4i16 V64:$Rn), - (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), - VectorIndexH:$idx))))))]> { + (v4i16 (op (v4i16 V64:$Rd), (v4i16 V64:$Rn), + (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), + VectorIndexH:$idx)))))]> { bits<3> idx; let Inst{11} = idx{2}; let Inst{21} = idx{1}; @@ -10600,11 +10590,9 @@ multiclass SIMDIndexedSQRDMLxHSDTied<bit U, bits<4> opc, string asm, V128, V128, V128_lo, VectorIndexH, asm, ".8h", ".8h", ".8h", ".h", [(set (v8i16 V128:$dst), - (Accum (v8i16 V128:$Rd), - (v8i16 (int_aarch64_neon_sqrdmulh - (v8i16 V128:$Rn), - (v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), - VectorIndexH:$idx))))))]> { + (v8i16 (op (v8i16 V128:$Rd), (v8i16 V128:$Rn), + (v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), + VectorIndexH:$idx)))))]> { bits<3> idx; let Inst{11} = idx{2}; let Inst{21} = idx{1}; @@ -10615,75 +10603,26 @@ multiclass SIMDIndexedSQRDMLxHSDTied<bit U, bits<4> opc, string asm, V64, V64, V128, VectorIndexS, asm, ".2s", ".2s", ".2s", ".s", [(set (v2i32 V64:$dst), - (Accum (v2i32 V64:$Rd), - (v2i32 (int_aarch64_neon_sqrdmulh - (v2i32 V64:$Rn), - (v2i32 (AArch64duplane32 (v4i32 V128:$Rm), - VectorIndexS:$idx))))))]> { + (v2i32 (op (v2i32 V64:$Rd), (v2i32 V64:$Rn), + (v2i32 (AArch64duplane32 (v4i32 V128:$Rm), + VectorIndexS:$idx)))))]> { bits<2> idx; let Inst{11} = idx{1}; let Inst{21} = idx{0}; } - // FIXME: it would be nice to use the scalar (v1i32) instruction here, but - // an intermediate EXTRACT_SUBREG would be untyped. - // FIXME: direct EXTRACT_SUBREG from v2i32 to i32 is illegal, that's why we - // got it lowered here as (i32 vector_extract (v4i32 insert_subvector(..))) - def : Pat<(i32 (Accum (i32 FPR32Op:$Rd), - (i32 (vector_extract - (v4i32 (insert_subvector - (undef), - (v2i32 (int_aarch64_neon_sqrdmulh - (v2i32 V64:$Rn), - (v2i32 (AArch64duplane32 - (v4i32 V128:$Rm), - VectorIndexS:$idx)))), - (i64 0))), - (i64 0))))), - (EXTRACT_SUBREG - (v2i32 (!cast<Instruction>(NAME # v2i32_indexed) - (v2i32 (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)), - FPR32Op:$Rd, - ssub)), - V64:$Rn, - V128:$Rm, - VectorIndexS:$idx)), - ssub)>; - def v4i32_indexed : BaseSIMDIndexedTied<1, U, 0, 0b10, opc, V128, V128, V128, VectorIndexS, asm, ".4s", ".4s", ".4s", ".s", [(set (v4i32 V128:$dst), - (Accum (v4i32 V128:$Rd), - (v4i32 (int_aarch64_neon_sqrdmulh - (v4i32 V128:$Rn), - (v4i32 (AArch64duplane32 (v4i32 V128:$Rm), - VectorIndexS:$idx))))))]> { + (v4i32 (op (v4i32 V128:$Rd), (v4i32 V128:$Rn), + (v4i32 (AArch64duplane32 (v4i32 V128:$Rm), + VectorIndexS:$idx)))))]> { bits<2> idx; let Inst{11} = idx{1}; let Inst{21} = idx{0}; } - // FIXME: it would be nice to use the scalar (v1i32) instruction here, but - // an intermediate EXTRACT_SUBREG would be untyped. - def : Pat<(i32 (Accum (i32 FPR32Op:$Rd), - (i32 (vector_extract - (v4i32 (int_aarch64_neon_sqrdmulh - (v4i32 V128:$Rn), - (v4i32 (AArch64duplane32 - (v4i32 V128:$Rm), - VectorIndexS:$idx)))), - (i64 0))))), - (EXTRACT_SUBREG - (v4i32 (!cast<Instruction>(NAME # v4i32_indexed) - (v4i32 (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), - FPR32Op:$Rd, - ssub)), - V128:$Rn, - V128:$Rm, - VectorIndexS:$idx)), - ssub)>; - def i16_indexed : BaseSIMDIndexedTied<1, U, 1, 0b01, opc, FPR16Op, FPR16Op, V128_lo, VectorIndexH, asm, ".h", "", "", ".h", @@ -10698,11 +10637,9 @@ multiclass SIMDIndexedSQRDMLxHSDTied<bit U, bits<4> opc, string asm, FPR32Op, FPR32Op, V128, VectorIndexS, asm, ".s", "", "", ".s", [(set (i32 FPR32Op:$dst), - (Accum (i32 FPR32Op:$Rd), - (i32 (int_aarch64_neon_sqrdmulh - (i32 FPR32Op:$Rn), - (i32 (vector_extract (v4i32 V128:$Rm), - VectorIndexS:$idx))))))]> { + (i32 (op (i32 FPR32Op:$Rd), (i32 FPR32Op:$Rn), + (i32 (vector_extract (v4i32 V128:$Rm), + VectorIndexS:$idx)))))]> { bits<2> idx; let Inst{11} = idx{1}; let Inst{21} = idx{0}; @@ -11430,6 +11367,123 @@ class Store64BV<bits<3> opc, string asm_inst, list<dag> pat = []> let Inst{20-16} = Rs; } +class MOPSMemoryCopyMoveBase<bit isMove, bits<2> opcode, bits<2> op1, + bits<2> op2, string asm> + : I<(outs GPR64common:$Rd_wb, GPR64common:$Rs_wb, GPR64:$Rn_wb), + (ins GPR64common:$Rd, GPR64common:$Rs, GPR64:$Rn), + asm, "\t[$Rd]!, [$Rs]!, $Rn!", + "$Rd = $Rd_wb,$Rs = $Rs_wb,$Rn = $Rn_wb", []>, + Sched<[]> { + bits<5> Rd; + bits<5> Rs; + bits<5> Rn; + let Inst{31-27} = 0b00011; + let Inst{26} = isMove; + let Inst{25-24} = 0b01; + let Inst{23-22} = opcode; + let Inst{21} = 0b0; + let Inst{20-16} = Rs; + let Inst{15-14} = op2; + let Inst{13-12} = op1; + let Inst{11-10} = 0b01; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; + + let DecoderMethod = "DecodeCPYMemOpInstruction"; + let mayLoad = 1; + let mayStore = 1; +} + +class MOPSMemoryCopy<bits<2> opcode, bits<2> op1, bits<2> op2, string asm> + : MOPSMemoryCopyMoveBase<0, opcode, op1, op2, asm>; + +class MOPSMemoryMove<bits<2> opcode, bits<2> op1, bits<2> op2, string asm> + : MOPSMemoryCopyMoveBase<1, opcode, op1, op2, asm>; + +class MOPSMemorySetBase<bit isTagging, bits<2> opcode, bit op1, bit op2, + string asm> + : I<(outs GPR64common:$Rd_wb, GPR64:$Rn_wb), + (ins GPR64common:$Rd, GPR64:$Rn, GPR64:$Rm), + asm, "\t[$Rd]!, $Rn!, $Rm", + "$Rd = $Rd_wb,$Rn = $Rn_wb", []>, + Sched<[]> { + bits<5> Rd; + bits<5> Rn; + bits<5> Rm; + let Inst{31-27} = 0b00011; + let Inst{26} = isTagging; + let Inst{25-21} = 0b01110; + let Inst{20-16} = Rm; + let Inst{15-14} = opcode; + let Inst{13} = op2; + let Inst{12} = op1; + let Inst{11-10} = 0b01; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; + + let DecoderMethod = "DecodeSETMemOpInstruction"; + let mayLoad = 0; + let mayStore = 1; +} + +class MOPSMemorySet<bits<2> opcode, bit op1, bit op2, string asm> + : MOPSMemorySetBase<0, opcode, op1, op2, asm>; + +class MOPSMemorySetTagging<bits<2> opcode, bit op1, bit op2, string asm> + : MOPSMemorySetBase<1, opcode, op1, op2, asm>; + +multiclass MOPSMemoryCopyInsns<bits<2> opcode, string asm> { + def "" : MOPSMemoryCopy<opcode, 0b00, 0b00, asm>; + def WN : MOPSMemoryCopy<opcode, 0b00, 0b01, asm # "wn">; + def RN : MOPSMemoryCopy<opcode, 0b00, 0b10, asm # "rn">; + def N : MOPSMemoryCopy<opcode, 0b00, 0b11, asm # "n">; + def WT : MOPSMemoryCopy<opcode, 0b01, 0b00, asm # "wt">; + def WTWN : MOPSMemoryCopy<opcode, 0b01, 0b01, asm # "wtwn">; + def WTRN : MOPSMemoryCopy<opcode, 0b01, 0b10, asm # "wtrn">; + def WTN : MOPSMemoryCopy<opcode, 0b01, 0b11, asm # "wtn">; + def RT : MOPSMemoryCopy<opcode, 0b10, 0b00, asm # "rt">; + def RTWN : MOPSMemoryCopy<opcode, 0b10, 0b01, asm # "rtwn">; + def RTRN : MOPSMemoryCopy<opcode, 0b10, 0b10, asm # "rtrn">; + def RTN : MOPSMemoryCopy<opcode, 0b10, 0b11, asm # "rtn">; + def T : MOPSMemoryCopy<opcode, 0b11, 0b00, asm # "t">; + def TWN : MOPSMemoryCopy<opcode, 0b11, 0b01, asm # "twn">; + def TRN : MOPSMemoryCopy<opcode, 0b11, 0b10, asm # "trn">; + def TN : MOPSMemoryCopy<opcode, 0b11, 0b11, asm # "tn">; +} + +multiclass MOPSMemoryMoveInsns<bits<2> opcode, string asm> { + def "" : MOPSMemoryMove<opcode, 0b00, 0b00, asm>; + def WN : MOPSMemoryMove<opcode, 0b00, 0b01, asm # "wn">; + def RN : MOPSMemoryMove<opcode, 0b00, 0b10, asm # "rn">; + def N : MOPSMemoryMove<opcode, 0b00, 0b11, asm # "n">; + def WT : MOPSMemoryMove<opcode, 0b01, 0b00, asm # "wt">; + def WTWN : MOPSMemoryMove<opcode, 0b01, 0b01, asm # "wtwn">; + def WTRN : MOPSMemoryMove<opcode, 0b01, 0b10, asm # "wtrn">; + def WTN : MOPSMemoryMove<opcode, 0b01, 0b11, asm # "wtn">; + def RT : MOPSMemoryMove<opcode, 0b10, 0b00, asm # "rt">; + def RTWN : MOPSMemoryMove<opcode, 0b10, 0b01, asm # "rtwn">; + def RTRN : MOPSMemoryMove<opcode, 0b10, 0b10, asm # "rtrn">; + def RTN : MOPSMemoryMove<opcode, 0b10, 0b11, asm # "rtn">; + def T : MOPSMemoryMove<opcode, 0b11, 0b00, asm # "t">; + def TWN : MOPSMemoryMove<opcode, 0b11, 0b01, asm # "twn">; + def TRN : MOPSMemoryMove<opcode, 0b11, 0b10, asm # "trn">; + def TN : MOPSMemoryMove<opcode, 0b11, 0b11, asm # "tn">; +} + +multiclass MOPSMemorySetInsns<bits<2> opcode, string asm> { + def "" : MOPSMemorySet<opcode, 0, 0, asm>; + def T : MOPSMemorySet<opcode, 1, 0, asm # "t">; + def N : MOPSMemorySet<opcode, 0, 1, asm # "n">; + def TN : MOPSMemorySet<opcode, 1, 1, asm # "tn">; +} + +multiclass MOPSMemorySetTaggingInsns<bits<2> opcode, string asm> { + def "" : MOPSMemorySetTagging<opcode, 0, 0, asm>; + def T : MOPSMemorySetTagging<opcode, 1, 0, asm # "t">; + def N : MOPSMemorySetTagging<opcode, 0, 1, asm # "n">; + def TN : MOPSMemorySetTagging<opcode, 1, 1, asm # "tn">; +} + //---------------------------------------------------------------------------- // Allow the size specifier tokens to be upper case, not just lower. def : TokenAlias<".4B", ".4b">; // Add dot product diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index 5fc5e4e5eb35..93c17133c845 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -2574,6 +2574,7 @@ AArch64InstrInfo::getAddrModeFromMemoryOp(const MachineInstr &MemI, AM.BaseReg = Base->getReg(); AM.Displacement = Offset; AM.ScaledReg = 0; + AM.Scale = 0; return AM; } @@ -7350,8 +7351,7 @@ static void signOutlinedFunction(MachineFunction &MF, MachineBasicBlock &MBB, .setMIFlags(MachineInstr::FrameSetup); // If v8.3a features are available we can replace a RET instruction by - // RETAA or RETAB and omit the AUT instructions. In this case the - // DW_CFA_AARCH64_negate_ra_state can't be emitted. + // RETAA or RETAB and omit the AUT instructions if (Subtarget.hasPAuth() && MBBAUT != MBB.end() && MBBAUT->getOpcode() == AArch64::RET) { BuildMI(MBB, MBBAUT, DL, @@ -7364,11 +7364,6 @@ static void signOutlinedFunction(MachineFunction &MF, MachineBasicBlock &MBB, TII->get(ShouldSignReturnAddrWithAKey ? AArch64::AUTIASP : AArch64::AUTIBSP)) .setMIFlag(MachineInstr::FrameDestroy); - unsigned CFIIndexAuth = - MF.addFrameInst(MCCFIInstruction::createNegateRAState(nullptr)); - BuildMI(MBB, MBBAUT, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndexAuth) - .setMIFlags(MachineInstr::FrameDestroy); } } } diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h index b2f9e82a7e8b..1054bea40e68 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h @@ -26,7 +26,6 @@ namespace llvm { class AArch64Subtarget; -class AArch64TargetMachine; static const MachineMemOperand::Flags MOSuppressPair = MachineMemOperand::MOTargetFlag1; diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index ebccc07edc7a..c8a697c8b82f 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -33,6 +33,8 @@ def HasV9_1a : Predicate<"Subtarget->hasV9_1aOps()">, AssemblerPredicate<(all_of HasV9_1aOps), "armv9.1a">; def HasV9_2a : Predicate<"Subtarget->hasV9_2aOps()">, AssemblerPredicate<(all_of HasV9_2aOps), "armv9.2a">; +def HasV9_3a : Predicate<"Subtarget->hasV9_3aOps()">, + AssemblerPredicate<(all_of HasV9_3aOps), "armv9.3a">; def HasV8_0r : Predicate<"Subtarget->hasV8_0rOps()">, AssemblerPredicate<(all_of HasV8_0rOps), "armv8-r">; @@ -198,6 +200,10 @@ def HasBRBE : Predicate<"Subtarget->hasBRBE()">, AssemblerPredicate<(all_of FeatureBRBE), "brbe">; def HasSPE_EEF : Predicate<"Subtarget->hasSPE_EEF()">, AssemblerPredicate<(all_of FeatureSPE_EEF), "spe-eef">; +def HasHBC : Predicate<"Subtarget->hasHBC()">, + AssemblerPredicate<(all_of FeatureHBC), "hbc">; +def HasMOPS : Predicate<"Subtarget->hasMOPS()">, + AssemblerPredicate<(all_of FeatureMOPS), "mops">; def IsLE : Predicate<"Subtarget->isLittleEndian()">; def IsBE : Predicate<"!Subtarget->isLittleEndian()">; def IsWindows : Predicate<"Subtarget->isTargetWindows()">; @@ -2362,7 +2368,12 @@ def : Pat<(AArch64tlsdesc_callseq texternalsym:$sym), //===----------------------------------------------------------------------===// // Conditional branch (immediate) instruction. //===----------------------------------------------------------------------===// -def Bcc : BranchCond; +def Bcc : BranchCond<0, "b">; + +// Armv8.8-A variant form which hints to the branch predictor that +// this branch is very likely to go the same way nearly all the time +// (even though it is not known at compile time _which_ way that is). +def BCcc : BranchCond<1, "bc">, Requires<[HasHBC]>; //===----------------------------------------------------------------------===// // Compare-and-branch instructions. @@ -4500,9 +4511,9 @@ defm URHADD : SIMDThreeSameVectorBHS<1,0b00010,"urhadd", AArch64urhadd>; defm URSHL : SIMDThreeSameVector<1,0b01010,"urshl", int_aarch64_neon_urshl>; defm USHL : SIMDThreeSameVector<1,0b01000,"ushl", int_aarch64_neon_ushl>; defm SQRDMLAH : SIMDThreeSameVectorSQRDMLxHTiedHS<1,0b10000,"sqrdmlah", - int_aarch64_neon_sqadd>; + int_aarch64_neon_sqrdmlah>; defm SQRDMLSH : SIMDThreeSameVectorSQRDMLxHTiedHS<1,0b10001,"sqrdmlsh", - int_aarch64_neon_sqsub>; + int_aarch64_neon_sqrdmlsh>; // Extra saturate patterns, other than the intrinsics matches above defm : SIMDThreeSameVectorExtraPatterns<"SQADD", saddsat>; @@ -4769,15 +4780,11 @@ defm USHL : SIMDThreeScalarD< 1, 0b01000, "ushl", int_aarch64_neon_ushl>; let Predicates = [HasRDM] in { defm SQRDMLAH : SIMDThreeScalarHSTied<1, 0, 0b10000, "sqrdmlah">; defm SQRDMLSH : SIMDThreeScalarHSTied<1, 0, 0b10001, "sqrdmlsh">; - def : Pat<(i32 (int_aarch64_neon_sqadd - (i32 FPR32:$Rd), - (i32 (int_aarch64_neon_sqrdmulh (i32 FPR32:$Rn), - (i32 FPR32:$Rm))))), + def : Pat<(i32 (int_aarch64_neon_sqrdmlah (i32 FPR32:$Rd), (i32 FPR32:$Rn), + (i32 FPR32:$Rm))), (SQRDMLAHv1i32 FPR32:$Rd, FPR32:$Rn, FPR32:$Rm)>; - def : Pat<(i32 (int_aarch64_neon_sqsub - (i32 FPR32:$Rd), - (i32 (int_aarch64_neon_sqrdmulh (i32 FPR32:$Rn), - (i32 FPR32:$Rm))))), + def : Pat<(i32 (int_aarch64_neon_sqrdmlsh (i32 FPR32:$Rd), (i32 FPR32:$Rn), + (i32 FPR32:$Rm))), (SQRDMLSHv1i32 FPR32:$Rd, FPR32:$Rn, FPR32:$Rm)>; } @@ -5342,19 +5349,6 @@ def : Pat<(v4i32 (concat_vectors (v2i32 (trunc (v2i64 V128:$Vn))), (v2i32 (trunc (v2i64 V128:$Vm))))), (UZP1v4i32 V128:$Vn, V128:$Vm)>; -def : Pat<(v16i8 (concat_vectors - (v8i8 (trunc (AArch64vlshr (v8i16 V128:$Vn), (i32 8)))), - (v8i8 (trunc (AArch64vlshr (v8i16 V128:$Vm), (i32 8)))))), - (UZP2v16i8 V128:$Vn, V128:$Vm)>; -def : Pat<(v8i16 (concat_vectors - (v4i16 (trunc (AArch64vlshr (v4i32 V128:$Vn), (i32 16)))), - (v4i16 (trunc (AArch64vlshr (v4i32 V128:$Vm), (i32 16)))))), - (UZP2v8i16 V128:$Vn, V128:$Vm)>; -def : Pat<(v4i32 (concat_vectors - (v2i32 (trunc (AArch64vlshr (v2i64 V128:$Vn), (i32 32)))), - (v2i32 (trunc (AArch64vlshr (v2i64 V128:$Vm), (i32 32)))))), - (UZP2v4i32 V128:$Vn, V128:$Vm)>; - //---------------------------------------------------------------------------- // AdvSIMD TBL/TBX instructions //---------------------------------------------------------------------------- @@ -5376,10 +5370,10 @@ def : Pat<(v16i8 (int_aarch64_neon_tbx1 (v16i8 V128:$Rd), //---------------------------------------------------------------------------- -// AdvSIMD scalar CPY instruction +// AdvSIMD scalar DUP instruction //---------------------------------------------------------------------------- -defm CPY : SIMDScalarCPY<"mov">; +defm DUP : SIMDScalarDUP<"mov">; //---------------------------------------------------------------------------- // AdvSIMD scalar pairwise instructions @@ -5790,7 +5784,7 @@ defm : Neon_INS_elt_pattern<v2f64, v1f64, f64, INSvi64lane>; // Floating point vector extractions are codegen'd as either a sequence of -// subregister extractions, or a MOV (aka CPY here, alias for DUP) if +// subregister extractions, or a MOV (aka DUP here) if // the lane number is anything other than zero. def : Pat<(vector_extract (v2f64 V128:$Rn), 0), (f64 (EXTRACT_SUBREG V128:$Rn, dsub))>; @@ -5803,13 +5797,13 @@ def : Pat<(vector_extract (v8bf16 V128:$Rn), 0), def : Pat<(vector_extract (v2f64 V128:$Rn), VectorIndexD:$idx), - (f64 (CPYi64 V128:$Rn, VectorIndexD:$idx))>; + (f64 (DUPi64 V128:$Rn, VectorIndexD:$idx))>; def : Pat<(vector_extract (v4f32 V128:$Rn), VectorIndexS:$idx), - (f32 (CPYi32 V128:$Rn, VectorIndexS:$idx))>; + (f32 (DUPi32 V128:$Rn, VectorIndexS:$idx))>; def : Pat<(vector_extract (v8f16 V128:$Rn), VectorIndexH:$idx), - (f16 (CPYi16 V128:$Rn, VectorIndexH:$idx))>; + (f16 (DUPi16 V128:$Rn, VectorIndexH:$idx))>; def : Pat<(vector_extract (v8bf16 V128:$Rn), VectorIndexH:$idx), - (bf16 (CPYi16 V128:$Rn, VectorIndexH:$idx))>; + (bf16 (DUPi16 V128:$Rn, VectorIndexH:$idx))>; // All concat_vectors operations are canonicalised to act on i64 vectors for // AArch64. In the general case we need an instruction, which had just as well be @@ -6407,9 +6401,9 @@ defm SQDMLAL : SIMDIndexedLongSQDMLXSDTied<0, 0b0011, "sqdmlal", defm SQDMLSL : SIMDIndexedLongSQDMLXSDTied<0, 0b0111, "sqdmlsl", int_aarch64_neon_sqsub>; defm SQRDMLAH : SIMDIndexedSQRDMLxHSDTied<1, 0b1101, "sqrdmlah", - int_aarch64_neon_sqadd>; + int_aarch64_neon_sqrdmlah>; defm SQRDMLSH : SIMDIndexedSQRDMLxHSDTied<1, 0b1111, "sqrdmlsh", - int_aarch64_neon_sqsub>; + int_aarch64_neon_sqrdmlsh>; defm SQDMULL : SIMDIndexedLongSD<0, 0b1011, "sqdmull", int_aarch64_neon_sqdmull>; defm UMLAL : SIMDVectorIndexedLongSDTied<1, 0b0010, "umlal", TriOpFrag<(add node:$LHS, (int_aarch64_neon_umull node:$MHS, node:$RHS))>>; @@ -6425,6 +6419,22 @@ def : Pat<(int_aarch64_neon_sqdmulls_scalar (i32 FPR32:$Rn), VectorIndexS:$idx)), (SQDMULLv1i64_indexed FPR32:$Rn, V128:$Vm, VectorIndexS:$idx)>; +// Match add node and also treat an 'or' node is as an 'add' if the or'ed operands +// have no common bits. +def add_and_or_is_add : PatFrags<(ops node:$lhs, node:$rhs), + [(add node:$lhs, node:$rhs), (or node:$lhs, node:$rhs)],[{ + if (N->getOpcode() == ISD::ADD) + return true; + return CurDAG->haveNoCommonBitsSet(N->getOperand(0), N->getOperand(1)); +}]> { + let GISelPredicateCode = [{ + // Only handle G_ADD for now. FIXME. build capability to compute whether + // operands of G_OR have common bits set or not. + return MI.getOpcode() == TargetOpcode::G_ADD; + }]; +} + + //---------------------------------------------------------------------------- // AdvSIMD scalar shift instructions //---------------------------------------------------------------------------- @@ -6530,7 +6540,7 @@ defm SRSRA : SIMDScalarRShiftDTied< 0, 0b00110, "srsra", (AArch64srshri node:$MHS, node:$RHS))>>; defm SSHR : SIMDScalarRShiftD< 0, 0b00000, "sshr", AArch64vashr>; defm SSRA : SIMDScalarRShiftDTied< 0, 0b00010, "ssra", - TriOpFrag<(add node:$LHS, + TriOpFrag<(add_and_or_is_add node:$LHS, (AArch64vashr node:$MHS, node:$RHS))>>; defm UQRSHRN : SIMDScalarRShiftBHS< 1, 0b10011, "uqrshrn", int_aarch64_neon_uqrshrn>; @@ -6543,7 +6553,7 @@ defm URSRA : SIMDScalarRShiftDTied< 1, 0b00110, "ursra", (AArch64urshri node:$MHS, node:$RHS))>>; defm USHR : SIMDScalarRShiftD< 1, 0b00000, "ushr", AArch64vlshr>; defm USRA : SIMDScalarRShiftDTied< 1, 0b00010, "usra", - TriOpFrag<(add node:$LHS, + TriOpFrag<(add_and_or_is_add node:$LHS, (AArch64vlshr node:$MHS, node:$RHS))>>; //---------------------------------------------------------------------------- @@ -6585,7 +6595,7 @@ defm SSHLL : SIMDVectorLShiftLongBHSD<0, 0b10100, "sshll", defm SSHR : SIMDVectorRShiftBHSD<0, 0b00000, "sshr", AArch64vashr>; defm SSRA : SIMDVectorRShiftBHSDTied<0, 0b00010, "ssra", - TriOpFrag<(add node:$LHS, (AArch64vashr node:$MHS, node:$RHS))>>; + TriOpFrag<(add_and_or_is_add node:$LHS, (AArch64vashr node:$MHS, node:$RHS))>>; defm UCVTF : SIMDVectorRShiftToFP<1, 0b11100, "ucvtf", int_aarch64_neon_vcvtfxu2fp>; defm UQRSHRN : SIMDVectorRShiftNarrowBHS<1, 0b10011, "uqrshrn", @@ -6601,7 +6611,7 @@ defm USHLL : SIMDVectorLShiftLongBHSD<1, 0b10100, "ushll", BinOpFrag<(AArch64vshl (zext node:$LHS), node:$RHS)>>; defm USHR : SIMDVectorRShiftBHSD<1, 0b00000, "ushr", AArch64vlshr>; defm USRA : SIMDVectorRShiftBHSDTied<1, 0b00010, "usra", - TriOpFrag<(add node:$LHS, (AArch64vlshr node:$MHS, node:$RHS))> >; + TriOpFrag<(add_and_or_is_add node:$LHS, (AArch64vlshr node:$MHS, node:$RHS))> >; // RADDHN patterns for when RSHRN shifts by half the size of the vector element def : Pat<(v8i8 (int_aarch64_neon_rshrn (v8i16 V128:$Vn), (i32 8))), @@ -8106,7 +8116,7 @@ class NTStore128Pat<ValueType VT> : Pat<(nontemporalstore (VT FPR128:$Rt), (am_indexed7s64 GPR64sp:$Rn, simm7s8:$offset)), (STNPDi (EXTRACT_SUBREG FPR128:$Rt, dsub), - (CPYi64 FPR128:$Rt, (i64 1)), + (DUPi64 FPR128:$Rt, (i64 1)), GPR64sp:$Rn, simm7s8:$offset)>; def : NTStore128Pat<v2i64>; @@ -8118,7 +8128,7 @@ class NTStore64Pat<ValueType VT> : Pat<(nontemporalstore (VT FPR64:$Rt), (am_indexed7s32 GPR64sp:$Rn, simm7s4:$offset)), (STNPSi (EXTRACT_SUBREG FPR64:$Rt, ssub), - (CPYi32 (SUBREG_TO_REG (i64 0), FPR64:$Rt, dsub), (i64 1)), + (DUPi32 (SUBREG_TO_REG (i64 0), FPR64:$Rt, dsub), (i64 1)), GPR64sp:$Rn, simm7s4:$offset)>; // FIXME: Shouldn't v1f64 loads/stores be promoted to v1i64? @@ -8319,6 +8329,26 @@ let Predicates = [HasLS64] in { def : ST64BPattern<int_aarch64_st64bv0, ST64BV0>; } +let Predicates = [HasMOPS] in { + defm CPYFP : MOPSMemoryCopyInsns<0b00, "cpyfp">; + defm CPYFM : MOPSMemoryCopyInsns<0b01, "cpyfm">; + defm CPYFE : MOPSMemoryCopyInsns<0b10, "cpyfe">; + + defm CPYP : MOPSMemoryMoveInsns<0b00, "cpyp">; + defm CPYM : MOPSMemoryMoveInsns<0b01, "cpym">; + defm CPYE : MOPSMemoryMoveInsns<0b10, "cpye">; + + defm SETP : MOPSMemorySetInsns<0b00, "setp">; + defm SETM : MOPSMemorySetInsns<0b01, "setm">; + defm SETE : MOPSMemorySetInsns<0b10, "sete">; +} +let Predicates = [HasMOPS, HasMTE] in { + defm SETGP : MOPSMemorySetTaggingInsns<0b00, "setgp">; + defm SETGM : MOPSMemorySetTaggingInsns<0b01, "setgm">; + // Can't use SETGE because it's a reserved name in TargetSelectionDAG.td + defm MOPSSETGE : MOPSMemorySetTaggingInsns<0b10, "setge">; +} + let Defs = [X16, X17], mayStore = 1, isCodeGenOnly = 1 in def StoreSwiftAsyncContext : Pseudo<(outs), (ins GPR64:$ctx, GPR64sp:$base, simm9:$offset), diff --git a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp index 3a836ac33064..6aefc1fdb599 100644 --- a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp +++ b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp @@ -1139,7 +1139,7 @@ AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI, ? getLdStOffsetOp(*StoreI).getImm() : getLdStOffsetOp(*StoreI).getImm() * StoreSize; int Width = LoadSize * 8; - unsigned DestReg = + Register DestReg = IsStoreXReg ? Register(TRI->getMatchingSuperReg( LdRt, AArch64::sub_32, &AArch64::GPR64RegClass)) : LdRt; diff --git a/llvm/lib/Target/AArch64/AArch64MCInstLower.h b/llvm/lib/Target/AArch64/AArch64MCInstLower.h index 8f3148a98410..b008e49d52dd 100644 --- a/llvm/lib/Target/AArch64/AArch64MCInstLower.h +++ b/llvm/lib/Target/AArch64/AArch64MCInstLower.h @@ -14,15 +14,12 @@ namespace llvm { class AsmPrinter; -class MCAsmInfo; class MCContext; class MCInst; class MCOperand; class MCSymbol; class MachineInstr; -class MachineModuleInfoMachO; class MachineOperand; -class Mangler; /// AArch64MCInstLower - This class is used to lower an MachineInstr /// into an MCInst. diff --git a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp index 42db18332f1c..1fc5617b49f6 100644 --- a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp +++ b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp @@ -11,12 +11,19 @@ // 1. MOVi32imm + ANDWrr ==> ANDWri + ANDWri // MOVi64imm + ANDXrr ==> ANDXri + ANDXri // +// 2. MOVi32imm + ADDWrr ==> ADDWRi + ADDWRi +// MOVi64imm + ADDXrr ==> ANDXri + ANDXri +// +// 3. MOVi32imm + SUBWrr ==> SUBWRi + SUBWRi +// MOVi64imm + SUBXrr ==> SUBXri + SUBXri +// // The mov pseudo instruction could be expanded to multiple mov instructions // later. In this case, we could try to split the constant operand of mov -// instruction into two bitmask immediates. It makes two AND instructions -// intead of multiple `mov` + `and` instructions. +// instruction into two immediates which can be directly encoded into +// *Wri/*Xri instructions. It makes two AND/ADD/SUB instructions instead of +// multiple `mov` + `and/add/sub` instructions. // -// 2. Remove redundant ORRWrs which is generated by zero-extend. +// 4. Remove redundant ORRWrs which is generated by zero-extend. // // %3:gpr32 = ORRWrs $wzr, %2, 0 // %4:gpr64 = SUBREG_TO_REG 0, %3, %subreg.sub_32 @@ -30,6 +37,7 @@ #include "AArch64ExpandImm.h" #include "AArch64InstrInfo.h" #include "MCTargetDesc/AArch64AddressingModes.h" +#include "llvm/ADT/Optional.h" #include "llvm/ADT/SetVector.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineLoopInfo.h" @@ -48,11 +56,44 @@ struct AArch64MIPeepholeOpt : public MachineFunctionPass { } const AArch64InstrInfo *TII; + const AArch64RegisterInfo *TRI; MachineLoopInfo *MLI; MachineRegisterInfo *MRI; template <typename T> - bool visitAND(MachineInstr &MI, + using SplitAndOpcFunc = + std::function<Optional<unsigned>(T, unsigned, T &, T &)>; + using BuildMIFunc = + std::function<void(MachineInstr &, unsigned, unsigned, unsigned, Register, + Register, Register)>; + + /// For instructions where an immediate operand could be split into two + /// separate immediate instructions, use the splitTwoPartImm two handle the + /// optimization. + /// + /// To implement, the following function types must be passed to + /// splitTwoPartImm. A SplitAndOpcFunc must be implemented that determines if + /// splitting the immediate is valid and returns the associated new opcode. A + /// BuildMIFunc must be implemented to build the two immediate instructions. + /// + /// Example Pattern (where IMM would require 2+ MOV instructions): + /// %dst = <Instr>rr %src IMM [...] + /// becomes: + /// %tmp = <Instr>ri %src (encode half IMM) [...] + /// %dst = <Instr>ri %tmp (encode half IMM) [...] + template <typename T> + bool splitTwoPartImm(MachineInstr &MI, + SmallSetVector<MachineInstr *, 8> &ToBeRemoved, + SplitAndOpcFunc<T> SplitAndOpc, BuildMIFunc BuildInstr); + + bool checkMovImmInstr(MachineInstr &MI, MachineInstr *&MovMI, + MachineInstr *&SubregToRegMI); + + template <typename T> + bool visitADDSUB(unsigned PosOpc, unsigned NegOpc, MachineInstr &MI, + SmallSetVector<MachineInstr *, 8> &ToBeRemoved); + template <typename T> + bool visitAND(unsigned Opc, MachineInstr &MI, SmallSetVector<MachineInstr *, 8> &ToBeRemoved); bool visitORR(MachineInstr &MI, SmallSetVector<MachineInstr *, 8> &ToBeRemoved); @@ -116,7 +157,8 @@ static bool splitBitmaskImm(T Imm, unsigned RegSize, T &Imm1Enc, T &Imm2Enc) { template <typename T> bool AArch64MIPeepholeOpt::visitAND( - MachineInstr &MI, SmallSetVector<MachineInstr *, 8> &ToBeRemoved) { + unsigned Opc, MachineInstr &MI, + SmallSetVector<MachineInstr *, 8> &ToBeRemoved) { // Try below transformation. // // MOVi32imm + ANDWrr ==> ANDWri + ANDWri @@ -127,23 +169,151 @@ bool AArch64MIPeepholeOpt::visitAND( // bitmask immediates. It makes only two AND instructions intead of multiple // mov + and instructions. - unsigned RegSize = sizeof(T) * 8; - assert((RegSize == 32 || RegSize == 64) && - "Invalid RegSize for AND bitmask peephole optimization"); + return splitTwoPartImm<T>( + MI, ToBeRemoved, + [Opc](T Imm, unsigned RegSize, T &Imm0, T &Imm1) -> Optional<unsigned> { + if (splitBitmaskImm(Imm, RegSize, Imm0, Imm1)) + return Opc; + return None; + }, + [&TII = TII](MachineInstr &MI, unsigned Opcode, unsigned Imm0, + unsigned Imm1, Register SrcReg, Register NewTmpReg, + Register NewDstReg) { + DebugLoc DL = MI.getDebugLoc(); + MachineBasicBlock *MBB = MI.getParent(); + BuildMI(*MBB, MI, DL, TII->get(Opcode), NewTmpReg) + .addReg(SrcReg) + .addImm(Imm0); + BuildMI(*MBB, MI, DL, TII->get(Opcode), NewDstReg) + .addReg(NewTmpReg) + .addImm(Imm1); + }); +} + +bool AArch64MIPeepholeOpt::visitORR( + MachineInstr &MI, SmallSetVector<MachineInstr *, 8> &ToBeRemoved) { + // Check this ORR comes from below zero-extend pattern. + // + // def : Pat<(i64 (zext GPR32:$src)), + // (SUBREG_TO_REG (i32 0), (ORRWrs WZR, GPR32:$src, 0), sub_32)>; + if (MI.getOperand(3).getImm() != 0) + return false; + + if (MI.getOperand(1).getReg() != AArch64::WZR) + return false; + + MachineInstr *SrcMI = MRI->getUniqueVRegDef(MI.getOperand(2).getReg()); + if (!SrcMI) + return false; + + // From https://developer.arm.com/documentation/dui0801/b/BABBGCAC + // + // When you use the 32-bit form of an instruction, the upper 32 bits of the + // source registers are ignored and the upper 32 bits of the destination + // register are set to zero. + // + // If AArch64's 32-bit form of instruction defines the source operand of + // zero-extend, we do not need the zero-extend. Let's check the MI's opcode is + // real AArch64 instruction and if it is not, do not process the opcode + // conservatively. + if (SrcMI->getOpcode() <= TargetOpcode::GENERIC_OP_END) + return false; + + Register DefReg = MI.getOperand(0).getReg(); + Register SrcReg = MI.getOperand(2).getReg(); + MRI->replaceRegWith(DefReg, SrcReg); + MRI->clearKillFlags(SrcReg); + // replaceRegWith changes MI's definition register. Keep it for SSA form until + // deleting MI. + MI.getOperand(0).setReg(DefReg); + ToBeRemoved.insert(&MI); + + LLVM_DEBUG(dbgs() << "Removed: " << MI << "\n"); + + return true; +} + +template <typename T> +static bool splitAddSubImm(T Imm, unsigned RegSize, T &Imm0, T &Imm1) { + // The immediate must be in the form of ((imm0 << 12) + imm1), in which both + // imm0 and imm1 are non-zero 12-bit unsigned int. + if ((Imm & 0xfff000) == 0 || (Imm & 0xfff) == 0 || + (Imm & ~static_cast<T>(0xffffff)) != 0) + return false; + + // The immediate can not be composed via a single instruction. + SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn; + AArch64_IMM::expandMOVImm(Imm, RegSize, Insn); + if (Insn.size() == 1) + return false; + + // Split Imm into (Imm0 << 12) + Imm1; + Imm0 = (Imm >> 12) & 0xfff; + Imm1 = Imm & 0xfff; + return true; +} + +template <typename T> +bool AArch64MIPeepholeOpt::visitADDSUB( + unsigned PosOpc, unsigned NegOpc, MachineInstr &MI, + SmallSetVector<MachineInstr *, 8> &ToBeRemoved) { + // Try below transformation. + // + // MOVi32imm + ADDWrr ==> ADDWri + ADDWri + // MOVi64imm + ADDXrr ==> ADDXri + ADDXri + // + // MOVi32imm + SUBWrr ==> SUBWri + SUBWri + // MOVi64imm + SUBXrr ==> SUBXri + SUBXri + // + // The mov pseudo instruction could be expanded to multiple mov instructions + // later. Let's try to split the constant operand of mov instruction into two + // legal add/sub immediates. It makes only two ADD/SUB instructions intead of + // multiple `mov` + `and/sub` instructions. + + return splitTwoPartImm<T>( + MI, ToBeRemoved, + [PosOpc, NegOpc](T Imm, unsigned RegSize, T &Imm0, + T &Imm1) -> Optional<unsigned> { + if (splitAddSubImm(Imm, RegSize, Imm0, Imm1)) + return PosOpc; + if (splitAddSubImm(-Imm, RegSize, Imm0, Imm1)) + return NegOpc; + return None; + }, + [&TII = TII](MachineInstr &MI, unsigned Opcode, unsigned Imm0, + unsigned Imm1, Register SrcReg, Register NewTmpReg, + Register NewDstReg) { + DebugLoc DL = MI.getDebugLoc(); + MachineBasicBlock *MBB = MI.getParent(); + BuildMI(*MBB, MI, DL, TII->get(Opcode), NewTmpReg) + .addReg(SrcReg) + .addImm(Imm0) + .addImm(12); + BuildMI(*MBB, MI, DL, TII->get(Opcode), NewDstReg) + .addReg(NewTmpReg) + .addImm(Imm1) + .addImm(0); + }); +} - // Check whether AND's MBB is in loop and the AND is loop invariant. +// Checks if the corresponding MOV immediate instruction is applicable for +// this peephole optimization. +bool AArch64MIPeepholeOpt::checkMovImmInstr(MachineInstr &MI, + MachineInstr *&MovMI, + MachineInstr *&SubregToRegMI) { + // Check whether current MBB is in loop and the AND is loop invariant. MachineBasicBlock *MBB = MI.getParent(); MachineLoop *L = MLI->getLoopFor(MBB); if (L && !L->isLoopInvariant(MI)) return false; - // Check whether AND's operand is MOV with immediate. - MachineInstr *MovMI = MRI->getUniqueVRegDef(MI.getOperand(2).getReg()); + // Check whether current MI's operand is MOV with immediate. + MovMI = MRI->getUniqueVRegDef(MI.getOperand(2).getReg()); if (!MovMI) return false; - MachineInstr *SubregToRegMI = nullptr; // If it is SUBREG_TO_REG, check its operand. + SubregToRegMI = nullptr; if (MovMI->getOpcode() == TargetOpcode::SUBREG_TO_REG) { SubregToRegMI = MovMI; MovMI = MRI->getUniqueVRegDef(MovMI->getOperand(2).getReg()); @@ -159,47 +329,63 @@ bool AArch64MIPeepholeOpt::visitAND( // more instructions. if (!MRI->hasOneUse(MovMI->getOperand(0).getReg())) return false; - if (SubregToRegMI && !MRI->hasOneUse(SubregToRegMI->getOperand(0).getReg())) return false; - // Split the bitmask immediate into two. - T UImm = static_cast<T>(MovMI->getOperand(1).getImm()); + // It is OK to perform this peephole optimization. + return true; +} + +template <typename T> +bool AArch64MIPeepholeOpt::splitTwoPartImm( + MachineInstr &MI, SmallSetVector<MachineInstr *, 8> &ToBeRemoved, + SplitAndOpcFunc<T> SplitAndOpc, BuildMIFunc BuildInstr) { + unsigned RegSize = sizeof(T) * 8; + assert((RegSize == 32 || RegSize == 64) && + "Invalid RegSize for legal immediate peephole optimization"); + + // Perform several essential checks against current MI. + MachineInstr *MovMI, *SubregToRegMI; + if (!checkMovImmInstr(MI, MovMI, SubregToRegMI)) + return false; + + // Split the immediate to Imm0 and Imm1, and calculate the Opcode. + T Imm = static_cast<T>(MovMI->getOperand(1).getImm()), Imm0, Imm1; // For the 32 bit form of instruction, the upper 32 bits of the destination // register are set to zero. If there is SUBREG_TO_REG, set the upper 32 bits - // of UImm to zero. + // of Imm to zero. This is essential if the Immediate value was a negative + // number since it was sign extended when we assign to the 64-bit Imm. if (SubregToRegMI) - UImm &= 0xFFFFFFFF; - T Imm1Enc; - T Imm2Enc; - if (!splitBitmaskImm(UImm, RegSize, Imm1Enc, Imm2Enc)) + Imm &= 0xFFFFFFFF; + unsigned Opcode; + if (auto R = SplitAndOpc(Imm, RegSize, Imm0, Imm1)) + Opcode = R.getValue(); + else return false; - // Create new AND MIs. - DebugLoc DL = MI.getDebugLoc(); - const TargetRegisterClass *ANDImmRC = - (RegSize == 32) ? &AArch64::GPR32spRegClass : &AArch64::GPR64spRegClass; + // Create new ADD/SUB MIs. + MachineFunction *MF = MI.getMF(); + const TargetRegisterClass *RC = + TII->getRegClass(TII->get(Opcode), 0, TRI, *MF); + const TargetRegisterClass *ORC = + TII->getRegClass(TII->get(Opcode), 1, TRI, *MF); Register DstReg = MI.getOperand(0).getReg(); Register SrcReg = MI.getOperand(1).getReg(); - Register NewTmpReg = MRI->createVirtualRegister(ANDImmRC); - Register NewDstReg = MRI->createVirtualRegister(ANDImmRC); - unsigned Opcode = (RegSize == 32) ? AArch64::ANDWri : AArch64::ANDXri; - - MRI->constrainRegClass(NewTmpReg, MRI->getRegClass(SrcReg)); - BuildMI(*MBB, MI, DL, TII->get(Opcode), NewTmpReg) - .addReg(SrcReg) - .addImm(Imm1Enc); + Register NewTmpReg = MRI->createVirtualRegister(RC); + Register NewDstReg = MRI->createVirtualRegister(RC); + MRI->constrainRegClass(SrcReg, RC); + MRI->constrainRegClass(NewTmpReg, ORC); MRI->constrainRegClass(NewDstReg, MRI->getRegClass(DstReg)); - BuildMI(*MBB, MI, DL, TII->get(Opcode), NewDstReg) - .addReg(NewTmpReg) - .addImm(Imm2Enc); + + BuildInstr(MI, Opcode, Imm0, Imm1, SrcReg, NewTmpReg, NewDstReg); MRI->replaceRegWith(DstReg, NewDstReg); // replaceRegWith changes MI's definition register. Keep it for SSA form until // deleting MI. MI.getOperand(0).setReg(DstReg); + // Record the MIs need to be removed. ToBeRemoved.insert(&MI); if (SubregToRegMI) ToBeRemoved.insert(SubregToRegMI); @@ -208,59 +394,17 @@ bool AArch64MIPeepholeOpt::visitAND( return true; } -bool AArch64MIPeepholeOpt::visitORR( - MachineInstr &MI, SmallSetVector<MachineInstr *, 8> &ToBeRemoved) { - // Check this ORR comes from below zero-extend pattern. - // - // def : Pat<(i64 (zext GPR32:$src)), - // (SUBREG_TO_REG (i32 0), (ORRWrs WZR, GPR32:$src, 0), sub_32)>; - if (MI.getOperand(3).getImm() != 0) - return false; - - if (MI.getOperand(1).getReg() != AArch64::WZR) - return false; - - MachineInstr *SrcMI = MRI->getUniqueVRegDef(MI.getOperand(2).getReg()); - if (!SrcMI) - return false; - - // From https://developer.arm.com/documentation/dui0801/b/BABBGCAC - // - // When you use the 32-bit form of an instruction, the upper 32 bits of the - // source registers are ignored and the upper 32 bits of the destination - // register are set to zero. - // - // If AArch64's 32-bit form of instruction defines the source operand of - // zero-extend, we do not need the zero-extend. Let's check the MI's opcode is - // real AArch64 instruction and if it is not, do not process the opcode - // conservatively. - if (SrcMI->getOpcode() <= TargetOpcode::GENERIC_OP_END) - return false; - - Register DefReg = MI.getOperand(0).getReg(); - Register SrcReg = MI.getOperand(2).getReg(); - MRI->replaceRegWith(DefReg, SrcReg); - MRI->clearKillFlags(SrcReg); - // replaceRegWith changes MI's definition register. Keep it for SSA form until - // deleting MI. - MI.getOperand(0).setReg(DefReg); - ToBeRemoved.insert(&MI); - - LLVM_DEBUG({ dbgs() << "Removed: " << MI << "\n"; }); - - return true; -} - bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) { if (skipFunction(MF.getFunction())) return false; TII = static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo()); + TRI = static_cast<const AArch64RegisterInfo *>( + MF.getSubtarget().getRegisterInfo()); MLI = &getAnalysis<MachineLoopInfo>(); MRI = &MF.getRegInfo(); - if (!MRI->isSSA()) - return false; + assert(MRI->isSSA() && "Expected to be run on SSA form!"); bool Changed = false; SmallSetVector<MachineInstr *, 8> ToBeRemoved; @@ -271,13 +415,30 @@ bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) { default: break; case AArch64::ANDWrr: - Changed = visitAND<uint32_t>(MI, ToBeRemoved); + Changed = visitAND<uint32_t>(AArch64::ANDWri, MI, ToBeRemoved); break; case AArch64::ANDXrr: - Changed = visitAND<uint64_t>(MI, ToBeRemoved); + Changed = visitAND<uint64_t>(AArch64::ANDXri, MI, ToBeRemoved); break; case AArch64::ORRWrs: Changed = visitORR(MI, ToBeRemoved); + break; + case AArch64::ADDWrr: + Changed = visitADDSUB<uint32_t>(AArch64::ADDWri, AArch64::SUBWri, MI, + ToBeRemoved); + break; + case AArch64::SUBWrr: + Changed = visitADDSUB<uint32_t>(AArch64::SUBWri, AArch64::ADDWri, MI, + ToBeRemoved); + break; + case AArch64::ADDXrr: + Changed = visitADDSUB<uint64_t>(AArch64::ADDXri, AArch64::SUBXri, MI, + ToBeRemoved); + break; + case AArch64::SUBXrr: + Changed = visitADDSUB<uint64_t>(AArch64::SUBXri, AArch64::ADDXri, MI, + ToBeRemoved); + break; } } } diff --git a/llvm/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp b/llvm/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp index 80d98d17e1d6..2ef7bc83003a 100644 --- a/llvm/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp +++ b/llvm/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp @@ -633,7 +633,7 @@ bool AArch64SIMDInstrOpt::optimizeLdStInterleave(MachineInstr &MI) { /// Return true when the instruction is processed successfully. bool AArch64SIMDInstrOpt::processSeqRegInst(MachineInstr *DefiningMI, unsigned* StReg, unsigned* StRegKill, unsigned NumArg) const { - assert (DefiningMI != NULL); + assert(DefiningMI != nullptr); if (DefiningMI->getOpcode() != AArch64::REG_SEQUENCE) return false; diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index eb55a472a69a..73a680465f6f 100644 --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -180,20 +180,22 @@ def AArch64asr_p : SDNode<"AArch64ISD::SRA_PRED", SDT_AArch64Arith>; def AArch64fadd_p : SDNode<"AArch64ISD::FADD_PRED", SDT_AArch64Arith>; def AArch64fdiv_p : SDNode<"AArch64ISD::FDIV_PRED", SDT_AArch64Arith>; def AArch64fma_p : SDNode<"AArch64ISD::FMA_PRED", SDT_AArch64FMA>; -def AArch64fmaxnm_p : SDNode<"AArch64ISD::FMAXNM_PRED", SDT_AArch64Arith>; -def AArch64fminnm_p : SDNode<"AArch64ISD::FMINNM_PRED", SDT_AArch64Arith>; def AArch64fmax_p : SDNode<"AArch64ISD::FMAX_PRED", SDT_AArch64Arith>; +def AArch64fmaxnm_p : SDNode<"AArch64ISD::FMAXNM_PRED", SDT_AArch64Arith>; def AArch64fmin_p : SDNode<"AArch64ISD::FMIN_PRED", SDT_AArch64Arith>; +def AArch64fminnm_p : SDNode<"AArch64ISD::FMINNM_PRED", SDT_AArch64Arith>; def AArch64fmul_p : SDNode<"AArch64ISD::FMUL_PRED", SDT_AArch64Arith>; def AArch64fsub_p : SDNode<"AArch64ISD::FSUB_PRED", SDT_AArch64Arith>; def AArch64lsl_p : SDNode<"AArch64ISD::SHL_PRED", SDT_AArch64Arith>; def AArch64lsr_p : SDNode<"AArch64ISD::SRL_PRED", SDT_AArch64Arith>; def AArch64mul_p : SDNode<"AArch64ISD::MUL_PRED", SDT_AArch64Arith>; +def AArch64sabd_p : SDNode<"AArch64ISD::ABDS_PRED", SDT_AArch64Arith>; def AArch64sdiv_p : SDNode<"AArch64ISD::SDIV_PRED", SDT_AArch64Arith>; def AArch64smax_p : SDNode<"AArch64ISD::SMAX_PRED", SDT_AArch64Arith>; def AArch64smin_p : SDNode<"AArch64ISD::SMIN_PRED", SDT_AArch64Arith>; def AArch64smulh_p : SDNode<"AArch64ISD::MULHS_PRED", SDT_AArch64Arith>; def AArch64sub_p : SDNode<"AArch64ISD::SUB_PRED", SDT_AArch64Arith>; +def AArch64uabd_p : SDNode<"AArch64ISD::ABDU_PRED", SDT_AArch64Arith>; def AArch64udiv_p : SDNode<"AArch64ISD::UDIV_PRED", SDT_AArch64Arith>; def AArch64umax_p : SDNode<"AArch64ISD::UMAX_PRED", SDT_AArch64Arith>; def AArch64umin_p : SDNode<"AArch64ISD::UMIN_PRED", SDT_AArch64Arith>; @@ -277,8 +279,11 @@ def AArch64mul_p_oneuse : PatFrag<(ops node:$pred, node:$src1, node:$src2), return N->hasOneUse(); }]>; +def AArch64fabd_p : PatFrag<(ops node:$pg, node:$op1, node:$op2), + (AArch64fabs_mt node:$pg, (AArch64fsub_p node:$pg, node:$op1, node:$op2), undef)>; + def AArch64fneg_mt_nsz : PatFrag<(ops node:$pred, node:$op, node:$pt), - (AArch64fneg_mt node:$pred, node:$op, node:$pt), [{ + (AArch64fneg_mt node:$pred, node:$op, node:$pt), [{ return N->getFlags().hasNoSignedZeros(); }]>; @@ -415,6 +420,8 @@ let Predicates = [HasSVEorStreamingSVE] in { defm UMAX_ZPZZ : sve_int_bin_pred_bhsd<AArch64umax_p>; defm SMIN_ZPZZ : sve_int_bin_pred_bhsd<AArch64smin_p>; defm UMIN_ZPZZ : sve_int_bin_pred_bhsd<AArch64umin_p>; + defm SABD_ZPZZ : sve_int_bin_pred_bhsd<AArch64sabd_p>; + defm UABD_ZPZZ : sve_int_bin_pred_bhsd<AArch64uabd_p>; defm FRECPE_ZZ : sve_fp_2op_u_zd<0b110, "frecpe", AArch64frecpe>; defm FRSQRTE_ZZ : sve_fp_2op_u_zd<0b111, "frsqrte", AArch64frsqrte>; @@ -469,6 +476,7 @@ let Predicates = [HasSVEorStreamingSVE] in { defm FMINNM_ZPZZ : sve_fp_bin_pred_hfd<AArch64fminnm_p>; defm FMAX_ZPZZ : sve_fp_bin_pred_hfd<AArch64fmax_p>; defm FMIN_ZPZZ : sve_fp_bin_pred_hfd<AArch64fmin_p>; + defm FABD_ZPZZ : sve_fp_bin_pred_hfd<AArch64fabd_p>; defm FDIV_ZPZZ : sve_fp_bin_pred_hfd<AArch64fdiv_p>; } // End HasSVEorStreamingSVE @@ -642,11 +650,11 @@ let Predicates = [HasSVEorStreamingSVE] in { (DUP_ZI_D $a, $b)>; // Duplicate immediate FP into all vector elements. - def : Pat<(nxv2f32 (AArch64dup (f32 fpimm:$val))), + def : Pat<(nxv2f32 (AArch64dup (f32 fpimm:$val))), (DUP_ZR_S (MOVi32imm (bitcast_fpimm_to_i32 f32:$val)))>; - def : Pat<(nxv4f32 (AArch64dup (f32 fpimm:$val))), + def : Pat<(nxv4f32 (AArch64dup (f32 fpimm:$val))), (DUP_ZR_S (MOVi32imm (bitcast_fpimm_to_i32 f32:$val)))>; - def : Pat<(nxv2f64 (AArch64dup (f64 fpimm:$val))), + def : Pat<(nxv2f64 (AArch64dup (f64 fpimm:$val))), (DUP_ZR_D (MOVi64imm (bitcast_fpimm_to_i64 f64:$val)))>; // Duplicate FP immediate into all vector elements @@ -722,11 +730,11 @@ let Predicates = [HasSVEorStreamingSVE] in { defm BRKBS_PPzP : sve_int_break_z<0b110, "brkbs", null_frag>; def PTEST_PP : sve_int_ptest<0b010000, "ptest">; - def PFALSE : sve_int_pfalse<0b000000, "pfalse">; + defm PFALSE : sve_int_pfalse<0b000000, "pfalse">; defm PFIRST : sve_int_pfirst<0b00000, "pfirst", int_aarch64_sve_pfirst>; defm PNEXT : sve_int_pnext<0b00110, "pnext", int_aarch64_sve_pnext>; - defm AND_PPzPP : sve_int_pred_log<0b0000, "and", int_aarch64_sve_and_z, and>; + defm AND_PPzPP : sve_int_pred_log_and<0b0000, "and", int_aarch64_sve_and_z>; defm BIC_PPzPP : sve_int_pred_log<0b0001, "bic", int_aarch64_sve_bic_z>; defm EOR_PPzPP : sve_int_pred_log<0b0010, "eor", int_aarch64_sve_eor_z, xor>; defm SEL_PPPP : sve_int_pred_log<0b0011, "sel", vselect>; @@ -1419,6 +1427,16 @@ let Predicates = [HasSVEorStreamingSVE] in { (INSR_ZV_D ZPR:$Z2, (INSERT_SUBREG (IMPLICIT_DEF), (LASTB_VPZ_D (PTRUE_D 31), ZPR:$Z1), dsub))>; + // Splice with lane bigger or equal to 0 + def : Pat<(nxv16i8 (vector_splice (nxv16i8 ZPR:$Z1), (nxv16i8 ZPR:$Z2), (i64 (sve_ext_imm_0_255 i32:$index)))), + (EXT_ZZI ZPR:$Z1, ZPR:$Z2, imm0_255:$index)>; + def : Pat<(nxv8i16 (vector_splice (nxv8i16 ZPR:$Z1), (nxv8i16 ZPR:$Z2), (i64 (sve_ext_imm_0_127 i32:$index)))), + (EXT_ZZI ZPR:$Z1, ZPR:$Z2, imm0_255:$index)>; + def : Pat<(nxv4i32 (vector_splice (nxv4i32 ZPR:$Z1), (nxv4i32 ZPR:$Z2), (i64 (sve_ext_imm_0_63 i32:$index)))), + (EXT_ZZI ZPR:$Z1, ZPR:$Z2, imm0_255:$index)>; + def : Pat<(nxv2i64 (vector_splice (nxv2i64 ZPR:$Z1), (nxv2i64 ZPR:$Z2), (i64 (sve_ext_imm_0_31 i32:$index)))), + (EXT_ZZI ZPR:$Z1, ZPR:$Z2, imm0_255:$index)>; + defm CMPHS_PPzZZ : sve_int_cmp_0<0b000, "cmphs", SETUGE, SETULE>; defm CMPHI_PPzZZ : sve_int_cmp_0<0b001, "cmphi", SETUGT, SETULT>; defm CMPGE_PPzZZ : sve_int_cmp_0<0b100, "cmpge", SETGE, SETLE>; @@ -2496,6 +2514,7 @@ let Predicates = [HasSVEorStreamingSVE] in { // 16-element contiguous store defm : st1<ST1B, ST1B_IMM, nxv16i8, AArch64st1, nxv16i1, nxv16i8, am_sve_regreg_lsl0>; + // Insert scalar into undef[0] def : Pat<(nxv16i8 (vector_insert (nxv16i8 (undef)), (i32 FPR32:$src), 0)), (INSERT_SUBREG (nxv16i8 (IMPLICIT_DEF)), FPR32:$src, ssub)>; def : Pat<(nxv8i16 (vector_insert (nxv8i16 (undef)), (i32 FPR32:$src), 0)), @@ -2691,17 +2710,6 @@ let Predicates = [HasSVEorStreamingSVE] in { def : Pat<(vector_extract (nxv2f64 ZPR:$Zs), (i64 0)), (f64 (EXTRACT_SUBREG ZPR:$Zs, dsub))>; } - - // Splice with lane bigger or equal to 0 - def : Pat<(nxv16i8 (vector_splice (nxv16i8 ZPR:$Z1), (nxv16i8 ZPR:$Z2), (i64 (sve_ext_imm_0_255 i32:$index)))), - (EXT_ZZI ZPR:$Z1, ZPR:$Z2, imm0_255:$index)>; - def : Pat<(nxv8i16 (vector_splice (nxv8i16 ZPR:$Z1), (nxv8i16 ZPR:$Z2), (i64 (sve_ext_imm_0_127 i32:$index)))), - (EXT_ZZI ZPR:$Z1, ZPR:$Z2, imm0_255:$index)>; - def : Pat<(nxv4i32 (vector_splice (nxv4i32 ZPR:$Z1), (nxv4i32 ZPR:$Z2), (i64 (sve_ext_imm_0_63 i32:$index)))), - (EXT_ZZI ZPR:$Z1, ZPR:$Z2, imm0_255:$index)>; - def : Pat<(nxv2i64 (vector_splice (nxv2i64 ZPR:$Z1), (nxv2i64 ZPR:$Z2), (i64 (sve_ext_imm_0_31 i32:$index)))), - (EXT_ZZI ZPR:$Z1, ZPR:$Z2, imm0_255:$index)>; - } // End HasSVEorStreamingSVE let Predicates = [HasSVE, HasMatMulInt8] in { diff --git a/llvm/lib/Target/AArch64/AArch64SchedA55.td b/llvm/lib/Target/AArch64/AArch64SchedA55.td index 877c4d2ced41..009219ce3c54 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedA55.td +++ b/llvm/lib/Target/AArch64/AArch64SchedA55.td @@ -235,10 +235,14 @@ def : ReadAdvance<ReadID, 1, [WriteImm,WriteI, //--- // Miscellaneous //--- -def : InstRW<[CortexA55WriteVLD1SI,CortexA55WriteLDP1], (instregex "LDPS?W")>; -def : InstRW<[CortexA55WriteVLD1,CortexA55WriteLDP1], (instregex "LDPS[^W]")>; -def : InstRW<[CortexA55WriteVLD1,CortexA55WriteLDP2], (instregex "LDP(X|D)")>; -def : InstRW<[CortexA55WriteVLD1,CortexA55WriteLDP4], (instregex "LDPQ")>; +def : InstRW<[CortexA55WriteVLD1SI,CortexA55WriteLDP1], (instregex "LDPS?Wi")>; +def : InstRW<[CortexA55WriteVLD1,CortexA55WriteLDP1], (instregex "LDPSi")>; +def : InstRW<[CortexA55WriteVLD1,CortexA55WriteLDP2], (instregex "LDP(X|D)i")>; +def : InstRW<[CortexA55WriteVLD1,CortexA55WriteLDP4], (instregex "LDPQi")>; +def : InstRW<[WriteAdr, CortexA55WriteVLD1SI,CortexA55WriteLDP1], (instregex "LDPS?W(pre|post)")>; +def : InstRW<[WriteAdr, CortexA55WriteVLD1,CortexA55WriteLDP1], (instregex "LDPS(pre|post)")>; +def : InstRW<[WriteAdr, CortexA55WriteVLD1,CortexA55WriteLDP2], (instregex "LDP(X|D)(pre|post)")>; +def : InstRW<[WriteAdr, CortexA55WriteVLD1,CortexA55WriteLDP4], (instregex "LDPQ(pre|post)")>; def : InstRW<[WriteI], (instrs COPY)>; //--- // Vector Loads - 64-bit per cycle diff --git a/llvm/lib/Target/AArch64/AArch64SchedA57.td b/llvm/lib/Target/AArch64/AArch64SchedA57.td index 168a762241ca..a860aa907fd1 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedA57.td +++ b/llvm/lib/Target/AArch64/AArch64SchedA57.td @@ -526,7 +526,7 @@ def : InstRW<[A57Write_5cyc_2V], (instregex "^FRINT[AIMNPXZ](v4f32|v2f64)")>; def : InstRW<[A57Write_3cyc_2V], (instregex "^(BIF|BIT|BSL|BSP)v16i8")>; // ASIMD duplicate, gen reg, D-form and Q-form -def : InstRW<[A57Write_8cyc_1L_1V], (instregex "^CPY")>; +def : InstRW<[A57Write_8cyc_1L_1V], (instregex "^DUP(i8|i16|i32|i64)$")>; def : InstRW<[A57Write_8cyc_1L_1V], (instregex "^DUPv.+gpr")>; // ASIMD move, saturating diff --git a/llvm/lib/Target/AArch64/AArch64SchedA64FX.td b/llvm/lib/Target/AArch64/AArch64SchedA64FX.td index 1d25a6c00f95..fa10d056b7f7 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedA64FX.td +++ b/llvm/lib/Target/AArch64/AArch64SchedA64FX.td @@ -1891,7 +1891,7 @@ def : InstRW<[A64FXWrite_4Cyc_GI0], // ASIMD duplicate, gen reg // ASIMD duplicate, element def : InstRW<[A64FXWrite_DUPGENERAL], (instregex "^DUPv")>; -def : InstRW<[A64FXWrite_6Cyc_GI0], (instregex "^CPY")>; +def : InstRW<[A64FXWrite_6Cyc_GI0], (instregex "^DUP(i8|i16|i32|i64)$")>; def : InstRW<[A64FXWrite_6Cyc_GI0], (instregex "^DUPv.+gpr")>; // ASIMD extract @@ -2512,16 +2512,16 @@ def : InstRW<[A64FXWrite_1Cyc_GI24], (instrs CNTW_XPiI)>; def : InstRW<[A64FXWrite_6Cyc_GI0], (instrs COMPACT_ZPZ_D, COMPACT_ZPZ_S)>; // [72] "cpy $Zd, $Pg/m, $Rn"; -//@@@ def : InstRW<[XXXXXX], (instrs CPY_ZPmR_B, CPY_ZPmR_D, CPY_ZPmR_H, CPY_ZPmR_S)>; +def : InstRW<[A64FXWrite_6Cyc_GI0], (instrs CPY_ZPmR_B, CPY_ZPmR_D, CPY_ZPmR_H, CPY_ZPmR_S)>; // [73] "cpy $Zd, $Pg/m, $Vn"; -//@@@ def : InstRW<[XXXXXX], (instrs CPY_ZPmV_B, CPY_ZPmV_D, CPY_ZPmV_H, CPY_ZPmV_S)>; +def : InstRW<[A64FXWrite_6Cyc_GI0], (instrs CPY_ZPmV_B, CPY_ZPmV_D, CPY_ZPmV_H, CPY_ZPmV_S)>; // [74] "cpy $Zd, $Pg/m, $imm"; -//@@@ def : InstRW<[XXXXXX], (instrs CPY_ZPmI_B, CPY_ZPmI_D, CPY_ZPmI_H, CPY_ZPmI_S)>; +def : InstRW<[A64FXWrite_6Cyc_GI0], (instrs CPY_ZPmI_B, CPY_ZPmI_D, CPY_ZPmI_H, CPY_ZPmI_S)>; // [75] "cpy $Zd, $Pg/z, $imm"; -//@@@ def : InstRW<[XXXXXX], (instrs CPY_ZPzI_B, CPY_ZPzI_D, CPY_ZPzI_H, CPY_ZPzI_S)>; +def : InstRW<[A64FXWrite_6Cyc_GI0], (instrs CPY_ZPzI_B, CPY_ZPzI_D, CPY_ZPzI_H, CPY_ZPzI_S)>; // [76] "ctermeq $Rn, $Rm"; def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs CTERMEQ_WW, CTERMEQ_XX)>; diff --git a/llvm/lib/Target/AArch64/AArch64SchedExynosM3.td b/llvm/lib/Target/AArch64/AArch64SchedExynosM3.td index 14df8236504b..d66efb82fccc 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedExynosM3.td +++ b/llvm/lib/Target/AArch64/AArch64SchedExynosM3.td @@ -669,7 +669,7 @@ def : InstRW<[M3WriteNEONB], (instregex "^DUPv.+gpr")>; def : InstRW<[M3WriteNSHF1], (instregex "^DUPv.+lane")>; def : InstRW<[M3WriteNSHF1], (instregex "^EXTv")>; def : InstRW<[M3WriteNSHF1], (instregex "^[SU]?Q?XTU?Nv")>; -def : InstRW<[M3WriteNSHF1], (instregex "^CPY")>; +def : InstRW<[M3WriteNSHF1], (instregex "^DUP(i8|i16|i32|i64)$")>; def : InstRW<[M3WriteNSHF1], (instregex "^INSv.+lane")>; def : InstRW<[M3WriteMOVI], (instregex "^MOVI")>; def : InstRW<[M3WriteNALU1], (instregex "^FMOVv")>; diff --git a/llvm/lib/Target/AArch64/AArch64SchedExynosM4.td b/llvm/lib/Target/AArch64/AArch64SchedExynosM4.td index 8f740a9a0d35..94e70793e855 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedExynosM4.td +++ b/llvm/lib/Target/AArch64/AArch64SchedExynosM4.td @@ -810,7 +810,7 @@ def : InstRW<[M4WriteNALU1], (instregex "^RBITv")>; def : InstRW<[M4WriteNALU1], (instregex "^(BIF|BIT|BSL|BSP)v")>; def : InstRW<[M4WriteNALU1], (instregex "^CL[STZ]v")>; def : InstRW<[M4WriteNEONB], (instregex "^DUPv.+gpr")>; -def : InstRW<[M4WriteNSHF1], (instregex "^CPY")>; +def : InstRW<[M4WriteNSHF1], (instregex "^DUP(i8|i16|i32|i64)$")>; def : InstRW<[M4WriteNSHF1], (instregex "^DUPv.+lane")>; def : InstRW<[M4WriteNSHF1], (instregex "^EXTv")>; def : InstRW<[M4WriteNSHT4A], (instregex "^XTNv")>; diff --git a/llvm/lib/Target/AArch64/AArch64SchedExynosM5.td b/llvm/lib/Target/AArch64/AArch64SchedExynosM5.td index 93e1b66bea03..1db5f5322a64 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedExynosM5.td +++ b/llvm/lib/Target/AArch64/AArch64SchedExynosM5.td @@ -848,7 +848,7 @@ def : InstRW<[M5WriteNALU2], (instregex "^RBITv")>; def : InstRW<[M5WriteNALU2], (instregex "^(BIF|BIT|BSL|BSP)v")>; def : InstRW<[M5WriteNALU2], (instregex "^CL[STZ]v")>; def : InstRW<[M5WriteNEONB], (instregex "^DUPv.+gpr")>; -def : InstRW<[M5WriteNSHF2], (instregex "^CPY")>; +def : InstRW<[M5WriteNSHF2], (instregex "^DUP(i8|i16|i32|i64)$")>; def : InstRW<[M5WriteNSHF2], (instregex "^DUPv.+lane")>; def : InstRW<[M5WriteNSHF2], (instregex "^EXTv")>; def : InstRW<[M5WriteNSHT4A], (instregex "^XTNv")>; diff --git a/llvm/lib/Target/AArch64/AArch64SchedFalkorDetails.td b/llvm/lib/Target/AArch64/AArch64SchedFalkorDetails.td index f2cd83caffa2..a3a038f869fb 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedFalkorDetails.td +++ b/llvm/lib/Target/AArch64/AArch64SchedFalkorDetails.td @@ -908,7 +908,7 @@ def : InstRW<[FalkorWr_ADDSUBsx], (instregex "^SUB(S)?(W|X)r(s|x|x64)$")>; // ----------------------------------------------------------------------------- def : InstRW<[FalkorWr_1GTOV_1cyc], (instregex "^DUP(v8i8|v4i16|v2i32)(gpr|lane)$")>; def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^DUP(v16i8|v8i16)(gpr|lane)$")>; -def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^CPY(i8|i16|i32|i64)$")>; +def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^DUP(i8|i16|i32|i64)$")>; def : InstRW<[FalkorWr_1GTOV_1cyc], (instregex "^INSv(i8|i16)(gpr|lane)$")>; def : InstRW<[FalkorWr_1VTOG_1cyc], (instregex "^(S|U)MOVv.*$")>; def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^(BIF|BIT|BSL|BSP)v8i8$")>; diff --git a/llvm/lib/Target/AArch64/AArch64SchedThunderX2T99.td b/llvm/lib/Target/AArch64/AArch64SchedThunderX2T99.td index e4cae97b5524..ffa0a5e7d91a 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedThunderX2T99.td +++ b/llvm/lib/Target/AArch64/AArch64SchedThunderX2T99.td @@ -1499,7 +1499,7 @@ def : InstRW<[THX2T99Write_5Cyc_F01], // ASIMD duplicate, gen reg // ASIMD duplicate, element def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^DUPv")>; -def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^CPY")>; +def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^DUP(i8|i16|i32|i64)$")>; def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^DUPv.+gpr")>; // ASIMD extract diff --git a/llvm/lib/Target/AArch64/AArch64SchedThunderX3T110.td b/llvm/lib/Target/AArch64/AArch64SchedThunderX3T110.td index 08be2b3a55b3..46a1c217f984 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedThunderX3T110.td +++ b/llvm/lib/Target/AArch64/AArch64SchedThunderX3T110.td @@ -1608,7 +1608,7 @@ def : InstRW<[THX3T110Write_3_4Cyc_F23_F0123], // ASIMD duplicate, gen reg // ASIMD duplicate, element def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^DUPv")>; -def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^CPY")>; +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^DUP(i8|i16|i32|i64)$")>; def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^DUPv.+gpr")>; // ASIMD extract diff --git a/llvm/lib/Target/AArch64/AArch64SpeculationHardening.cpp b/llvm/lib/Target/AArch64/AArch64SpeculationHardening.cpp index 7307961ddb5f..87be7bb6d113 100644 --- a/llvm/lib/Target/AArch64/AArch64SpeculationHardening.cpp +++ b/llvm/lib/Target/AArch64/AArch64SpeculationHardening.cpp @@ -304,7 +304,7 @@ bool AArch64SpeculationHardening::instrumentControlFlow( // sure if that would actually result in a big performance difference // though. Maybe RegisterScavenger::findSurvivorBackwards has some logic // already to do this - but it's unclear if that could easily be used here. - unsigned TmpReg = RS.FindUnusedReg(&AArch64::GPR64commonRegClass); + Register TmpReg = RS.FindUnusedReg(&AArch64::GPR64commonRegClass); LLVM_DEBUG(dbgs() << "RS finds " << ((TmpReg == 0) ? "no register " : "register "); if (TmpReg != 0) dbgs() << printReg(TmpReg, TRI) << " "; diff --git a/llvm/lib/Target/AArch64/AArch64StackTaggingPreRA.cpp b/llvm/lib/Target/AArch64/AArch64StackTaggingPreRA.cpp index d2488f61eb4b..cae6d65bed2d 100644 --- a/llvm/lib/Target/AArch64/AArch64StackTaggingPreRA.cpp +++ b/llvm/lib/Target/AArch64/AArch64StackTaggingPreRA.cpp @@ -195,7 +195,7 @@ void AArch64StackTaggingPreRA::uncheckUsesOf(unsigned TaggedReg, int FI) { void AArch64StackTaggingPreRA::uncheckLoadsAndStores() { for (auto *I : ReTags) { - unsigned TaggedReg = I->getOperand(0).getReg(); + Register TaggedReg = I->getOperand(0).getReg(); int FI = I->getOperand(1).getIndex(); uncheckUsesOf(TaggedReg, FI); } diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp index f7d3dd0bc222..a4f4b8582182 100644 --- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp +++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp @@ -23,6 +23,7 @@ #include "llvm/CodeGen/GlobalISel/InstructionSelect.h" #include "llvm/CodeGen/MachineScheduler.h" #include "llvm/IR/GlobalValue.h" +#include "llvm/Support/AArch64TargetParser.h" #include "llvm/Support/TargetParser.h" using namespace llvm; @@ -157,13 +158,19 @@ void AArch64Subtarget::initializeProperties() { break; case NeoverseN1: PrefFunctionLogAlignment = 4; + PrefLoopLogAlignment = 5; + MaxBytesForLoopAlignment = 16; break; case NeoverseN2: PrefFunctionLogAlignment = 4; + PrefLoopLogAlignment = 5; + MaxBytesForLoopAlignment = 16; VScaleForTuning = 1; break; case NeoverseV1: PrefFunctionLogAlignment = 4; + PrefLoopLogAlignment = 5; + MaxBytesForLoopAlignment = 16; VScaleForTuning = 2; break; case Neoverse512TVB: @@ -228,8 +235,7 @@ AArch64Subtarget::AArch64Subtarget(const Triple &TT, const std::string &CPU, IsLittle(LittleEndian), MinSVEVectorSizeInBits(MinSVEVectorSizeInBitsOverride), MaxSVEVectorSizeInBits(MaxSVEVectorSizeInBitsOverride), TargetTriple(TT), - FrameLowering(), - InstrInfo(initializeSubtargetDependencies(FS, CPU, TuneCPU)), TSInfo(), + InstrInfo(initializeSubtargetDependencies(FS, CPU, TuneCPU)), TLInfo(TM, *this) { if (AArch64::isX18ReservedByDefault(TT)) ReserveXRegister.set(18); @@ -367,9 +373,4 @@ void AArch64Subtarget::mirFileLoaded(MachineFunction &MF) const { MFI.computeMaxCallFrameSize(MF); } -bool AArch64Subtarget::useSVEForFixedLengthVectors() const { - // Prefer NEON unless larger SVE registers are available. - return hasSVE() && getMinSVEVectorSizeInBits() >= 256; -} - bool AArch64Subtarget::useAA() const { return UseAA; } diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h index b3cd5ebd5f65..3e3c0f6aba15 100644 --- a/llvm/lib/Target/AArch64/AArch64Subtarget.h +++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h @@ -94,9 +94,11 @@ protected: bool HasV8_5aOps = false; bool HasV8_6aOps = false; bool HasV8_7aOps = false; + bool HasV8_8aOps = false; bool HasV9_0aOps = false; bool HasV9_1aOps = false; bool HasV9_2aOps = false; + bool HasV9_3aOps = false; bool HasV8_0rOps = false; bool HasCONTEXTIDREL2 = false; @@ -188,6 +190,10 @@ protected: bool HasHCX = false; bool HasLS64 = false; + // Armv8.8-A Extensions + bool HasHBC = false; + bool HasMOPS = false; + // Arm SVE2 extensions bool HasSVE2 = false; bool HasSVE2AES = false; @@ -274,6 +280,7 @@ protected: unsigned MaxPrefetchIterationsAhead = UINT_MAX; unsigned PrefFunctionLogAlignment = 0; unsigned PrefLoopLogAlignment = 0; + unsigned MaxBytesForLoopAlignment = 0; unsigned MaxJumpTableSize = 0; unsigned WideningBaseCost = 0; @@ -365,6 +372,7 @@ public: bool hasV9_0aOps() const { return HasV9_0aOps; } bool hasV9_1aOps() const { return HasV9_1aOps; } bool hasV9_2aOps() const { return HasV9_2aOps; } + bool hasV9_3aOps() const { return HasV9_3aOps; } bool hasV8_0rOps() const { return HasV8_0rOps; } bool hasZeroCycleRegMove() const { return HasZeroCycleRegMove; } @@ -464,6 +472,10 @@ public: } unsigned getPrefLoopLogAlignment() const { return PrefLoopLogAlignment; } + unsigned getMaxBytesForLoopAlignment() const { + return MaxBytesForLoopAlignment; + } + unsigned getMaximumJumpTableSize() const { return MaxJumpTableSize; } unsigned getWideningBaseCost() const { return WideningBaseCost; } @@ -572,6 +584,8 @@ public: bool hasRCPC_IMMO() const { return HasRCPC_IMMO; } bool hasEL2VMSA() const { return HasEL2VMSA; } bool hasEL3() const { return HasEL3; } + bool hasHBC() const { return HasHBC; } + bool hasMOPS() const { return HasMOPS; } bool fixCortexA53_835769() const { return FixCortexA53_835769; } @@ -666,7 +680,10 @@ public: return MinSVEVectorSizeInBits; } - bool useSVEForFixedLengthVectors() const; + bool useSVEForFixedLengthVectors() const { + // Prefer NEON unless larger SVE registers are available. + return hasSVE() && getMinSVEVectorSizeInBits() >= 256; + } unsigned getVScaleForTuning() const { return VScaleForTuning; } }; diff --git a/llvm/lib/Target/AArch64/AArch64SystemOperands.td b/llvm/lib/Target/AArch64/AArch64SystemOperands.td index f9fe804865a5..cce5813fe6e9 100644 --- a/llvm/lib/Target/AArch64/AArch64SystemOperands.td +++ b/llvm/lib/Target/AArch64/AArch64SystemOperands.td @@ -1333,7 +1333,7 @@ def : RWSysReg<"PRBAR_EL2", 0b11, 0b100, 0b0110, 0b1000, 0b000>; def : RWSysReg<"PRLAR_EL1", 0b11, 0b000, 0b0110, 0b1000, 0b001>; def : RWSysReg<"PRLAR_EL2", 0b11, 0b100, 0b0110, 0b1000, 0b001>; -foreach n = 0-15 in { +foreach n = 1-15 in { foreach x = 1-2 in { //Direct acces to Protection Region Base Address Register for n th MPU region def : RWSysReg<!strconcat("PRBAR"#n, "_EL"#x), @@ -1348,7 +1348,7 @@ foreach x = 1-2 in { let Encoding{13} = !add(x,-1); } } //foreach x = 1-2 in -} //foreach n = 0-15 in +} //foreach n = 1-15 in } //let Requires = [{ {AArch64::HasV8_0rOps} }] in // v8.1a "Privileged Access Never" extension-specific system registers diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.h b/llvm/lib/Target/AArch64/AArch64TargetMachine.h index 25e626134317..7d314bce99b1 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetMachine.h +++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.h @@ -20,8 +20,6 @@ namespace llvm { -class AArch64RegisterBankInfo; - class AArch64TargetMachine : public LLVMTargetMachine { protected: std::unique_ptr<TargetLoweringObjectFile> TLOF; diff --git a/llvm/lib/Target/AArch64/AArch64TargetObjectFile.cpp b/llvm/lib/Target/AArch64/AArch64TargetObjectFile.cpp index dfc66f0cb4c1..7ed934cfabc0 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetObjectFile.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetObjectFile.cpp @@ -25,8 +25,7 @@ void AArch64_ELFTargetObjectFile::Initialize(MCContext &Ctx, SupportDebugThreadLocalLocation = false; } -AArch64_MachoTargetObjectFile::AArch64_MachoTargetObjectFile() - : TargetLoweringObjectFileMachO() { +AArch64_MachoTargetObjectFile::AArch64_MachoTargetObjectFile() { SupportGOTPCRelWithOffset = false; } diff --git a/llvm/lib/Target/AArch64/AArch64TargetObjectFile.h b/llvm/lib/Target/AArch64/AArch64TargetObjectFile.h index 28324c2ae608..9f098230bbd7 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetObjectFile.h +++ b/llvm/lib/Target/AArch64/AArch64TargetObjectFile.h @@ -13,7 +13,6 @@ #include "llvm/Target/TargetLoweringObjectFile.h" namespace llvm { -class AArch64TargetMachine; /// This implementation is used for AArch64 ELF targets (Linux in particular). class AArch64_ELFTargetObjectFile : public TargetLoweringObjectFileELF { diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index d21854e38f5a..a4d666a0a3c2 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -331,6 +331,45 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, } break; } + case Intrinsic::sadd_with_overflow: + case Intrinsic::uadd_with_overflow: + case Intrinsic::ssub_with_overflow: + case Intrinsic::usub_with_overflow: + case Intrinsic::smul_with_overflow: + case Intrinsic::umul_with_overflow: { + static const CostTblEntry WithOverflowCostTbl[] = { + {Intrinsic::sadd_with_overflow, MVT::i8, 3}, + {Intrinsic::uadd_with_overflow, MVT::i8, 3}, + {Intrinsic::sadd_with_overflow, MVT::i16, 3}, + {Intrinsic::uadd_with_overflow, MVT::i16, 3}, + {Intrinsic::sadd_with_overflow, MVT::i32, 1}, + {Intrinsic::uadd_with_overflow, MVT::i32, 1}, + {Intrinsic::sadd_with_overflow, MVT::i64, 1}, + {Intrinsic::uadd_with_overflow, MVT::i64, 1}, + {Intrinsic::ssub_with_overflow, MVT::i8, 3}, + {Intrinsic::usub_with_overflow, MVT::i8, 3}, + {Intrinsic::ssub_with_overflow, MVT::i16, 3}, + {Intrinsic::usub_with_overflow, MVT::i16, 3}, + {Intrinsic::ssub_with_overflow, MVT::i32, 1}, + {Intrinsic::usub_with_overflow, MVT::i32, 1}, + {Intrinsic::ssub_with_overflow, MVT::i64, 1}, + {Intrinsic::usub_with_overflow, MVT::i64, 1}, + {Intrinsic::smul_with_overflow, MVT::i8, 5}, + {Intrinsic::umul_with_overflow, MVT::i8, 4}, + {Intrinsic::smul_with_overflow, MVT::i16, 5}, + {Intrinsic::umul_with_overflow, MVT::i16, 4}, + {Intrinsic::smul_with_overflow, MVT::i32, 2}, // eg umull;tst + {Intrinsic::umul_with_overflow, MVT::i32, 2}, // eg umull;cmp sxtw + {Intrinsic::smul_with_overflow, MVT::i64, 3}, // eg mul;smulh;cmp + {Intrinsic::umul_with_overflow, MVT::i64, 3}, // eg mul;umulh;cmp asr + }; + EVT MTy = TLI->getValueType(DL, RetTy->getContainedType(0), true); + if (MTy.isSimple()) + if (const auto *Entry = CostTableLookup(WithOverflowCostTbl, ICA.getID(), + MTy.getSimpleVT())) + return Entry->Cost; + break; + } default: break; } @@ -377,12 +416,76 @@ static Optional<Instruction *> processPhiNode(InstCombiner &IC, return IC.replaceInstUsesWith(II, NPN); } +// (from_svbool (binop (to_svbool pred) (svbool_t _) (svbool_t _)))) +// => (binop (pred) (from_svbool _) (from_svbool _)) +// +// The above transformation eliminates a `to_svbool` in the predicate +// operand of bitwise operation `binop` by narrowing the vector width of +// the operation. For example, it would convert a `<vscale x 16 x i1> +// and` into a `<vscale x 4 x i1> and`. This is profitable because +// to_svbool must zero the new lanes during widening, whereas +// from_svbool is free. +static Optional<Instruction *> tryCombineFromSVBoolBinOp(InstCombiner &IC, + IntrinsicInst &II) { + auto BinOp = dyn_cast<IntrinsicInst>(II.getOperand(0)); + if (!BinOp) + return None; + + auto IntrinsicID = BinOp->getIntrinsicID(); + switch (IntrinsicID) { + case Intrinsic::aarch64_sve_and_z: + case Intrinsic::aarch64_sve_bic_z: + case Intrinsic::aarch64_sve_eor_z: + case Intrinsic::aarch64_sve_nand_z: + case Intrinsic::aarch64_sve_nor_z: + case Intrinsic::aarch64_sve_orn_z: + case Intrinsic::aarch64_sve_orr_z: + break; + default: + return None; + } + + auto BinOpPred = BinOp->getOperand(0); + auto BinOpOp1 = BinOp->getOperand(1); + auto BinOpOp2 = BinOp->getOperand(2); + + auto PredIntr = dyn_cast<IntrinsicInst>(BinOpPred); + if (!PredIntr || + PredIntr->getIntrinsicID() != Intrinsic::aarch64_sve_convert_to_svbool) + return None; + + auto PredOp = PredIntr->getOperand(0); + auto PredOpTy = cast<VectorType>(PredOp->getType()); + if (PredOpTy != II.getType()) + return None; + + IRBuilder<> Builder(II.getContext()); + Builder.SetInsertPoint(&II); + + SmallVector<Value *> NarrowedBinOpArgs = {PredOp}; + auto NarrowBinOpOp1 = Builder.CreateIntrinsic( + Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp1}); + NarrowedBinOpArgs.push_back(NarrowBinOpOp1); + if (BinOpOp1 == BinOpOp2) + NarrowedBinOpArgs.push_back(NarrowBinOpOp1); + else + NarrowedBinOpArgs.push_back(Builder.CreateIntrinsic( + Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp2})); + + auto NarrowedBinOp = + Builder.CreateIntrinsic(IntrinsicID, {PredOpTy}, NarrowedBinOpArgs); + return IC.replaceInstUsesWith(II, NarrowedBinOp); +} + static Optional<Instruction *> instCombineConvertFromSVBool(InstCombiner &IC, IntrinsicInst &II) { // If the reinterpret instruction operand is a PHI Node if (isa<PHINode>(II.getArgOperand(0))) return processPhiNode(IC, II); + if (auto BinOpCombine = tryCombineFromSVBoolBinOp(IC, II)) + return BinOpCombine; + SmallVector<Instruction *, 32> CandidatesForRemoval; Value *Cursor = II.getOperand(0), *EarliestReplacement = nullptr; @@ -1129,6 +1232,32 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC, return None; } +Optional<Value *> AArch64TTIImpl::simplifyDemandedVectorEltsIntrinsic( + InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts, + APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, + std::function<void(Instruction *, unsigned, APInt, APInt &)> + SimplifyAndSetOp) const { + switch (II.getIntrinsicID()) { + default: + break; + case Intrinsic::aarch64_neon_fcvtxn: + case Intrinsic::aarch64_neon_rshrn: + case Intrinsic::aarch64_neon_sqrshrn: + case Intrinsic::aarch64_neon_sqrshrun: + case Intrinsic::aarch64_neon_sqshrn: + case Intrinsic::aarch64_neon_sqshrun: + case Intrinsic::aarch64_neon_sqxtn: + case Intrinsic::aarch64_neon_sqxtun: + case Intrinsic::aarch64_neon_uqrshrn: + case Intrinsic::aarch64_neon_uqshrn: + case Intrinsic::aarch64_neon_uqxtn: + SimplifyAndSetOp(&II, 0, OrigDemandedElts, UndefElts); + break; + } + + return None; +} + bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode, ArrayRef<const Value *> Args) { @@ -1461,6 +1590,15 @@ InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, { ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f32, 2}, { ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f32, 6}, + // Bitcasts from float to integer + { ISD::BITCAST, MVT::nxv2f16, MVT::nxv2i16, 0 }, + { ISD::BITCAST, MVT::nxv4f16, MVT::nxv4i16, 0 }, + { ISD::BITCAST, MVT::nxv2f32, MVT::nxv2i32, 0 }, + + // Bitcasts from integer to float + { ISD::BITCAST, MVT::nxv2i16, MVT::nxv2f16, 0 }, + { ISD::BITCAST, MVT::nxv4i16, MVT::nxv4f16, 0 }, + { ISD::BITCAST, MVT::nxv2i32, MVT::nxv2f32, 0 }, }; if (const auto *Entry = ConvertCostTableLookup(ConversionTbl, ISD, @@ -1555,9 +1693,12 @@ InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, if (!LT.second.isVector()) return 0; - // The type may be split. Normalize the index to the new type. - unsigned Width = LT.second.getVectorNumElements(); - Index = Index % Width; + // The type may be split. For fixed-width vectors we can normalize the + // index to the new type. + if (LT.second.isFixedLengthVector()) { + unsigned Width = LT.second.getVectorNumElements(); + Index = Index % Width; + } // The element at index zero is already inside the vector. if (Index == 0) diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index c3e1735cd4cd..a6029b9f2445 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -106,6 +106,12 @@ public: Optional<Instruction *> instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const; + Optional<Value *> simplifyDemandedVectorEltsIntrinsic( + InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, + APInt &UndefElts2, APInt &UndefElts3, + std::function<void(Instruction *, unsigned, APInt, APInt &)> + SimplifyAndSetOp) const; + TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const { switch (K) { case TargetTransformInfo::RGK_Scalar: @@ -307,6 +313,10 @@ public: return 2; } + bool emitGetActiveLaneMask() const { + return ST->hasSVE(); + } + bool supportsScalableVectors() const { return ST->hasSVE(); } bool enableScalableVectorization() const { return ST->hasSVE(); } diff --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp index 62038b10fccd..33ed7ae9780e 100644 --- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp +++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp @@ -48,6 +48,7 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/SMLoc.h" +#include "llvm/Support/AArch64TargetParser.h" #include "llvm/Support/TargetParser.h" #include "llvm/Support/raw_ostream.h" #include <cassert> @@ -3284,6 +3285,8 @@ static const struct Extension { {"sme", {AArch64::FeatureSME}}, {"sme-f64", {AArch64::FeatureSMEF64}}, {"sme-i64", {AArch64::FeatureSMEI64}}, + {"hbc", {AArch64::FeatureHBC}}, + {"mops", {AArch64::FeatureMOPS}}, // FIXME: Unsupported extensions {"lor", {}}, {"rdma", {}}, @@ -3307,12 +3310,16 @@ static void setRequiredFeatureString(FeatureBitset FBS, std::string &Str) { Str += "ARMv8.6a"; else if (FBS[AArch64::HasV8_7aOps]) Str += "ARMv8.7a"; + else if (FBS[AArch64::HasV8_8aOps]) + Str += "ARMv8.8a"; else if (FBS[AArch64::HasV9_0aOps]) Str += "ARMv9-a"; else if (FBS[AArch64::HasV9_1aOps]) Str += "ARMv9.1a"; else if (FBS[AArch64::HasV9_2aOps]) Str += "ARMv9.2a"; + else if (FBS[AArch64::HasV9_3aOps]) + Str += "ARMv9.3a"; else if (FBS[AArch64::HasV8_0rOps]) Str += "ARMv8r"; else { @@ -4531,7 +4538,7 @@ bool AArch64AsmParser::ParseInstruction(ParseInstructionInfo &Info, Mnemonic = Head; // Handle condition codes for a branch mnemonic - if (Head == "b" && Next != StringRef::npos) { + if ((Head == "b" || Head == "bc") && Next != StringRef::npos) { Start = Next; Next = Name.find('.', Start + 1); Head = Name.slice(Start + 1, Next); @@ -4862,6 +4869,177 @@ bool AArch64AsmParser::validateInstruction(MCInst &Inst, SMLoc &IDLoc, } } + // Check v8.8-A memops instructions. + switch (Inst.getOpcode()) { + case AArch64::CPYFP: + case AArch64::CPYFPWN: + case AArch64::CPYFPRN: + case AArch64::CPYFPN: + case AArch64::CPYFPWT: + case AArch64::CPYFPWTWN: + case AArch64::CPYFPWTRN: + case AArch64::CPYFPWTN: + case AArch64::CPYFPRT: + case AArch64::CPYFPRTWN: + case AArch64::CPYFPRTRN: + case AArch64::CPYFPRTN: + case AArch64::CPYFPT: + case AArch64::CPYFPTWN: + case AArch64::CPYFPTRN: + case AArch64::CPYFPTN: + case AArch64::CPYFM: + case AArch64::CPYFMWN: + case AArch64::CPYFMRN: + case AArch64::CPYFMN: + case AArch64::CPYFMWT: + case AArch64::CPYFMWTWN: + case AArch64::CPYFMWTRN: + case AArch64::CPYFMWTN: + case AArch64::CPYFMRT: + case AArch64::CPYFMRTWN: + case AArch64::CPYFMRTRN: + case AArch64::CPYFMRTN: + case AArch64::CPYFMT: + case AArch64::CPYFMTWN: + case AArch64::CPYFMTRN: + case AArch64::CPYFMTN: + case AArch64::CPYFE: + case AArch64::CPYFEWN: + case AArch64::CPYFERN: + case AArch64::CPYFEN: + case AArch64::CPYFEWT: + case AArch64::CPYFEWTWN: + case AArch64::CPYFEWTRN: + case AArch64::CPYFEWTN: + case AArch64::CPYFERT: + case AArch64::CPYFERTWN: + case AArch64::CPYFERTRN: + case AArch64::CPYFERTN: + case AArch64::CPYFET: + case AArch64::CPYFETWN: + case AArch64::CPYFETRN: + case AArch64::CPYFETN: + case AArch64::CPYP: + case AArch64::CPYPWN: + case AArch64::CPYPRN: + case AArch64::CPYPN: + case AArch64::CPYPWT: + case AArch64::CPYPWTWN: + case AArch64::CPYPWTRN: + case AArch64::CPYPWTN: + case AArch64::CPYPRT: + case AArch64::CPYPRTWN: + case AArch64::CPYPRTRN: + case AArch64::CPYPRTN: + case AArch64::CPYPT: + case AArch64::CPYPTWN: + case AArch64::CPYPTRN: + case AArch64::CPYPTN: + case AArch64::CPYM: + case AArch64::CPYMWN: + case AArch64::CPYMRN: + case AArch64::CPYMN: + case AArch64::CPYMWT: + case AArch64::CPYMWTWN: + case AArch64::CPYMWTRN: + case AArch64::CPYMWTN: + case AArch64::CPYMRT: + case AArch64::CPYMRTWN: + case AArch64::CPYMRTRN: + case AArch64::CPYMRTN: + case AArch64::CPYMT: + case AArch64::CPYMTWN: + case AArch64::CPYMTRN: + case AArch64::CPYMTN: + case AArch64::CPYE: + case AArch64::CPYEWN: + case AArch64::CPYERN: + case AArch64::CPYEN: + case AArch64::CPYEWT: + case AArch64::CPYEWTWN: + case AArch64::CPYEWTRN: + case AArch64::CPYEWTN: + case AArch64::CPYERT: + case AArch64::CPYERTWN: + case AArch64::CPYERTRN: + case AArch64::CPYERTN: + case AArch64::CPYET: + case AArch64::CPYETWN: + case AArch64::CPYETRN: + case AArch64::CPYETN: { + unsigned Xd_wb = Inst.getOperand(0).getReg(); + unsigned Xs_wb = Inst.getOperand(1).getReg(); + unsigned Xn_wb = Inst.getOperand(2).getReg(); + unsigned Xd = Inst.getOperand(3).getReg(); + unsigned Xs = Inst.getOperand(4).getReg(); + unsigned Xn = Inst.getOperand(5).getReg(); + if (Xd_wb != Xd) + return Error(Loc[0], + "invalid CPY instruction, Xd_wb and Xd do not match"); + if (Xs_wb != Xs) + return Error(Loc[0], + "invalid CPY instruction, Xs_wb and Xs do not match"); + if (Xn_wb != Xn) + return Error(Loc[0], + "invalid CPY instruction, Xn_wb and Xn do not match"); + if (Xd == Xs) + return Error(Loc[0], "invalid CPY instruction, destination and source" + " registers are the same"); + if (Xd == Xn) + return Error(Loc[0], "invalid CPY instruction, destination and size" + " registers are the same"); + if (Xs == Xn) + return Error(Loc[0], "invalid CPY instruction, source and size" + " registers are the same"); + break; + } + case AArch64::SETP: + case AArch64::SETPT: + case AArch64::SETPN: + case AArch64::SETPTN: + case AArch64::SETM: + case AArch64::SETMT: + case AArch64::SETMN: + case AArch64::SETMTN: + case AArch64::SETE: + case AArch64::SETET: + case AArch64::SETEN: + case AArch64::SETETN: + case AArch64::SETGP: + case AArch64::SETGPT: + case AArch64::SETGPN: + case AArch64::SETGPTN: + case AArch64::SETGM: + case AArch64::SETGMT: + case AArch64::SETGMN: + case AArch64::SETGMTN: + case AArch64::MOPSSETGE: + case AArch64::MOPSSETGET: + case AArch64::MOPSSETGEN: + case AArch64::MOPSSETGETN: { + unsigned Xd_wb = Inst.getOperand(0).getReg(); + unsigned Xn_wb = Inst.getOperand(1).getReg(); + unsigned Xd = Inst.getOperand(2).getReg(); + unsigned Xn = Inst.getOperand(3).getReg(); + unsigned Xm = Inst.getOperand(4).getReg(); + if (Xd_wb != Xd) + return Error(Loc[0], + "invalid SET instruction, Xd_wb and Xd do not match"); + if (Xn_wb != Xn) + return Error(Loc[0], + "invalid SET instruction, Xn_wb and Xn do not match"); + if (Xd == Xn) + return Error(Loc[0], "invalid SET instruction, destination and size" + " registers are the same"); + if (Xd == Xm) + return Error(Loc[0], "invalid SET instruction, destination and source" + " registers are the same"); + if (Xn == Xm) + return Error(Loc[0], "invalid SET instruction, source and size" + " registers are the same"); + break; + } + } // Now check immediate ranges. Separate from the above as there is overlap // in the instructions being checked and this keeps the nested conditionals @@ -5931,9 +6109,11 @@ static void ExpandCryptoAEK(AArch64::ArchKind ArchKind, case AArch64::ArchKind::ARMV8_5A: case AArch64::ArchKind::ARMV8_6A: case AArch64::ArchKind::ARMV8_7A: + case AArch64::ArchKind::ARMV8_8A: case AArch64::ArchKind::ARMV9A: case AArch64::ArchKind::ARMV9_1A: case AArch64::ArchKind::ARMV9_2A: + case AArch64::ArchKind::ARMV9_3A: case AArch64::ArchKind::ARMV8R: RequestedExtensions.push_back("sm4"); RequestedExtensions.push_back("sha3"); @@ -5956,6 +6136,7 @@ static void ExpandCryptoAEK(AArch64::ArchKind ArchKind, case AArch64::ArchKind::ARMV8_5A: case AArch64::ArchKind::ARMV8_6A: case AArch64::ArchKind::ARMV8_7A: + case AArch64::ArchKind::ARMV8_8A: case AArch64::ArchKind::ARMV9A: case AArch64::ArchKind::ARMV9_1A: case AArch64::ArchKind::ARMV9_2A: diff --git a/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp b/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp index 96d410e42be2..9ce00f76d9c7 100644 --- a/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp +++ b/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp @@ -238,6 +238,12 @@ static DecodeStatus DecodeSVEIncDecImm(MCInst &Inst, unsigned Imm, uint64_t Addr, const void *Decoder); static DecodeStatus DecodeSVCROp(MCInst &Inst, unsigned Imm, uint64_t Address, const void *Decoder); +static DecodeStatus DecodeCPYMemOpInstruction(MCInst &Inst, uint32_t insn, + uint64_t Addr, + const void *Decoder); +static DecodeStatus DecodeSETMemOpInstruction(MCInst &Inst, uint32_t insn, + uint64_t Addr, + const void *Decoder); static bool Check(DecodeStatus &Out, DecodeStatus In) { switch (In) { @@ -1842,3 +1848,52 @@ static DecodeStatus DecodeSVCROp(MCInst &Inst, unsigned Imm, uint64_t Address, } return Fail; } + +static DecodeStatus DecodeCPYMemOpInstruction(MCInst &Inst, uint32_t insn, + uint64_t Addr, + const void *Decoder) { + unsigned Rd = fieldFromInstruction(insn, 0, 5); + unsigned Rs = fieldFromInstruction(insn, 16, 5); + unsigned Rn = fieldFromInstruction(insn, 5, 5); + + // None of the registers may alias: if they do, then the instruction is not + // merely unpredictable but actually entirely unallocated. + if (Rd == Rs || Rs == Rn || Rd == Rn) + return MCDisassembler::Fail; + + // All three register operands are written back, so they all appear + // twice in the operand list, once as outputs and once as inputs. + if (!DecodeGPR64commonRegisterClass(Inst, Rd, Addr, Decoder) || + !DecodeGPR64commonRegisterClass(Inst, Rs, Addr, Decoder) || + !DecodeGPR64RegisterClass(Inst, Rn, Addr, Decoder) || + !DecodeGPR64commonRegisterClass(Inst, Rd, Addr, Decoder) || + !DecodeGPR64commonRegisterClass(Inst, Rs, Addr, Decoder) || + !DecodeGPR64RegisterClass(Inst, Rn, Addr, Decoder)) + return MCDisassembler::Fail; + + return MCDisassembler::Success; +} + +static DecodeStatus DecodeSETMemOpInstruction(MCInst &Inst, uint32_t insn, + uint64_t Addr, + const void *Decoder) { + unsigned Rd = fieldFromInstruction(insn, 0, 5); + unsigned Rm = fieldFromInstruction(insn, 16, 5); + unsigned Rn = fieldFromInstruction(insn, 5, 5); + + // None of the registers may alias: if they do, then the instruction is not + // merely unpredictable but actually entirely unallocated. + if (Rd == Rm || Rm == Rn || Rd == Rn) + return MCDisassembler::Fail; + + // Rd and Rn (not Rm) register operands are written back, so they appear + // twice in the operand list, once as outputs and once as inputs. + if (!DecodeGPR64commonRegisterClass(Inst, Rd, Addr, Decoder) || + !DecodeGPR64RegisterClass(Inst, Rn, Addr, Decoder) || + !DecodeGPR64commonRegisterClass(Inst, Rd, Addr, Decoder) || + !DecodeGPR64RegisterClass(Inst, Rn, Addr, Decoder) || + !DecodeGPR64RegisterClass(Inst, Rm, Addr, Decoder)) + return MCDisassembler::Fail; + + return MCDisassembler::Success; +} diff --git a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp index ac08ee8ae8dd..097b93e4fcca 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp @@ -1112,6 +1112,7 @@ bool AArch64CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, return false; } + Info.IsTailCall = CanTailCallOpt; if (CanTailCallOpt) return lowerTailCall(MIRBuilder, Info, OutArgs); @@ -1179,7 +1180,7 @@ bool AArch64CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, if (!determineAndHandleAssignments( UsingReturnedArg ? ReturnedArgHandler : Handler, Assigner, InArgs, MIRBuilder, Info.CallConv, Info.IsVarArg, - UsingReturnedArg ? OutArgs[0].Regs[0] : Register())) + UsingReturnedArg ? makeArrayRef(OutArgs[0].Regs) : None)) return false; } diff --git a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.h b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.h index add0342c90fd..aafb1d19640a 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.h +++ b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.h @@ -24,9 +24,7 @@ namespace llvm { class AArch64TargetLowering; class CCValAssign; -class DataLayout; class MachineIRBuilder; -class MachineRegisterInfo; class Type; class AArch64CallLowering: public CallLowering { diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp index 3d9a626d3ac3..1f546ad50d57 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp @@ -18,7 +18,6 @@ #include "AArch64RegisterInfo.h" #include "AArch64Subtarget.h" #include "AArch64TargetMachine.h" -#include "AArch64GlobalISelUtils.h" #include "MCTargetDesc/AArch64AddressingModes.h" #include "MCTargetDesc/AArch64MCTargetDesc.h" #include "llvm/ADT/Optional.h" @@ -472,8 +471,8 @@ private: AArch64InstructionSelector::AArch64InstructionSelector( const AArch64TargetMachine &TM, const AArch64Subtarget &STI, const AArch64RegisterBankInfo &RBI) - : InstructionSelector(), TM(TM), STI(STI), TII(*STI.getInstrInfo()), - TRI(*STI.getRegisterInfo()), RBI(RBI), + : TM(TM), STI(STI), TII(*STI.getInstrInfo()), TRI(*STI.getRegisterInfo()), + RBI(RBI), #define GET_GLOBALISEL_PREDICATES_INIT #include "AArch64GenGlobalISel.inc" #undef GET_GLOBALISEL_PREDICATES_INIT @@ -3937,19 +3936,19 @@ static bool getLaneCopyOpcode(unsigned &CopyOpc, unsigned &ExtractSubReg, // vector's elements. switch (EltSize) { case 8: - CopyOpc = AArch64::CPYi8; + CopyOpc = AArch64::DUPi8; ExtractSubReg = AArch64::bsub; break; case 16: - CopyOpc = AArch64::CPYi16; + CopyOpc = AArch64::DUPi16; ExtractSubReg = AArch64::hsub; break; case 32: - CopyOpc = AArch64::CPYi32; + CopyOpc = AArch64::DUPi32; ExtractSubReg = AArch64::ssub; break; case 64: - CopyOpc = AArch64::CPYi64; + CopyOpc = AArch64::DUPi64; ExtractSubReg = AArch64::dsub; break; default: @@ -5469,8 +5468,8 @@ bool AArch64InstructionSelector::selectIntrinsic(MachineInstr &I, // Insert the copy from LR/X30 into the entry block, before it can be // clobbered by anything. MFI.setReturnAddressIsTaken(true); - MFReturnAddr = getFunctionLiveInPhysReg(MF, TII, AArch64::LR, - AArch64::GPR64RegClass); + MFReturnAddr = getFunctionLiveInPhysReg( + MF, TII, AArch64::LR, AArch64::GPR64RegClass, I.getDebugLoc()); } if (STI.hasPAuth()) { diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h index 35456d95dc2b..e2c46f4b4c1f 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h +++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h @@ -21,7 +21,6 @@ namespace llvm { -class LLVMContext; class AArch64Subtarget; /// This class provides the information for the target register banks. diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h index 7274ae79f74a..225e0c8e55fc 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h @@ -19,7 +19,6 @@ namespace llvm { class MCStreamer; -class Target; class Triple; struct AArch64MCAsmInfoDarwin : public MCAsmInfoDarwin { diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h index 941226b83e44..66cb7a37a958 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h @@ -30,11 +30,7 @@ class MCStreamer; class MCSubtargetInfo; class MCTargetOptions; class MCTargetStreamer; -class StringRef; class Target; -class Triple; -class raw_ostream; -class raw_pwrite_stream; MCCodeEmitter *createAArch64MCCodeEmitter(const MCInstrInfo &MCII, const MCRegisterInfo &MRI, diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td index bb488cd7da32..574b22124957 100644 --- a/llvm/lib/Target/AArch64/SVEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -334,6 +334,8 @@ multiclass sve_int_ptrue<bits<3> opc, string asm, SDPatternOperator op> { def SDT_AArch64PTrue : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVT<1, i32>]>; def AArch64ptrue : SDNode<"AArch64ISD::PTRUE", SDT_AArch64PTrue>; +def SDT_AArch64PFalse : SDTypeProfile<1, 0, [SDTCisVec<0>, SDTCVecEltisVT<0, i1>]>; +def AArch64pfalse : SDNode<"AArch64ISD::PFALSE", SDT_AArch64PFalse>; let Predicates = [HasSVEorStreamingSVE] in { defm PTRUE : sve_int_ptrue<0b000, "ptrue", AArch64ptrue>; @@ -609,6 +611,15 @@ class sve_int_pfalse<bits<6> opc, string asm> let isReMaterializable = 1; } +multiclass sve_int_pfalse<bits<6> opc, string asm> { + def NAME : sve_int_pfalse<opc, asm>; + + def : Pat<(nxv16i1 (AArch64pfalse)), (!cast<Instruction>(NAME))>; + def : Pat<(nxv8i1 (AArch64pfalse)), (!cast<Instruction>(NAME))>; + def : Pat<(nxv4i1 (AArch64pfalse)), (!cast<Instruction>(NAME))>; + def : Pat<(nxv2i1 (AArch64pfalse)), (!cast<Instruction>(NAME))>; +} + class sve_int_ptest<bits<6> opc, string asm> : I<(outs), (ins PPRAny:$Pg, PPR8:$Pn), asm, "\t$Pg, $Pn", @@ -1622,6 +1633,18 @@ multiclass sve_int_pred_log<bits<4> opc, string asm, SDPatternOperator op, !cast<Instruction>(NAME), PTRUE_D>; } +multiclass sve_int_pred_log_and<bits<4> opc, string asm, SDPatternOperator op> : + sve_int_pred_log<opc, asm, op> { + def : Pat<(nxv16i1 (and nxv16i1:$Op1, nxv16i1:$Op2)), + (!cast<Instruction>(NAME) $Op1, $Op1, $Op2)>; + def : Pat<(nxv8i1 (and nxv8i1:$Op1, nxv8i1:$Op2)), + (!cast<Instruction>(NAME) $Op1, $Op1, $Op2)>; + def : Pat<(nxv4i1 (and nxv4i1:$Op1, nxv4i1:$Op2)), + (!cast<Instruction>(NAME) $Op1, $Op1, $Op2)>; + def : Pat<(nxv2i1 (and nxv2i1:$Op1, nxv2i1:$Op2)), + (!cast<Instruction>(NAME) $Op1, $Op1, $Op2)>; +} + //===----------------------------------------------------------------------===// // SVE Logical Mask Immediate Group //===----------------------------------------------------------------------===// @@ -1708,6 +1731,9 @@ multiclass sve_int_dup_mask_imm<string asm> { (!cast<Instruction>(NAME) ZPR32:$Zd, sve_preferred_logical_imm32:$imm), 6>; def : InstAlias<"mov $Zd, $imm", (!cast<Instruction>(NAME) ZPR64:$Zd, sve_preferred_logical_imm64:$imm), 5>; + + def : Pat<(nxv2i64 (AArch64dup (i64 logical_imm64:$imm))), + (!cast<Instruction>(NAME) logical_imm64:$imm)>; } //===----------------------------------------------------------------------===// @@ -4641,6 +4667,10 @@ multiclass SVE_SETCC_Pat<CondCode cc, CondCode invcc, ValueType predvt, (cmp $Op1, $Op2, $Op3)>; def : Pat<(predvt (AArch64setcc_z predvt:$Op1, intvt:$Op2, intvt:$Op3, invcc)), (cmp $Op1, $Op3, $Op2)>; + def : Pat<(predvt (and predvt:$Pg, (AArch64setcc_z (predvt (AArch64ptrue 31)), intvt:$Op2, intvt:$Op3, cc))), + (cmp $Pg, $Op2, $Op3)>; + def : Pat<(predvt (and predvt:$Pg, (AArch64setcc_z (predvt (AArch64ptrue 31)), intvt:$Op2, intvt:$Op3, invcc))), + (cmp $Pg, $Op3, $Op2)>; } multiclass SVE_SETCC_Pat_With_Zero<CondCode cc, CondCode invcc, ValueType predvt, diff --git a/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp b/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp index 642080a0d40d..4a24162540a5 100644 --- a/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp +++ b/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp @@ -40,10 +40,6 @@ using namespace llvm::PatternMatch; #define DEBUG_TYPE "aarch64-sve-intrinsic-opts" -namespace llvm { -void initializeSVEIntrinsicOptsPass(PassRegistry &); -} - namespace { struct SVEIntrinsicOpts : public ModulePass { static char ID; // Pass identification, replacement for typeid diff --git a/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h index caee2acd2606..5906a5d6b50b 100644 --- a/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h +++ b/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h @@ -483,18 +483,20 @@ inline unsigned getNumElementsFromSVEPredPattern(unsigned Pattern) { } /// Return specific VL predicate pattern based on the number of elements. -inline unsigned getSVEPredPatternFromNumElements(unsigned MinNumElts) { +inline Optional<unsigned> +getSVEPredPatternFromNumElements(unsigned MinNumElts) { switch (MinNumElts) { default: - llvm_unreachable("unexpected element count for SVE predicate"); + return None; case 1: - return AArch64SVEPredPattern::vl1; case 2: - return AArch64SVEPredPattern::vl2; + case 3: case 4: - return AArch64SVEPredPattern::vl4; + case 5: + case 6: + case 7: case 8: - return AArch64SVEPredPattern::vl8; + return MinNumElts; case 16: return AArch64SVEPredPattern::vl16; case 32: @@ -757,7 +759,6 @@ namespace AArch64 { // <n x (M*P) x t> vector (such as index 1) are undefined. static constexpr unsigned SVEBitsPerBlock = 128; static constexpr unsigned SVEMaxBitsPerVector = 2048; -const unsigned NeonBitsPerVector = 128; } // end namespace AArch64 } // end namespace llvm diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index e606f0e8fc3c..806c0b18637a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -610,12 +610,6 @@ def FeatureDsSrc2Insts : SubtargetFeature<"ds-src2-insts", "Has ds_*_src2 instructions" >; -def FeatureRegisterBanking : SubtargetFeature<"register-banking", - "HasRegisterBanking", - "true", - "Has register banking" ->; - def FeatureVOP3Literal : SubtargetFeature<"vop3-literal", "HasVOP3Literal", "true", @@ -826,7 +820,7 @@ def FeatureGFX10 : GCNSubtargetFeatureGeneration<"GFX10", FeatureSDWA, FeatureSDWAOmod, FeatureSDWAScalar, FeatureSDWASdst, FeatureFlatInstOffsets, FeatureFlatGlobalInsts, FeatureFlatScratchInsts, FeatureAddNoCarryInsts, FeatureFmaMixInsts, FeatureGFX8Insts, - FeatureNoSdstCMPX, FeatureVscnt, FeatureRegisterBanking, + FeatureNoSdstCMPX, FeatureVscnt, FeatureVOP3Literal, FeatureDPP8, FeatureExtendedImageInsts, FeatureNoDataDepHazard, FeaturePkFmacF16Inst, FeatureGFX10A16, FeatureSMemTimeInst, FeatureFastDenormalF32, FeatureG16, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h b/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h index 22be014813b0..5ba9b2cd187e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h @@ -26,7 +26,7 @@ class AMDGPUAAResult : public AAResultBase<AMDGPUAAResult> { const DataLayout &DL; public: - explicit AMDGPUAAResult(const DataLayout &DL) : AAResultBase(), DL(DL) {} + explicit AMDGPUAAResult(const DataLayout &DL) : DL(DL) {} AMDGPUAAResult(AMDGPUAAResult &&Arg) : AAResultBase(std::move(Arg)), DL(Arg.DL) {} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp index 2f1e7823f65c..cd084fd5440a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp @@ -192,8 +192,20 @@ struct AMDGPUOutgoingArgHandler : public AMDGPUOutgoingValueHandler { const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); - if (!SPReg) - SPReg = MIRBuilder.buildCopy(PtrTy, MFI->getStackPtrOffsetReg()).getReg(0); + if (!SPReg) { + const GCNSubtarget &ST = MIRBuilder.getMF().getSubtarget<GCNSubtarget>(); + if (ST.enableFlatScratch()) { + // The stack is accessed unswizzled, so we can use a regular copy. + SPReg = MIRBuilder.buildCopy(PtrTy, + MFI->getStackPtrOffsetReg()).getReg(0); + } else { + // The address we produce here, without knowing the use context, is going + // to be interpreted as a vector address, so we need to convert to a + // swizzled address. + SPReg = MIRBuilder.buildInstr(AMDGPU::G_AMDGPU_WAVE_ADDRESS, {PtrTy}, + {MFI->getStackPtrOffsetReg()}).getReg(0); + } + } auto OffsetReg = MIRBuilder.buildConstant(S32, Offset); @@ -615,6 +627,13 @@ bool AMDGPUCallLowering::lowerFormalArguments( CCInfo.AllocateReg(ImplicitBufferPtrReg); } + // FIXME: This probably isn't defined for mesa + if (Info->hasFlatScratchInit() && !Subtarget.isAmdPalOS()) { + Register FlatScratchInitReg = Info->addFlatScratchInit(*TRI); + MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass); + CCInfo.AllocateReg(FlatScratchInitReg); + } + SmallVector<ArgInfo, 32> SplitArgs; unsigned Idx = 0; unsigned PSInputNum = 0; @@ -879,13 +898,17 @@ bool AMDGPUCallLowering::passSpecialInputs(MachineIRBuilder &MIRBuilder, Register InputReg; if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX && NeedWorkItemIDX) { - InputReg = MRI.createGenericVirtualRegister(S32); - LI->loadInputValue(InputReg, MIRBuilder, IncomingArgX, - std::get<1>(WorkitemIDX), std::get<2>(WorkitemIDX)); + if (ST.getMaxWorkitemID(MF.getFunction(), 0) != 0) { + InputReg = MRI.createGenericVirtualRegister(S32); + LI->loadInputValue(InputReg, MIRBuilder, IncomingArgX, + std::get<1>(WorkitemIDX), std::get<2>(WorkitemIDX)); + } else { + InputReg = MIRBuilder.buildConstant(S32, 0).getReg(0); + } } if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY && - NeedWorkItemIDY) { + NeedWorkItemIDY && ST.getMaxWorkitemID(MF.getFunction(), 1) != 0) { Register Y = MRI.createGenericVirtualRegister(S32); LI->loadInputValue(Y, MIRBuilder, IncomingArgY, std::get<1>(WorkitemIDY), std::get<2>(WorkitemIDY)); @@ -895,7 +918,7 @@ bool AMDGPUCallLowering::passSpecialInputs(MachineIRBuilder &MIRBuilder, } if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ && - NeedWorkItemIDZ) { + NeedWorkItemIDZ && ST.getMaxWorkitemID(MF.getFunction(), 2) != 0) { Register Z = MRI.createGenericVirtualRegister(S32); LI->loadInputValue(Z, MIRBuilder, IncomingArgZ, std::get<1>(WorkitemIDZ), std::get<2>(WorkitemIDZ)); @@ -904,16 +927,24 @@ bool AMDGPUCallLowering::passSpecialInputs(MachineIRBuilder &MIRBuilder, InputReg = InputReg ? MIRBuilder.buildOr(S32, InputReg, Z).getReg(0) : Z; } - if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) { + if (!InputReg && + (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) { InputReg = MRI.createGenericVirtualRegister(S32); - - // Workitem ids are already packed, any of present incoming arguments will - // carry all required fields. - ArgDescriptor IncomingArg = ArgDescriptor::createArg( - IncomingArgX ? *IncomingArgX : + if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) { + // We're in a situation where the outgoing function requires the workitem + // ID, but the calling function does not have it (e.g a graphics function + // calling a C calling convention function). This is illegal, but we need + // to produce something. + MIRBuilder.buildUndef(InputReg); + } else { + // Workitem ids are already packed, any of present incoming arguments will + // carry all required fields. + ArgDescriptor IncomingArg = ArgDescriptor::createArg( + IncomingArgX ? *IncomingArgX : IncomingArgY ? *IncomingArgY : *IncomingArgZ, ~0u); - LI->loadInputValue(InputReg, MIRBuilder, &IncomingArg, - &AMDGPU::VGPR_32RegClass, S32); + LI->loadInputValue(InputReg, MIRBuilder, &IncomingArg, + &AMDGPU::VGPR_32RegClass, S32); + } } if (OutgoingArg->isRegister()) { @@ -1314,6 +1345,7 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, return false; } + Info.IsTailCall = CanTailCallOpt; if (CanTailCallOpt) return lowerTailCall(MIRBuilder, Info, OutArgs); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp index a55729586b8d..1920684d8f1f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -150,13 +150,13 @@ class AMDGPUCodeGenPrepare : public FunctionPass, /// \returns The minimum number of bits needed to store the value of \Op as an /// unsigned integer. Truncating to this size and then zero-extending to - /// ScalarSize will not change the value. - unsigned numBitsUnsigned(Value *Op, unsigned ScalarSize) const; + /// the original will not change the value. + unsigned numBitsUnsigned(Value *Op) const; /// \returns The minimum number of bits needed to store the value of \Op as a /// signed integer. Truncating to this size and then sign-extending to - /// ScalarSize will not change the value. - unsigned numBitsSigned(Value *Op, unsigned ScalarSize) const; + /// the original size will not change the value. + unsigned numBitsSigned(Value *Op) const; /// Replace mul instructions with llvm.amdgcn.mul.u24 or llvm.amdgcn.mul.s24. /// SelectionDAG has an issue where an and asserting the bits are known @@ -445,17 +445,12 @@ bool AMDGPUCodeGenPrepare::promoteUniformBitreverseToI32( return true; } -unsigned AMDGPUCodeGenPrepare::numBitsUnsigned(Value *Op, - unsigned ScalarSize) const { - KnownBits Known = computeKnownBits(Op, *DL, 0, AC); - return ScalarSize - Known.countMinLeadingZeros(); +unsigned AMDGPUCodeGenPrepare::numBitsUnsigned(Value *Op) const { + return computeKnownBits(Op, *DL, 0, AC).countMaxActiveBits(); } -unsigned AMDGPUCodeGenPrepare::numBitsSigned(Value *Op, - unsigned ScalarSize) const { - // In order for this to be a signed 24-bit value, bit 23, must - // be a sign bit. - return ScalarSize - ComputeNumSignBits(Op, *DL, 0, AC) + 1; +unsigned AMDGPUCodeGenPrepare::numBitsSigned(Value *Op) const { + return ComputeMaxSignificantBits(Op, *DL, 0, AC); } static void extractValues(IRBuilder<> &Builder, @@ -532,12 +527,12 @@ bool AMDGPUCodeGenPrepare::replaceMulWithMul24(BinaryOperator &I) const { unsigned LHSBits = 0, RHSBits = 0; bool IsSigned = false; - if (ST->hasMulU24() && (LHSBits = numBitsUnsigned(LHS, Size)) <= 24 && - (RHSBits = numBitsUnsigned(RHS, Size)) <= 24) { + if (ST->hasMulU24() && (LHSBits = numBitsUnsigned(LHS)) <= 24 && + (RHSBits = numBitsUnsigned(RHS)) <= 24) { IsSigned = false; - } else if (ST->hasMulI24() && (LHSBits = numBitsSigned(LHS, Size)) <= 24 && - (RHSBits = numBitsSigned(RHS, Size)) <= 24) { + } else if (ST->hasMulI24() && (LHSBits = numBitsSigned(LHS)) <= 24 && + (RHSBits = numBitsSigned(RHS)) <= 24) { IsSigned = true; } else diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp index 699c6c479455..3ac7c45b3275 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp @@ -331,8 +331,7 @@ void MetadataStreamerV2::emitKernelArg(const Argument &Arg) { if (auto PtrTy = dyn_cast<PointerType>(Arg.getType())) { if (PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { // FIXME: Should report this for all address spaces - PointeeAlign = DL.getValueOrABITypeAlignment(Arg.getParamAlign(), - PtrTy->getElementType()); + PointeeAlign = Arg.getParamAlign().valueOrOne(); } } @@ -731,10 +730,8 @@ void MetadataStreamerV3::emitKernelArg(const Argument &Arg, unsigned &Offset, // FIXME: Need to distinguish in memory alignment from pointer alignment. if (auto PtrTy = dyn_cast<PointerType>(Ty)) { - if (PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { - PointeeAlign = DL.getValueOrABITypeAlignment(Arg.getParamAlign(), - PtrTy->getElementType()); - } + if (PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) + PointeeAlign = Arg.getParamAlign().valueOrOne(); } // There's no distinction between byval aggregates and raw aggregates. diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 54177564afbc..b9d0655feef7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -51,7 +51,7 @@ unsigned AMDGPUTargetLowering::numBitsUnsigned(SDValue Op, SelectionDAG &DAG) { unsigned AMDGPUTargetLowering::numBitsSigned(SDValue Op, SelectionDAG &DAG) { // In order for this to be a signed 24-bit value, bit 23, must // be a sign bit. - return DAG.ComputeMinSignedBits(Op); + return DAG.ComputeMaxSignificantBits(Op); } AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, @@ -360,6 +360,8 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f32, Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f16, Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i16, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f16, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i16, Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f32, Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i32, Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v3f32, Custom); @@ -1408,6 +1410,11 @@ SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, Start != 1) return Op; + if (((SrcVT == MVT::v8f16 && VT == MVT::v4f16) || + (SrcVT == MVT::v8i16 && VT == MVT::v4i16)) && + (Start == 0 || Start == 4)) + return Op; + DAG.ExtractVectorElements(Op.getOperand(0), Args, Start, VT.getVectorNumElements()); @@ -4626,11 +4633,12 @@ void AMDGPUTargetLowering::computeKnownBitsForTargetNode( RHSKnown = RHSKnown.trunc(24); if (Opc == AMDGPUISD::MUL_I24) { - unsigned LHSValBits = 24 - LHSKnown.countMinSignBits(); - unsigned RHSValBits = 24 - RHSKnown.countMinSignBits(); - unsigned MaxValBits = std::min(LHSValBits + RHSValBits, 32u); - if (MaxValBits >= 32) + unsigned LHSValBits = LHSKnown.countMaxSignificantBits(); + unsigned RHSValBits = RHSKnown.countMaxSignificantBits(); + unsigned MaxValBits = LHSValBits + RHSValBits; + if (MaxValBits > 32) break; + unsigned SignBits = 32 - MaxValBits + 1; bool LHSNegative = LHSKnown.isNegative(); bool LHSNonNegative = LHSKnown.isNonNegative(); bool LHSPositive = LHSKnown.isStrictlyPositive(); @@ -4639,16 +4647,16 @@ void AMDGPUTargetLowering::computeKnownBitsForTargetNode( bool RHSPositive = RHSKnown.isStrictlyPositive(); if ((LHSNonNegative && RHSNonNegative) || (LHSNegative && RHSNegative)) - Known.Zero.setHighBits(32 - MaxValBits); + Known.Zero.setHighBits(SignBits); else if ((LHSNegative && RHSPositive) || (LHSPositive && RHSNegative)) - Known.One.setHighBits(32 - MaxValBits); + Known.One.setHighBits(SignBits); } else { - unsigned LHSValBits = 24 - LHSKnown.countMinLeadingZeros(); - unsigned RHSValBits = 24 - RHSKnown.countMinLeadingZeros(); - unsigned MaxValBits = std::min(LHSValBits + RHSValBits, 32u); + unsigned LHSValBits = LHSKnown.countMaxActiveBits(); + unsigned RHSValBits = RHSKnown.countMaxActiveBits(); + unsigned MaxValBits = LHSValBits + RHSValBits; if (MaxValBits >= 32) break; - Known.Zero.setHighBits(32 - MaxValBits); + Known.Zero.setBitsFrom(MaxValBits); } break; } @@ -4904,7 +4912,8 @@ AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { } } -bool AMDGPUTargetLowering::isConstantUnsignedBitfieldExtactLegal( +bool AMDGPUTargetLowering::isConstantUnsignedBitfieldExtractLegal( unsigned Opc, LLT Ty1, LLT Ty2) const { - return Ty1 == Ty2 && (Ty1 == LLT::scalar(32) || Ty1 == LLT::scalar(64)); + return (Ty1 == LLT::scalar(32) || Ty1 == LLT::scalar(64)) && + Ty2 == LLT::scalar(32); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h index daaca8737c5d..b41506157b68 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -335,8 +335,8 @@ public: AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override; - bool isConstantUnsignedBitfieldExtactLegal(unsigned Opc, LLT Ty1, - LLT Ty2) const override; + bool isConstantUnsignedBitfieldExtractLegal(unsigned Opc, LLT Ty1, + LLT Ty2) const override; }; namespace AMDGPUISD { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp index db84b8766924..4f1d700bcd84 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp @@ -58,24 +58,37 @@ static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1, // Check if a value can be converted to a 16-bit value without losing // precision. -static bool canSafelyConvertTo16Bit(Value &V) { +// The value is expected to be either a float (IsFloat = true) or an unsigned +// integer (IsFloat = false). +static bool canSafelyConvertTo16Bit(Value &V, bool IsFloat) { Type *VTy = V.getType(); if (VTy->isHalfTy() || VTy->isIntegerTy(16)) { // The value is already 16-bit, so we don't want to convert to 16-bit again! return false; } - if (ConstantFP *ConstFloat = dyn_cast<ConstantFP>(&V)) { - // We need to check that if we cast the index down to a half, we do not lose - // precision. - APFloat FloatValue(ConstFloat->getValueAPF()); - bool LosesInfo = true; - FloatValue.convert(APFloat::IEEEhalf(), APFloat::rmTowardZero, &LosesInfo); - return !LosesInfo; + if (IsFloat) { + if (ConstantFP *ConstFloat = dyn_cast<ConstantFP>(&V)) { + // We need to check that if we cast the index down to a half, we do not + // lose precision. + APFloat FloatValue(ConstFloat->getValueAPF()); + bool LosesInfo = true; + FloatValue.convert(APFloat::IEEEhalf(), APFloat::rmTowardZero, + &LosesInfo); + return !LosesInfo; + } + } else { + if (ConstantInt *ConstInt = dyn_cast<ConstantInt>(&V)) { + // We need to check that if we cast the index down to an i16, we do not + // lose precision. + APInt IntValue(ConstInt->getValue()); + return IntValue.getActiveBits() <= 16; + } } + Value *CastSrc; - if (match(&V, m_FPExt(PatternMatch::m_Value(CastSrc))) || - match(&V, m_SExt(PatternMatch::m_Value(CastSrc))) || - match(&V, m_ZExt(PatternMatch::m_Value(CastSrc)))) { + bool IsExt = IsFloat ? match(&V, m_FPExt(PatternMatch::m_Value(CastSrc))) + : match(&V, m_ZExt(PatternMatch::m_Value(CastSrc))); + if (IsExt) { Type *CastSrcTy = CastSrc->getType(); if (CastSrcTy->isHalfTy() || CastSrcTy->isIntegerTy(16)) return true; @@ -97,13 +110,116 @@ static Value *convertTo16Bit(Value &V, InstCombiner::BuilderTy &Builder) { llvm_unreachable("Should never be called!"); } +/// Applies Function(II.Args, II.ArgTys) and replaces the intrinsic call with +/// the modified arguments. +static Optional<Instruction *> modifyIntrinsicCall( + IntrinsicInst &II, unsigned NewIntr, InstCombiner &IC, + std::function<void(SmallVectorImpl<Value *> &, SmallVectorImpl<Type *> &)> + Func) { + SmallVector<Type *, 4> ArgTys; + if (!Intrinsic::getIntrinsicSignature(II.getCalledFunction(), ArgTys)) + return None; + + SmallVector<Value *, 8> Args(II.args()); + + // Modify arguments and types + Func(Args, ArgTys); + + Function *I = Intrinsic::getDeclaration(II.getModule(), NewIntr, ArgTys); + + CallInst *NewCall = IC.Builder.CreateCall(I, Args); + NewCall->takeName(&II); + NewCall->copyMetadata(II); + if (isa<FPMathOperator>(NewCall)) + NewCall->copyFastMathFlags(&II); + + // Erase and replace uses + if (!II.getType()->isVoidTy()) + IC.replaceInstUsesWith(II, NewCall); + return IC.eraseInstFromFunction(II); +} + static Optional<Instruction *> simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST, const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr, IntrinsicInst &II, InstCombiner &IC) { + // Optimize _L to _LZ when _L is zero + if (const auto *LZMappingInfo = + AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) { + if (auto *ConstantLod = + dyn_cast<ConstantFP>(II.getOperand(ImageDimIntr->LodIndex))) { + if (ConstantLod->isZero() || ConstantLod->isNegative()) { + const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr = + AMDGPU::getImageDimIntrinsicByBaseOpcode(LZMappingInfo->LZ, + ImageDimIntr->Dim); + return modifyIntrinsicCall( + II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) { + Args.erase(Args.begin() + ImageDimIntr->LodIndex); + }); + } + } + } + + // Optimize _mip away, when 'lod' is zero + if (const auto *MIPMappingInfo = + AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) { + if (auto *ConstantMip = + dyn_cast<ConstantInt>(II.getOperand(ImageDimIntr->MipIndex))) { + if (ConstantMip->isZero()) { + const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr = + AMDGPU::getImageDimIntrinsicByBaseOpcode(MIPMappingInfo->NONMIP, + ImageDimIntr->Dim); + return modifyIntrinsicCall( + II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) { + Args.erase(Args.begin() + ImageDimIntr->MipIndex); + }); + } + } + } + + // Optimize _bias away when 'bias' is zero + if (const auto *BiasMappingInfo = + AMDGPU::getMIMGBiasMappingInfo(ImageDimIntr->BaseOpcode)) { + if (auto *ConstantBias = + dyn_cast<ConstantFP>(II.getOperand(ImageDimIntr->BiasIndex))) { + if (ConstantBias->isZero()) { + const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr = + AMDGPU::getImageDimIntrinsicByBaseOpcode(BiasMappingInfo->NoBias, + ImageDimIntr->Dim); + return modifyIntrinsicCall( + II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) { + Args.erase(Args.begin() + ImageDimIntr->BiasIndex); + ArgTys.erase(ArgTys.begin() + ImageDimIntr->BiasTyArg); + }); + } + } + } + + // Optimize _offset away when 'offset' is zero + if (const auto *OffsetMappingInfo = + AMDGPU::getMIMGOffsetMappingInfo(ImageDimIntr->BaseOpcode)) { + if (auto *ConstantOffset = + dyn_cast<ConstantInt>(II.getOperand(ImageDimIntr->OffsetIndex))) { + if (ConstantOffset->isZero()) { + const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr = + AMDGPU::getImageDimIntrinsicByBaseOpcode( + OffsetMappingInfo->NoOffset, ImageDimIntr->Dim); + return modifyIntrinsicCall( + II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) { + Args.erase(Args.begin() + ImageDimIntr->OffsetIndex); + }); + } + } + } + + // Try to use A16 or G16 if (!ST->hasA16() && !ST->hasG16()) return None; + // Address is interpreted as float if the instruction has a sampler or as + // unsigned int if there is no sampler. + bool HasSampler = + AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode)->Sampler; bool FloatCoord = false; // true means derivatives can be converted to 16 bit, coordinates not bool OnlyDerivatives = false; @@ -112,7 +228,7 @@ simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST, OperandIndex < ImageDimIntr->VAddrEnd; OperandIndex++) { Value *Coord = II.getOperand(OperandIndex); // If the values are not derived from 16-bit values, we cannot optimize. - if (!canSafelyConvertTo16Bit(*Coord)) { + if (!canSafelyConvertTo16Bit(*Coord, HasSampler)) { if (OperandIndex < ImageDimIntr->CoordStart || ImageDimIntr->GradientStart == ImageDimIntr->CoordStart) { return None; @@ -127,43 +243,50 @@ simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST, FloatCoord = Coord->getType()->isFloatingPointTy(); } - if (OnlyDerivatives) { - if (!ST->hasG16()) - return None; - } else { - if (!ST->hasA16()) - OnlyDerivatives = true; // Only supports G16 + if (!OnlyDerivatives && !ST->hasA16()) + OnlyDerivatives = true; // Only supports G16 + + // Check if there is a bias parameter and if it can be converted to f16 + if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) { + Value *Bias = II.getOperand(ImageDimIntr->BiasIndex); + assert(HasSampler && + "Only image instructions with a sampler can have a bias"); + if (!canSafelyConvertTo16Bit(*Bias, HasSampler)) + OnlyDerivatives = true; } + if (OnlyDerivatives && (!ST->hasG16() || ImageDimIntr->GradientStart == + ImageDimIntr->CoordStart)) + return None; + Type *CoordType = FloatCoord ? Type::getHalfTy(II.getContext()) : Type::getInt16Ty(II.getContext()); - SmallVector<Type *, 4> ArgTys; - if (!Intrinsic::getIntrinsicSignature(II.getCalledFunction(), ArgTys)) - return None; - - ArgTys[ImageDimIntr->GradientTyArg] = CoordType; - if (!OnlyDerivatives) - ArgTys[ImageDimIntr->CoordTyArg] = CoordType; - Function *I = - Intrinsic::getDeclaration(II.getModule(), II.getIntrinsicID(), ArgTys); + return modifyIntrinsicCall( + II, II.getIntrinsicID(), IC, [&](auto &Args, auto &ArgTys) { + ArgTys[ImageDimIntr->GradientTyArg] = CoordType; + if (!OnlyDerivatives) { + ArgTys[ImageDimIntr->CoordTyArg] = CoordType; - SmallVector<Value *, 8> Args(II.args()); + // Change the bias type + if (ImageDimIntr->NumBiasArgs != 0) + ArgTys[ImageDimIntr->BiasTyArg] = Type::getHalfTy(II.getContext()); + } - unsigned EndIndex = - OnlyDerivatives ? ImageDimIntr->CoordStart : ImageDimIntr->VAddrEnd; - for (unsigned OperandIndex = ImageDimIntr->GradientStart; - OperandIndex < EndIndex; OperandIndex++) { - Args[OperandIndex] = - convertTo16Bit(*II.getOperand(OperandIndex), IC.Builder); - } + unsigned EndIndex = + OnlyDerivatives ? ImageDimIntr->CoordStart : ImageDimIntr->VAddrEnd; + for (unsigned OperandIndex = ImageDimIntr->GradientStart; + OperandIndex < EndIndex; OperandIndex++) { + Args[OperandIndex] = + convertTo16Bit(*II.getOperand(OperandIndex), IC.Builder); + } - CallInst *NewCall = IC.Builder.CreateCall(I, Args); - NewCall->takeName(&II); - NewCall->copyMetadata(II); - if (isa<FPMathOperator>(NewCall)) - NewCall->copyFastMathFlags(&II); - return IC.replaceInstUsesWith(II, NewCall); + // Convert the bias + if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) { + Value *Bias = II.getOperand(ImageDimIntr->BiasIndex); + Args[ImageDimIntr->BiasIndex] = convertTo16Bit(*Bias, IC.Builder); + } + }); } bool GCNTTIImpl::canSimplifyLegacyMulToMul(const Value *Op0, const Value *Op1, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h index b1263618c5db..e7ee36447682 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h @@ -20,9 +20,6 @@ namespace llvm { class GCNSubtarget; -class MachineFunction; -class MachineInstr; -class MachineInstrBuilder; class MachineMemOperand; class AMDGPUInstrInfo { @@ -52,6 +49,9 @@ struct ImageDimIntrinsicInfo { unsigned BaseOpcode; MIMGDim Dim; + uint8_t NumOffsetArgs; + uint8_t NumBiasArgs; + uint8_t NumZCompareArgs; uint8_t NumGradients; uint8_t NumDmask; uint8_t NumData; @@ -60,6 +60,9 @@ struct ImageDimIntrinsicInfo { uint8_t DMaskIndex; uint8_t VAddrStart; + uint8_t OffsetIndex; + uint8_t BiasIndex; + uint8_t ZCompareIndex; uint8_t GradientStart; uint8_t CoordStart; uint8_t LodIndex; @@ -71,6 +74,7 @@ struct ImageDimIntrinsicInfo { uint8_t TexFailCtrlIndex; uint8_t CachePolicyIndex; + uint8_t BiasTyArg; uint8_t GradientTyArg; uint8_t CoordTyArg; }; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index e16bead81b65..b7d0f0580cda 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -46,8 +46,7 @@ static cl::opt<bool> AllowRiskySelect( AMDGPUInstructionSelector::AMDGPUInstructionSelector( const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI, const AMDGPUTargetMachine &TM) - : InstructionSelector(), TII(*STI.getInstrInfo()), - TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM), + : TII(*STI.getInstrInfo()), TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM), STI(STI), EnableLateStructurizeCFG(AMDGPUTargetMachine::EnableLateStructurizeCFG), #define GET_GLOBALISEL_PREDICATES_INIT @@ -1103,7 +1102,18 @@ bool AMDGPUInstructionSelector::selectIntrinsicIcmp(MachineInstr &I) const { const DebugLoc &DL = I.getDebugLoc(); Register SrcReg = I.getOperand(2).getReg(); unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI); + auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(4).getImm()); + if (!ICmpInst::isIntPredicate(static_cast<ICmpInst::Predicate>(Pred))) { + MachineInstr *ICmp = + BuildMI(*BB, &I, DL, TII.get(AMDGPU::IMPLICIT_DEF), Dst); + + if (!RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(), + *TRI.getBoolRC(), *MRI)) + return false; + I.eraseFromParent(); + return true; + } int Opcode = getV_CMPOpcode(Pred, Size); if (Opcode == -1) @@ -1234,7 +1244,7 @@ bool AMDGPUInstructionSelector::selectReturnAddress(MachineInstr &I) const { // Get the return address reg and mark it as an implicit live-in Register ReturnAddrReg = TRI.getReturnAddressReg(MF); Register LiveIn = getFunctionLiveInPhysReg(MF, TII, ReturnAddrReg, - AMDGPU::SReg_64RegClass); + AMDGPU::SReg_64RegClass, DL); BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), DstReg) .addReg(LiveIn); I.eraseFromParent(); @@ -1494,9 +1504,9 @@ static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, if (TexFailCtrl) IsTexFail = true; - TFE = (TexFailCtrl & 0x1) ? 1 : 0; + TFE = (TexFailCtrl & 0x1) ? true : false; TexFailCtrl &= ~(uint64_t)0x1; - LWE = (TexFailCtrl & 0x2) ? 1 : 0; + LWE = (TexFailCtrl & 0x2) ? true : false; TexFailCtrl &= ~(uint64_t)0x2; return TexFailCtrl == 0; @@ -1511,10 +1521,6 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic( AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim); - const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = - AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode); - const AMDGPU::MIMGMIPMappingInfo *MIPMappingInfo = - AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode); unsigned IntrOpcode = Intr->BaseOpcode; const bool IsGFX10Plus = AMDGPU::isGFX10Plus(STI); @@ -1523,7 +1529,8 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic( Register VDataIn, VDataOut; LLT VDataTy; int NumVDataDwords = -1; - bool IsD16 = false; + bool IsD16 = MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 || + MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16; bool Unorm; if (!BaseOpcode->Sampler) @@ -1572,16 +1579,6 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic( DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm(); DMaskLanes = BaseOpcode->Gather4 ? 4 : countPopulation(DMask); - // One memoperand is mandatory, except for getresinfo. - // FIXME: Check this in verifier. - if (!MI.memoperands_empty()) { - const MachineMemOperand *MMO = *MI.memoperands_begin(); - - // Infer d16 from the memory size, as the register type will be mangled by - // unpacked subtargets, or by TFE. - IsD16 = ((8 * MMO->getSize()) / DMaskLanes) < 32; - } - if (BaseOpcode->Store) { VDataIn = MI.getOperand(1).getReg(); VDataTy = MRI->getType(VDataIn); @@ -1596,26 +1593,6 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic( } } - // Optimize _L to _LZ when _L is zero - if (LZMappingInfo) { - // The legalizer replaced the register with an immediate 0 if we need to - // change the opcode. - const MachineOperand &Lod = MI.getOperand(ArgOffset + Intr->LodIndex); - if (Lod.isImm()) { - assert(Lod.getImm() == 0); - IntrOpcode = LZMappingInfo->LZ; // set new opcode to _lz variant of _l - } - } - - // Optimize _mip away, when 'lod' is zero - if (MIPMappingInfo) { - const MachineOperand &Lod = MI.getOperand(ArgOffset + Intr->MipIndex); - if (Lod.isImm()) { - assert(Lod.getImm() == 0); - IntrOpcode = MIPMappingInfo->NONMIP; // set new opcode to variant without _mip - } - } - // Set G16 opcode if (IsG16 && !IsA16) { const AMDGPU::MIMGG16MappingInfo *G16MappingInfo = @@ -2562,6 +2539,8 @@ bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const { Register MaskReg = I.getOperand(2).getReg(); LLT Ty = MRI->getType(DstReg); LLT MaskTy = MRI->getType(MaskReg); + MachineBasicBlock *BB = I.getParent(); + const DebugLoc &DL = I.getDebugLoc(); const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI); @@ -2570,6 +2549,24 @@ bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const { if (DstRB != SrcRB) // Should only happen for hand written MIR. return false; + // Try to avoid emitting a bit operation when we only need to touch half of + // the 64-bit pointer. + APInt MaskOnes = KnownBits->getKnownOnes(MaskReg).zextOrSelf(64); + const APInt MaskHi32 = APInt::getHighBitsSet(64, 32); + const APInt MaskLo32 = APInt::getLowBitsSet(64, 32); + + const bool CanCopyLow32 = (MaskOnes & MaskLo32) == MaskLo32; + const bool CanCopyHi32 = (MaskOnes & MaskHi32) == MaskHi32; + + if (!IsVGPR && Ty.getSizeInBits() == 64 && + !CanCopyLow32 && !CanCopyHi32) { + auto MIB = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_AND_B64), DstReg) + .addReg(SrcReg) + .addReg(MaskReg); + I.eraseFromParent(); + return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); + } + unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32; const TargetRegisterClass &RegRC = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass; @@ -2586,8 +2583,6 @@ bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const { !RBI.constrainGenericRegister(MaskReg, *MaskRC, *MRI)) return false; - MachineBasicBlock *BB = I.getParent(); - const DebugLoc &DL = I.getDebugLoc(); if (Ty.getSizeInBits() == 32) { assert(MaskTy.getSizeInBits() == 32 && "ptrmask should have been narrowed during legalize"); @@ -2610,13 +2605,7 @@ bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const { Register MaskedLo, MaskedHi; - // Try to avoid emitting a bit operation when we only need to touch half of - // the 64-bit pointer. - APInt MaskOnes = KnownBits->getKnownOnes(MaskReg).zextOrSelf(64); - - const APInt MaskHi32 = APInt::getHighBitsSet(64, 32); - const APInt MaskLo32 = APInt::getLowBitsSet(64, 32); - if ((MaskOnes & MaskLo32) == MaskLo32) { + if (CanCopyLow32) { // If all the bits in the low half are 1, we only need a copy for it. MaskedLo = LoReg; } else { @@ -2631,7 +2620,7 @@ bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const { .addReg(MaskLo); } - if ((MaskOnes & MaskHi32) == MaskHi32) { + if (CanCopyHi32) { // If all the bits in the high half are 1, we only need a copy for it. MaskedHi = HiReg; } else { @@ -3123,6 +3112,33 @@ bool AMDGPUInstructionSelector::selectBVHIntrinsic(MachineInstr &MI) const{ return true; } +bool AMDGPUInstructionSelector::selectWaveAddress(MachineInstr &MI) const { + Register DstReg = MI.getOperand(0).getReg(); + Register SrcReg = MI.getOperand(1).getReg(); + const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); + const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID; + MachineBasicBlock *MBB = MI.getParent(); + const DebugLoc &DL = MI.getDebugLoc(); + + if (IsVALU) { + BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHRREV_B32_e64), DstReg) + .addImm(Subtarget->getWavefrontSizeLog2()) + .addReg(SrcReg); + } else { + BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), DstReg) + .addReg(SrcReg) + .addImm(Subtarget->getWavefrontSizeLog2()); + } + + const TargetRegisterClass &RC = + IsVALU ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass; + if (!RBI.constrainGenericRegister(DstReg, RC, *MRI)) + return false; + + MI.eraseFromParent(); + return true; +} + bool AMDGPUInstructionSelector::select(MachineInstr &I) { if (I.isPHI()) return selectPHI(I); @@ -3236,7 +3252,9 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) { case TargetOpcode::G_SHUFFLE_VECTOR: return selectG_SHUFFLE_VECTOR(I); case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD: - case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: { + case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16: + case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: + case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: { const AMDGPU::ImageDimIntrinsicInfo *Intr = AMDGPU::getImageDimIntrinsicInfo(I.getIntrinsicID()); assert(Intr && "not an image intrinsic with image pseudo"); @@ -3252,6 +3270,8 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) { case AMDGPU::G_SI_CALL: I.setDesc(TII.get(AMDGPU::SI_CALL)); return true; + case AMDGPU::G_AMDGPU_WAVE_ADDRESS: + return selectWaveAddress(I); default: return selectImpl(I, *CoverageInfo); } @@ -3896,20 +3916,59 @@ bool AMDGPUInstructionSelector::isUnneededShiftMask(const MachineInstr &MI, return (LHSKnownZeros | *RHS).countTrailingOnes() >= ShAmtBits; } +// Return the wave level SGPR base address if this is a wave address. +static Register getWaveAddress(const MachineInstr *Def) { + return Def->getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS + ? Def->getOperand(1).getReg() + : Register(); +} + InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectMUBUFScratchOffset( MachineOperand &Root) const { - MachineInstr *MI = Root.getParent(); - MachineBasicBlock *MBB = MI->getParent(); + Register Reg = Root.getReg(); + const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); + + const MachineInstr *Def = MRI->getVRegDef(Reg); + if (Register WaveBase = getWaveAddress(Def)) { + return {{ + [=](MachineInstrBuilder &MIB) { // rsrc + MIB.addReg(Info->getScratchRSrcReg()); + }, + [=](MachineInstrBuilder &MIB) { // soffset + MIB.addReg(WaveBase); + }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // offset + }}; + } int64_t Offset = 0; + + // FIXME: Copy check is a hack + Register BasePtr; + if (mi_match(Reg, *MRI, m_GPtrAdd(m_Reg(BasePtr), m_Copy(m_ICst(Offset))))) { + if (!SIInstrInfo::isLegalMUBUFImmOffset(Offset)) + return {}; + const MachineInstr *BasePtrDef = MRI->getVRegDef(BasePtr); + Register WaveBase = getWaveAddress(BasePtrDef); + if (!WaveBase) + return {}; + + return {{ + [=](MachineInstrBuilder &MIB) { // rsrc + MIB.addReg(Info->getScratchRSrcReg()); + }, + [=](MachineInstrBuilder &MIB) { // soffset + MIB.addReg(WaveBase); + }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset + }}; + } + if (!mi_match(Root.getReg(), *MRI, m_ICst(Offset)) || !SIInstrInfo::isLegalMUBUFImmOffset(Offset)) return {}; - const MachineFunction *MF = MBB->getParent(); - const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); - return {{ [=](MachineInstrBuilder &MIB) { // rsrc MIB.addReg(Info->getScratchRSrcReg()); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h index 26996e42af53..42095332d11a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -30,7 +30,6 @@ namespace AMDGPU { struct ImageDimIntrinsicInfo; } -class AMDGPUInstrInfo; class AMDGPURegisterBankInfo; class AMDGPUTargetMachine; class BlockFrequencyInfo; @@ -42,7 +41,6 @@ class MachineOperand; class MachineRegisterInfo; class RegisterBank; class SIInstrInfo; -class SIMachineFunctionInfo; class SIRegisterInfo; class TargetRegisterClass; @@ -147,6 +145,7 @@ private: bool selectGlobalAtomicFadd(MachineInstr &I, MachineOperand &AddrOp, MachineOperand &DataOp) const; bool selectBVHIntrinsic(MachineInstr &I) const; + bool selectWaveAddress(MachineInstr &I) const; std::pair<Register, unsigned> selectVOP3ModsImpl(MachineOperand &Root, bool AllowAbs = true) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td index 0528b552f475..7d3dbfd7e851 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -18,6 +18,7 @@ class AddressSpacesImpl { int Local = 3; int Constant = 4; int Private = 5; + int Constant32Bit = 6; } def AddrSpaces : AddressSpacesImpl; @@ -405,18 +406,23 @@ class Aligned<int Bytes> { int MinAlignment = Bytes; } -class StoreHi16<SDPatternOperator op> : PatFrag < +class StoreHi16<SDPatternOperator op, ValueType vt> : PatFrag < (ops node:$value, node:$ptr), (op (srl node:$value, (i32 16)), node:$ptr)> { let IsStore = 1; + let MemoryVT = vt; } -def LoadAddress_constant : AddressSpaceList<[ AddrSpaces.Constant ]>; -def LoadAddress_global : AddressSpaceList<[ AddrSpaces.Global, AddrSpaces.Constant ]>; +def LoadAddress_constant : AddressSpaceList<[ AddrSpaces.Constant, + AddrSpaces.Constant32Bit ]>; +def LoadAddress_global : AddressSpaceList<[ AddrSpaces.Global, + AddrSpaces.Constant, + AddrSpaces.Constant32Bit ]>; def StoreAddress_global : AddressSpaceList<[ AddrSpaces.Global ]>; -def LoadAddress_flat : AddressSpaceList<[ AddrSpaces.Flat, - AddrSpaces.Global, - AddrSpaces.Constant ]>; +def LoadAddress_flat : AddressSpaceList<[ AddrSpaces.Flat, + AddrSpaces.Global, + AddrSpaces.Constant, + AddrSpaces.Constant32Bit ]>; def StoreAddress_flat : AddressSpaceList<[ AddrSpaces.Flat, AddrSpaces.Global ]>; def LoadAddress_private : AddressSpaceList<[ AddrSpaces.Private ]>; @@ -522,9 +528,9 @@ def truncstorei16_#as : PatFrag<(ops node:$val, node:$ptr), let MemoryVT = i16; } -def store_hi16_#as : StoreHi16 <truncstorei16>; -def truncstorei8_hi16_#as : StoreHi16<truncstorei8>; -def truncstorei16_hi16_#as : StoreHi16<truncstorei16>; +def store_hi16_#as : StoreHi16 <truncstorei16, i16>; +def truncstorei8_hi16_#as : StoreHi16<truncstorei8, i8>; +def truncstorei16_hi16_#as : StoreHi16<truncstorei16, i16>; defm atomic_store_#as : binary_atomic_op<atomic_store>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 5046daaed977..04c6f67ed339 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -272,8 +272,8 @@ static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, const bool IsLoad = Query.Opcode != AMDGPU::G_STORE; unsigned RegSize = Ty.getSizeInBits(); - unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); - unsigned AlignBits = Query.MMODescrs[0].AlignInBits; + uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); + uint64_t AlignBits = Query.MMODescrs[0].AlignInBits; unsigned AS = Query.Types[1].getAddressSpace(); // All of these need to be custom lowered to cast the pointer operand. @@ -380,7 +380,7 @@ static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, /// access up to the alignment. Note this case when the memory access itself /// changes, not the size of the result register. static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy, - unsigned AlignInBits, unsigned AddrSpace, + uint64_t AlignInBits, unsigned AddrSpace, unsigned Opcode) { unsigned SizeInBits = MemoryTy.getSizeInBits(); // We don't want to widen cases that are naturally legal. @@ -929,10 +929,11 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, getActionDefinitionsBuilder(G_CTPOP) .legalFor({{S32, S32}, {S32, S64}}) .clampScalar(0, S32, S32) + .widenScalarToNextPow2(1, 32) .clampScalar(1, S32, S64) .scalarize(0) - .widenScalarToNextPow2(0, 32) - .widenScalarToNextPow2(1, 32); + .widenScalarToNextPow2(0, 32); + // The hardware instructions return a different result on 0 than the generic // instructions expect. The hardware produces -1, but these produce the @@ -1172,7 +1173,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, if (MemSize > MaxSize) return std::make_pair(0, LLT::scalar(MaxSize)); - unsigned Align = Query.MMODescrs[0].AlignInBits; + uint64_t Align = Query.MMODescrs[0].AlignInBits; return std::make_pair(0, LLT::scalar(Align)); }) .fewerElementsIf( @@ -1295,6 +1296,18 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, if (ST.hasAtomicFaddInsts()) Atomic.legalFor({{S32, GlobalPtr}}); + if (ST.hasGFX90AInsts()) { + // These are legal with some caveats, and should have undergone expansion in + // the IR in most situations + // TODO: Move atomic expansion into legalizer + // TODO: Also supports <2 x f16> + Atomic.legalFor({ + {S32, GlobalPtr}, + {S64, GlobalPtr}, + {S64, FlatPtr} + }); + } + // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output // demarshalling getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) @@ -1345,8 +1358,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, }, changeTo(1, S16)); Shifts.maxScalarIf(typeIs(0, S16), 1, S16); Shifts.clampScalar(1, S32, S32); - Shifts.clampScalar(0, S16, S64); Shifts.widenScalarToNextPow2(0, 16); + Shifts.clampScalar(0, S16, S64); getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) .minScalar(0, S16) @@ -1357,8 +1370,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, // expansion for the shifted type will produce much worse code if it hasn't // been truncated already. Shifts.clampScalar(1, S32, S32); - Shifts.clampScalar(0, S32, S64); Shifts.widenScalarToNextPow2(0, 32); + Shifts.clampScalar(0, S32, S64); getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) .minScalar(0, S32) @@ -1812,6 +1825,27 @@ Register AMDGPULegalizerInfo::getSegmentAperture( return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); } +/// Return true if the value is a known valid address, such that a null check is +/// not necessary. +static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, + const AMDGPUTargetMachine &TM, unsigned AddrSpace) { + MachineInstr *Def = MRI.getVRegDef(Val); + switch (Def->getOpcode()) { + case AMDGPU::G_FRAME_INDEX: + case AMDGPU::G_GLOBAL_VALUE: + case AMDGPU::G_BLOCK_ADDR: + return true; + case AMDGPU::G_CONSTANT: { + const ConstantInt *CI = Def->getOperand(1).getCImm(); + return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace); + } + default: + return false; + } + + return false; +} + bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { @@ -1862,6 +1896,14 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || DestAS == AMDGPUAS::PRIVATE_ADDRESS); + + if (isKnownNonNull(Src, MRI, TM, SrcAS)) { + // Extract low 32-bits of the pointer. + B.buildExtract(Dst, Src, 0); + MI.eraseFromParent(); + return true; + } + unsigned NullVal = TM.getNullPointerValue(DestAS); auto SegmentNull = B.buildConstant(DstTy, NullVal); @@ -1884,24 +1926,29 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( if (!ST.hasFlatAddressSpace()) return false; - auto SegmentNull = - B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); - auto FlatNull = - B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); - Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); if (!ApertureReg.isValid()) return false; - auto CmpRes = - B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); - // Coerce the type of the low half of the result so we can use merge_values. Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); // TODO: Should we allow mismatched types but matching sizes in merges to // avoid the ptrtoint? auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); + + if (isKnownNonNull(Src, MRI, TM, SrcAS)) { + B.buildCopy(Dst, BuildPtr); + MI.eraseFromParent(); + return true; + } + + auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); + auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); + + auto CmpRes = + B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); + B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); MI.eraseFromParent(); @@ -1959,6 +2006,7 @@ bool AMDGPULegalizerInfo::legalizeFceil( // TODO: Should this propagate fast-math-flags? B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); + MI.eraseFromParent(); return true; } @@ -2213,10 +2261,12 @@ bool AMDGPULegalizerInfo::legalizeExtractVectorElt( LLT EltTy = VecTy.getElementType(); assert(EltTy == MRI.getType(Dst)); - if (IdxVal < VecTy.getNumElements()) - B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits()); - else + if (IdxVal < VecTy.getNumElements()) { + auto Unmerge = B.buildUnmerge(EltTy, Vec); + B.buildCopy(Dst, Unmerge.getReg(IdxVal)); + } else { B.buildUndef(Dst); + } MI.eraseFromParent(); return true; @@ -2245,11 +2295,20 @@ bool AMDGPULegalizerInfo::legalizeInsertVectorElt( LLT VecTy = MRI.getType(Vec); LLT EltTy = VecTy.getElementType(); assert(EltTy == MRI.getType(Ins)); + (void)Ins; - if (IdxVal < VecTy.getNumElements()) - B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits()); - else + unsigned NumElts = VecTy.getNumElements(); + if (IdxVal < NumElts) { + SmallVector<Register, 8> SrcRegs; + for (unsigned i = 0; i < NumElts; ++i) + SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy)); + B.buildUnmerge(SrcRegs, Vec); + + SrcRegs[IdxVal] = MI.getOperand(2).getReg(); + B.buildMerge(Dst, SrcRegs); + } else { B.buildUndef(Dst); + } MI.eraseFromParent(); return true; @@ -2502,7 +2561,7 @@ bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper, const LLT MemTy = MMO->getMemoryType(); const Align MemAlign = MMO->getAlign(); const unsigned MemSize = MemTy.getSizeInBits(); - const unsigned AlignInBits = 8 * MemAlign.value(); + const uint64_t AlignInBits = 8 * MemAlign.value(); // Widen non-power-of-2 loads to the alignment if needed if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) { @@ -2832,8 +2891,8 @@ bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected"); assert(DstReg.isVirtual() && "Virtual register expected"); - Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC, - ArgTy); + Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, + *ArgRC, B.getDebugLoc(), ArgTy); if (Arg->isMasked()) { // TODO: Should we try to emit this once in the entry block? const LLT S32 = LLT::scalar(32); @@ -2842,6 +2901,8 @@ bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, Register AndMaskSrc = LiveIn; + // TODO: Avoid clearing the high bits if we know workitem id y/z are always + // 0. if (Shift != 0) { auto ShiftAmt = B.buildConstant(S32, Shift); AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); @@ -4106,7 +4167,6 @@ static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; - case Intrinsic::amdgcn_buffer_atomic_fadd: case Intrinsic::amdgcn_raw_buffer_atomic_fadd: case Intrinsic::amdgcn_struct_buffer_atomic_fadd: return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD; @@ -4213,15 +4273,18 @@ static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, if ((I < Intr->GradientStart) || (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) || (I >= Intr->CoordStart && !IsA16)) { - // Handle any gradient or coordinate operands that should not be packed if ((I < Intr->GradientStart) && IsA16 && (B.getMRI()->getType(AddrReg) == S16)) { + assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument"); // Special handling of bias when A16 is on. Bias is of type half but // occupies full 32-bit. PackedAddrs.push_back( B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) .getReg(0)); } else { + assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) && + "Bias needs to be converted to 16 bit in A16 mode"); + // Handle any gradient or coordinate operands that should not be packed AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); PackedAddrs.push_back(AddrReg); } @@ -4320,6 +4383,8 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic( const LLT V2S16 = LLT::fixed_vector(2, 16); unsigned DMask = 0; + Register VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg(); + LLT Ty = MRI->getType(VData); // Check for 16 bit addresses and pack if true. LLT GradTy = @@ -4328,6 +4393,7 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic( MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg()); const bool IsG16 = GradTy == S16; const bool IsA16 = AddrTy == S16; + const bool IsD16 = Ty.getScalarType() == S16; int DMaskLanes = 0; if (!BaseOpcode->Atomic) { @@ -4347,8 +4413,11 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic( Observer.changingInstr(MI); auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); - unsigned NewOpcode = NumDefs == 0 ? - AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; + const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16 + : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE; + const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 + : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; + unsigned NewOpcode = NumDefs == 0 ? StoreOpcode : LoadOpcode; // Track that we legalized this MI.setDesc(B.getTII().get(NewOpcode)); @@ -4381,44 +4450,6 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic( unsigned CorrectedNumVAddrs = Intr->NumVAddrs; - // Optimize _L to _LZ when _L is zero - if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = - AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) { - const ConstantFP *ConstantLod; - - if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI, - m_GFCst(ConstantLod))) { - if (ConstantLod->isZero() || ConstantLod->isNegative()) { - // Set new opcode to _lz variant of _l, and change the intrinsic ID. - const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr = - AMDGPU::getImageDimIntrinsicByBaseOpcode(LZMappingInfo->LZ, - Intr->Dim); - - // The starting indexes should remain in the same place. - --CorrectedNumVAddrs; - - MI.getOperand(MI.getNumExplicitDefs()) - .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr)); - MI.RemoveOperand(ArgOffset + Intr->LodIndex); - Intr = NewImageDimIntr; - } - } - } - - // Optimize _mip away, when 'lod' is zero - if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) { - int64_t ConstantLod; - if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI, - m_ICst(ConstantLod))) { - if (ConstantLod == 0) { - // TODO: Change intrinsic opcode and remove operand instead or replacing - // it with 0, as the _L to _LZ handling is done above. - MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0); - --CorrectedNumVAddrs; - } - } - } - // Rewrite the addressing register layout before doing anything else. if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) { // 16 bit gradients are supported, but are tied to the A16 control @@ -4494,9 +4525,7 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic( if (BaseOpcode->Store) { // No TFE for stores? // TODO: Handle dmask trim - Register VData = MI.getOperand(1).getReg(); - LLT Ty = MRI->getType(VData); - if (!Ty.isVector() || Ty.getElementType() != S16) + if (!Ty.isVector() || !IsD16) return true; Register RepackedReg = handleD16VData(B, *MRI, VData, true); @@ -4508,9 +4537,7 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic( } Register DstReg = MI.getOperand(0).getReg(); - LLT Ty = MRI->getType(DstReg); const LLT EltTy = Ty.getScalarType(); - const bool IsD16 = Ty.getScalarType() == S16; const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; // Confirm that the return type is large enough for the dmask specified @@ -4918,6 +4945,12 @@ bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI, return true; } +static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, int64_t C) { + B.buildConstant(MI.getOperand(0).getReg(), C); + MI.eraseFromParent(); + return true; +} + bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, MachineInstr &MI) const { MachineIRBuilder &B = Helper.MIRBuilder; @@ -5021,12 +5054,20 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, case Intrinsic::amdgcn_implicitarg_ptr: return legalizeImplicitArgPtr(MI, MRI, B); case Intrinsic::amdgcn_workitem_id_x: + if (ST.getMaxWorkitemID(B.getMF().getFunction(), 0) == 0) + return replaceWithConstant(B, MI, 0); return legalizePreloadedArgIntrin(MI, MRI, B, AMDGPUFunctionArgInfo::WORKITEM_ID_X); case Intrinsic::amdgcn_workitem_id_y: + if (ST.getMaxWorkitemID(B.getMF().getFunction(), 1) == 0) + return replaceWithConstant(B, MI, 0); + return legalizePreloadedArgIntrin(MI, MRI, B, AMDGPUFunctionArgInfo::WORKITEM_ID_Y); case Intrinsic::amdgcn_workitem_id_z: + if (ST.getMaxWorkitemID(B.getMF().getFunction(), 2) == 0) + return replaceWithConstant(B, MI, 0); + return legalizePreloadedArgIntrin(MI, MRI, B, AMDGPUFunctionArgInfo::WORKITEM_ID_Z); case Intrinsic::amdgcn_workgroup_id_x: @@ -5105,16 +5146,29 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, case Intrinsic::amdgcn_struct_buffer_atomic_inc: case Intrinsic::amdgcn_raw_buffer_atomic_dec: case Intrinsic::amdgcn_struct_buffer_atomic_dec: - case Intrinsic::amdgcn_raw_buffer_atomic_fadd: - case Intrinsic::amdgcn_struct_buffer_atomic_fadd: case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: - case Intrinsic::amdgcn_buffer_atomic_fadd: case Intrinsic::amdgcn_raw_buffer_atomic_fmin: case Intrinsic::amdgcn_struct_buffer_atomic_fmin: case Intrinsic::amdgcn_raw_buffer_atomic_fmax: case Intrinsic::amdgcn_struct_buffer_atomic_fmax: return legalizeBufferAtomic(MI, B, IntrID); + case Intrinsic::amdgcn_raw_buffer_atomic_fadd: + case Intrinsic::amdgcn_struct_buffer_atomic_fadd: { + Register DstReg = MI.getOperand(0).getReg(); + if (!MRI.use_empty(DstReg) && !ST.hasGFX90AInsts()) { + Function &F = B.getMF().getFunction(); + DiagnosticInfoUnsupported NoFpRet( + F, "return versions of fp atomics not supported", B.getDebugLoc(), + DS_Error); + F.getContext().diagnose(NoFpRet); + B.buildUndef(DstReg); + MI.eraseFromParent(); + return true; + } + + return legalizeBufferAtomic(MI, B, IntrID); + } case Intrinsic::amdgcn_atomic_inc: return legalizeAtomicIncDec(MI, B, true); case Intrinsic::amdgcn_atomic_dec: diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h index 7faf0436f995..964a41d3d740 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h @@ -21,7 +21,6 @@ namespace llvm { class GCNTargetMachine; -class LLVMContext; class GCNSubtarget; class MachineIRBuilder; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp index 49cf6db5197f..c28427758ac7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp @@ -58,9 +58,6 @@ private: // "FuncName" exists. It may create a new function prototype in pre-link mode. FunctionCallee getFunction(Module *M, const FuncInfo &fInfo); - // Replace a normal function with its native version. - bool replaceWithNative(CallInst *CI, const FuncInfo &FInfo); - bool parseFunctionName(const StringRef &FMangledName, FuncInfo &FInfo); bool TDOFold(CallInst *CI, const FuncInfo &FInfo); @@ -90,24 +87,6 @@ private: double& Res1, Constant *copr0, Constant *copr1, Constant *copr2); bool evaluateCall(CallInst *aCI, const FuncInfo &FInfo); - // exp - bool fold_exp(CallInst *CI, IRBuilder<> &B, const FuncInfo &FInfo); - - // exp2 - bool fold_exp2(CallInst *CI, IRBuilder<> &B, const FuncInfo &FInfo); - - // exp10 - bool fold_exp10(CallInst *CI, IRBuilder<> &B, const FuncInfo &FInfo); - - // log - bool fold_log(CallInst *CI, IRBuilder<> &B, const FuncInfo &FInfo); - - // log2 - bool fold_log2(CallInst *CI, IRBuilder<> &B, const FuncInfo &FInfo); - - // log10 - bool fold_log10(CallInst *CI, IRBuilder<> &B, const FuncInfo &FInfo); - // sqrt bool fold_sqrt(CallInst *CI, IRBuilder<> &B, const FuncInfo &FInfo); @@ -623,7 +602,8 @@ bool AMDGPULibCalls::fold(CallInst *CI, AliasAnalysis *AA) { Function *Callee = CI->getCalledFunction(); // Ignore indirect calls. - if (Callee == 0) return false; + if (Callee == nullptr) + return false; BasicBlock *BB = CI->getParent(); LLVMContext &Context = CI->getParent()->getContext(); @@ -778,27 +758,6 @@ bool AMDGPULibCalls::TDOFold(CallInst *CI, const FuncInfo &FInfo) { return false; } -bool AMDGPULibCalls::replaceWithNative(CallInst *CI, const FuncInfo &FInfo) { - Module *M = CI->getModule(); - if (getArgType(FInfo) != AMDGPULibFunc::F32 || - FInfo.getPrefix() != AMDGPULibFunc::NOPFX || - !HasNative(FInfo.getId())) - return false; - - AMDGPULibFunc nf = FInfo; - nf.setPrefix(AMDGPULibFunc::NATIVE); - if (FunctionCallee FPExpr = getFunction(M, nf)) { - LLVM_DEBUG(dbgs() << "AMDIC: " << *CI << " ---> "); - - CI->setCalledFunction(FPExpr); - - LLVM_DEBUG(dbgs() << *CI << '\n'); - - return true; - } - return false; -} - // [native_]half_recip(c) ==> 1.0/c bool AMDGPULibCalls::fold_recip(CallInst *CI, IRBuilder<> &B, const FuncInfo &FInfo) { @@ -1402,8 +1361,8 @@ AllocaInst* AMDGPULibCalls::insertAlloca(CallInst *UI, IRBuilder<> &B, Function *UCallee = UI->getCalledFunction(); Type *RetType = UCallee->getReturnType(); B.SetInsertPoint(&*ItNew); - AllocaInst *Alloc = B.CreateAlloca(RetType, 0, - std::string(prefix) + UI->getName()); + AllocaInst *Alloc = + B.CreateAlloca(RetType, nullptr, std::string(prefix) + UI->getName()); Alloc->setAlignment( Align(UCallee->getParent()->getDataLayout().getTypeAllocSize(RetType))); return Alloc; @@ -1724,7 +1683,8 @@ bool AMDGPUSimplifyLibCalls::runOnFunction(Function &F) { // Ignore indirect calls. Function *Callee = CI->getCalledFunction(); - if (Callee == 0) continue; + if (Callee == nullptr) + continue; LLVM_DEBUG(dbgs() << "AMDIC: try folding " << *CI << "\n"; dbgs().flush()); @@ -1757,7 +1717,7 @@ PreservedAnalyses AMDGPUSimplifyLibCallsPass::run(Function &F, // Ignore indirect calls. Function *Callee = CI->getCalledFunction(); - if (Callee == 0) + if (Callee == nullptr) continue; LLVM_DEBUG(dbgs() << "AMDIC: try folding " << *CI << "\n"; @@ -1783,9 +1743,10 @@ bool AMDGPUUseNativeCalls::runOnFunction(Function &F) { // Ignore indirect calls. Function *Callee = CI->getCalledFunction(); - if (Callee == 0) continue; + if (Callee == nullptr) + continue; - if(Simplifier.useNative(CI)) + if (Simplifier.useNative(CI)) Changed = true; } } @@ -1811,7 +1772,7 @@ PreservedAnalyses AMDGPUUseNativeCallsPass::run(Function &F, // Ignore indirect calls. Function *Callee = CI->getCalledFunction(); - if (Callee == 0) + if (Callee == nullptr) continue; if (Simplifier.useNative(CI)) diff --git a/llvm/lib/Target/AMDGPU/AMDGPULibFunc.h b/llvm/lib/Target/AMDGPU/AMDGPULibFunc.h index c97223b047e8..dc0ac72016f3 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULibFunc.h +++ b/llvm/lib/Target/AMDGPU/AMDGPULibFunc.h @@ -10,6 +10,7 @@ #define _AMDGPU_LIBFUNC_H_ #include "llvm/ADT/StringRef.h" +#include <memory> namespace llvm { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp index 0c743a77092c..593388a4d819 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp @@ -15,9 +15,8 @@ using namespace llvm; AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) - : MachineFunctionInfo(), Mode(MF.getFunction()), - IsEntryFunction( - AMDGPU::isEntryFunctionCC(MF.getFunction().getCallingConv())), + : Mode(MF.getFunction()), IsEntryFunction(AMDGPU::isEntryFunctionCC( + MF.getFunction().getCallingConv())), IsModuleEntryFunction( AMDGPU::isModuleEntryFunctionCC(MF.getFunction().getCallingConv())), NoSignedZerosFPMath(MF.getTarget().Options.NoSignedZerosFPMath) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h index 10ff50040c6a..48cf46b5f871 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h @@ -15,8 +15,6 @@ namespace llvm { -class GCNSubtarget; - class AMDGPUMachineFunction : public MachineFunctionInfo { /// A map to keep track of local memory objects and their offsets within the /// local memory space. diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPTNote.h b/llvm/lib/Target/AMDGPU/AMDGPUPTNote.h index 8af7979dba8b..5cefc83e25e0 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPTNote.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUPTNote.h @@ -29,4 +29,4 @@ const char NoteNameV3[] = "AMDGPU"; } // End namespace ElfNote } // End namespace AMDGPU } // End namespace llvm -#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUNOTETYPE_H +#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUPTNOTE_H diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp index 7c4eb71882c7..f91f31508ad2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp @@ -463,7 +463,7 @@ bool AMDGPUPrintfRuntimeBindingImpl::lowerPrintfForGpu(Module &M) { WhatToStore.push_back(Arg); } } else if (isa<FixedVectorType>(ArgType)) { - Type *IType = NULL; + Type *IType = nullptr; uint32_t EleCount = cast<FixedVectorType>(ArgType)->getNumElements(); uint32_t EleSize = ArgType->getScalarSizeInBits(); uint32_t TotalSize = EleCount * EleSize; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index f9a9fe403ff6..2d8126a49327 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -789,6 +789,17 @@ bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) { Align Alignment = DL.getValueOrABITypeAlignment(GV->getAlign(), GV->getValueType()); uint64_t AllocSize = DL.getTypeAllocSize(GV->getValueType()); + + // HIP uses an extern unsized array in local address space for dynamically + // allocated shared memory. In that case, we have to disable the promotion. + if (GV->hasExternalLinkage() && AllocSize == 0) { + LocalMemLimit = 0; + LLVM_DEBUG(dbgs() << "Function has a reference to externally allocated " + "local memory. Promoting to local memory " + "disabled.\n"); + return false; + } + AllocatedSizes.emplace_back(AllocSize, Alignment); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp index 3ce67a733c10..0df6f4d45b06 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp @@ -36,6 +36,7 @@ protected: MachineIRBuilder &B; MachineFunction &MF; MachineRegisterInfo &MRI; + const GCNSubtarget &Subtarget; const RegisterBankInfo &RBI; const TargetRegisterInfo &TRI; const SIInstrInfo &TII; @@ -44,9 +45,9 @@ protected: public: AMDGPURegBankCombinerHelper(MachineIRBuilder &B, CombinerHelper &Helper) : B(B), MF(B.getMF()), MRI(*B.getMRI()), - RBI(*MF.getSubtarget().getRegBankInfo()), - TRI(*MF.getSubtarget().getRegisterInfo()), - TII(*MF.getSubtarget<GCNSubtarget>().getInstrInfo()), Helper(Helper){}; + Subtarget(MF.getSubtarget<GCNSubtarget>()), + RBI(*Subtarget.getRegBankInfo()), TRI(*Subtarget.getRegisterInfo()), + TII(*Subtarget.getInstrInfo()), Helper(Helper){}; bool isVgprRegBank(Register Reg); Register getAsVgpr(Register Reg); @@ -193,7 +194,10 @@ bool AMDGPURegBankCombinerHelper::matchFPMinMaxToMed3( MachineInstr &MI, Med3MatchInfo &MatchInfo) { Register Dst = MI.getOperand(0).getReg(); LLT Ty = MRI.getType(Dst); - if (Ty != LLT::scalar(16) && Ty != LLT::scalar(32)) + + // med3 for f16 is only available on gfx9+, and not available for v2f16. + if ((Ty != LLT::scalar(16) || !Subtarget.hasMed3_16()) && + Ty != LLT::scalar(32)) return false; auto OpcodeTriple = getMinMaxPair(MI.getOpcode()); diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index c60012bcfe2e..de2dccef804a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -718,8 +718,11 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop( const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass(); const unsigned WaveAndOpc = Subtarget.isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64; - const unsigned MovTermOpc = Subtarget.isWave32() ? - AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term; + const unsigned MovExecOpc = + Subtarget.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; + const unsigned MovExecTermOpc = + Subtarget.isWave32() ? AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term; + const unsigned XorTermOpc = Subtarget.isWave32() ? AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term; const unsigned AndSaveExecOpc = Subtarget.isWave32() ? @@ -996,12 +999,12 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop( B.buildInstr(AMDGPU::SI_WATERFALL_LOOP).addMBB(LoopBB); // Save the EXEC mask before the loop. - BuildMI(MBB, MBB.end(), DL, TII->get(MovTermOpc), SaveExecReg) + BuildMI(MBB, MBB.end(), DL, TII->get(MovExecOpc), SaveExecReg) .addReg(ExecReg); // Restore the EXEC mask after the loop. B.setMBB(*RestoreExecBB); - B.buildInstr(MovTermOpc) + B.buildInstr(MovExecTermOpc) .addDef(ExecReg) .addReg(SaveExecReg); @@ -2953,7 +2956,9 @@ void AMDGPURegisterBankInfo::applyMappingImpl( break; } case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD: - case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: { + case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16: + case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: + case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: { const AMDGPU::RsrcIntrinsic *RSrcIntrin = AMDGPU::lookupRsrcIntrinsic(MI.getIntrinsicID()); assert(RSrcIntrin && RSrcIntrin->IsImage); @@ -3691,6 +3696,16 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, 32); break; } + case AMDGPU::G_AMDGPU_WAVE_ADDRESS: { + // This case is weird because we expect a physical register in the source, + // but need to set a bank anyway. + // + // We could select the result to SGPR or VGPR, but for the one current use + // it's more practical to always use VGPR. + OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); + OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); + break; + } case AMDGPU::G_INSERT: { unsigned BankID = getMappingType(MRI, MI); unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); @@ -4078,7 +4093,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_mqsad_pk_u16_u8: case Intrinsic::amdgcn_mqsad_u32_u8: case Intrinsic::amdgcn_cvt_pk_u8_f32: - case Intrinsic::amdgcn_alignbit: case Intrinsic::amdgcn_alignbyte: case Intrinsic::amdgcn_perm: case Intrinsic::amdgcn_fdot2: @@ -4276,7 +4290,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { break; } case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD: - case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: { + case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16: + case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: + case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: { auto IntrID = MI.getIntrinsicID(); const AMDGPU::RsrcIntrinsic *RSrcIntrin = AMDGPU::lookupRsrcIntrinsic(IntrID); assert(RSrcIntrin && "missing RsrcIntrinsic for image intrinsic"); diff --git a/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp index 45f7c2f369bd..1c6c63dd5b25 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp @@ -353,7 +353,7 @@ bool AMDGPURewriteOutArguments::runOnFunction(Function &F) { // off any return attributes, e.g. zeroext doesn't make sense with a struct. NewFunc->stealArgumentListFrom(F); - AttrBuilder RetAttrs; + AttributeMask RetAttrs; RetAttrs.addAttribute(Attribute::SExt); RetAttrs.addAttribute(Attribute::ZExt); RetAttrs.addAttribute(Attribute::NoAlias); @@ -433,7 +433,7 @@ bool AMDGPURewriteOutArguments::runOnFunction(Function &F) { PointerType *ArgType = cast<PointerType>(Arg.getType()); - auto *EltTy = ArgType->getElementType(); + auto *EltTy = ArgType->getPointerElementType(); const auto Align = DL->getValueOrABITypeAlignment(Arg.getParamAlign(), EltTy); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index cd05797fdbdb..e82f9232b114 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -269,7 +269,6 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, HasGetWaveIdInst(false), HasSMemTimeInst(false), HasShaderCyclesRegister(false), - HasRegisterBanking(false), HasVOP3Literal(false), HasNoDataDepHazard(false), FlatAddressSpace(false), @@ -772,11 +771,11 @@ unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const { } unsigned -GCNSubtarget::getBaseReservedNumSGPRs(const bool HasFlatScratchInit) const { +GCNSubtarget::getBaseReservedNumSGPRs(const bool HasFlatScratch) const { if (getGeneration() >= AMDGPUSubtarget::GFX10) return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs. - if (HasFlatScratchInit || HasArchitectedFlatScratch) { + if (HasFlatScratch || HasArchitectedFlatScratch) { if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) return 6; // FLAT_SCRATCH, XNACK, VCC (in that order). if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) @@ -794,20 +793,11 @@ unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { } unsigned GCNSubtarget::getReservedNumSGPRs(const Function &F) const { - // The logic to detect if the function has - // flat scratch init is slightly different than how - // SIMachineFunctionInfo constructor derives. - // We don't use amdgpu-calls, amdgpu-stack-objects - // attributes and isAmdHsaOrMesa here as it doesn't really matter. - // TODO: Outline this derivation logic and have just - // one common function in the backend to avoid duplication. - bool isEntry = AMDGPU::isEntryFunctionCC(F.getCallingConv()); - bool FunctionHasFlatScratchInit = false; - if (hasFlatAddressSpace() && isEntry && !flatScratchIsArchitected() && - enableFlatScratch()) { - FunctionHasFlatScratchInit = true; - } - return getBaseReservedNumSGPRs(FunctionHasFlatScratchInit); + // In principle we do not need to reserve SGPR pair used for flat_scratch if + // we know flat instructions do not access the stack anywhere in the + // program. For now assume it's needed if we have flat instructions. + const bool KernelUsesFlatScratch = hasFlatAddressSpace(); + return getBaseReservedNumSGPRs(KernelUsesFlatScratch); } unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h index 88ed4b2b7a24..7f1b94be4ffe 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -212,7 +212,19 @@ public: /// Returns the offset in bytes from the start of the input buffer /// of the first explicit kernel argument. unsigned getExplicitKernelArgOffset(const Function &F) const { - return isAmdHsaOrMesa(F) ? 0 : 36; + switch (TargetTriple.getOS()) { + case Triple::AMDHSA: + case Triple::AMDPAL: + case Triple::Mesa3D: + return 0; + case Triple::UnknownOS: + default: + // For legacy reasons unknown/other is treated as a different version of + // mesa. + return 36; + } + + llvm_unreachable("invalid triple OS"); } /// \returns Maximum number of work groups per compute unit supported by the diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h index 226646a96953..dd3676f3b707 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h @@ -21,8 +21,6 @@ namespace llvm { -class ScheduleDAGMILive; - //===----------------------------------------------------------------------===// // AMDGPU Target Machine (R600+) //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 09c5eb192e1f..a8df7789c8a1 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -844,15 +844,8 @@ bool GCNTTIImpl::isInlineAsmSourceOfDivergence( TLI->ComputeConstraintToUse(TC, SDValue()); - Register AssignedReg; - const TargetRegisterClass *RC; - std::tie(AssignedReg, RC) = TLI->getRegForInlineAsmConstraint( - TRI, TC.ConstraintCode, TC.ConstraintVT); - if (AssignedReg) { - // FIXME: This is a workaround for getRegForInlineAsmConstraint - // returning VS_32 - RC = TRI->getPhysRegClass(AssignedReg); - } + const TargetRegisterClass *RC = TLI->getRegForInlineAsmConstraint( + TRI, TC.ConstraintCode, TC.ConstraintVT).second; // For AGPR constraints null is returned on subtargets without AGPRs, so // assume divergent for null. diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 2bb59086f391..c1c88d9a7462 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -62,7 +62,7 @@ class AMDGPUOperand : public MCParsedAsmOperand { public: AMDGPUOperand(KindTy Kind_, const AMDGPUAsmParser *AsmParser_) - : MCParsedAsmOperand(), Kind(Kind_), AsmParser(AsmParser_) {} + : Kind(Kind_), AsmParser(AsmParser_) {} using Ptr = std::unique_ptr<AMDGPUOperand>; @@ -1548,6 +1548,7 @@ private: bool validateVccOperand(unsigned Reg) const; bool validateVOPLiteral(const MCInst &Inst, const OperandVector &Operands); bool validateMAIAccWrite(const MCInst &Inst, const OperandVector &Operands); + bool validateMFMA(const MCInst &Inst, const OperandVector &Operands); bool validateAGPRLdSt(const MCInst &Inst) const; bool validateVGPRAlign(const MCInst &Inst) const; bool validateGWS(const MCInst &Inst, const OperandVector &Operands); @@ -3613,6 +3614,40 @@ bool AMDGPUAsmParser::validateMAIAccWrite(const MCInst &Inst, return true; } +bool AMDGPUAsmParser::validateMFMA(const MCInst &Inst, + const OperandVector &Operands) { + const unsigned Opc = Inst.getOpcode(); + const MCInstrDesc &Desc = MII.get(Opc); + + if ((Desc.TSFlags & SIInstrFlags::IsMAI) == 0) + return true; + + const int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2); + if (Src2Idx == -1) + return true; + + const MCOperand &Src2 = Inst.getOperand(Src2Idx); + if (!Src2.isReg()) + return true; + + MCRegister Src2Reg = Src2.getReg(); + MCRegister DstReg = Inst.getOperand(0).getReg(); + if (Src2Reg == DstReg) + return true; + + const MCRegisterInfo *TRI = getContext().getRegisterInfo(); + if (TRI->getRegClass(Desc.OpInfo[0].RegClass).getSizeInBits() <= 128) + return true; + + if (isRegIntersect(Src2Reg, DstReg, TRI)) { + Error(getRegLoc(mc2PseudoReg(Src2Reg), Operands), + "source 2 operand must not partially overlap with dst"); + return false; + } + + return true; +} + bool AMDGPUAsmParser::validateDivScale(const MCInst &Inst) { switch (Inst.getOpcode()) { default: @@ -4297,6 +4332,9 @@ bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst, if (!validateMAIAccWrite(Inst, Operands)) { return false; } + if (!validateMFMA(Inst, Operands)) { + return false; + } if (!validateCoherencyBits(Inst, Operands, IDLoc)) { return false; } @@ -4568,7 +4606,13 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() { uint64_t AccumOffset = 0; SMRange SGPRRange; uint64_t NextFreeSGPR = 0; - unsigned UserSGPRCount = 0; + + // Count the number of user SGPRs implied from the enabled feature bits. + unsigned ImpliedUserSGPRCount = 0; + + // Track if the asm explicitly contains the directive for the user SGPR + // count. + Optional<unsigned> ExplicitUserSGPRCount; bool ReserveVCC = true; bool ReserveFlatScr = true; Optional<bool> EnableWavefrontSize32; @@ -4617,6 +4661,8 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() { if (!isUInt<sizeof(KD.kernarg_size) * CHAR_BIT>(Val)) return OutOfRangeError(ValRange); KD.kernarg_size = Val; + } else if (ID == ".amdhsa_user_sgpr_count") { + ExplicitUserSGPRCount = Val; } else if (ID == ".amdhsa_user_sgpr_private_segment_buffer") { if (hasArchitectedFlatScratch()) return Error(IDRange.Start, @@ -4626,31 +4672,31 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() { KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER, Val, ValRange); if (Val) - UserSGPRCount += 4; + ImpliedUserSGPRCount += 4; } else if (ID == ".amdhsa_user_sgpr_dispatch_ptr") { PARSE_BITS_ENTRY(KD.kernel_code_properties, KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR, Val, ValRange); if (Val) - UserSGPRCount += 2; + ImpliedUserSGPRCount += 2; } else if (ID == ".amdhsa_user_sgpr_queue_ptr") { PARSE_BITS_ENTRY(KD.kernel_code_properties, KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR, Val, ValRange); if (Val) - UserSGPRCount += 2; + ImpliedUserSGPRCount += 2; } else if (ID == ".amdhsa_user_sgpr_kernarg_segment_ptr") { PARSE_BITS_ENTRY(KD.kernel_code_properties, KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR, Val, ValRange); if (Val) - UserSGPRCount += 2; + ImpliedUserSGPRCount += 2; } else if (ID == ".amdhsa_user_sgpr_dispatch_id") { PARSE_BITS_ENTRY(KD.kernel_code_properties, KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID, Val, ValRange); if (Val) - UserSGPRCount += 2; + ImpliedUserSGPRCount += 2; } else if (ID == ".amdhsa_user_sgpr_flat_scratch_init") { if (hasArchitectedFlatScratch()) return Error(IDRange.Start, @@ -4660,13 +4706,13 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() { KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT, Val, ValRange); if (Val) - UserSGPRCount += 2; + ImpliedUserSGPRCount += 2; } else if (ID == ".amdhsa_user_sgpr_private_segment_size") { PARSE_BITS_ENTRY(KD.kernel_code_properties, KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE, Val, ValRange); if (Val) - UserSGPRCount += 1; + ImpliedUserSGPRCount += 1; } else if (ID == ".amdhsa_wavefront_size32") { if (IVersion.Major < 10) return Error(IDRange.Start, "directive requires gfx10+", IDRange); @@ -4850,6 +4896,13 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() { COMPUTE_PGM_RSRC1_GRANULATED_WAVEFRONT_SGPR_COUNT, SGPRBlocks); + if (ExplicitUserSGPRCount && ImpliedUserSGPRCount > *ExplicitUserSGPRCount) + return TokError("amdgpu_user_sgpr_count smaller than than implied by " + "enabled user SGPRs"); + + unsigned UserSGPRCount = + ExplicitUserSGPRCount ? *ExplicitUserSGPRCount : ImpliedUserSGPRCount; + if (!isUInt<COMPUTE_PGM_RSRC2_USER_SGPR_COUNT_WIDTH>(UserSGPRCount)) return TokError("too many user SGPRs enabled"); AMDHSA_BITS_SET(KD.compute_pgm_rsrc2, COMPUTE_PGM_RSRC2_USER_SGPR_COUNT, diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td index 104b5160b985..c4043177b618 100644 --- a/llvm/lib/Target/AMDGPU/DSInstructions.td +++ b/llvm/lib/Target/AMDGPU/DSInstructions.td @@ -89,7 +89,6 @@ class DS_Real <DS_Pseudo ps> : !if(!or(ps.has_data0, ps.has_gws_data0), data0{9}, 0)); } - // DS Pseudo instructions class DS_0A1D_NORET<string opName, RegisterClass rc = VGPR_32> diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index c7ec5308e6d0..c530d3cb49f0 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -915,7 +915,7 @@ class FlatSignedAtomicPatNoRtn <FLAT_Pseudo inst, SDPatternOperator node, ValueT class FlatSignedAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt, ValueType data_vt = vt> : GCNPat < (vt (node (GlobalOffset i64:$vaddr, i16:$offset), data_vt:$data)), - (inst $vaddr, $data, $offset) + (inst VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset) >; class ScratchLoadSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index 0f8dd0b3bf58..c0592f6f3c7a 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -95,7 +95,9 @@ static bool isDGEMM(unsigned Opcode) { return Opcode == AMDGPU::V_MFMA_F64_4X4X4F64_e64 || Opcode == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64 || Opcode == AMDGPU::V_MFMA_F64_16X16X4F64_e64 || - Opcode == AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64; + Opcode == AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64 || + Opcode == AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64 || + Opcode == AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64; } static bool isXDL(const GCNSubtarget &ST, const MachineInstr &MI) { @@ -1438,7 +1440,7 @@ int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) { if (!Use.isReg()) continue; - unsigned Reg = Use.getReg(); + Register Reg = Use.getReg(); bool FullReg; const MachineInstr *MI1; @@ -1477,6 +1479,8 @@ int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) { switch (Opc1) { case AMDGPU::V_MFMA_F64_16X16X4F64_e64: case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64: + case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64: + case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64: if (!isXDL(ST, *MI)) NeedWaitStates = DMFMA16x16WritesVGPROverlappedSrcCWaitStates; break; @@ -1509,6 +1513,8 @@ int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) { switch (Opc1) { case AMDGPU::V_MFMA_F64_16X16X4F64_e64: case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64: + case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64: + case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64: NeedWaitStates = DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates; break; case AMDGPU::V_MFMA_F64_4X4X4F64_e64: diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h index 162121c2c525..716bc027a894 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h @@ -25,7 +25,6 @@ class MachineFunction; class MachineInstr; class MachineOperand; class MachineRegisterInfo; -class ScheduleDAG; class SIInstrInfo; class SIRegisterInfo; class GCNSubtarget; diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp index 82c09378acac..fb106d98c162 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp @@ -27,7 +27,7 @@ void llvm::printLivesAt(SlotIndex SI, << *LIS.getInstructionFromIndex(SI); unsigned Num = 0; for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) { - const unsigned Reg = Register::index2VirtReg(I); + const Register Reg = Register::index2VirtReg(I); if (!LIS.hasInterval(Reg)) continue; const auto &LI = LIS.getInterval(Reg); @@ -487,7 +487,7 @@ void GCNRPTracker::printLiveRegs(raw_ostream &OS, const LiveRegSet& LiveRegs, const MachineRegisterInfo &MRI) { const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo(); for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) { - unsigned Reg = Register::index2VirtReg(I); + Register Reg = Register::index2VirtReg(I); auto It = LiveRegs.find(Reg); if (It != LiveRegs.end() && It->second.any()) OS << ' ' << printVRegOrUnit(Reg, TRI) << ':' diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h index 53d6ff0aa731..a6e42ad3dfca 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h @@ -140,4 +140,4 @@ public: } // End namespace llvm -#endif // GCNSCHEDSTRATEGY_H +#endif // LLVM_LIB_TARGET_AMDGPU_GCNSCHEDSTRATEGY_H diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index d8bc0b2df2bd..0cd2cfa2f0e7 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -153,7 +153,6 @@ protected: bool HasGetWaveIdInst; bool HasSMemTimeInst; bool HasShaderCyclesRegister; - bool HasRegisterBanking; bool HasVOP3Literal; bool HasNoDataDepHazard; bool FlatAddressSpace; @@ -723,10 +722,6 @@ public: return HasShaderCyclesRegister; } - bool hasRegisterBanking() const { - return HasRegisterBanking; - } - bool hasVOP3Literal() const { return HasVOP3Literal; } @@ -1029,7 +1024,7 @@ public: /// \returns Reserved number of SGPRs. This is common /// utility function called by MachineFunction and /// Function variants of getReservedNumSGPRs. - unsigned getBaseReservedNumSGPRs(const bool HasFlatScratchInit) const; + unsigned getBaseReservedNumSGPRs(const bool HasFlatScratch) const; /// \returns Reserved number of SGPRs for given machine function \p MF. unsigned getReservedNumSGPRs(const MachineFunction &MF) const; diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp index b68b4b12e750..76663b563150 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp @@ -1397,21 +1397,26 @@ void AMDGPUInstPrinter::printWaitFlag(const MCInst *MI, unsigned OpNo, unsigned Vmcnt, Expcnt, Lgkmcnt; decodeWaitcnt(ISA, SImm16, Vmcnt, Expcnt, Lgkmcnt); + bool IsDefaultVmcnt = Vmcnt == getVmcntBitMask(ISA); + bool IsDefaultExpcnt = Expcnt == getExpcntBitMask(ISA); + bool IsDefaultLgkmcnt = Lgkmcnt == getLgkmcntBitMask(ISA); + bool PrintAll = IsDefaultVmcnt && IsDefaultExpcnt && IsDefaultLgkmcnt; + bool NeedSpace = false; - if (Vmcnt != getVmcntBitMask(ISA)) { + if (!IsDefaultVmcnt || PrintAll) { O << "vmcnt(" << Vmcnt << ')'; NeedSpace = true; } - if (Expcnt != getExpcntBitMask(ISA)) { + if (!IsDefaultExpcnt || PrintAll) { if (NeedSpace) O << ' '; O << "expcnt(" << Expcnt << ')'; NeedSpace = true; } - if (Lgkmcnt != getLgkmcntBitMask(ISA)) { + if (!IsDefaultLgkmcnt || PrintAll) { if (NeedSpace) O << ' '; O << "lgkmcnt(" << Lgkmcnt << ')'; diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp index 7708579a4491..ded3fb7ab8d9 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp @@ -15,8 +15,7 @@ using namespace llvm; AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(const Triple &TT, - const MCTargetOptions &Options) - : MCAsmInfoELF() { + const MCTargetOptions &Options) { CodePointerSize = (TT.getArch() == Triple::amdgcn) ? 8 : 4; StackGrowsUp = true; HasSingleParameterDotFile = false; diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp index 9a9a2c973f44..9578bdb0bad0 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp @@ -319,6 +319,10 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor( << KD.private_segment_fixed_size << '\n'; OS << "\t\t.amdhsa_kernarg_size " << KD.kernarg_size << '\n'; + PRINT_FIELD(OS, ".amdhsa_user_sgpr_count", KD, + compute_pgm_rsrc2, + amdhsa::COMPUTE_PGM_RSRC2_USER_SGPR_COUNT); + if (!hasArchitectedFlatScratch(STI)) PRINT_FIELD( OS, ".amdhsa_user_sgpr_private_segment_buffer", KD, diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td index 6dd886367302..cf03fd682143 100644 --- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td +++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td @@ -131,6 +131,38 @@ def MIMGMIPMappingTable : GenericTable { let PrimaryKeyName = "getMIMGMIPMappingInfo"; } +class MIMGBiasMapping<MIMGBaseOpcode bias, MIMGBaseOpcode nobias> { + MIMGBaseOpcode Bias = bias; + MIMGBaseOpcode NoBias = nobias; +} + +def MIMGBiasMappingTable : GenericTable { + let FilterClass = "MIMGBiasMapping"; + let CppTypeName = "MIMGBiasMappingInfo"; + let Fields = ["Bias", "NoBias"]; + string TypeOf_Bias = "MIMGBaseOpcode"; + string TypeOf_NoBias = "MIMGBaseOpcode"; + + let PrimaryKey = ["Bias"]; + let PrimaryKeyName = "getMIMGBiasMappingInfo"; +} + +class MIMGOffsetMapping<MIMGBaseOpcode offset, MIMGBaseOpcode nooffset> { + MIMGBaseOpcode Offset = offset; + MIMGBaseOpcode NoOffset = nooffset; +} + +def MIMGOffsetMappingTable : GenericTable { + let FilterClass = "MIMGOffsetMapping"; + let CppTypeName = "MIMGOffsetMappingInfo"; + let Fields = ["Offset", "NoOffset"]; + string TypeOf_Offset = "MIMGBaseOpcode"; + string TypeOf_NoOffset = "MIMGBaseOpcode"; + + let PrimaryKey = ["Offset"]; + let PrimaryKeyName = "getMIMGOffsetMappingInfo"; +} + class MIMGG16Mapping<MIMGBaseOpcode g, MIMGBaseOpcode g16> { MIMGBaseOpcode G = g; MIMGBaseOpcode G16 = g16; @@ -1070,6 +1102,9 @@ class ImageDimIntrinsicInfo<AMDGPUImageDimIntrinsic I> { AMDGPUDimProps Dim = I.P.Dim; AMDGPUImageDimIntrinsicEval DimEval = AMDGPUImageDimIntrinsicEval<I.P>; + bits<8> NumOffsetArgs = DimEval.NumOffsetArgs; + bits<8> NumBiasArgs = DimEval.NumBiasArgs; + bits<8> NumZCompareArgs = DimEval.NumZCompareArgs; bits<8> NumGradients = DimEval.NumGradientArgs; bits<8> NumDmask = DimEval.NumDmaskArgs; bits<8> NumData = DimEval.NumDataArgs; @@ -1078,6 +1113,9 @@ class ImageDimIntrinsicInfo<AMDGPUImageDimIntrinsic I> { bits<8> DMaskIndex = DimEval.DmaskArgIndex; bits<8> VAddrStart = DimEval.VAddrArgIndex; + bits<8> OffsetIndex = DimEval.OffsetArgIndex; + bits<8> BiasIndex = DimEval.BiasArgIndex; + bits<8> ZCompareIndex = DimEval.ZCompareArgIndex; bits<8> GradientStart = DimEval.GradientArgIndex; bits<8> CoordStart = DimEval.CoordArgIndex; bits<8> LodIndex = DimEval.LodArgIndex; @@ -1089,6 +1127,8 @@ class ImageDimIntrinsicInfo<AMDGPUImageDimIntrinsic I> { bits<8> TexFailCtrlIndex = DimEval.TexFailCtrlArgIndex; bits<8> CachePolicyIndex = DimEval.CachePolicyArgIndex; + bits<8> BiasTyArg = !add(I.P.NumRetAndDataAnyTypes, + !if(!eq(NumOffsetArgs, 0), 0, I.P.ExtraAddrArgs[0].Type.isAny)); bits<8> GradientTyArg = !add(I.P.NumRetAndDataAnyTypes, !foldl(0, I.P.ExtraAddrArgs, cnt, arg, !add(cnt, arg.Type.isAny))); bits<8> CoordTyArg = !add(GradientTyArg, !if(I.P.Gradients, 1, 0)); @@ -1096,10 +1136,10 @@ class ImageDimIntrinsicInfo<AMDGPUImageDimIntrinsic I> { def ImageDimIntrinsicTable : GenericTable { let FilterClass = "ImageDimIntrinsicInfo"; - let Fields = ["Intr", "BaseOpcode", "Dim", "NumGradients", "NumDmask", "NumData", "NumVAddrs", "NumArgs", - "DMaskIndex", "VAddrStart", "GradientStart", "CoordStart", "LodIndex", "MipIndex", "VAddrEnd", + let Fields = ["Intr", "BaseOpcode", "Dim", "NumOffsetArgs", "NumBiasArgs", "NumZCompareArgs", "NumGradients", "NumDmask", "NumData", "NumVAddrs", "NumArgs", + "DMaskIndex", "VAddrStart", "OffsetIndex", "BiasIndex", "ZCompareIndex", "GradientStart", "CoordStart", "LodIndex", "MipIndex", "VAddrEnd", "RsrcIndex", "SampIndex", "UnormIndex", "TexFailCtrlIndex", "CachePolicyIndex", - "GradientTyArg", "CoordTyArg"]; + "BiasTyArg", "GradientTyArg", "CoordTyArg"]; string TypeOf_BaseOpcode = "MIMGBaseOpcode"; string TypeOf_Dim = "MIMGDim"; @@ -1132,6 +1172,66 @@ def : MIMGLZMapping<IMAGE_GATHER4_C_L_O, IMAGE_GATHER4_C_LZ_O>; def : MIMGMIPMapping<IMAGE_LOAD_MIP, IMAGE_LOAD>; def : MIMGMIPMapping<IMAGE_STORE_MIP, IMAGE_STORE>; +// Bias to NoBias Optimization Mapping +def : MIMGBiasMapping<IMAGE_SAMPLE_B, IMAGE_SAMPLE>; +def : MIMGBiasMapping<IMAGE_SAMPLE_B_CL, IMAGE_SAMPLE_CL>; +def : MIMGBiasMapping<IMAGE_SAMPLE_C_B, IMAGE_SAMPLE_C>; +def : MIMGBiasMapping<IMAGE_SAMPLE_C_B_CL, IMAGE_SAMPLE_C_CL>; +def : MIMGBiasMapping<IMAGE_SAMPLE_B_O, IMAGE_SAMPLE_O>; +def : MIMGBiasMapping<IMAGE_SAMPLE_B_CL_O, IMAGE_SAMPLE_CL_O>; +def : MIMGBiasMapping<IMAGE_SAMPLE_C_B_O, IMAGE_SAMPLE_C_O>; +def : MIMGBiasMapping<IMAGE_SAMPLE_C_B_CL_O, IMAGE_SAMPLE_C_CL_O>; +def : MIMGBiasMapping<IMAGE_GATHER4_B, IMAGE_GATHER4>; +def : MIMGBiasMapping<IMAGE_GATHER4_B_CL, IMAGE_GATHER4_CL>; +def : MIMGBiasMapping<IMAGE_GATHER4_C_B, IMAGE_GATHER4_C>; +def : MIMGBiasMapping<IMAGE_GATHER4_C_B_CL, IMAGE_GATHER4_C_CL>; +def : MIMGBiasMapping<IMAGE_GATHER4_B_O, IMAGE_GATHER4_O>; +def : MIMGBiasMapping<IMAGE_GATHER4_B_CL_O, IMAGE_GATHER4_CL_O>; +def : MIMGBiasMapping<IMAGE_GATHER4_C_B_O, IMAGE_GATHER4_C_O>; +def : MIMGBiasMapping<IMAGE_GATHER4_C_B_CL_O, IMAGE_GATHER4_C_CL_O>; + +// Offset to NoOffset Optimization Mapping +def : MIMGOffsetMapping<IMAGE_SAMPLE_O, IMAGE_SAMPLE>; +def : MIMGOffsetMapping<IMAGE_SAMPLE_CL_O, IMAGE_SAMPLE_CL>; +def : MIMGOffsetMapping<IMAGE_SAMPLE_D_O, IMAGE_SAMPLE_D>; +def : MIMGOffsetMapping<IMAGE_SAMPLE_D_CL_O, IMAGE_SAMPLE_D_CL>; +def : MIMGOffsetMapping<IMAGE_SAMPLE_D_O_G16, IMAGE_SAMPLE_D_G16>; +def : MIMGOffsetMapping<IMAGE_SAMPLE_D_CL_O_G16, IMAGE_SAMPLE_D_CL_G16>; +def : MIMGOffsetMapping<IMAGE_SAMPLE_L_O, IMAGE_SAMPLE_L>; +def : MIMGOffsetMapping<IMAGE_SAMPLE_B_O, IMAGE_SAMPLE_B>; +def : MIMGOffsetMapping<IMAGE_SAMPLE_B_CL_O, IMAGE_SAMPLE_B_CL>; +def : MIMGOffsetMapping<IMAGE_SAMPLE_LZ_O, IMAGE_SAMPLE_LZ>; +def : MIMGOffsetMapping<IMAGE_SAMPLE_C_O, IMAGE_SAMPLE_C>; +def : MIMGOffsetMapping<IMAGE_SAMPLE_C_CL_O, IMAGE_SAMPLE_C_CL>; +def : MIMGOffsetMapping<IMAGE_SAMPLE_C_D_O, IMAGE_SAMPLE_C_D>; +def : MIMGOffsetMapping<IMAGE_SAMPLE_C_D_CL_O, IMAGE_SAMPLE_C_D_CL>; +def : MIMGOffsetMapping<IMAGE_SAMPLE_C_D_O_G16, IMAGE_SAMPLE_C_D_G16>; +def : MIMGOffsetMapping<IMAGE_SAMPLE_C_D_CL_O_G16, IMAGE_SAMPLE_C_D_CL_G16>; +def : MIMGOffsetMapping<IMAGE_SAMPLE_C_L_O, IMAGE_SAMPLE_C_L>; +def : MIMGOffsetMapping<IMAGE_SAMPLE_C_B_CL_O, IMAGE_SAMPLE_C_B_CL>; +def : MIMGOffsetMapping<IMAGE_SAMPLE_C_B_O, IMAGE_SAMPLE_C_B>; +def : MIMGOffsetMapping<IMAGE_SAMPLE_C_LZ_O, IMAGE_SAMPLE_C_LZ>; +def : MIMGOffsetMapping<IMAGE_GATHER4_O, IMAGE_GATHER4>; +def : MIMGOffsetMapping<IMAGE_GATHER4_CL_O, IMAGE_GATHER4_CL>; +def : MIMGOffsetMapping<IMAGE_GATHER4_L_O, IMAGE_GATHER4_L>; +def : MIMGOffsetMapping<IMAGE_GATHER4_B_O, IMAGE_GATHER4_B>; +def : MIMGOffsetMapping<IMAGE_GATHER4_B_CL_O, IMAGE_GATHER4_B_CL>; +def : MIMGOffsetMapping<IMAGE_GATHER4_LZ_O, IMAGE_GATHER4_LZ>; +def : MIMGOffsetMapping<IMAGE_GATHER4_C_O, IMAGE_GATHER4_C>; +def : MIMGOffsetMapping<IMAGE_GATHER4_C_CL_O, IMAGE_GATHER4_C_CL>; +def : MIMGOffsetMapping<IMAGE_GATHER4_C_L_O, IMAGE_GATHER4_C_L>; +def : MIMGOffsetMapping<IMAGE_GATHER4_C_B_O, IMAGE_GATHER4_C_B>; +def : MIMGOffsetMapping<IMAGE_GATHER4_C_B_CL_O, IMAGE_GATHER4_C_B_CL>; +def : MIMGOffsetMapping<IMAGE_GATHER4_C_LZ_O, IMAGE_GATHER4_C_LZ>; +def : MIMGOffsetMapping<IMAGE_SAMPLE_CD_O, IMAGE_SAMPLE_CD>; +def : MIMGOffsetMapping<IMAGE_SAMPLE_CD_CL_O, IMAGE_SAMPLE_CD_CL>; +def : MIMGOffsetMapping<IMAGE_SAMPLE_C_CD_O, IMAGE_SAMPLE_C_CD>; +def : MIMGOffsetMapping<IMAGE_SAMPLE_C_CD_CL_O, IMAGE_SAMPLE_C_CD_CL>; +def : MIMGOffsetMapping<IMAGE_SAMPLE_CD_O_G16, IMAGE_SAMPLE_CD_G16>; +def : MIMGOffsetMapping<IMAGE_SAMPLE_CD_CL_O_G16, IMAGE_SAMPLE_CD_CL_G16>; +def : MIMGOffsetMapping<IMAGE_SAMPLE_C_CD_O_G16, IMAGE_SAMPLE_C_CD_G16>; +def : MIMGOffsetMapping<IMAGE_SAMPLE_C_CD_CL_O_G16, IMAGE_SAMPLE_C_CD_CL_G16>; + // G to G16 Optimization Mapping def : MIMGG16Mapping<IMAGE_SAMPLE_D, IMAGE_SAMPLE_D_G16>; def : MIMGG16Mapping<IMAGE_SAMPLE_D_CL, IMAGE_SAMPLE_D_CL_G16>; diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.h b/llvm/lib/Target/AMDGPU/R600ISelLowering.h index f9a9a6127322..1e75a0432ec3 100644 --- a/llvm/lib/Target/AMDGPU/R600ISelLowering.h +++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.h @@ -19,7 +19,6 @@ namespace llvm { -class R600InstrInfo; class R600Subtarget; class R600TargetLowering final : public AMDGPUTargetLowering { diff --git a/llvm/lib/Target/AMDGPU/R600InstrInfo.h b/llvm/lib/Target/AMDGPU/R600InstrInfo.h index fc567f1a1fca..bc8a4786df77 100644 --- a/llvm/lib/Target/AMDGPU/R600InstrInfo.h +++ b/llvm/lib/Target/AMDGPU/R600InstrInfo.h @@ -29,7 +29,6 @@ enum : uint64_t { }; } -class AMDGPUTargetMachine; class DFAPacketizer; class MachineFunction; class MachineInstr; diff --git a/llvm/lib/Target/AMDGPU/R600Subtarget.h b/llvm/lib/Target/AMDGPU/R600Subtarget.h index 94403b88f21a..92d559b1f8e6 100644 --- a/llvm/lib/Target/AMDGPU/R600Subtarget.h +++ b/llvm/lib/Target/AMDGPU/R600Subtarget.h @@ -21,12 +21,6 @@ #include "Utils/AMDGPUBaseInfo.h" #include "llvm/CodeGen/SelectionDAGTargetInfo.h" -namespace llvm { - -class MCInstrInfo; - -} // namespace llvm - #define GET_SUBTARGETINFO_HEADER #include "R600GenSubtargetInfo.inc" diff --git a/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp b/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp index 397b2f873515..b81fac36fc95 100644 --- a/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp +++ b/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp @@ -245,6 +245,12 @@ Value *SIAnnotateControlFlow::handleLoopCondition( return CallInst::Create(IfBreak, Args, "", Insert); } + if (isa<Argument>(Cond)) { + Instruction *Insert = L->getHeader()->getFirstNonPHIOrDbgOrLifetime(); + Value *Args[] = { Cond, Broken }; + return CallInst::Create(IfBreak, Args, "", Insert); + } + llvm_unreachable("Unhandled loop condition!"); } diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h index 580e4bc417a4..107ee5ed5532 100644 --- a/llvm/lib/Target/AMDGPU/SIDefines.h +++ b/llvm/lib/Target/AMDGPU/SIDefines.h @@ -379,6 +379,8 @@ enum Id { // HwRegCode, (6) [5:0] ID_FLAT_SCR_LO = 20, ID_FLAT_SCR_HI = 21, ID_XNACK_MASK = 22, + ID_HW_ID1 = 23, + ID_HW_ID2 = 24, ID_POPS_PACKER = 25, ID_SHADER_CYCLES = 29, ID_SYMBOLIC_FIRST_GFX1030_ = ID_SHADER_CYCLES, diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 1f93284fc7ee..33954e11d6c6 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -300,6 +300,13 @@ static bool updateOperand(FoldCandidate &Fold, assert(!Fold.needsShrink() && "not handled"); if (Fold.isImm()) { + if (Old.isTied()) { + int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(MI->getOpcode()); + if (NewMFMAOpc == -1) + return false; + MI->setDesc(TII.get(NewMFMAOpc)); + MI->untieRegOperand(0); + } Old.ChangeToImmediate(Fold.ImmToFold); return true; } diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp index d4fe74ecb96e..6078f4a0577a 100644 --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -1195,7 +1195,8 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized( } } else if (TII->isStoreToStackSlot(MI, FrameIndex) || TII->isLoadFromStackSlot(MI, FrameIndex)) - NonVGPRSpillFIs.set(FrameIndex); + if (!MFI.isFixedObjectIndex(FrameIndex)) + NonVGPRSpillFIs.set(FrameIndex); } } @@ -1320,16 +1321,14 @@ void SIFrameLowering::determineCalleeSavesSGPR(MachineFunction &MF, const BitVector AllSavedRegs = SavedRegs; SavedRegs.clearBitsInMask(TRI->getAllVectorRegMask()); - // If clearing VGPRs changed the mask, we will have some CSR VGPR spills. - const bool HaveAnyCSRVGPR = SavedRegs != AllSavedRegs; - // We have to anticipate introducing CSR VGPR spills or spill of caller // save VGPR reserved for SGPR spills as we now always create stack entry - // for it, if we don't have any stack objects already, since we require - // an FP if there is a call and stack. + // for it, if we don't have any stack objects already, since we require a FP + // if there is a call and stack. We will allocate a VGPR for SGPR spills if + // there are any SGPR spills. Whether they are CSR spills or otherwise. MachineFrameInfo &FrameInfo = MF.getFrameInfo(); const bool WillHaveFP = - FrameInfo.hasCalls() && (HaveAnyCSRVGPR || MFI->VGPRReservedForSGPRSpill); + FrameInfo.hasCalls() && (AllSavedRegs.any() || MFI->hasSpilledSGPRs()); // FP will be specially managed like SP. if (WillHaveFP || hasFP(MF)) diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.h b/llvm/lib/Target/AMDGPU/SIFrameLowering.h index 56fbb875ffd9..7949dcfa6632 100644 --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.h +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.h @@ -13,11 +13,6 @@ namespace llvm { -class SIInstrInfo; -class SIMachineFunctionInfo; -class SIRegisterInfo; -class GCNSubtarget; - class SIFrameLowering final : public AMDGPUFrameLowering { public: SIFrameLowering(StackDirection D, Align StackAl, int LAO, diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 9f138136e6e9..561866b5a398 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -45,10 +45,6 @@ static cl::opt<bool> DisableLoopAlignment( cl::desc("Do not align and prefetch loops"), cl::init(false)); -static cl::opt<bool> VGPRReserveforSGPRSpill( - "amdgpu-reserve-vgpr-for-sgpr-spill", - cl::desc("Allocates one VGPR for future SGPR Spill"), cl::init(true)); - static cl::opt<bool> UseDivergentRegisterIndexing( "amdgpu-use-divergent-register-indexing", cl::Hidden, @@ -138,6 +134,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32RegClass); addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass); addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass); + addRegisterClass(MVT::v8i16, &AMDGPU::SGPR_128RegClass); + addRegisterClass(MVT::v8f16, &AMDGPU::SGPR_128RegClass); } addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass); @@ -273,7 +271,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, MVT::v2i64, MVT::v2f64, MVT::v4i16, MVT::v4f16, MVT::v3i64, MVT::v3f64, MVT::v6i32, MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64, - MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32 }) { + MVT::v8i16, MVT::v8f16, MVT::v16i64, MVT::v16f64, + MVT::v32i32, MVT::v32f32 }) { for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) { switch (Op) { case ISD::LOAD: @@ -615,7 +614,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, if (STI.hasMadF16()) setOperationAction(ISD::FMAD, MVT::f16, Legal); - for (MVT VT : {MVT::v2i16, MVT::v2f16, MVT::v4i16, MVT::v4f16}) { + for (MVT VT : {MVT::v2i16, MVT::v2f16, MVT::v4i16, MVT::v4f16, MVT::v8i16, + MVT::v8f16}) { for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) { switch (Op) { case ISD::LOAD: @@ -677,6 +677,21 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::STORE, MVT::v4f16, Promote); AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32); + setOperationAction(ISD::LOAD, MVT::v8i16, Promote); + AddPromotedToType(ISD::LOAD, MVT::v8i16, MVT::v4i32); + setOperationAction(ISD::LOAD, MVT::v8f16, Promote); + AddPromotedToType(ISD::LOAD, MVT::v8f16, MVT::v4i32); + + setOperationAction(ISD::STORE, MVT::v4i16, Promote); + AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32); + setOperationAction(ISD::STORE, MVT::v4f16, Promote); + AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32); + + setOperationAction(ISD::STORE, MVT::v8i16, Promote); + AddPromotedToType(ISD::STORE, MVT::v8i16, MVT::v4i32); + setOperationAction(ISD::STORE, MVT::v8f16, Promote); + AddPromotedToType(ISD::STORE, MVT::v8f16, MVT::v4i32); + setOperationAction(ISD::ANY_EXTEND, MVT::v2i32, Expand); setOperationAction(ISD::ZERO_EXTEND, MVT::v2i32, Expand); setOperationAction(ISD::SIGN_EXTEND, MVT::v2i32, Expand); @@ -686,6 +701,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::ZERO_EXTEND, MVT::v4i32, Expand); setOperationAction(ISD::SIGN_EXTEND, MVT::v4i32, Expand); + setOperationAction(ISD::ANY_EXTEND, MVT::v8i32, Expand); + setOperationAction(ISD::ZERO_EXTEND, MVT::v8i32, Expand); + setOperationAction(ISD::SIGN_EXTEND, MVT::v8i32, Expand); + if (!Subtarget->hasVOP3PInsts()) { setOperationAction(ISD::BUILD_VECTOR, MVT::v2i16, Custom); setOperationAction(ISD::BUILD_VECTOR, MVT::v2f16, Custom); @@ -703,9 +722,20 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::FMINNUM_IEEE, MVT::v4f16, Custom); setOperationAction(ISD::FMAXNUM_IEEE, MVT::v4f16, Custom); + setOperationAction(ISD::FMINNUM_IEEE, MVT::v8f16, Custom); + setOperationAction(ISD::FMAXNUM_IEEE, MVT::v8f16, Custom); setOperationAction(ISD::FMINNUM, MVT::v4f16, Expand); setOperationAction(ISD::FMAXNUM, MVT::v4f16, Expand); + setOperationAction(ISD::FMINNUM, MVT::v8f16, Expand); + setOperationAction(ISD::FMAXNUM, MVT::v8f16, Expand); + + for (MVT Vec16 : { MVT::v8i16, MVT::v8f16 }) { + setOperationAction(ISD::BUILD_VECTOR, Vec16, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec16, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, Vec16, Expand); + setOperationAction(ISD::SCALAR_TO_VECTOR, Vec16, Expand); + } } if (Subtarget->hasVOP3PInsts()) { @@ -739,34 +769,42 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f16, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i16, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f16, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i16, Custom); - setOperationAction(ISD::SHL, MVT::v4i16, Custom); - setOperationAction(ISD::SRA, MVT::v4i16, Custom); - setOperationAction(ISD::SRL, MVT::v4i16, Custom); - setOperationAction(ISD::ADD, MVT::v4i16, Custom); - setOperationAction(ISD::SUB, MVT::v4i16, Custom); - setOperationAction(ISD::MUL, MVT::v4i16, Custom); + for (MVT VT : { MVT::v4i16, MVT::v8i16 }) { + // Split vector operations. + setOperationAction(ISD::SHL, VT, Custom); + setOperationAction(ISD::SRA, VT, Custom); + setOperationAction(ISD::SRL, VT, Custom); + setOperationAction(ISD::ADD, VT, Custom); + setOperationAction(ISD::SUB, VT, Custom); + setOperationAction(ISD::MUL, VT, Custom); - setOperationAction(ISD::SMIN, MVT::v4i16, Custom); - setOperationAction(ISD::SMAX, MVT::v4i16, Custom); - setOperationAction(ISD::UMIN, MVT::v4i16, Custom); - setOperationAction(ISD::UMAX, MVT::v4i16, Custom); + setOperationAction(ISD::SMIN, VT, Custom); + setOperationAction(ISD::SMAX, VT, Custom); + setOperationAction(ISD::UMIN, VT, Custom); + setOperationAction(ISD::UMAX, VT, Custom); - setOperationAction(ISD::UADDSAT, MVT::v4i16, Custom); - setOperationAction(ISD::SADDSAT, MVT::v4i16, Custom); - setOperationAction(ISD::USUBSAT, MVT::v4i16, Custom); - setOperationAction(ISD::SSUBSAT, MVT::v4i16, Custom); + setOperationAction(ISD::UADDSAT, VT, Custom); + setOperationAction(ISD::SADDSAT, VT, Custom); + setOperationAction(ISD::USUBSAT, VT, Custom); + setOperationAction(ISD::SSUBSAT, VT, Custom); + } - setOperationAction(ISD::FADD, MVT::v4f16, Custom); - setOperationAction(ISD::FMUL, MVT::v4f16, Custom); - setOperationAction(ISD::FMA, MVT::v4f16, Custom); + for (MVT VT : { MVT::v4f16, MVT::v8f16 }) { + // Split vector operations. + setOperationAction(ISD::FADD, VT, Custom); + setOperationAction(ISD::FMUL, VT, Custom); + setOperationAction(ISD::FMA, VT, Custom); + setOperationAction(ISD::FCANONICALIZE, VT, Custom); + } setOperationAction(ISD::FMAXNUM, MVT::v2f16, Custom); setOperationAction(ISD::FMINNUM, MVT::v2f16, Custom); setOperationAction(ISD::FMINNUM, MVT::v4f16, Custom); setOperationAction(ISD::FMAXNUM, MVT::v4f16, Custom); - setOperationAction(ISD::FCANONICALIZE, MVT::v4f16, Custom); setOperationAction(ISD::FEXP, MVT::v2f16, Custom); setOperationAction(ISD::SELECT, MVT::v4i16, Custom); @@ -803,7 +841,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::FABS, MVT::v2f16, Custom); } - for (MVT VT : { MVT::v4i16, MVT::v4f16, MVT::v2i8, MVT::v4i8, MVT::v8i8 }) { + for (MVT VT : { MVT::v4i16, MVT::v4f16, MVT::v2i8, MVT::v4i8, MVT::v8i8, + MVT::v8i16, MVT::v8f16 }) { setOperationAction(ISD::SELECT, VT, Custom); } @@ -2776,6 +2815,7 @@ void SITargetLowering::passSpecialInputs( SelectionDAG &DAG = CLI.DAG; const SDLoc &DL = CLI.DL; + const Function &F = DAG.getMachineFunction().getFunction(); const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo(); @@ -2887,11 +2927,16 @@ void SITargetLowering::passSpecialInputs( // If incoming ids are not packed we need to pack them. if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX && - NeedWorkItemIDX) - InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX); + NeedWorkItemIDX) { + if (Subtarget->getMaxWorkitemID(F, 0) != 0) { + InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX); + } else { + InputReg = DAG.getConstant(0, DL, MVT::i32); + } + } if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY && - NeedWorkItemIDY) { + NeedWorkItemIDY && Subtarget->getMaxWorkitemID(F, 1) != 0) { SDValue Y = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgY); Y = DAG.getNode(ISD::SHL, SL, MVT::i32, Y, DAG.getShiftAmountConstant(10, MVT::i32, SL)); @@ -2900,7 +2945,7 @@ void SITargetLowering::passSpecialInputs( } if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ && - NeedWorkItemIDZ) { + NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(F, 2) != 0) { SDValue Z = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgZ); Z = DAG.getNode(ISD::SHL, SL, MVT::i32, Z, DAG.getShiftAmountConstant(20, MVT::i32, SL)); @@ -2909,13 +2954,21 @@ void SITargetLowering::passSpecialInputs( } if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) { - // Workitem ids are already packed, any of present incoming arguments - // will carry all required fields. - ArgDescriptor IncomingArg = ArgDescriptor::createArg( - IncomingArgX ? *IncomingArgX : - IncomingArgY ? *IncomingArgY : - *IncomingArgZ, ~0u); - InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, IncomingArg); + if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) { + // We're in a situation where the outgoing function requires the workitem + // ID, but the calling function does not have it (e.g a graphics function + // calling a C calling convention function). This is illegal, but we need + // to produce something. + InputReg = DAG.getUNDEF(MVT::i32); + } else { + // Workitem ids are already packed, any of present incoming arguments + // will carry all required fields. + ArgDescriptor IncomingArg = ArgDescriptor::createArg( + IncomingArgX ? *IncomingArgX : + IncomingArgY ? *IncomingArgY : + *IncomingArgZ, ~0u); + InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, IncomingArg); + } } if (OutgoingArg->isRegister()) { @@ -4600,7 +4653,8 @@ SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op, unsigned Opc = Op.getOpcode(); EVT VT = Op.getValueType(); assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 || - VT == MVT::v8f32 || VT == MVT::v16f32 || VT == MVT::v32f32); + VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8f32 || + VT == MVT::v16f32 || VT == MVT::v32f32); SDValue Lo0, Hi0; std::tie(Lo0, Hi0) = DAG.SplitVectorOperand(Op.getNode(), 0); @@ -4621,21 +4675,26 @@ SDValue SITargetLowering::splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const { unsigned Opc = Op.getOpcode(); EVT VT = Op.getValueType(); - assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 || - VT == MVT::v8f32 || VT == MVT::v16f32 || VT == MVT::v32f32); + assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 || + VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v8f32 || + VT == MVT::v16f32 || VT == MVT::v32f32); SDValue Lo0, Hi0; - std::tie(Lo0, Hi0) = DAG.SplitVectorOperand(Op.getNode(), 0); + SDValue Op0 = Op.getOperand(0); + std::tie(Lo0, Hi0) = Op0.getValueType().isVector() + ? DAG.SplitVectorOperand(Op.getNode(), 0) + : std::make_pair(Op0, Op0); SDValue Lo1, Hi1; std::tie(Lo1, Hi1) = DAG.SplitVectorOperand(Op.getNode(), 1); SDValue Lo2, Hi2; std::tie(Lo2, Hi2) = DAG.SplitVectorOperand(Op.getNode(), 2); SDLoc SL(Op); + auto ResVT = DAG.GetSplitDestVTs(VT); - SDValue OpLo = DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1, Lo2, + SDValue OpLo = DAG.getNode(Opc, SL, ResVT.first, Lo0, Lo1, Lo2, Op->getFlags()); - SDValue OpHi = DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1, Hi2, + SDValue OpHi = DAG.getNode(Opc, SL, ResVT.second, Hi0, Hi1, Hi2, Op->getFlags()); return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi); @@ -5297,7 +5356,7 @@ SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op, if (IsIEEEMode) return expandFMINNUM_FMAXNUM(Op.getNode(), DAG); - if (VT == MVT::v4f16) + if (VT == MVT::v4f16 || VT == MVT::v8f16) return splitBinaryVectorOp(Op, DAG); return Op; } @@ -5501,6 +5560,22 @@ SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL, MachineMemOperand::MOInvariant); } +/// Return true if the value is a known valid address, such that a null check is +/// not necessary. +static bool isKnownNonNull(SDValue Val, SelectionDAG &DAG, + const AMDGPUTargetMachine &TM, unsigned AddrSpace) { + if (isa<FrameIndexSDNode>(Val) || isa<GlobalAddressSDNode>(Val) || + isa<BasicBlockSDNode>(Val)) + return true; + + if (auto *ConstVal = dyn_cast<ConstantSDNode>(Val)) + return ConstVal->getSExtValue() != TM.getNullPointerValue(AddrSpace); + + // TODO: Search through arithmetic, handle arguments and loads + // marked nonnull. + return false; +} + SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) const { SDLoc SL(Op); @@ -5508,48 +5583,64 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op, SDValue Src = ASC->getOperand(0); SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64); + unsigned SrcAS = ASC->getSrcAddressSpace(); const AMDGPUTargetMachine &TM = static_cast<const AMDGPUTargetMachine &>(getTargetMachine()); // flat -> local/private - if (ASC->getSrcAddressSpace() == AMDGPUAS::FLAT_ADDRESS) { + if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { unsigned DestAS = ASC->getDestAddressSpace(); if (DestAS == AMDGPUAS::LOCAL_ADDRESS || DestAS == AMDGPUAS::PRIVATE_ADDRESS) { + SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src); + + if (isKnownNonNull(Src, DAG, TM, SrcAS)) + return Ptr; + unsigned NullVal = TM.getNullPointerValue(DestAS); SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32); SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE); - SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src); - return DAG.getNode(ISD::SELECT, SL, MVT::i32, - NonNull, Ptr, SegmentNullPtr); + return DAG.getNode(ISD::SELECT, SL, MVT::i32, NonNull, Ptr, + SegmentNullPtr); } } // local/private -> flat if (ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS) { - unsigned SrcAS = ASC->getSrcAddressSpace(); - if (SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS) { + + SDValue Aperture = getSegmentAperture(ASC->getSrcAddressSpace(), SL, DAG); + SDValue CvtPtr = + DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture); + CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr); + + if (isKnownNonNull(Src, DAG, TM, SrcAS)) + return CvtPtr; + unsigned NullVal = TM.getNullPointerValue(SrcAS); SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32); SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE); - SDValue Aperture = getSegmentAperture(ASC->getSrcAddressSpace(), SL, DAG); - SDValue CvtPtr - = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture); - - return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull, - DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr), + return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull, CvtPtr, FlatNullPtr); } } + if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT && + Op.getValueType() == MVT::i64) { + const SIMachineFunctionInfo *Info = + DAG.getMachineFunction().getInfo<SIMachineFunctionInfo>(); + SDValue Hi = DAG.getConstant(Info->get32BitAddressHighBits(), SL, MVT::i32); + SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Hi); + return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec); + } + if (ASC->getDestAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT && Src.getValueType() == MVT::i64) return DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src); @@ -5676,7 +5767,6 @@ SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op, EVT VecVT = Vec.getValueType(); unsigned VecSize = VecVT.getSizeInBits(); EVT EltVT = VecVT.getVectorElementType(); - assert(VecSize <= 64); DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr); @@ -5687,6 +5777,28 @@ SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op, if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI)) return Combined; + if (VecSize == 128) { + SDValue Lo, Hi; + EVT LoVT, HiVT; + SDValue V2 = DAG.getBitcast(MVT::v2i64, Vec); + std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VecVT); + Lo = + DAG.getBitcast(LoVT, DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, + V2, DAG.getConstant(0, SL, MVT::i32))); + Hi = + DAG.getBitcast(HiVT, DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, + V2, DAG.getConstant(1, SL, MVT::i32))); + EVT IdxVT = Idx.getValueType(); + unsigned NElem = VecVT.getVectorNumElements(); + assert(isPowerOf2_32(NElem)); + SDValue IdxMask = DAG.getConstant(NElem / 2 - 1, SL, IdxVT); + SDValue NewIdx = DAG.getNode(ISD::AND, SL, IdxVT, Idx, IdxMask); + SDValue Half = DAG.getSelectCC(SL, Idx, IdxMask, Hi, Lo, ISD::SETUGT); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Half, NewIdx); + } + + assert(VecSize <= 64); + unsigned EltSize = EltVT.getSizeInBits(); assert(isPowerOf2_32(EltSize)); @@ -5769,20 +5881,27 @@ SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op, SDLoc SL(Op); EVT VT = Op.getValueType(); - if (VT == MVT::v4i16 || VT == MVT::v4f16) { - EVT HalfVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(), 2); + if (VT == MVT::v4i16 || VT == MVT::v4f16 || + VT == MVT::v8i16 || VT == MVT::v8f16) { + EVT HalfVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(), + VT.getVectorNumElements() / 2); + MVT HalfIntVT = MVT::getIntegerVT(HalfVT.getSizeInBits()); // Turn into pair of packed build_vectors. // TODO: Special case for constants that can be materialized with s_mov_b64. - SDValue Lo = DAG.getBuildVector(HalfVT, SL, - { Op.getOperand(0), Op.getOperand(1) }); - SDValue Hi = DAG.getBuildVector(HalfVT, SL, - { Op.getOperand(2), Op.getOperand(3) }); + SmallVector<SDValue, 4> LoOps, HiOps; + for (unsigned I = 0, E = VT.getVectorNumElements() / 2; I != E; ++I) { + LoOps.push_back(Op.getOperand(I)); + HiOps.push_back(Op.getOperand(I + E)); + } + SDValue Lo = DAG.getBuildVector(HalfVT, SL, LoOps); + SDValue Hi = DAG.getBuildVector(HalfVT, SL, HiOps); - SDValue CastLo = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Lo); - SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Hi); + SDValue CastLo = DAG.getNode(ISD::BITCAST, SL, HalfIntVT, Lo); + SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, HalfIntVT, Hi); - SDValue Blend = DAG.getBuildVector(MVT::v2i32, SL, { CastLo, CastHi }); + SDValue Blend = DAG.getBuildVector(MVT::getVectorVT(HalfIntVT, 2), SL, + { CastLo, CastHi }); return DAG.getNode(ISD::BITCAST, SL, VT, Blend); } @@ -6155,10 +6274,6 @@ SDValue SITargetLowering::lowerImage(SDValue Op, const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim); - const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = - AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode); - const AMDGPU::MIMGMIPMappingInfo *MIPMappingInfo = - AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode); unsigned IntrOpcode = Intr->BaseOpcode; bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget); @@ -6246,28 +6361,6 @@ SDValue SITargetLowering::lowerImage(SDValue Op, unsigned VAddrEnd = ArgOffset + Intr->VAddrEnd; SmallVector<SDValue, 4> VAddrs; - // Optimize _L to _LZ when _L is zero - if (LZMappingInfo) { - if (auto *ConstantLod = dyn_cast<ConstantFPSDNode>( - Op.getOperand(ArgOffset + Intr->LodIndex))) { - if (ConstantLod->isZero() || ConstantLod->isNegative()) { - IntrOpcode = LZMappingInfo->LZ; // set new opcode to _lz variant of _l - VAddrEnd--; // remove 'lod' - } - } - } - - // Optimize _mip away, when 'lod' is zero - if (MIPMappingInfo) { - if (auto *ConstantLod = dyn_cast<ConstantSDNode>( - Op.getOperand(ArgOffset + Intr->MipIndex))) { - if (ConstantLod->isZero()) { - IntrOpcode = MIPMappingInfo->NONMIP; // set new opcode to variant without _mip - VAddrEnd--; // remove 'mip' - } - } - } - // Check for 16 bit addresses or derivatives and pack if true. MVT VAddrVT = Op.getOperand(ArgOffset + Intr->GradientStart).getSimpleValueType(); @@ -6283,12 +6376,18 @@ SDValue SITargetLowering::lowerImage(SDValue Op, // Push back extra arguments. for (unsigned I = Intr->VAddrStart; I < Intr->GradientStart; I++) { if (IsA16 && (Op.getOperand(ArgOffset + I).getValueType() == MVT::f16)) { + assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument"); // Special handling of bias when A16 is on. Bias is of type half but // occupies full 32-bit. - SDValue bias = DAG.getBuildVector( MVT::v2f16, DL, {Op.getOperand(ArgOffset + I), DAG.getUNDEF(MVT::f16)}); - VAddrs.push_back(bias); - } else + SDValue Bias = DAG.getBuildVector( + MVT::v2f16, DL, + {Op.getOperand(ArgOffset + I), DAG.getUNDEF(MVT::f16)}); + VAddrs.push_back(Bias); + } else { + assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) && + "Bias needs to be converted to 16 bit in A16 mode"); VAddrs.push_back(Op.getOperand(ArgOffset + I)); + } } if (BaseOpcode->Gradients && !ST->hasG16() && (IsA16 != IsG16)) { @@ -6731,14 +6830,23 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); case Intrinsic::amdgcn_workitem_id_x: + if (Subtarget->getMaxWorkitemID(MF.getFunction(), 0) == 0) + return DAG.getConstant(0, DL, MVT::i32); + return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32, SDLoc(DAG.getEntryNode()), MFI->getArgInfo().WorkItemIDX); case Intrinsic::amdgcn_workitem_id_y: + if (Subtarget->getMaxWorkitemID(MF.getFunction(), 1) == 0) + return DAG.getConstant(0, DL, MVT::i32); + return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32, SDLoc(DAG.getEntryNode()), MFI->getArgInfo().WorkItemIDY); case Intrinsic::amdgcn_workitem_id_z: + if (Subtarget->getMaxWorkitemID(MF.getFunction(), 2) == 0) + return DAG.getConstant(0, DL, MVT::i32); + return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32, SDLoc(DAG.getEntryNode()), MFI->getArgInfo().WorkItemIDZ); @@ -6899,9 +7007,6 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, DAG.getConstant(1, SL, MVT::i32)); return DAG.getSetCC(SL, MVT::i1, SrcHi, Aperture, ISD::SETEQ); } - case Intrinsic::amdgcn_alignbit: - return DAG.getNode(ISD::FSHR, DL, VT, - Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); case Intrinsic::amdgcn_perm: return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); @@ -8408,21 +8513,14 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); + if (VT.getSizeInBits() == 128) + return splitTernaryVectorOp(Op, DAG); + assert(VT.getSizeInBits() == 64); SDLoc DL(Op); SDValue Cond = Op.getOperand(0); - if (Subtarget->hasScalarCompareEq64() && Op->getOperand(0)->hasOneUse() && - !Op->isDivergent()) { - if (VT == MVT::i64) - return Op; - SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::i64, Op.getOperand(1)); - SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::i64, Op.getOperand(2)); - return DAG.getNode(ISD::BITCAST, DL, VT, - DAG.getSelect(DL, MVT::i64, Cond, LHS, RHS)); - } - SDValue Zero = DAG.getConstant(0, DL, MVT::i32); SDValue One = DAG.getConstant(1, DL, MVT::i32); @@ -9550,6 +9648,9 @@ SDValue SITargetLowering::performOrCombine(SDNode *N, SDValue SITargetLowering::performXorCombine(SDNode *N, DAGCombinerInfo &DCI) const { + if (SDValue RV = reassociateScalarOps(N, DCI.DAG)) + return RV; + EVT VT = N->getValueType(0); if (VT != MVT::i64) return SDValue(); @@ -10462,6 +10563,9 @@ SDValue SITargetLowering::reassociateScalarOps(SDNode *N, if (VT != MVT::i32 && VT != MVT::i64) return SDValue(); + if (DAG.isBaseWithConstantOffset(SDValue(N, 0))) + return SDValue(); + unsigned Opc = N->getOpcode(); SDValue Op0 = N->getOperand(0); SDValue Op1 = N->getOperand(1); @@ -10483,12 +10587,6 @@ SDValue SITargetLowering::reassociateScalarOps(SDNode *N, if (Op1->isDivergent()) std::swap(Op1, Op2); - // If either operand is constant this will conflict with - // DAGCombiner::ReassociateOps(). - if (DAG.isConstantIntBuildVectorOrConstantInt(Op0) || - DAG.isConstantIntBuildVectorOrConstantInt(Op1)) - return SDValue(); - SDLoc SL(N); SDValue Add1 = DAG.getNode(Opc, SL, VT, Op0, Op1); return DAG.getNode(Opc, SL, VT, Add1, Op2); @@ -11130,7 +11228,9 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node, unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1; unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1; bool UsesTFC = ((int(TFEIdx) >= 0 && Node->getConstantOperandVal(TFEIdx)) || - Node->getConstantOperandVal(LWEIdx)) ? 1 : 0; + Node->getConstantOperandVal(LWEIdx)) + ? true + : false; unsigned TFCLane = 0; bool HasChain = Node->getNumValues() > 1; @@ -11719,25 +11819,51 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI_, return std::make_pair(0U, RC); } - if (Constraint.size() > 1) { - if (Constraint[1] == 'v') { + if (Constraint.startswith("{") && Constraint.endswith("}")) { + StringRef RegName(Constraint.data() + 1, Constraint.size() - 2); + if (RegName.consume_front("v")) { RC = &AMDGPU::VGPR_32RegClass; - } else if (Constraint[1] == 's') { + } else if (RegName.consume_front("s")) { RC = &AMDGPU::SGPR_32RegClass; - } else if (Constraint[1] == 'a') { + } else if (RegName.consume_front("a")) { RC = &AMDGPU::AGPR_32RegClass; } if (RC) { uint32_t Idx; - bool Failed = Constraint.substr(2).getAsInteger(10, Idx); - if (!Failed && Idx < RC->getNumRegs()) - return std::make_pair(RC->getRegister(Idx), RC); + if (RegName.consume_front("[")) { + uint32_t End; + bool Failed = RegName.consumeInteger(10, Idx); + Failed |= !RegName.consume_front(":"); + Failed |= RegName.consumeInteger(10, End); + Failed |= !RegName.consume_back("]"); + if (!Failed) { + uint32_t Width = (End - Idx + 1) * 32; + MCRegister Reg = RC->getRegister(Idx); + if (SIRegisterInfo::isVGPRClass(RC)) + RC = TRI->getVGPRClassForBitWidth(Width); + else if (SIRegisterInfo::isSGPRClass(RC)) + RC = TRI->getSGPRClassForBitWidth(Width); + else if (SIRegisterInfo::isAGPRClass(RC)) + RC = TRI->getAGPRClassForBitWidth(Width); + if (RC) { + Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC); + return std::make_pair(Reg, RC); + } + } + } else { + bool Failed = RegName.getAsInteger(10, Idx); + if (!Failed && Idx < RC->getNumRegs()) + return std::make_pair(RC->getRegister(Idx), RC); + } } } - // FIXME: Returns VS_32 for physical SGPR constraints - return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); + auto Ret = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); + if (Ret.first) + Ret.second = TRI->getPhysRegClass(Ret.first); + + return Ret; } static bool isImmConstraint(StringRef Constraint) { @@ -11975,13 +12101,6 @@ void SITargetLowering::finalizeLowering(MachineFunction &MF) const { } TargetLoweringBase::finalizeLowering(MF); - - // Allocate a VGPR for future SGPR Spill if - // "amdgpu-reserve-vgpr-for-sgpr-spill" option is used - // FIXME: We won't need this hack if we split SGPR allocation from VGPR - if (VGPRReserveforSGPRSpill && TRI->spillSGPRToVGPR() && - !Info->VGPRReservedForSGPRSpill && !Info->isEntryFunction()) - Info->reserveVGPRforSGPRSpills(MF); } void SITargetLowering::computeKnownBitsForFrameIndex( @@ -12441,17 +12560,10 @@ bool SITargetLowering::requiresUniformRegister(MachineFunction &MF, for (auto &TC : TargetConstraints) { if (TC.Type == InlineAsm::isOutput) { ComputeConstraintToUse(TC, SDValue()); - unsigned AssignedReg; - const TargetRegisterClass *RC; - std::tie(AssignedReg, RC) = getRegForInlineAsmConstraint( - SIRI, TC.ConstraintCode, TC.ConstraintVT); - if (RC) { - MachineRegisterInfo &MRI = MF.getRegInfo(); - if (AssignedReg != 0 && SIRI->isSGPRReg(MRI, AssignedReg)) - return true; - else if (SIRI->isSGPRClass(RC)) - return true; - } + const TargetRegisterClass *RC = getRegForInlineAsmConstraint( + SIRI, TC.ConstraintCode, TC.ConstraintVT).second; + if (RC && SIRI->isSGPRClass(RC)) + return true; } } } @@ -12475,3 +12587,27 @@ SITargetLowering::getTypeLegalizationCost(const DataLayout &DL, Cost.first += (Size + 255) / 256; return Cost; } + +bool SITargetLowering::hasMemSDNodeUser(SDNode *N) const { + SDNode::use_iterator I = N->use_begin(), E = N->use_end(); + for (; I != E; ++I) { + if (MemSDNode *M = dyn_cast<MemSDNode>(*I)) { + if (getBasePtrIndex(M) == I.getOperandNo()) + return true; + } + } + return false; +} + +bool SITargetLowering::isReassocProfitable(SelectionDAG &DAG, SDValue N0, + SDValue N1) const { + if (!N0.hasOneUse()) + return false; + // Take care of the oportunity to keep N0 uniform + if (N0->isDivergent() || !N1->isDivergent()) + return true; + // Check if we have a good chance to form the memory access pattern with the + // base and offset + return (DAG.isBaseWithConstantOffset(N0) && + hasMemSDNodeUser(*N0->use_begin())); +} diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h index 1315cc15dd02..bf81e082b478 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -449,6 +449,11 @@ public: bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, LegacyDivergenceAnalysis *DA) const override; + bool hasMemSDNodeUser(SDNode *N) const; + + bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, + SDValue N1) const override; + bool isCanonicalized(SelectionDAG &DAG, SDValue Op, unsigned MaxDepth = 5) const; bool isCanonicalized(Register Reg, MachineFunction &MF, diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 6fbe5d45ce0a..f8a10bc8ef6f 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -863,7 +863,7 @@ bool SIInsertWaitcnts::applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets, Wait.ExpCnt = ~0u; LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n" - << "Old Instr: " << MI << "New Instr: " << *WaitcntInstr + << "Old Instr: " << *MI << "New Instr: " << *WaitcntInstr << '\n'); } else { WaitcntInstr->eraseFromParent(); @@ -886,7 +886,7 @@ bool SIInsertWaitcnts::applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets, Wait.VsCnt = ~0u; LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n" - << "Old Instr: " << MI + << "Old Instr: " << *MI << "New Instr: " << *WaitcntVsCntInstr << '\n'); } else { WaitcntVsCntInstr->eraseFromParent(); @@ -1382,7 +1382,6 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) { for (auto T : inst_counter_types()) { // Merge event flags for this counter - const bool OldOutOfOrder = counterOutOfOrder(T); const unsigned OldEvents = PendingEvents & WaitEventMaskForInst[T]; const unsigned OtherEvents = Other.PendingEvents & WaitEventMaskForInst[T]; if (OtherEvents & ~OldEvents) @@ -1425,7 +1424,7 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) { } } - if (RegStrictDom && !OldOutOfOrder) + if (RegStrictDom) StrictDom = true; } diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 1755b93538ce..0a2f9381e71f 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -130,10 +130,24 @@ bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI, return false; } +static bool readsExecAsData(const MachineInstr &MI) { + if (MI.isCompare()) + return true; + + switch (MI.getOpcode()) { + default: + break; + case AMDGPU::V_READFIRSTLANE_B32: + return true; + } + + return false; +} + bool SIInstrInfo::isIgnorableUse(const MachineOperand &MO) const { // Any implicit use of exec by VALU is not a real register read. return MO.getReg() == AMDGPU::EXEC && MO.isImplicit() && - isVALU(*MO.getParent()); + isVALU(*MO.getParent()) && !readsExecAsData(*MO.getParent()); } bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, @@ -3184,10 +3198,14 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI, Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 || Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64; bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64; + int NewMFMAOpc = -1; switch (Opc) { default: - return nullptr; + NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(Opc); + if (NewMFMAOpc == -1) + return nullptr; + break; case AMDGPU::V_MAC_F16_e64: case AMDGPU::V_FMAC_F16_e64: IsF16 = true; @@ -3216,6 +3234,19 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI, } } + MachineInstrBuilder MIB; + MachineBasicBlock &MBB = *MI.getParent(); + + if (NewMFMAOpc != -1) { + MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewMFMAOpc)); + for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) + MIB.add(MI.getOperand(I)); + updateLiveVariables(LV, MI, *MIB); + if (LIS) + LIS->ReplaceMachineInstrInMaps(MI, *MIB); + return MIB; + } + const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst); const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0); const MachineOperand *Src0Mods = @@ -3226,8 +3257,6 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI, const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2); const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp); const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod); - MachineInstrBuilder MIB; - MachineBasicBlock &MBB = *MI.getParent(); if (!Src0Mods && !Src1Mods && !Clamp && !Omod && !IsF64 && // If we have an SGPR input, we will violate the constant bus restriction. @@ -4520,6 +4549,14 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, } } + if (Desc.getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS) { + const MachineOperand &SrcOp = MI.getOperand(1); + if (!SrcOp.isReg() || SrcOp.getReg().isVirtual()) { + ErrInfo = "pseudo expects only physical SGPRs"; + return false; + } + } + return true; } @@ -6122,11 +6159,8 @@ MachineBasicBlock *SIInstrInfo::moveToVALU(MachineInstr &TopInst, continue; case AMDGPU::S_CSELECT_B32: - lowerSelect32(Worklist, Inst, MDT); - Inst.eraseFromParent(); - continue; case AMDGPU::S_CSELECT_B64: - splitSelect64(Worklist, Inst, MDT); + lowerSelect(Worklist, Inst, MDT); Inst.eraseFromParent(); continue; case AMDGPU::S_CMP_EQ_I32: @@ -6304,8 +6338,8 @@ SIInstrInfo::moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst, return std::make_pair(false, nullptr); } -void SIInstrInfo::lowerSelect32(SetVectorType &Worklist, MachineInstr &Inst, - MachineDominatorTree *MDT) const { +void SIInstrInfo::lowerSelect(SetVectorType &Worklist, MachineInstr &Inst, + MachineDominatorTree *MDT) const { MachineBasicBlock &MBB = *Inst.getParent(); MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); @@ -6380,95 +6414,6 @@ void SIInstrInfo::lowerSelect32(SetVectorType &Worklist, MachineInstr &Inst, addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); } -void SIInstrInfo::splitSelect64(SetVectorType &Worklist, MachineInstr &Inst, - MachineDominatorTree *MDT) const { - // Split S_CSELECT_B64 into a pair of S_CSELECT_B32 and lower them - // further. - const DebugLoc &DL = Inst.getDebugLoc(); - MachineBasicBlock::iterator MII = Inst; - MachineBasicBlock &MBB = *Inst.getParent(); - MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); - - // Get the original operands. - MachineOperand &Dest = Inst.getOperand(0); - MachineOperand &Src0 = Inst.getOperand(1); - MachineOperand &Src1 = Inst.getOperand(2); - MachineOperand &Cond = Inst.getOperand(3); - - Register SCCSource = Cond.getReg(); - bool IsSCC = (SCCSource == AMDGPU::SCC); - - // If this is a trivial select where the condition is effectively not SCC - // (SCCSource is a source of copy to SCC), then the select is semantically - // equivalent to copying SCCSource. Hence, there is no need to create - // V_CNDMASK, we can just use that and bail out. - if (!IsSCC && (Src0.isImm() && Src0.getImm() == -1) && - (Src1.isImm() && Src1.getImm() == 0)) { - MRI.replaceRegWith(Dest.getReg(), SCCSource); - return; - } - - // Prepare the split destination. - Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); - Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - - // Split the source operands. - const TargetRegisterClass *Src0RC = nullptr; - const TargetRegisterClass *Src0SubRC = nullptr; - if (Src0.isReg()) { - Src0RC = MRI.getRegClass(Src0.getReg()); - Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0); - } - const TargetRegisterClass *Src1RC = nullptr; - const TargetRegisterClass *Src1SubRC = nullptr; - if (Src1.isReg()) { - Src1RC = MRI.getRegClass(Src1.getReg()); - Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0); - } - // Split lo. - MachineOperand SrcReg0Sub0 = - buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC); - MachineOperand SrcReg1Sub0 = - buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC); - // Split hi. - MachineOperand SrcReg0Sub1 = - buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC); - MachineOperand SrcReg1Sub1 = - buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC); - // Select the lo part. - MachineInstr *LoHalf = - BuildMI(MBB, MII, DL, get(AMDGPU::S_CSELECT_B32), DestSub0) - .add(SrcReg0Sub0) - .add(SrcReg1Sub0); - // Replace the condition operand with the original one. - LoHalf->getOperand(3).setReg(SCCSource); - Worklist.insert(LoHalf); - // Select the hi part. - MachineInstr *HiHalf = - BuildMI(MBB, MII, DL, get(AMDGPU::S_CSELECT_B32), DestSub1) - .add(SrcReg0Sub1) - .add(SrcReg1Sub1); - // Replace the condition operand with the original one. - HiHalf->getOperand(3).setReg(SCCSource); - Worklist.insert(HiHalf); - // Merge them back to the original 64-bit one. - BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) - .addReg(DestSub0) - .addImm(AMDGPU::sub0) - .addReg(DestSub1) - .addImm(AMDGPU::sub1); - MRI.replaceRegWith(Dest.getReg(), FullDestReg); - - // Try to legalize the operands in case we need to swap the order to keep - // it valid. - legalizeOperands(*LoHalf, MDT); - legalizeOperands(*HiHalf, MDT); - - // Move all users of this moved value. - addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); -} - void SIInstrInfo::lowerScalarAbs(SetVectorType &Worklist, MachineInstr &Inst) const { MachineBasicBlock &MBB = *Inst.getParent(); @@ -7820,6 +7765,12 @@ int SIInstrInfo::pseudoToMCOpcode(int Opcode) const { } } + if (isMAI(Opcode)) { + int MFMAOp = AMDGPU::getMFMAEarlyClobberOp(Opcode); + if (MFMAOp != -1) + Opcode = MFMAOp; + } + int MCOp = AMDGPU::getMCOpcode(Opcode, Gen); // -1 means that Opcode is already a native instruction. diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index dd9ea2b53ca2..e551d6c7223f 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -78,11 +78,8 @@ private: moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst, MachineDominatorTree *MDT = nullptr) const; - void lowerSelect32(SetVectorType &Worklist, MachineInstr &Inst, - MachineDominatorTree *MDT = nullptr) const; - - void splitSelect64(SetVectorType &Worklist, MachineInstr &Inst, - MachineDominatorTree *MDT = nullptr) const; + void lowerSelect(SetVectorType &Worklist, MachineInstr &Inst, + MachineDominatorTree *MDT = nullptr) const; void lowerScalarAbs(SetVectorType &Worklist, MachineInstr &Inst) const; @@ -1249,6 +1246,10 @@ namespace AMDGPU { LLVM_READONLY int getFlatScratchInstSVfromSS(uint16_t Opcode); + /// \returns earlyclobber version of a MAC MFMA is exists. + LLVM_READONLY + int getMFMAEarlyClobberOp(uint16_t Opcode); + const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL; const uint64_t RSRC_ELEMENT_SIZE_SHIFT = (32 + 19); const uint64_t RSRC_INDEX_STRIDE_SHIFT = (32 + 21); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index dda92d3d25ff..713a08907e99 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -2588,6 +2588,14 @@ def getFlatScratchInstSVfromSS : InstrMapping { let ValueCols = [["SV"]]; } +def getMFMAEarlyClobberOp : InstrMapping { + let FilterClass = "MFMATable"; + let RowFields = ["FMAOp"]; + let ColFields = ["IsMac"]; + let KeyCol = ["1"]; + let ValueCols = [["0"]]; +} + include "SIInstructions.td" include "DSInstructions.td" diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 636337ede000..7be63ae6964b 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -1011,7 +1011,7 @@ def : GCNPat < } def : GCNPat < - (i32 (ctpop i32:$popcnt)), + (i32 (DivergentUnaryFrag<ctpop> i32:$popcnt)), (V_BCNT_U32_B32_e64 VSrc_b32:$popcnt, (i32 0)) >; @@ -1020,6 +1020,14 @@ def : GCNPat < (V_BCNT_U32_B32_e64 $popcnt, $val) >; +def : GCNPat < + (i64 (DivergentUnaryFrag<ctpop> i64:$src)), + (REG_SEQUENCE VReg_64, + (V_BCNT_U32_B32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub1)), + (i32 (V_BCNT_U32_B32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub0)), (i32 0)))), sub0, + (i32 (V_MOV_B32_e32 (i32 0))), sub1) +>; + /********** ============================================ **********/ /********** Extraction, Insertion, Building and Casting **********/ /********** ============================================ **********/ @@ -1184,6 +1192,26 @@ def : Pat < (v2f16 (EXTRACT_SUBREG v4f16:$vec, sub1)) >; +def : Pat < + (extract_subvector v8i16:$vec, (i32 0)), + (v4i16 (EXTRACT_SUBREG v8i16:$vec, sub0_sub1)) +>; + +def : Pat < + (extract_subvector v8i16:$vec, (i32 4)), + (v4i16 (EXTRACT_SUBREG v8i16:$vec, sub2_sub3)) +>; + +def : Pat < + (extract_subvector v8f16:$vec, (i32 0)), + (v4f16 (EXTRACT_SUBREG v8f16:$vec, sub0_sub1)) +>; + +def : Pat < + (extract_subvector v8f16:$vec, (i32 4)), + (v4f16 (EXTRACT_SUBREG v8f16:$vec, sub2_sub3)) +>; + foreach Index = 0-31 in { def Extract_Element_v32i32_#Index : Extract_Element < i32, v32i32, Index, !cast<SubRegIndex>(sub#Index) @@ -1279,6 +1307,26 @@ def : BitConvert <v2i64, v2f64, VReg_128>; def : BitConvert <v2f64, v2i64, VReg_128>; def : BitConvert <v4f32, v2i64, VReg_128>; def : BitConvert <v2i64, v4f32, VReg_128>; +def : BitConvert <v8i16, v4i32, SReg_128>; +def : BitConvert <v4i32, v8i16, SReg_128>; +def : BitConvert <v8f16, v4f32, VReg_128>; +def : BitConvert <v8f16, v4i32, VReg_128>; +def : BitConvert <v4f32, v8f16, VReg_128>; +def : BitConvert <v4i32, v8f16, VReg_128>; +def : BitConvert <v8i16, v8f16, VReg_128>; +def : BitConvert <v8f16, v8i16, VReg_128>; +def : BitConvert <v4f32, v8i16, VReg_128>; +def : BitConvert <v8i16, v4f32, VReg_128>; +def : BitConvert <v8i16, v8f16, SReg_128>; +def : BitConvert <v8i16, v2i64, SReg_128>; +def : BitConvert <v8i16, v2f64, SReg_128>; +def : BitConvert <v8f16, v2i64, SReg_128>; +def : BitConvert <v8f16, v2f64, SReg_128>; +def : BitConvert <v8f16, v8i16, SReg_128>; +def : BitConvert <v2i64, v8i16, SReg_128>; +def : BitConvert <v2f64, v8i16, SReg_128>; +def : BitConvert <v2i64, v8f16, SReg_128>; +def : BitConvert <v2f64, v8f16, SReg_128>; // 160-bit bitcast def : BitConvert <v5i32, v5f32, SReg_160>; @@ -1762,44 +1810,44 @@ def BFIImm32 : PatFrag< // (y & x) | (z & ~x) def : AMDGPUPat < (DivergentBinFrag<or> (and i32:$y, i32:$x), (and i32:$z, (not i32:$x))), - (V_BFI_B32_e64 $x, $y, $z) + (V_BFI_B32_e64 VSrc_b32:$x, VSrc_b32:$y, VSrc_b32:$z) >; // (y & C) | (z & ~C) def : AMDGPUPat < (BFIImm32 i32:$x, i32:$y, i32:$z), - (V_BFI_B32_e64 $x, $y, $z) + (V_BFI_B32_e64 VSrc_b32:$x, VSrc_b32:$y, VSrc_b32:$z) >; // 64-bit version def : AMDGPUPat < (DivergentBinFrag<or> (and i64:$y, i64:$x), (and i64:$z, (not i64:$x))), - (REG_SEQUENCE SReg_64, - (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG SReg_64:$x, sub0)), - (i32 (EXTRACT_SUBREG SReg_64:$y, sub0)), - (i32 (EXTRACT_SUBREG SReg_64:$z, sub0))), sub0, - (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG SReg_64:$x, sub1)), - (i32 (EXTRACT_SUBREG SReg_64:$y, sub1)), - (i32 (EXTRACT_SUBREG SReg_64:$z, sub1))), sub1) + (REG_SEQUENCE VReg_64, + (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub0)), + (i32 (EXTRACT_SUBREG VReg_64:$y, sub0)), + (i32 (EXTRACT_SUBREG VReg_64:$z, sub0))), sub0, + (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub1)), + (i32 (EXTRACT_SUBREG VReg_64:$y, sub1)), + (i32 (EXTRACT_SUBREG VReg_64:$z, sub1))), sub1) >; // SHA-256 Ch function // z ^ (x & (y ^ z)) def : AMDGPUPat < (DivergentBinFrag<xor> i32:$z, (and i32:$x, (xor i32:$y, i32:$z))), - (V_BFI_B32_e64 $x, $y, $z) + (V_BFI_B32_e64 VSrc_b32:$x, VSrc_b32:$y, VSrc_b32:$z) >; // 64-bit version def : AMDGPUPat < (DivergentBinFrag<xor> i64:$z, (and i64:$x, (xor i64:$y, i64:$z))), - (REG_SEQUENCE SReg_64, - (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG SReg_64:$x, sub0)), - (i32 (EXTRACT_SUBREG SReg_64:$y, sub0)), - (i32 (EXTRACT_SUBREG SReg_64:$z, sub0))), sub0, - (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG SReg_64:$x, sub1)), - (i32 (EXTRACT_SUBREG SReg_64:$y, sub1)), - (i32 (EXTRACT_SUBREG SReg_64:$z, sub1))), sub1) + (REG_SEQUENCE VReg_64, + (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub0)), + (i32 (EXTRACT_SUBREG VReg_64:$y, sub0)), + (i32 (EXTRACT_SUBREG VReg_64:$z, sub0))), sub0, + (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub1)), + (i32 (EXTRACT_SUBREG VReg_64:$y, sub1)), + (i32 (EXTRACT_SUBREG VReg_64:$z, sub1))), sub1) >; def : AMDGPUPat < @@ -2725,21 +2773,21 @@ def : AMDGPUPat < def : AMDGPUPat < (DivergentBinFrag<or> (and i32:$x, i32:$z), (and i32:$y, (or i32:$x, i32:$z))), - (V_BFI_B32_e64 (V_XOR_B32_e64 i32:$x, i32:$y), i32:$z, i32:$y) + (V_BFI_B32_e64 (V_XOR_B32_e64 VSrc_b32:$x, VSrc_b32:$y), VSrc_b32:$z, VSrc_b32:$y) >; def : AMDGPUPat < (DivergentBinFrag<or> (and i64:$x, i64:$z), (and i64:$y, (or i64:$x, i64:$z))), - (REG_SEQUENCE SReg_64, - (V_BFI_B32_e64 (V_XOR_B32_e64 (i32 (EXTRACT_SUBREG SReg_64:$x, sub0)), - (i32 (EXTRACT_SUBREG SReg_64:$y, sub0))), - (i32 (EXTRACT_SUBREG SReg_64:$z, sub0)), - (i32 (EXTRACT_SUBREG SReg_64:$y, sub0))), sub0, - (V_BFI_B32_e64 (V_XOR_B32_e64 (i32 (EXTRACT_SUBREG SReg_64:$x, sub1)), - (i32 (EXTRACT_SUBREG SReg_64:$y, sub1))), - (i32 (EXTRACT_SUBREG SReg_64:$z, sub1)), - (i32 (EXTRACT_SUBREG SReg_64:$y, sub1))), sub1) + (REG_SEQUENCE VReg_64, + (V_BFI_B32_e64 (V_XOR_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub0)), + (i32 (EXTRACT_SUBREG VReg_64:$y, sub0))), + (i32 (EXTRACT_SUBREG VReg_64:$z, sub0)), + (i32 (EXTRACT_SUBREG VReg_64:$y, sub0))), sub0, + (V_BFI_B32_e64 (V_XOR_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub1)), + (i32 (EXTRACT_SUBREG VReg_64:$y, sub1))), + (i32 (EXTRACT_SUBREG VReg_64:$z, sub1)), + (i32 (EXTRACT_SUBREG VReg_64:$y, sub1))), sub1) >; multiclass IntMed3Pat<Instruction med3Inst, @@ -2825,6 +2873,15 @@ class AMDGPUGenericInstruction : GenericInstruction { let Namespace = "AMDGPU"; } +// Convert a wave address to a swizzled vector address (i.e. this is +// for copying the stack pointer to a vector address appropriate to +// use in the offset field of mubuf instructions). +def G_AMDGPU_WAVE_ADDRESS : AMDGPUGenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src); + let hasSideEffects = 0; +} + // Returns -1 if the input is zero. def G_AMDGPU_FFBH_U32 : AMDGPUGenericInstruction { let OutOperandList = (outs type0:$dst); @@ -3027,6 +3084,16 @@ def G_AMDGPU_INTRIN_IMAGE_LOAD : AMDGPUGenericInstruction { let mayStore = 1; } +def G_AMDGPU_INTRIN_IMAGE_LOAD_D16 : AMDGPUGenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins unknown:$intrin, variable_ops); + let hasSideEffects = 0; + let mayLoad = 1; + + // FIXME: Use separate opcode for atomics. + let mayStore = 1; +} + // This is equivalent to the G_INTRINSIC*, but the operands may have // been legalized depending on the subtarget requirements. def G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPUGenericInstruction { @@ -3036,6 +3103,13 @@ def G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPUGenericInstruction { let mayStore = 1; } +def G_AMDGPU_INTRIN_IMAGE_STORE_D16 : AMDGPUGenericInstruction { + let OutOperandList = (outs); + let InOperandList = (ins unknown:$intrin, variable_ops); + let hasSideEffects = 0; + let mayStore = 1; +} + def G_AMDGPU_INTRIN_BVH_INTERSECT_RAY : AMDGPUGenericInstruction { let OutOperandList = (outs type0:$dst); let InOperandList = (ins unknown:$intrin, variable_ops); diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp index f4d9002e930e..c18637bdbc43 100644 --- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -105,6 +105,7 @@ class SILoadStoreOptimizer : public MachineFunctionPass { unsigned DMask; InstClassEnum InstClass; unsigned CPol = 0; + bool IsAGPR; bool UseST64; int AddrIdx[MaxAddressRegs]; const MachineOperand *AddrReg[MaxAddressRegs]; @@ -158,8 +159,7 @@ class SILoadStoreOptimizer : public MachineFunctionPass { return true; } - void setMI(MachineBasicBlock::iterator MI, const SIInstrInfo &TII, - const GCNSubtarget &STM); + void setMI(MachineBasicBlock::iterator MI, const SILoadStoreOptimizer &LSO); }; struct BaseRegisters { @@ -484,15 +484,16 @@ static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) { } void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI, - const SIInstrInfo &TII, - const GCNSubtarget &STM) { + const SILoadStoreOptimizer &LSO) { I = MI; unsigned Opc = MI->getOpcode(); - InstClass = getInstClass(Opc, TII); + InstClass = getInstClass(Opc, *LSO.TII); if (InstClass == UNKNOWN) return; + IsAGPR = LSO.TRI->hasAGPRs(LSO.getDataRegClass(*MI)); + switch (InstClass) { case DS_READ: EltSize = @@ -505,7 +506,7 @@ void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI, : 4; break; case S_BUFFER_LOAD_IMM: - EltSize = AMDGPU::convertSMRDOffsetUnits(STM, 4); + EltSize = AMDGPU::convertSMRDOffsetUnits(*LSO.STM, 4); break; default: EltSize = 4; @@ -513,7 +514,7 @@ void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI, } if (InstClass == MIMG) { - DMask = TII.getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm(); + DMask = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm(); // Offset is not considered for MIMG instructions. Offset = 0; } else { @@ -522,17 +523,17 @@ void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI, } if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE) - Format = TII.getNamedOperand(*I, AMDGPU::OpName::format)->getImm(); + Format = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::format)->getImm(); - Width = getOpcodeWidth(*I, TII); + Width = getOpcodeWidth(*I, *LSO.TII); if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) { Offset &= 0xffff; } else if (InstClass != MIMG) { - CPol = TII.getNamedOperand(*I, AMDGPU::OpName::cpol)->getImm(); + CPol = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::cpol)->getImm(); } - AddressRegs Regs = getRegs(Opc, TII); + AddressRegs Regs = getRegs(Opc, *LSO.TII); NumAddresses = 0; for (unsigned J = 0; J < Regs.NumVAddrs; J++) @@ -910,19 +911,10 @@ bool SILoadStoreOptimizer::checkAndPrepareMerge( } const unsigned InstSubclass = getInstSubclass(Opc, *TII); - // Do not merge VMEM buffer instructions with "swizzled" bit set. - int Swizzled = - AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::swz); - if (Swizzled != -1 && CI.I->getOperand(Swizzled).getImm()) - return false; - DenseSet<Register> RegDefsToMove; DenseSet<Register> PhysRegUsesToMove; addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove); - const TargetRegisterClass *DataRC = getDataRegClass(*CI.I); - bool IsAGPR = TRI->hasAGPRs(DataRC); - MachineBasicBlock::iterator E = std::next(Paired.I); MachineBasicBlock::iterator MBBI = std::next(CI.I); MachineBasicBlock::iterator MBBE = CI.I->getParent()->end(); @@ -971,15 +963,6 @@ bool SILoadStoreOptimizer::checkAndPrepareMerge( continue; } - // Don't merge volatiles. - if (MBBI->hasOrderedMemoryRef()) - return false; - - int Swizzled = - AMDGPU::getNamedOperandIdx(MBBI->getOpcode(), AMDGPU::OpName::swz); - if (Swizzled != -1 && MBBI->getOperand(Swizzled).getImm()) - return false; - // Handle a case like // DS_WRITE_B32 addr, v, idx0 // w = DS_READ_B32 addr, idx0 @@ -991,17 +974,6 @@ bool SILoadStoreOptimizer::checkAndPrepareMerge( continue; if (&*MBBI == &*Paired.I) { - if (TRI->hasAGPRs(getDataRegClass(*MBBI)) != IsAGPR) - return false; - // FIXME: nothing is illegal in a ds_write2 opcode with two AGPR data - // operands. However we are reporting that ds_write2 shall have - // only VGPR data so that machine copy propagation does not - // create an illegal instruction with a VGPR and AGPR sources. - // Consequenctially if we create such instruction the verifier - // will complain. - if (IsAGPR && CI.InstClass == DS_WRITE) - return false; - // We need to go through the list of instructions that we plan to // move and make sure they are all safe to move down past the merged // instruction. @@ -1542,49 +1514,36 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI, std::pair<unsigned, unsigned> SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI, const CombineInfo &Paired) { - - assert(CI.Width != 0 && Paired.Width != 0 && "Width cannot be zero"); - bool ReverseOrder; if (CI.InstClass == MIMG) { assert( (countPopulation(CI.DMask | Paired.DMask) == CI.Width + Paired.Width) && "No overlaps"); ReverseOrder = CI.DMask > Paired.DMask; - } else + } else { ReverseOrder = CI.Offset > Paired.Offset; + } unsigned Idx0; unsigned Idx1; - if (CI.Width + Paired.Width > 4) { - assert(CI.Width == 4 && Paired.Width == 4); + static const unsigned Idxs[5][4] = { + {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3}, + {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4}, + {AMDGPU::sub2, AMDGPU::sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5}, + {AMDGPU::sub3, AMDGPU::sub3_sub4, AMDGPU::sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6}, + {AMDGPU::sub4, AMDGPU::sub4_sub5, AMDGPU::sub4_sub5_sub6, AMDGPU::sub4_sub5_sub6_sub7}, + }; - if (ReverseOrder) { - Idx1 = AMDGPU::sub0_sub1_sub2_sub3; - Idx0 = AMDGPU::sub4_sub5_sub6_sub7; - } else { - Idx0 = AMDGPU::sub0_sub1_sub2_sub3; - Idx1 = AMDGPU::sub4_sub5_sub6_sub7; - } + assert(CI.Width >= 1 && CI.Width <= 4); + assert(Paired.Width >= 1 && Paired.Width <= 4); + + if (ReverseOrder) { + Idx1 = Idxs[0][Paired.Width - 1]; + Idx0 = Idxs[Paired.Width][CI.Width - 1]; } else { - static const unsigned Idxs[4][4] = { - {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3}, - {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, 0}, - {AMDGPU::sub2, AMDGPU::sub2_sub3, 0, 0}, - {AMDGPU::sub3, 0, 0, 0}, - }; - - assert(CI.Width >= 1 && CI.Width <= 3); - assert(Paired.Width >= 1 && Paired.Width <= 3); - - if (ReverseOrder) { - Idx1 = Idxs[0][Paired.Width - 1]; - Idx0 = Idxs[Paired.Width][CI.Width - 1]; - } else { - Idx0 = Idxs[0][CI.Width - 1]; - Idx1 = Idxs[CI.Width][Paired.Width - 1]; - } + Idx0 = Idxs[0][CI.Width - 1]; + Idx1 = Idxs[CI.Width][Paired.Width - 1]; } return std::make_pair(Idx0, Idx1); @@ -1847,7 +1806,8 @@ bool SILoadStoreOptimizer::promoteConstantOffsetToImm( if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0) return false; - if (MI.mayLoad() && TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != NULL) + if (MI.mayLoad() && + TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != nullptr) return false; if (AnchorList.count(&MI)) @@ -1988,6 +1948,7 @@ void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI, std::list<std::list<CombineInfo> > &MergeableInsts) const { for (std::list<CombineInfo> &AddrList : MergeableInsts) { if (AddrList.front().InstClass == CI.InstClass && + AddrList.front().IsAGPR == CI.IsAGPR && AddrList.front().hasSameBaseAddress(*CI.I)) { AddrList.emplace_back(CI); return; @@ -2030,13 +1991,29 @@ SILoadStoreOptimizer::collectMergeableInsts( if (InstClass == UNKNOWN) continue; + // Do not merge VMEM buffer instructions with "swizzled" bit set. + int Swizzled = + AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::swz); + if (Swizzled != -1 && MI.getOperand(Swizzled).getImm()) + continue; + CombineInfo CI; - CI.setMI(MI, *TII, *STM); + CI.setMI(MI, *this); CI.Order = Order++; if (!CI.hasMergeableAddress(*MRI)) continue; + if (CI.InstClass == DS_WRITE && CI.IsAGPR) { + // FIXME: nothing is illegal in a ds_write2 opcode with two AGPR data + // operands. However we are reporting that ds_write2 shall have + // only VGPR data so that machine copy propagation does not + // create an illegal instruction with a VGPR and AGPR sources. + // Consequenctially if we create such instruction the verifier + // will complain. + continue; + } + LLVM_DEBUG(dbgs() << "Mergeable: " << MI); addInstToMergeableList(CI, MergeableInsts); @@ -2144,54 +2121,54 @@ SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr( case DS_READ: { MachineBasicBlock::iterator NewMI = mergeRead2Pair(CI, Paired, InstsToMove); - CI.setMI(NewMI, *TII, *STM); + CI.setMI(NewMI, *this); break; } case DS_WRITE: { MachineBasicBlock::iterator NewMI = mergeWrite2Pair(CI, Paired, InstsToMove); - CI.setMI(NewMI, *TII, *STM); + CI.setMI(NewMI, *this); break; } case S_BUFFER_LOAD_IMM: { MachineBasicBlock::iterator NewMI = mergeSBufferLoadImmPair(CI, Paired, InstsToMove); - CI.setMI(NewMI, *TII, *STM); + CI.setMI(NewMI, *this); OptimizeListAgain |= (CI.Width + Paired.Width) < 8; break; } case BUFFER_LOAD: { MachineBasicBlock::iterator NewMI = mergeBufferLoadPair(CI, Paired, InstsToMove); - CI.setMI(NewMI, *TII, *STM); + CI.setMI(NewMI, *this); OptimizeListAgain |= (CI.Width + Paired.Width) < 4; break; } case BUFFER_STORE: { MachineBasicBlock::iterator NewMI = mergeBufferStorePair(CI, Paired, InstsToMove); - CI.setMI(NewMI, *TII, *STM); + CI.setMI(NewMI, *this); OptimizeListAgain |= (CI.Width + Paired.Width) < 4; break; } case MIMG: { MachineBasicBlock::iterator NewMI = mergeImagePair(CI, Paired, InstsToMove); - CI.setMI(NewMI, *TII, *STM); + CI.setMI(NewMI, *this); OptimizeListAgain |= (CI.Width + Paired.Width) < 4; break; } case TBUFFER_LOAD: { MachineBasicBlock::iterator NewMI = mergeTBufferLoadPair(CI, Paired, InstsToMove); - CI.setMI(NewMI, *TII, *STM); + CI.setMI(NewMI, *this); OptimizeListAgain |= (CI.Width + Paired.Width) < 4; break; } case TBUFFER_STORE: { MachineBasicBlock::iterator NewMI = mergeTBufferStorePair(CI, Paired, InstsToMove); - CI.setMI(NewMI, *TII, *STM); + CI.setMI(NewMI, *this); OptimizeListAgain |= (CI.Width + Paired.Width) < 4; break; } diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp index 3168bcd53eda..e1018bdfde46 100644 --- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -56,6 +56,7 @@ #include "llvm/CodeGen/LiveVariables.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/Target/TargetMachine.h" using namespace llvm; @@ -90,6 +91,8 @@ private: unsigned OrSaveExecOpc; unsigned Exec; + bool EnableOptimizeEndCf = false; + bool hasKill(const MachineBasicBlock *Begin, const MachineBasicBlock *End); void emitIf(MachineInstr &MI); @@ -579,10 +582,10 @@ void SILowerControlFlow::combineMasks(MachineInstr &MI) { void SILowerControlFlow::optimizeEndCf() { // If the only instruction immediately following this END_CF is an another // END_CF in the only successor we can avoid emitting exec mask restore here. - if (!RemoveRedundantEndcf) + if (!EnableOptimizeEndCf) return; - for (MachineInstr *MI : LoweredEndCf) { + for (MachineInstr *MI : reverse(LoweredEndCf)) { MachineBasicBlock &MBB = *MI->getParent(); auto Next = skipIgnoreExecInstsTrivialSucc(MBB, std::next(MI->getIterator())); @@ -807,6 +810,8 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) { const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); TII = ST.getInstrInfo(); TRI = &TII->getRegisterInfo(); + EnableOptimizeEndCf = + RemoveRedundantEndcf && MF.getTarget().getOptLevel() > CodeGenOpt::None; // This doesn't actually need LiveIntervals, but we can preserve them. LIS = getAnalysisIfAvailable<LiveIntervals>(); diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp index 55196fe334e6..0fbdbef6fcce 100644 --- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp @@ -127,7 +127,7 @@ static void insertCSRRestores(MachineBasicBlock &RestoreBlock, // FIXME: Just emit the readlane/writelane directly if (!TFI->restoreCalleeSavedRegisters(RestoreBlock, I, CSI, TRI)) { for (const CalleeSavedInfo &CI : reverse(CSI)) { - unsigned Reg = CI.getReg(); + Register Reg = CI.getReg(); const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, MVT::i32); @@ -239,50 +239,6 @@ bool SILowerSGPRSpills::spillCalleeSavedRegs(MachineFunction &MF) { return false; } -// Find lowest available VGPR and use it as VGPR reserved for SGPR spills. -static bool lowerShiftReservedVGPR(MachineFunction &MF, - const GCNSubtarget &ST) { - SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); - const Register PreReservedVGPR = FuncInfo->VGPRReservedForSGPRSpill; - // Early out if pre-reservation of a VGPR for SGPR spilling is disabled. - if (!PreReservedVGPR) - return false; - - // If there are no free lower VGPRs available, default to using the - // pre-reserved register instead. - const SIRegisterInfo *TRI = ST.getRegisterInfo(); - Register LowestAvailableVGPR = - TRI->findUnusedRegister(MF.getRegInfo(), &AMDGPU::VGPR_32RegClass, MF); - if (!LowestAvailableVGPR) - LowestAvailableVGPR = PreReservedVGPR; - - MachineFrameInfo &FrameInfo = MF.getFrameInfo(); - // Create a stack object for a possible spill in the function prologue. - // Note Non-CSR VGPR also need this as we may overwrite inactive lanes. - Optional<int> FI = FrameInfo.CreateSpillStackObject(4, Align(4)); - - // Find saved info about the pre-reserved register. - const auto *ReservedVGPRInfoItr = - llvm::find_if(FuncInfo->getSGPRSpillVGPRs(), - [PreReservedVGPR](const auto &SpillRegInfo) { - return SpillRegInfo.VGPR == PreReservedVGPR; - }); - - assert(ReservedVGPRInfoItr != FuncInfo->getSGPRSpillVGPRs().end()); - auto Index = - std::distance(FuncInfo->getSGPRSpillVGPRs().begin(), ReservedVGPRInfoItr); - - FuncInfo->setSGPRSpillVGPRs(LowestAvailableVGPR, FI, Index); - - for (MachineBasicBlock &MBB : MF) { - assert(LowestAvailableVGPR.isValid() && "Did not find an available VGPR"); - MBB.addLiveIn(LowestAvailableVGPR); - MBB.sortUniqueLiveIns(); - } - - return true; -} - bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) { const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); TII = ST.getInstrInfo(); @@ -304,11 +260,6 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) { if (!MFI.hasStackObjects() && !HasCSRs) { SaveBlocks.clear(); RestoreBlocks.clear(); - if (FuncInfo->VGPRReservedForSGPRSpill) { - // Free the reserved VGPR for later possible use by frame lowering. - FuncInfo->removeVGPRForSGPRSpill(FuncInfo->VGPRReservedForSGPRSpill, MF); - MRI.freezeReservedRegs(MF); - } return false; } @@ -326,8 +277,6 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) { // This operates under the assumption that only other SGPR spills are users // of the frame index. - lowerShiftReservedVGPR(MF, ST); - // To track the spill frame indices handled in this pass. BitVector SpillFIs(MFI.getObjectIndexEnd(), false); @@ -375,8 +324,6 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) { FuncInfo->removeDeadFrameIndices(MFI); MadeChange = true; - } else if (FuncInfo->VGPRReservedForSGPRSpill) { - FuncInfo->removeVGPRForSGPRSpill(FuncInfo->VGPRReservedForSGPRSpill, MF); } SaveBlocks.clear(); diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index 3ce368ef4db9..cca8565c9ff9 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -118,10 +118,12 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) if (IsKernel || !F.hasFnAttribute("amdgpu-no-workitem-id-x")) WorkItemIDX = true; - if (!F.hasFnAttribute("amdgpu-no-workitem-id-y")) + if (!F.hasFnAttribute("amdgpu-no-workitem-id-y") && + ST.getMaxWorkitemID(F, 1) != 0) WorkItemIDY = true; - if (!F.hasFnAttribute("amdgpu-no-workitem-id-z")) + if (!F.hasFnAttribute("amdgpu-no-workitem-id-z") && + ST.getMaxWorkitemID(F, 2) != 0) WorkItemIDZ = true; if (!F.hasFnAttribute("amdgpu-no-dispatch-ptr")) @@ -274,7 +276,6 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF, MachineFrameInfo &FrameInfo = MF.getFrameInfo(); MachineRegisterInfo &MRI = MF.getRegInfo(); unsigned WaveSize = ST.getWavefrontSize(); - SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); unsigned Size = FrameInfo.getObjectSize(FI); unsigned NumLanes = Size / 4; @@ -291,16 +292,7 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF, Register LaneVGPR; unsigned VGPRIndex = (NumVGPRSpillLanes % WaveSize); - // Reserve a VGPR (when NumVGPRSpillLanes = 0, WaveSize, 2*WaveSize, ..) and - // when one of the two conditions is true: - // 1. One reserved VGPR being tracked by VGPRReservedForSGPRSpill is not yet - // reserved. - // 2. All spill lanes of reserved VGPR(s) are full and another spill lane is - // required. - if (FuncInfo->VGPRReservedForSGPRSpill && NumVGPRSpillLanes < WaveSize) { - assert(FuncInfo->VGPRReservedForSGPRSpill == SpillVGPRs.back().VGPR); - LaneVGPR = FuncInfo->VGPRReservedForSGPRSpill; - } else if (VGPRIndex == 0) { + if (VGPRIndex == 0) { LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF); if (LaneVGPR == AMDGPU::NoRegister) { // We have no VGPRs left for spilling SGPRs. Reset because we will not @@ -308,6 +300,8 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF, SGPRToVGPRSpills.erase(FI); NumVGPRSpillLanes -= I; + // FIXME: We can run out of free registers with split allocation if + // IPRA is enabled and a called function already uses every VGPR. #if 0 DiagnosticInfoResourceLimit DiagOutOfRegs(MF.getFunction(), "VGPRs for SGPR spilling", @@ -340,21 +334,6 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF, return true; } -/// Reserve a VGPR for spilling of SGPRs -bool SIMachineFunctionInfo::reserveVGPRforSGPRSpills(MachineFunction &MF) { - const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); - const SIRegisterInfo *TRI = ST.getRegisterInfo(); - SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); - - Register LaneVGPR = TRI->findUnusedRegister( - MF.getRegInfo(), &AMDGPU::VGPR_32RegClass, MF, true); - if (LaneVGPR == Register()) - return false; - SpillVGPRs.push_back(SGPRSpillVGPR(LaneVGPR, None)); - FuncInfo->VGPRReservedForSGPRSpill = LaneVGPR; - return true; -} - /// Reserve AGPRs or VGPRs to support spilling for FrameIndex \p FI. /// Either AGPR is spilled to VGPR to vice versa. /// Returns true if a \p FI can be eliminated completely. @@ -616,24 +595,6 @@ bool SIMachineFunctionInfo::initializeBaseYamlFields( return false; } -// Remove VGPR which was reserved for SGPR spills if there are no spilled SGPRs -bool SIMachineFunctionInfo::removeVGPRForSGPRSpill(Register ReservedVGPR, - MachineFunction &MF) { - for (auto *i = SpillVGPRs.begin(); i < SpillVGPRs.end(); i++) { - if (i->VGPR == ReservedVGPR) { - SpillVGPRs.erase(i); - - for (MachineBasicBlock &MBB : MF) { - MBB.removeLiveIn(ReservedVGPR); - MBB.sortUniqueLiveIns(); - } - this->VGPRReservedForSGPRSpill = AMDGPU::NoRegister; - return true; - } - } - return false; -} - bool SIMachineFunctionInfo::usesAGPRs(const MachineFunction &MF) const { if (UsesAGPRs) return *UsesAGPRs; diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h index 8accbf611c5f..8e821274bb77 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -502,7 +502,6 @@ public: // FIXME Register SGPRForBPSaveRestoreCopy; Optional<int> BasePointerSaveIndex; - Register VGPRReservedForSGPRSpill; bool isCalleeSavedReg(const MCPhysReg *CSRegs, MCPhysReg Reg); public: @@ -528,7 +527,6 @@ public: void setSGPRSpillVGPRs(Register NewVGPR, Optional<int> newFI, int Index) { SpillVGPRs[Index].VGPR = NewVGPR; SpillVGPRs[Index].FI = newFI; - VGPRReservedForSGPRSpill = NewVGPR; } bool removeVGPRForSGPRSpill(Register ReservedVGPR, MachineFunction &MF); @@ -556,7 +554,6 @@ public: bool haveFreeLanesForSGPRSpill(const MachineFunction &MF, unsigned NumLane) const; bool allocateSGPRSpillToVGPR(MachineFunction &MF, int FI); - bool reserveVGPRforSGPRSpills(MachineFunction &MF); bool allocateVGPRSpillToAGPR(MachineFunction &MF, int FI, bool isAGPRtoVGPR); void removeDeadFrameIndices(MachineFrameInfo &MFI); diff --git a/llvm/lib/Target/AMDGPU/SIModeRegister.cpp b/llvm/lib/Target/AMDGPU/SIModeRegister.cpp index 69eab762f05c..24a8879b5684 100644 --- a/llvm/lib/Target/AMDGPU/SIModeRegister.cpp +++ b/llvm/lib/Target/AMDGPU/SIModeRegister.cpp @@ -188,7 +188,7 @@ void SIModeRegister::insertSetreg(MachineBasicBlock &MBB, MachineInstr *MI, unsigned Offset = countTrailingZeros<unsigned>(InstrMode.Mask); unsigned Width = countTrailingOnes<unsigned>(InstrMode.Mask >> Offset); unsigned Value = (InstrMode.Mode >> Offset) & ((1 << Width) - 1); - BuildMI(MBB, MI, 0, TII->get(AMDGPU::S_SETREG_IMM32_B32)) + BuildMI(MBB, MI, nullptr, TII->get(AMDGPU::S_SETREG_IMM32_B32)) .addImm(Value) .addImm(((Width - 1) << AMDGPU::Hwreg::WIDTH_M1_SHIFT_) | (Offset << AMDGPU::Hwreg::OFFSET_SHIFT_) | diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp index 6bf6c45d8cf6..e13e33ed5457 100644 --- a/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp +++ b/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp @@ -155,6 +155,11 @@ public: return MachineFunctionProperties().set( MachineFunctionProperties::Property::IsSSA); } + + MachineFunctionProperties getClearedProperties() const override { + return MachineFunctionProperties().set( + MachineFunctionProperties::Property::NoPHIs); + } }; } // end anonymous namespace @@ -366,47 +371,42 @@ void SIOptimizeVGPRLiveRange::collectWaterfallCandidateRegisters( // Re-calculate the liveness of \p Reg in the THEN-region void SIOptimizeVGPRLiveRange::updateLiveRangeInThenRegion( Register Reg, MachineBasicBlock *If, MachineBasicBlock *Flow) const { - - SmallPtrSet<MachineBasicBlock *, 16> PHIIncoming; - - MachineBasicBlock *ThenEntry = nullptr; - for (auto *Succ : If->successors()) { - if (Succ != Flow) { - ThenEntry = Succ; - break; + SetVector<MachineBasicBlock *> Blocks; + SmallVector<MachineBasicBlock *> WorkList({If}); + + // Collect all successors until we see the flow block, where we should + // reconverge. + while (!WorkList.empty()) { + auto *MBB = WorkList.pop_back_val(); + for (auto *Succ : MBB->successors()) { + if (Succ != Flow && !Blocks.contains(Succ)) { + WorkList.push_back(Succ); + Blocks.insert(Succ); + } } } - assert(ThenEntry && "No successor in Then region?"); LiveVariables::VarInfo &OldVarInfo = LV->getVarInfo(Reg); - df_iterator_default_set<MachineBasicBlock *, 16> Visited; - - for (MachineBasicBlock *MBB : depth_first_ext(ThenEntry, Visited)) { - if (MBB == Flow) - break; - + for (MachineBasicBlock *MBB : Blocks) { // Clear Live bit, as we will recalculate afterwards LLVM_DEBUG(dbgs() << "Clear AliveBlock " << printMBBReference(*MBB) << '\n'); OldVarInfo.AliveBlocks.reset(MBB->getNumber()); } + SmallPtrSet<MachineBasicBlock *, 4> PHIIncoming; + // Get the blocks the Reg should be alive through for (auto I = MRI->use_nodbg_begin(Reg), E = MRI->use_nodbg_end(); I != E; ++I) { auto *UseMI = I->getParent(); if (UseMI->isPHI() && I->readsReg()) { - if (Visited.contains(UseMI->getParent())) + if (Blocks.contains(UseMI->getParent())) PHIIncoming.insert(UseMI->getOperand(I.getOperandNo() + 1).getMBB()); } } - Visited.clear(); - - for (MachineBasicBlock *MBB : depth_first_ext(ThenEntry, Visited)) { - if (MBB == Flow) - break; - + for (MachineBasicBlock *MBB : Blocks) { SmallVector<MachineInstr *> Uses; // PHI instructions has been processed before. findNonPHIUsesInBlock(Reg, MBB, Uses); @@ -433,7 +433,7 @@ void SIOptimizeVGPRLiveRange::updateLiveRangeInThenRegion( // Set the isKilled flag if we get new Kills in the THEN region. for (auto *MI : OldVarInfo.Kills) { - if (Visited.contains(MI->getParent())) + if (Blocks.contains(MI->getParent())) MI->addRegisterKilled(Reg, TRI); } } diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td index 340e2b48e5cd..eb9452f4b85e 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -617,7 +617,7 @@ def Pseudo_SReg_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16 let HasSGPR = 1; } -def Pseudo_SReg_128 : SIRegisterClass<"AMDGPU", [v4i32, v2i64, v2f64], 32, +def Pseudo_SReg_128 : SIRegisterClass<"AMDGPU", [v4i32, v2i64, v2f64, v8i16, v8f16], 32, (add PRIVATE_RSRC_REG)> { let isAllocatable = 0; let CopyCost = -1; @@ -784,7 +784,7 @@ multiclass SRegClass<int numRegs, int priority, } defm "" : SRegClass<3, 14, [v3i32, v3f32], SGPR_96Regs, TTMP_96Regs>; -defm "" : SRegClass<4, 15, [v4i32, v4f32, v2i64], SGPR_128Regs, TTMP_128Regs>; +defm "" : SRegClass<4, 15, [v4i32, v4f32, v2i64, v2f64, v8i16, v8f16], SGPR_128Regs, TTMP_128Regs>; defm "" : SRegClass<5, 16, [v5i32, v5f32], SGPR_160Regs, TTMP_160Regs>; defm "" : SRegClass<6, 17, [v6i32, v6f32, v3i64, v3f64], SGPR_192Regs, TTMP_192Regs>; defm "" : SRegClass<7, 18, [v7i32, v7f32], SGPR_224Regs, TTMP_224Regs>; @@ -824,7 +824,7 @@ multiclass VRegClass<int numRegs, list<ValueType> regTypes, dag regList> { defm VReg_64 : VRegClass<2, [i64, f64, v2i32, v2f32, v4f16, v4i16, p0, p1, p4], (add VGPR_64)>; defm VReg_96 : VRegClass<3, [v3i32, v3f32], (add VGPR_96)>; -defm VReg_128 : VRegClass<4, [v4i32, v4f32, v2i64, v2f64], (add VGPR_128)>; +defm VReg_128 : VRegClass<4, [v4i32, v4f32, v2i64, v2f64, v8i16, v8f16], (add VGPR_128)>; defm VReg_160 : VRegClass<5, [v5i32, v5f32], (add VGPR_160)>; defm VReg_192 : VRegClass<6, [v6i32, v6f32, v3i64, v3f64], (add VGPR_192)>; @@ -846,7 +846,7 @@ multiclass ARegClass<int numRegs, list<ValueType> regTypes, dag regList> { defm AReg_64 : ARegClass<2, [i64, f64, v2i32, v2f32, v4f16, v4i16], (add AGPR_64)>; defm AReg_96 : ARegClass<3, [v3i32, v3f32], (add AGPR_96)>; -defm AReg_128 : ARegClass<4, [v4i32, v4f32, v2i64, v2f64], (add AGPR_128)>; +defm AReg_128 : ARegClass<4, [v4i32, v4f32, v2i64, v2f64, v8i16, v8f16], (add AGPR_128)>; defm AReg_160 : ARegClass<5, [v5i32, v5f32], (add AGPR_160)>; defm AReg_192 : ARegClass<6, [v6i32, v6f32, v3i64, v3f64], (add AGPR_192)>; defm AReg_224 : ARegClass<7, [v7i32, v7f32], (add AGPR_224)>; diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp index 77ee3c0ff0e4..46efb3c605c6 100644 --- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -861,12 +861,16 @@ MachineInstr *SIWholeQuadMode::lowerKillF32(MachineBasicBlock &MBB, MachineInstr *VcmpMI; const MachineOperand &Op0 = MI.getOperand(0); const MachineOperand &Op1 = MI.getOperand(1); + + // VCC represents lanes killed. + Register VCC = ST->isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC; + if (TRI->isVGPR(*MRI, Op0.getReg())) { Opcode = AMDGPU::getVOPe32(Opcode); VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode)).add(Op1).add(Op0); } else { VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode)) - .addReg(AMDGPU::VCC, RegState::Define) + .addReg(VCC, RegState::Define) .addImm(0) // src0 modifiers .add(Op1) .addImm(0) // src1 modifiers @@ -874,9 +878,6 @@ MachineInstr *SIWholeQuadMode::lowerKillF32(MachineBasicBlock &MBB, .addImm(0); // omod } - // VCC represents lanes killed. - Register VCC = ST->isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC; - MachineInstr *MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg) .addReg(LiveMaskReg) diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td index 1713586dcf5b..3f7837f7dbf1 100644 --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -246,10 +246,10 @@ let Defs = [SCC] in { def S_BCNT0_I32_B32 : SOP1_32 <"s_bcnt0_i32_b32">; def S_BCNT0_I32_B64 : SOP1_32_64 <"s_bcnt0_i32_b64">; def S_BCNT1_I32_B32 : SOP1_32 <"s_bcnt1_i32_b32", - [(set i32:$sdst, (ctpop i32:$src0))] + [(set i32:$sdst, (UniformUnaryFrag<ctpop> i32:$src0))] >; def S_BCNT1_I32_B64 : SOP1_32_64 <"s_bcnt1_i32_b64", - [(set i32:$sdst, (ctpop i64:$src0))] + [(set i32:$sdst, (UniformUnaryFrag<ctpop> i64:$src0))] >; } // End Defs = [SCC] @@ -518,10 +518,9 @@ let Uses = [SCC] in { def S_CSELECT_B32 : SOP2_32 <"s_cselect_b32", [(set i32:$sdst, (SelectPat<select> i32:$src0, i32:$src1))] >; - def S_CSELECT_B64 : SOP2_64 <"s_cselect_b64", - [(set i64:$sdst, (SelectPat<select> i64:$src0, i64:$src1))] - >; } + + def S_CSELECT_B64 : SOP2_64 <"s_cselect_b64">; } // End Uses = [SCC] let Defs = [SCC] in { @@ -551,11 +550,11 @@ def S_XOR_B64 : SOP2_64 <"s_xor_b64", >; def S_XNOR_B32 : SOP2_32 <"s_xnor_b32", - [(set i32:$sdst, (not (xor_oneuse i32:$src0, i32:$src1)))] + [(set i32:$sdst, (UniformUnaryFrag<not> (xor_oneuse i32:$src0, i32:$src1)))] >; def S_XNOR_B64 : SOP2_64 <"s_xnor_b64", - [(set i64:$sdst, (not (xor_oneuse i64:$src0, i64:$src1)))] + [(set i64:$sdst, (UniformUnaryFrag<not> (xor_oneuse i64:$src0, i64:$src1)))] >; def S_NAND_B32 : SOP2_32 <"s_nand_b32", @@ -1371,7 +1370,7 @@ def : GCNPat < >; def : GCNPat < - (i64 (ctpop i64:$src)), + (i64 (UniformUnaryFrag<ctpop> i64:$src)), (i64 (REG_SEQUENCE SReg_64, (i32 (COPY_TO_REGCLASS (S_BCNT1_I32_B64 $src), SReg_32)), sub0, (S_MOV_B32 (i32 0)), sub1)) diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp index 0bee9022975e..18c348d1cf89 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp @@ -79,8 +79,8 @@ const char* const IdSymbolic[] = { "HW_REG_FLAT_SCR_LO", "HW_REG_FLAT_SCR_HI", "HW_REG_XNACK_MASK", - nullptr, // HW_ID1, no predictable values - nullptr, // HW_ID2, no predictable values + "HW_REG_HW_ID1", + "HW_REG_HW_ID2", "HW_REG_POPS_PACKER", nullptr, nullptr, diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index d20eaaaa65e8..1e96266eb06c 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -132,6 +132,8 @@ bool isHsaAbiVersion3Or4(const MCSubtargetInfo *STI) { #define GET_MIMGInfoTable_IMPL #define GET_MIMGLZMappingTable_IMPL #define GET_MIMGMIPMappingTable_IMPL +#define GET_MIMGBiasMappingTable_IMPL +#define GET_MIMGOffsetMappingTable_IMPL #define GET_MIMGG16MappingTable_IMPL #include "AMDGPUGenSearchableTables.inc" @@ -410,7 +412,7 @@ void AMDGPUTargetID::setTargetIDFromTargetIDStream(StringRef TargetID) { } std::string AMDGPUTargetID::toString() const { - std::string StringRep = ""; + std::string StringRep; raw_string_ostream StreamRep(StringRep); auto TargetTriple = STI.getTargetTriple(); @@ -421,7 +423,7 @@ std::string AMDGPUTargetID::toString() const { << TargetTriple.getOSName() << '-' << TargetTriple.getEnvironmentName() << '-'; - std::string Processor = ""; + std::string Processor; // TODO: Following else statement is present here because we used various // alias names for GPUs up until GFX9 (e.g. 'fiji' is same as 'gfx803'). // Remove once all aliases are removed from GCNProcessors.td. @@ -432,7 +434,7 @@ std::string AMDGPUTargetID::toString() const { Twine(Version.Stepping)) .str(); - std::string Features = ""; + std::string Features; if (Optional<uint8_t> HsaAbiVersion = getHsaAbiVersion(&STI)) { switch (*HsaAbiVersion) { case ELF::ELFABIVERSION_AMDGPU_HSA_V2: @@ -1018,9 +1020,18 @@ static unsigned getLastSymbolicHwreg(const MCSubtargetInfo &STI) { } bool isValidHwreg(int64_t Id, const MCSubtargetInfo &STI) { - return - ID_SYMBOLIC_FIRST_ <= Id && Id < getLastSymbolicHwreg(STI) && - IdSymbolic[Id] && (Id != ID_XNACK_MASK || !AMDGPU::isGFX10_BEncoding(STI)); + switch (Id) { + case ID_HW_ID: + return isSI(STI) || isCI(STI) || isVI(STI) || isGFX9(STI); + case ID_HW_ID1: + case ID_HW_ID2: + return isGFX10Plus(STI); + case ID_XNACK_MASK: + return isGFX10(STI) && !AMDGPU::isGFX10_BEncoding(STI); + default: + return ID_SYMBOLIC_FIRST_ <= Id && Id < getLastSymbolicHwreg(STI) && + IdSymbolic[Id]; + } } bool isValidHwreg(int64_t Id) { diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index 061c74c0ace6..89f928eb8b92 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -64,6 +64,7 @@ struct GcnBufferFormatInfo { #define GET_MIMGEncoding_DECL #define GET_MIMGLZMapping_DECL #define GET_MIMGMIPMapping_DECL +#define GET_MIMGBiASMapping_DECL #include "AMDGPUGenSearchableTables.inc" namespace IsaInfo { @@ -330,6 +331,16 @@ struct MIMGMIPMappingInfo { MIMGBaseOpcode NONMIP; }; +struct MIMGBiasMappingInfo { + MIMGBaseOpcode Bias; + MIMGBaseOpcode NoBias; +}; + +struct MIMGOffsetMappingInfo { + MIMGBaseOpcode Offset; + MIMGBaseOpcode NoOffset; +}; + struct MIMGG16MappingInfo { MIMGBaseOpcode G; MIMGBaseOpcode G16; @@ -342,6 +353,12 @@ LLVM_READONLY const MIMGMIPMappingInfo *getMIMGMIPMappingInfo(unsigned MIP); LLVM_READONLY +const MIMGBiasMappingInfo *getMIMGBiasMappingInfo(unsigned Bias); + +LLVM_READONLY +const MIMGOffsetMappingInfo *getMIMGOffsetMappingInfo(unsigned Offset); + +LLVM_READONLY const MIMGG16MappingInfo *getMIMGG16MappingInfo(unsigned G); LLVM_READONLY diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td index 8d232ffe4114..b9ff814a4dc5 100644 --- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td @@ -637,9 +637,9 @@ class divergent_i64_BinOp <SDPatternOperator Op, Instruction Inst> : ) >; -def : divergent_i64_BinOp <and, V_AND_B32_e32>; -def : divergent_i64_BinOp <or, V_OR_B32_e32>; -def : divergent_i64_BinOp <xor, V_XOR_B32_e32>; +def : divergent_i64_BinOp <and, V_AND_B32_e64>; +def : divergent_i64_BinOp <or, V_OR_B32_e64>; +def : divergent_i64_BinOp <xor, V_XOR_B32_e64>; let SubtargetPredicate = Has16BitInsts in { @@ -688,6 +688,36 @@ let SubtargetPredicate = HasDLInsts in { let isReMaterializable = 1 in defm V_XNOR_B32 : VOP2Inst <"v_xnor_b32", VOP_I32_I32_I32, xnor>; +def : GCNPat< + (i32 (DivergentUnaryFrag<not> (xor_oneuse i32:$src0, i32:$src1))), + (i32 (V_XNOR_B32_e64 $src0, $src1)) +>; + +def : GCNPat< + (i32 (DivergentBinFrag<xor_oneuse> (not i32:$src0), i32:$src1)), + (i32 (V_XNOR_B32_e64 $src0, $src1)) +>; + +def : GCNPat< + (i64 (DivergentUnaryFrag<not> (xor_oneuse i64:$src0, i64:$src1))), + (REG_SEQUENCE VReg_64, (i32 (V_XNOR_B32_e64 + (i32 (EXTRACT_SUBREG $src0, sub0)), + (i32 (EXTRACT_SUBREG $src1, sub0)))), sub0, + (i32 (V_XNOR_B32_e64 + (i32 (EXTRACT_SUBREG $src0, sub1)), + (i32 (EXTRACT_SUBREG $src1, sub1)))), sub1) +>; + +def : GCNPat< + (i64 (DivergentBinFrag<xor_oneuse> (not i64:$src0), i64:$src1)), + (REG_SEQUENCE VReg_64, (i32 (V_XNOR_B32_e64 + (i32 (EXTRACT_SUBREG $src0, sub0)), + (i32 (EXTRACT_SUBREG $src1, sub0)))), sub0, + (i32 (V_XNOR_B32_e64 + (i32 (EXTRACT_SUBREG $src0, sub1)), + (i32 (EXTRACT_SUBREG $src1, sub1)))), sub1) +>; + let Constraints = "$vdst = $src2", DisableEncoding = "$src2", isConvertibleToThreeAddress = 1, diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index 32222b3eb93c..707475ceccee 100644 --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -388,6 +388,12 @@ class VOPProfileMAI<VOPProfile P, RegisterOperand _SrcRC, RegisterOperand _DstRC let HasModifiers = 0; let Asm64 = "$vdst, $src0, $src1, $src2$cbsz$abid$blgp"; let Ins64 = (ins Src0RC64:$src0, Src1RC64:$src1, Src2RC64:$src2, cbsz:$cbsz, abid:$abid, blgp:$blgp); + // Dst and SrcC cannot partially overlap if SrcC/Dst is bigger than 4 VGPRs. + // We then create two versions of the instruction: with tied dst and src2 + // and with the eralyclobber flag on the dst. This is strciter than the + // actual HW restriction. In particular earlyclobber also affects src0 and + // src1 allocation which is not required. + bit NoDstOverlap = !gt(DstVT.Size, 128); } def VOPProfileMAI_F32_F32_X4 : VOPProfileMAI<VOP_V4F32_F32_F32_V4F32, AISrc_128_f32, ADst_128>; @@ -426,6 +432,11 @@ def VOPProfileMAI_F32_V4I16_X32_VCD : VOPProfileMAI<VOP_V32F32_V4I16_V4I16_V32F def VOPProfileMAI_F64_16X16X4F64_VCD : VOPProfileMAI<VOP_V4F64_F64_F64_V4F64, VISrc_256_f64, VDst_256, AVSrc_64>; def VOPProfileMAI_F64_4X4X4F64_VCD : VOPProfileMAI<VOP_F64_F64_F64_F64, VISrc_64_f64, VDst_64, AVSrc_64>; +class MFMATable <bit is_mac, string Name> { + bit IsMac = is_mac; + string FMAOp = Name; +} + let Predicates = [HasMAIInsts] in { let isAsCheapAsAMove = 1, isReMaterializable = 1 in { @@ -435,13 +446,31 @@ let isAsCheapAsAMove = 1, isReMaterializable = 1 in { } // End isMoveImm = 1 } // End isAsCheapAsAMove = 1, isReMaterializable = 1 -multiclass MAIInst<string OpName, string P, SDPatternOperator node> { +multiclass MAIInst<string OpName, string P, SDPatternOperator node, + bit NoDstOverlap = !cast<VOPProfileMAI>("VOPProfileMAI_" # P).NoDstOverlap> { let isConvergent = 1, mayRaiseFPException = 0, ReadsModeReg = 1 in { // FP32 denorm mode is respected, rounding mode is not. Exceptions are not supported. - defm "" : VOP3Inst<OpName, !cast<VOPProfileMAI>("VOPProfileMAI_" # P), node>; - - let SubtargetPredicate = isGFX90APlus, Mnemonic = OpName in - defm _vgprcd : VOP3Inst<OpName # "_vgprcd", !cast<VOPProfileMAI>("VOPProfileMAI_" # P # "_VCD")>; + let Constraints = !if(NoDstOverlap, "@earlyclobber $vdst", "") in { + defm "" : VOP3Inst<OpName, !cast<VOPProfileMAI>("VOPProfileMAI_" # P), !if(NoDstOverlap, null_frag, node)>, + MFMATable<0, NAME # "_e64">; + + let SubtargetPredicate = isGFX90APlus, Mnemonic = OpName in + defm _vgprcd : VOP3Inst<OpName # "_vgprcd", !cast<VOPProfileMAI>("VOPProfileMAI_" # P # "_VCD")>, + MFMATable<0, NAME # "_vgprcd_e64">; + } + + foreach _ = BoolToList<NoDstOverlap>.ret in { + let Constraints = !if(NoDstOverlap, "$vdst = $src2", ""), + isConvertibleToThreeAddress = NoDstOverlap, + Mnemonic = OpName in { + defm "_mac" : VOP3Inst<OpName # "_mac", !cast<VOPProfileMAI>("VOPProfileMAI_" # P), node>, + MFMATable<1, NAME # "_e64">; + + let SubtargetPredicate = isGFX90APlus in + defm _mac_vgprcd : VOP3Inst<OpName # "_mac_vgprcd", !cast<VOPProfileMAI>("VOPProfileMAI_" # P # "_VCD")>, + MFMATable<1, NAME # "_vgprcd_e64">; + } + } } // End isConvergent = 1, mayRaiseFPException = 0, ReadsModeReg = 1 } @@ -517,6 +546,7 @@ multiclass VOP3P_Real_MAI<bits<7> op> { } } +let Constraints = "" in { multiclass VOP3P_Real_MFMA_gfx90a<bits<7> op> { let SubtargetPredicate = isGFX90AOnly, AssemblerPredicate = isGFX90AOnly, DecoderNamespace = "GFX90A" in { @@ -536,6 +566,7 @@ multiclass VOP3P_Real_MFMA<bits<7> op> : let DecoderNamespace = "GFX8"; } } +} defm V_PK_MAD_I16 : VOP3P_Real_vi <0x00>; defm V_PK_MUL_LO_U16 : VOP3P_Real_vi <0x01>; diff --git a/llvm/lib/Target/ARM/ARM.h b/llvm/lib/Target/ARM/ARM.h index 1d5e45aec06c..979371bf7cf6 100644 --- a/llvm/lib/Target/ARM/ARM.h +++ b/llvm/lib/Target/ARM/ARM.h @@ -25,12 +25,9 @@ class ARMAsmPrinter; class ARMBaseTargetMachine; class ARMRegisterBankInfo; class ARMSubtarget; -struct BasicBlockInfo; class Function; class FunctionPass; class InstructionSelector; -class MachineBasicBlock; -class MachineFunction; class MachineInstr; class MCInst; class PassRegistry; diff --git a/llvm/lib/Target/ARM/ARM.td b/llvm/lib/Target/ARM/ARM.td index 8173fe4036a8..4efbdbb2abc8 100644 --- a/llvm/lib/Target/ARM/ARM.td +++ b/llvm/lib/Target/ARM/ARM.td @@ -512,8 +512,7 @@ def HasV6T2Ops : SubtargetFeature<"v6t2", "HasV6T2Ops", "true", def HasV7Ops : SubtargetFeature<"v7", "HasV7Ops", "true", "Support ARM v7 instructions", - [HasV6T2Ops, FeaturePerfMon, - FeatureV7Clrex]>; + [HasV6T2Ops, FeatureV7Clrex]>; def HasV8MMainlineOps : SubtargetFeature<"v8m.main", "HasV8MMainlineOps", "true", @@ -522,7 +521,7 @@ def HasV8MMainlineOps : def HasV8Ops : SubtargetFeature<"v8", "HasV8Ops", "true", "Support ARM v8 instructions", - [HasV7Ops, FeatureAcquireRelease]>; + [HasV7Ops, FeaturePerfMon, FeatureAcquireRelease]>; def HasV8_1aOps : SubtargetFeature<"v8.1a", "HasV8_1aOps", "true", "Support ARM v8.1a instructions", @@ -553,6 +552,10 @@ def HasV8_7aOps : SubtargetFeature<"v8.7a", "HasV8_7aOps", "true", "Support ARM v8.7a instructions", [HasV8_6aOps]>; +def HasV8_8aOps : SubtargetFeature<"v8.8a", "HasV8_8aOps", "true", + "Support ARM v8.8a instructions", + [HasV8_7aOps]>; + def HasV9_0aOps : SubtargetFeature<"v9a", "HasV9_0aOps", "true", "Support ARM v9a instructions", [HasV8_5aOps]>; @@ -565,6 +568,10 @@ def HasV9_2aOps : SubtargetFeature<"v9.2a", "HasV9_2aOps", "true", "Support ARM v9.2a instructions", [HasV8_7aOps, HasV9_1aOps]>; +def HasV9_3aOps : SubtargetFeature<"v9.3a", "HasV9_3aOps", "true", + "Support ARM v9.3a instructions", + [HasV8_8aOps, HasV9_2aOps]>; + def HasV8_1MMainlineOps : SubtargetFeature< "v8.1m.main", "HasV8_1MMainlineOps", "true", "Support ARM v8-1M Mainline instructions", @@ -757,7 +764,8 @@ def ARMv7a : Architecture<"armv7-a", "ARMv7a", [HasV7Ops, FeatureNEON, FeatureDB, FeatureDSP, - FeatureAClass]>; + FeatureAClass, + FeaturePerfMon]>; def ARMv7ve : Architecture<"armv7ve", "ARMv7ve", [HasV7Ops, FeatureNEON, @@ -766,13 +774,15 @@ def ARMv7ve : Architecture<"armv7ve", "ARMv7ve", [HasV7Ops, FeatureTrustZone, FeatureMP, FeatureVirtualization, - FeatureAClass]>; + FeatureAClass, + FeaturePerfMon]>; def ARMv7r : Architecture<"armv7-r", "ARMv7r", [HasV7Ops, FeatureDB, FeatureDSP, FeatureHWDivThumb, - FeatureRClass]>; + FeatureRClass, + FeaturePerfMon]>; def ARMv7m : Architecture<"armv7-m", "ARMv7m", [HasV7Ops, FeatureThumb2, @@ -894,6 +904,19 @@ def ARMv87a : Architecture<"armv8.7-a", "ARMv87a", [HasV8_7aOps, FeatureCRC, FeatureRAS, FeatureDotProd]>; +def ARMv88a : Architecture<"armv8.8-a", "ARMv88a", [HasV8_8aOps, + FeatureAClass, + FeatureDB, + FeatureFPARMv8, + FeatureNEON, + FeatureDSP, + FeatureTrustZone, + FeatureMP, + FeatureVirtualization, + FeatureCrypto, + FeatureCRC, + FeatureRAS, + FeatureDotProd]>; def ARMv9a : Architecture<"armv9-a", "ARMv9a", [HasV9_0aOps, FeatureAClass, @@ -931,6 +954,19 @@ def ARMv92a : Architecture<"armv9.2-a", "ARMv92a", [HasV9_2aOps, FeatureCRC, FeatureRAS, FeatureDotProd]>; +def ARMv93a : Architecture<"armv9.3-a", "ARMv93a", [HasV9_3aOps, + FeatureAClass, + FeatureDB, + FeatureFPARMv8, + FeatureNEON, + FeatureDSP, + FeatureTrustZone, + FeatureMP, + FeatureVirtualization, + FeatureCrypto, + FeatureCRC, + FeatureRAS, + FeatureDotProd]>; def ARMv8r : Architecture<"armv8-r", "ARMv8r", [HasV8Ops, FeatureRClass, @@ -1425,8 +1461,7 @@ def : ProcNoItin<"neoverse-n1", [ARMv82a, def : ProcNoItin<"neoverse-n2", [ARMv85a, FeatureBF16, - FeatureMatMulInt8, - FeaturePerfMon]>; + FeatureMatMulInt8]>; def : ProcessorModel<"cyclone", SwiftModel, [ARMv8a, ProcSwift, FeatureHasRetAddrStack, diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp index 884f38ff6c58..cde715880376 100644 --- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -4868,6 +4868,36 @@ bool ARMBaseInstrInfo::verifyInstruction(const MachineInstr &MI, return false; } } + + // Check the address model by taking the first Imm operand and checking it is + // legal for that addressing mode. + ARMII::AddrMode AddrMode = + (ARMII::AddrMode)(MI.getDesc().TSFlags & ARMII::AddrModeMask); + switch (AddrMode) { + default: + break; + case ARMII::AddrModeT2_i7: + case ARMII::AddrModeT2_i7s2: + case ARMII::AddrModeT2_i7s4: + case ARMII::AddrModeT2_i8: + case ARMII::AddrModeT2_i8pos: + case ARMII::AddrModeT2_i8neg: + case ARMII::AddrModeT2_i8s4: + case ARMII::AddrModeT2_i12: { + uint32_t Imm = 0; + for (auto Op : MI.operands()) { + if (Op.isImm()) { + Imm = Op.getImm(); + break; + } + } + if (!isLegalAddressImm(MI.getOpcode(), Imm, this)) { + ErrInfo = "Incorrect AddrMode Imm for instruction"; + return false; + } + break; + } + } return true; } diff --git a/llvm/lib/Target/ARM/ARMCallLowering.cpp b/llvm/lib/Target/ARM/ARMCallLowering.cpp index 81ec4d09a408..b15ef094d9d2 100644 --- a/llvm/lib/Target/ARM/ARMCallLowering.cpp +++ b/llvm/lib/Target/ARM/ARMCallLowering.cpp @@ -534,7 +534,7 @@ bool ARMCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, CallLoweringInfo & MIRBuilder.buildInstr(ARM::ADJCALLSTACKUP) .addImm(ArgAssigner.StackOffset) - .addImm(0) + .addImm(-1ULL) .add(predOps(ARMCC::AL)); return true; diff --git a/llvm/lib/Target/ARM/ARMCallLowering.h b/llvm/lib/Target/ARM/ARMCallLowering.h index 87b18f811747..38095617fb4f 100644 --- a/llvm/lib/Target/ARM/ARMCallLowering.h +++ b/llvm/lib/Target/ARM/ARMCallLowering.h @@ -23,7 +23,6 @@ namespace llvm { class ARMTargetLowering; -class MachineFunction; class MachineInstrBuilder; class MachineIRBuilder; class Value; diff --git a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp index fa244786a80d..2f083561bbd4 100644 --- a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp +++ b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp @@ -1144,7 +1144,7 @@ static bool determineFPRegsToClear(const MachineInstr &MI, if (!Op.isReg()) continue; - unsigned Reg = Op.getReg(); + Register Reg = Op.getReg(); if (Op.isDef()) { if ((Reg >= ARM::Q0 && Reg <= ARM::Q7) || (Reg >= ARM::D0 && Reg <= ARM::D15) || @@ -1356,7 +1356,7 @@ void ARMExpandPseudo::CMSESaveClearFPRegsV8( std::vector<unsigned> NonclearedFPRegs; for (const MachineOperand &Op : MBBI->operands()) { if (Op.isReg() && Op.isUse()) { - unsigned Reg = Op.getReg(); + Register Reg = Op.getReg(); assert(!ARM::DPRRegClass.contains(Reg) || ARM::DPR_VFP2RegClass.contains(Reg)); assert(!ARM::QPRRegClass.contains(Reg)); @@ -1451,9 +1451,9 @@ void ARMExpandPseudo::CMSESaveClearFPRegsV8( // restore FPSCR from stack and clear bits 0-4, 7, 28-31 // The other bits are program global according to the AAPCS if (passesFPReg) { - BuildMI(MBB, MBBI, DL, TII->get(ARM::t2LDRi8), SpareReg) + BuildMI(MBB, MBBI, DL, TII->get(ARM::tLDRspi), SpareReg) .addReg(ARM::SP) - .addImm(0x40) + .addImm(0x10) .add(predOps(ARMCC::AL)); BuildMI(MBB, MBBI, DL, TII->get(ARM::t2BICri), SpareReg) .addReg(SpareReg) @@ -1543,7 +1543,7 @@ void ARMExpandPseudo::CMSERestoreFPRegsV8( std::vector<unsigned> NonclearedFPRegs; for (const MachineOperand &Op : MBBI->operands()) { if (Op.isReg() && Op.isDef()) { - unsigned Reg = Op.getReg(); + Register Reg = Op.getReg(); assert(!ARM::DPRRegClass.contains(Reg) || ARM::DPR_VFP2RegClass.contains(Reg)); assert(!ARM::QPRRegClass.contains(Reg)); @@ -1663,7 +1663,7 @@ static bool definesOrUsesFPReg(const MachineInstr &MI) { for (const MachineOperand &Op : MI.operands()) { if (!Op.isReg()) continue; - unsigned Reg = Op.getReg(); + Register Reg = Op.getReg(); if ((Reg >= ARM::Q0 && Reg <= ARM::Q7) || (Reg >= ARM::D0 && Reg <= ARM::D15) || (Reg >= ARM::S0 && Reg <= ARM::S31)) @@ -2201,7 +2201,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB, } case ARM::tBLXNS_CALL: { DebugLoc DL = MBBI->getDebugLoc(); - unsigned JumpReg = MBBI->getOperand(0).getReg(); + Register JumpReg = MBBI->getOperand(0).getReg(); // Figure out which registers are live at the point immediately before the // call. When we indiscriminately push a set of registers, the live diff --git a/llvm/lib/Target/ARM/ARMFastISel.cpp b/llvm/lib/Target/ARM/ARMFastISel.cpp index 28a076edd6dc..5d94b99d4c5d 100644 --- a/llvm/lib/Target/ARM/ARMFastISel.cpp +++ b/llvm/lib/Target/ARM/ARMFastISel.cpp @@ -319,7 +319,7 @@ unsigned ARMFastISel::fastEmitInst_r(unsigned MachineInstOpcode, unsigned ARMFastISel::fastEmitInst_rr(unsigned MachineInstOpcode, const TargetRegisterClass *RC, unsigned Op0, unsigned Op1) { - unsigned ResultReg = createResultReg(RC); + Register ResultReg = createResultReg(RC); const MCInstrDesc &II = TII.get(MachineInstOpcode); // Make sure the input operands are sufficiently constrained to be legal @@ -346,7 +346,7 @@ unsigned ARMFastISel::fastEmitInst_rr(unsigned MachineInstOpcode, unsigned ARMFastISel::fastEmitInst_ri(unsigned MachineInstOpcode, const TargetRegisterClass *RC, unsigned Op0, uint64_t Imm) { - unsigned ResultReg = createResultReg(RC); + Register ResultReg = createResultReg(RC); const MCInstrDesc &II = TII.get(MachineInstOpcode); // Make sure the input operand is sufficiently constrained to be legal @@ -371,7 +371,7 @@ unsigned ARMFastISel::fastEmitInst_ri(unsigned MachineInstOpcode, unsigned ARMFastISel::fastEmitInst_i(unsigned MachineInstOpcode, const TargetRegisterClass *RC, uint64_t Imm) { - unsigned ResultReg = createResultReg(RC); + Register ResultReg = createResultReg(RC); const MCInstrDesc &II = TII.get(MachineInstOpcode); if (II.getNumDefs() >= 1) { @@ -392,7 +392,7 @@ unsigned ARMFastISel::fastEmitInst_i(unsigned MachineInstOpcode, unsigned ARMFastISel::ARMMoveToFPReg(MVT VT, unsigned SrcReg) { if (VT == MVT::f64) return 0; - unsigned MoveReg = createResultReg(TLI.getRegClassFor(VT)); + Register MoveReg = createResultReg(TLI.getRegClassFor(VT)); AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM::VMOVSR), MoveReg) .addReg(SrcReg)); @@ -402,7 +402,7 @@ unsigned ARMFastISel::ARMMoveToFPReg(MVT VT, unsigned SrcReg) { unsigned ARMFastISel::ARMMoveToIntReg(MVT VT, unsigned SrcReg) { if (VT == MVT::i64) return 0; - unsigned MoveReg = createResultReg(TLI.getRegClassFor(VT)); + Register MoveReg = createResultReg(TLI.getRegClassFor(VT)); AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM::VMOVRS), MoveReg) .addReg(SrcReg)); @@ -428,7 +428,7 @@ unsigned ARMFastISel::ARMMaterializeFP(const ConstantFP *CFP, MVT VT) { Imm = ARM_AM::getFP32Imm(Val); Opc = ARM::FCONSTS; } - unsigned DestReg = createResultReg(TLI.getRegClassFor(VT)); + Register DestReg = createResultReg(TLI.getRegClassFor(VT)); AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), DestReg).addImm(Imm)); return DestReg; @@ -440,7 +440,7 @@ unsigned ARMFastISel::ARMMaterializeFP(const ConstantFP *CFP, MVT VT) { // MachineConstantPool wants an explicit alignment. Align Alignment = DL.getPrefTypeAlign(CFP->getType()); unsigned Idx = MCP.getConstantPoolIndex(cast<Constant>(CFP), Alignment); - unsigned DestReg = createResultReg(TLI.getRegClassFor(VT)); + Register DestReg = createResultReg(TLI.getRegClassFor(VT)); unsigned Opc = is64bit ? ARM::VLDRD : ARM::VLDRS; // The extra reg is for addrmode5. @@ -462,7 +462,7 @@ unsigned ARMFastISel::ARMMaterializeInt(const Constant *C, MVT VT) { unsigned Opc = isThumb2 ? ARM::t2MOVi16 : ARM::MOVi16; const TargetRegisterClass *RC = isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRRegClass; - unsigned ImmReg = createResultReg(RC); + Register ImmReg = createResultReg(RC); AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ImmReg) .addImm(CI->getZExtValue())); @@ -478,7 +478,7 @@ unsigned ARMFastISel::ARMMaterializeInt(const Constant *C, MVT VT) { unsigned Opc = isThumb2 ? ARM::t2MVNi : ARM::MVNi; const TargetRegisterClass *RC = isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRRegClass; - unsigned ImmReg = createResultReg(RC); + Register ImmReg = createResultReg(RC); AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ImmReg) .addImm(Imm)); @@ -531,7 +531,7 @@ unsigned ARMFastISel::ARMMaterializeGV(const GlobalValue *GV, MVT VT) { bool IsIndirect = Subtarget->isGVIndirectSymbol(GV); const TargetRegisterClass *RC = isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRRegClass; - unsigned DestReg = createResultReg(RC); + Register DestReg = createResultReg(RC); // FastISel TLS support on non-MachO is broken, punt to SelectionDAG. const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV); @@ -589,7 +589,7 @@ unsigned ARMFastISel::ARMMaterializeGV(const GlobalValue *GV, MVT VT) { if (IsPositionIndependent) { unsigned Opc = IsIndirect ? ARM::PICLDR : ARM::PICADD; - unsigned NewDestReg = createResultReg(TLI.getRegClassFor(VT)); + Register NewDestReg = createResultReg(TLI.getRegClassFor(VT)); MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), NewDestReg) @@ -605,7 +605,7 @@ unsigned ARMFastISel::ARMMaterializeGV(const GlobalValue *GV, MVT VT) { (Subtarget->isTargetMachO() && IsIndirect) || Subtarget->genLongCalls()) { MachineInstrBuilder MIB; - unsigned NewDestReg = createResultReg(TLI.getRegClassFor(VT)); + Register NewDestReg = createResultReg(TLI.getRegClassFor(VT)); if (isThumb2) MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM::t2LDRi12), NewDestReg) @@ -657,7 +657,7 @@ unsigned ARMFastISel::fastMaterializeAlloca(const AllocaInst *AI) { if (SI != FuncInfo.StaticAllocaMap.end()) { unsigned Opc = isThumb2 ? ARM::t2ADDri : ARM::ADDri; const TargetRegisterClass* RC = TLI.getRegClassFor(VT); - unsigned ResultReg = createResultReg(RC); + Register ResultReg = createResultReg(RC); ResultReg = constrainOperandRegClass(TII.get(Opc), ResultReg, 0); AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, @@ -832,7 +832,7 @@ void ARMFastISel::ARMSimplifyAddress(Address &Addr, MVT VT, bool useAM3) { if (needsLowering && Addr.BaseType == Address::FrameIndexBase) { const TargetRegisterClass *RC = isThumb2 ? &ARM::tGPRRegClass : &ARM::GPRRegClass; - unsigned ResultReg = createResultReg(RC); + Register ResultReg = createResultReg(RC); unsigned Opc = isThumb2 ? ARM::t2ADDri : ARM::ADDri; AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg) @@ -991,7 +991,7 @@ bool ARMFastISel::ARMEmitLoad(MVT VT, Register &ResultReg, Address &Addr, // If we had an unaligned load of a float we've converted it to an regular // load. Now we must move from the GRP to the FP register. if (needVMOV) { - unsigned MoveReg = createResultReg(TLI.getRegClassFor(MVT::f32)); + Register MoveReg = createResultReg(TLI.getRegClassFor(MVT::f32)); AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM::VMOVSR), MoveReg) .addReg(ResultReg)); @@ -1044,7 +1044,7 @@ bool ARMFastISel::ARMEmitStore(MVT VT, unsigned SrcReg, Address &Addr, // This is mostly going to be Neon/vector support. default: return false; case MVT::i1: { - unsigned Res = createResultReg(isThumb2 ? &ARM::tGPRRegClass + Register Res = createResultReg(isThumb2 ? &ARM::tGPRRegClass : &ARM::GPRRegClass); unsigned Opc = isThumb2 ? ARM::t2ANDri : ARM::ANDri; SrcReg = constrainOperandRegClass(TII.get(Opc), SrcReg, 1); @@ -1095,7 +1095,7 @@ bool ARMFastISel::ARMEmitStore(MVT VT, unsigned SrcReg, Address &Addr, if (!Subtarget->hasVFP2Base()) return false; // Unaligned stores need special handling. Floats require word-alignment. if (Alignment && Alignment < 4) { - unsigned MoveReg = createResultReg(TLI.getRegClassFor(MVT::i32)); + Register MoveReg = createResultReg(TLI.getRegClassFor(MVT::i32)); AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM::VMOVRS), MoveReg) .addReg(SrcReg)); @@ -1257,7 +1257,7 @@ bool ARMFastISel::SelectBranch(const Instruction *I) { if (TI->hasOneUse() && TI->getParent() == I->getParent() && (isLoadTypeLegal(TI->getOperand(0)->getType(), SourceVT))) { unsigned TstOpc = isThumb2 ? ARM::t2TSTri : ARM::TSTri; - unsigned OpReg = getRegForValue(TI->getOperand(0)); + Register OpReg = getRegForValue(TI->getOperand(0)); OpReg = constrainOperandRegClass(TII.get(TstOpc), OpReg, 0); AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TstOpc)) @@ -1284,7 +1284,7 @@ bool ARMFastISel::SelectBranch(const Instruction *I) { return true; } - unsigned CmpReg = getRegForValue(BI->getCondition()); + Register CmpReg = getRegForValue(BI->getCondition()); if (CmpReg == 0) return false; // We've been divorced from our compare! Our block was split, and @@ -1315,7 +1315,7 @@ bool ARMFastISel::SelectBranch(const Instruction *I) { } bool ARMFastISel::SelectIndirectBr(const Instruction *I) { - unsigned AddrReg = getRegForValue(I->getOperand(0)); + Register AddrReg = getRegForValue(I->getOperand(0)); if (AddrReg == 0) return false; unsigned Opc = isThumb2 ? ARM::tBRIND : ARM::BX; @@ -1406,7 +1406,7 @@ bool ARMFastISel::ARMEmitCmp(const Value *Src1Value, const Value *Src2Value, break; } - unsigned SrcReg1 = getRegForValue(Src1Value); + Register SrcReg1 = getRegForValue(Src1Value); if (SrcReg1 == 0) return false; unsigned SrcReg2 = 0; @@ -1468,7 +1468,7 @@ bool ARMFastISel::SelectCmp(const Instruction *I) { unsigned MovCCOpc = isThumb2 ? ARM::t2MOVCCi : ARM::MOVCCi; const TargetRegisterClass *RC = isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRRegClass; - unsigned DestReg = createResultReg(RC); + Register DestReg = createResultReg(RC); Constant *Zero = ConstantInt::get(Type::getInt32Ty(*Context), 0); unsigned ZeroReg = fastMaterializeConstant(Zero); // ARMEmitCmp emits a FMSTAT when necessary, so it's always safe to use CPSR. @@ -1488,10 +1488,10 @@ bool ARMFastISel::SelectFPExt(const Instruction *I) { if (!I->getType()->isDoubleTy() || !V->getType()->isFloatTy()) return false; - unsigned Op = getRegForValue(V); + Register Op = getRegForValue(V); if (Op == 0) return false; - unsigned Result = createResultReg(&ARM::DPRRegClass); + Register Result = createResultReg(&ARM::DPRRegClass); AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM::VCVTDS), Result) .addReg(Op)); @@ -1507,10 +1507,10 @@ bool ARMFastISel::SelectFPTrunc(const Instruction *I) { if (!(I->getType()->isFloatTy() && V->getType()->isDoubleTy())) return false; - unsigned Op = getRegForValue(V); + Register Op = getRegForValue(V); if (Op == 0) return false; - unsigned Result = createResultReg(&ARM::SPRRegClass); + Register Result = createResultReg(&ARM::SPRRegClass); AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM::VCVTSD), Result) .addReg(Op)); @@ -1535,7 +1535,7 @@ bool ARMFastISel::SelectIToFP(const Instruction *I, bool isSigned) { if (SrcVT != MVT::i32 && SrcVT != MVT::i16 && SrcVT != MVT::i8) return false; - unsigned SrcReg = getRegForValue(Src); + Register SrcReg = getRegForValue(Src); if (SrcReg == 0) return false; // Handle sign-extension. @@ -1556,7 +1556,7 @@ bool ARMFastISel::SelectIToFP(const Instruction *I, bool isSigned) { Opc = isSigned ? ARM::VSITOD : ARM::VUITOD; else return false; - unsigned ResultReg = createResultReg(TLI.getRegClassFor(DstVT)); + Register ResultReg = createResultReg(TLI.getRegClassFor(DstVT)); AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg).addReg(FP)); updateValueMap(I, ResultReg); @@ -1572,7 +1572,7 @@ bool ARMFastISel::SelectFPToI(const Instruction *I, bool isSigned) { if (!isTypeLegal(RetTy, DstVT)) return false; - unsigned Op = getRegForValue(I->getOperand(0)); + Register Op = getRegForValue(I->getOperand(0)); if (Op == 0) return false; unsigned Opc; @@ -1583,7 +1583,7 @@ bool ARMFastISel::SelectFPToI(const Instruction *I, bool isSigned) { else return false; // f64->s32/u32 or f32->s32/u32 both need an intermediate f32 reg. - unsigned ResultReg = createResultReg(TLI.getRegClassFor(MVT::f32)); + Register ResultReg = createResultReg(TLI.getRegClassFor(MVT::f32)); AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg).addReg(Op)); @@ -1604,9 +1604,9 @@ bool ARMFastISel::SelectSelect(const Instruction *I) { // Things need to be register sized for register moves. if (VT != MVT::i32) return false; - unsigned CondReg = getRegForValue(I->getOperand(0)); + Register CondReg = getRegForValue(I->getOperand(0)); if (CondReg == 0) return false; - unsigned Op1Reg = getRegForValue(I->getOperand(1)); + Register Op1Reg = getRegForValue(I->getOperand(1)); if (Op1Reg == 0) return false; // Check to see if we can use an immediate in the conditional move. @@ -1649,7 +1649,7 @@ bool ARMFastISel::SelectSelect(const Instruction *I) { else MovCCOpc = isThumb2 ? ARM::t2MVNCCi : ARM::MVNCCi; } - unsigned ResultReg = createResultReg(RC); + Register ResultReg = createResultReg(RC); if (!UseImm) { Op2Reg = constrainOperandRegClass(TII.get(MovCCOpc), Op2Reg, 1); Op1Reg = constrainOperandRegClass(TII.get(MovCCOpc), Op1Reg, 2); @@ -1752,15 +1752,15 @@ bool ARMFastISel::SelectBinaryIntOp(const Instruction *I, unsigned ISDOpcode) { break; } - unsigned SrcReg1 = getRegForValue(I->getOperand(0)); + Register SrcReg1 = getRegForValue(I->getOperand(0)); if (SrcReg1 == 0) return false; // TODO: Often the 2nd operand is an immediate, which can be encoded directly // in the instruction, rather then materializing the value in a register. - unsigned SrcReg2 = getRegForValue(I->getOperand(1)); + Register SrcReg2 = getRegForValue(I->getOperand(1)); if (SrcReg2 == 0) return false; - unsigned ResultReg = createResultReg(&ARM::GPRnopcRegClass); + Register ResultReg = createResultReg(&ARM::GPRnopcRegClass); SrcReg1 = constrainOperandRegClass(TII.get(Opc), SrcReg1, 1); SrcReg2 = constrainOperandRegClass(TII.get(Opc), SrcReg2, 2); AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, @@ -1803,13 +1803,13 @@ bool ARMFastISel::SelectBinaryFPOp(const Instruction *I, unsigned ISDOpcode) { Opc = is64bit ? ARM::VMULD : ARM::VMULS; break; } - unsigned Op1 = getRegForValue(I->getOperand(0)); + Register Op1 = getRegForValue(I->getOperand(0)); if (Op1 == 0) return false; - unsigned Op2 = getRegForValue(I->getOperand(1)); + Register Op2 = getRegForValue(I->getOperand(1)); if (Op2 == 0) return false; - unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT.SimpleTy)); + Register ResultReg = createResultReg(TLI.getRegClassFor(VT.SimpleTy)); AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg) .addReg(Op1).addReg(Op2)); @@ -2022,7 +2022,7 @@ bool ARMFastISel::FinishCall(MVT RetVT, SmallVectorImpl<Register> &UsedRegs, unsigned AdjStackUp = TII.getCallFrameDestroyOpcode(); AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackUp)) - .addImm(NumBytes).addImm(0)); + .addImm(NumBytes).addImm(-1ULL)); // Now the return value. if (RetVT != MVT::isVoid) { @@ -2101,7 +2101,7 @@ bool ARMFastISel::SelectRet(const Instruction *I) { F.isVarArg())); const Value *RV = Ret->getOperand(0); - unsigned Reg = getRegForValue(RV); + Register Reg = getRegForValue(RV); if (Reg == 0) return false; @@ -2226,7 +2226,7 @@ bool ARMFastISel::ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call) { ArgVTs.reserve(I->getNumOperands()); ArgFlags.reserve(I->getNumOperands()); for (Value *Op : I->operands()) { - unsigned Arg = getRegForValue(Op); + Register Arg = getRegForValue(Op); if (Arg == 0) return false; Type *ArgTy = Op->getType(); @@ -2588,7 +2588,7 @@ bool ARMFastISel::SelectTrunc(const Instruction *I) { if (DestVT != MVT::i16 && DestVT != MVT::i8 && DestVT != MVT::i1) return false; - unsigned SrcReg = getRegForValue(Op); + Register SrcReg = getRegForValue(Op); if (!SrcReg) return false; // Because the high bits are undefined, a truncate doesn't generate @@ -2744,7 +2744,7 @@ bool ARMFastISel::SelectIntExt(const Instruction *I) { Type *SrcTy = Src->getType(); bool isZExt = isa<ZExtInst>(I); - unsigned SrcReg = getRegForValue(Src); + Register SrcReg = getRegForValue(Src); if (!SrcReg) return false; EVT SrcEVT, DestEVT; @@ -2788,7 +2788,7 @@ bool ARMFastISel::SelectShift(const Instruction *I, } Value *Src1Value = I->getOperand(0); - unsigned Reg1 = getRegForValue(Src1Value); + Register Reg1 = getRegForValue(Src1Value); if (Reg1 == 0) return false; unsigned Reg2 = 0; @@ -2797,7 +2797,7 @@ bool ARMFastISel::SelectShift(const Instruction *I, if (Reg2 == 0) return false; } - unsigned ResultReg = createResultReg(&ARM::GPRnopcRegClass); + Register ResultReg = createResultReg(&ARM::GPRnopcRegClass); if(ResultReg == 0) return false; MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, @@ -2975,7 +2975,7 @@ unsigned ARMFastISel::ARMLowerPICELF(const GlobalValue *GV, MVT VT) { MIB.add(predOps(ARMCC::AL)); // Fix the address by adding pc. - unsigned DestReg = createResultReg(TLI.getRegClassFor(VT)); + Register DestReg = createResultReg(TLI.getRegClassFor(VT)); Opc = Subtarget->isThumb() ? ARM::tPICADD : UseGOT_PREL ? ARM::PICLDR : ARM::PICADD; DestReg = constrainOperandRegClass(TII.get(Opc), DestReg, 0); @@ -2987,7 +2987,7 @@ unsigned ARMFastISel::ARMLowerPICELF(const GlobalValue *GV, MVT VT) { MIB.add(predOps(ARMCC::AL)); if (UseGOT_PREL && Subtarget->isThumb()) { - unsigned NewDestReg = createResultReg(TLI.getRegClassFor(VT)); + Register NewDestReg = createResultReg(TLI.getRegClassFor(VT)); MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM::t2LDRi12), NewDestReg) .addReg(DestReg) @@ -3057,11 +3057,11 @@ bool ARMFastISel::fastLowerArguments() { for (const Argument &Arg : F->args()) { unsigned ArgNo = Arg.getArgNo(); unsigned SrcReg = GPRArgRegs[ArgNo]; - unsigned DstReg = FuncInfo.MF->addLiveIn(SrcReg, RC); + Register DstReg = FuncInfo.MF->addLiveIn(SrcReg, RC); // FIXME: Unfortunately it's necessary to emit a copy from the livein copy. // Without this, EmitLiveInCopies may eliminate the livein if its only // use is a bitcast (which isn't turned into an instruction). - unsigned ResultReg = createResultReg(RC); + Register ResultReg = createResultReg(RC); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), ResultReg).addReg(DstReg, getKillRegState(true)); diff --git a/llvm/lib/Target/ARM/ARMFrameLowering.cpp b/llvm/lib/Target/ARM/ARMFrameLowering.cpp index 4b59f9cb94ce..1f2f6f7497e0 100644 --- a/llvm/lib/Target/ARM/ARMFrameLowering.cpp +++ b/llvm/lib/Target/ARM/ARMFrameLowering.cpp @@ -516,7 +516,7 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF, // Determine spill area sizes. for (const CalleeSavedInfo &I : CSI) { - unsigned Reg = I.getReg(); + Register Reg = I.getReg(); int FI = I.getFrameIdx(); switch (Reg) { case ARM::R8: @@ -751,7 +751,7 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF, MachineBasicBlock::iterator Pos = std::next(GPRCS1Push); int CFIIndex; for (const auto &Entry : CSI) { - unsigned Reg = Entry.getReg(); + Register Reg = Entry.getReg(); int FI = Entry.getFrameIdx(); switch (Reg) { case ARM::R8: @@ -784,7 +784,7 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF, if (GPRCS2Size > 0) { MachineBasicBlock::iterator Pos = std::next(GPRCS2Push); for (const auto &Entry : CSI) { - unsigned Reg = Entry.getReg(); + Register Reg = Entry.getReg(); int FI = Entry.getFrameIdx(); switch (Reg) { case ARM::R8: @@ -794,7 +794,7 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF, case ARM::R12: if (STI.splitFramePushPop(MF)) { unsigned DwarfReg = MRI->getDwarfRegNum( - Reg == ARM::R12 ? (unsigned)ARM::RA_AUTH_CODE : Reg, true); + Reg == ARM::R12 ? ARM::RA_AUTH_CODE : Reg, true); unsigned Offset = MFI.getObjectOffset(FI); unsigned CFIIndex = MF.addFrameInst( MCCFIInstruction::createOffset(nullptr, DwarfReg, Offset)); @@ -812,7 +812,7 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF, // instructions in the prologue. MachineBasicBlock::iterator Pos = std::next(LastPush); for (const auto &Entry : CSI) { - unsigned Reg = Entry.getReg(); + Register Reg = Entry.getReg(); int FI = Entry.getFrameIdx(); if ((Reg >= ARM::D0 && Reg <= ARM::D31) && (Reg < ARM::D8 || Reg >= ARM::D8 + AFI->getNumAlignedDPRCS2Regs())) { @@ -1144,7 +1144,7 @@ void ARMFrameLowering::emitPushInst(MachineBasicBlock &MBB, while (i != 0) { unsigned LastReg = 0; for (; i != 0; --i) { - unsigned Reg = CSI[i-1].getReg(); + Register Reg = CSI[i-1].getReg(); if (!(Func)(Reg, STI.splitFramePushPop(MF))) continue; // D-registers in the aligned area DPRCS2 are NOT spilled here. @@ -1237,7 +1237,7 @@ void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB, bool DeleteRet = false; for (; i != 0; --i) { CalleeSavedInfo &Info = CSI[i-1]; - unsigned Reg = Info.getReg(); + Register Reg = Info.getReg(); if (!(Func)(Reg, STI.splitFramePushPop(MF))) continue; // The aligned reloads from area DPRCS2 are not inserted here. @@ -1812,7 +1812,7 @@ bool ARMFrameLowering::enableShrinkWrapping(const MachineFunction &MF) const { // shrinkwrapping can cause clobbering of r12 when the PAC code is // generated. A follow-up patch will fix this in a more performant manner. if (MF.getInfo<ARMFunctionInfo>()->shouldSignReturnAddress( - false /*SpillsLR */)) + true /* SpillsLR */)) return false; return true; @@ -2353,7 +2353,7 @@ bool ARMFrameLowering::assignCalleeSavedSpillSlots( // LR, R7, R6, R5, R4, <R12>, R11, R10, R9, R8, D15-D8 CSI.insert(find_if(CSI, [=](const auto &CS) { - unsigned Reg = CS.getReg(); + Register Reg = CS.getReg(); return Reg == ARM::R10 || Reg == ARM::R11 || Reg == ARM::R8 || Reg == ARM::R9 || ARM::DPRRegClass.contains(Reg); diff --git a/llvm/lib/Target/ARM/ARMHazardRecognizer.cpp b/llvm/lib/Target/ARM/ARMHazardRecognizer.cpp index f083fa6662e9..0d201a67af46 100644 --- a/llvm/lib/Target/ARM/ARMHazardRecognizer.cpp +++ b/llvm/lib/Target/ARM/ARMHazardRecognizer.cpp @@ -164,7 +164,7 @@ static bool getBaseOffset(const MachineInstr &MI, const MachineOperand *&BaseOp, ARMBankConflictHazardRecognizer::ARMBankConflictHazardRecognizer( const ScheduleDAG *DAG, int64_t CPUBankMask, bool CPUAssumeITCMConflict) - : ScheduleHazardRecognizer(), MF(DAG->MF), DL(DAG->MF.getDataLayout()), + : MF(DAG->MF), DL(DAG->MF.getDataLayout()), DataMask(DataBankMask.getNumOccurrences() ? int64_t(DataBankMask) : CPUBankMask), AssumeITCMBankConflict(AssumeITCMConflict.getNumOccurrences() diff --git a/llvm/lib/Target/ARM/ARMHazardRecognizer.h b/llvm/lib/Target/ARM/ARMHazardRecognizer.h index c1f1bcd0a629..66a1477e5e08 100644 --- a/llvm/lib/Target/ARM/ARMHazardRecognizer.h +++ b/llvm/lib/Target/ARM/ARMHazardRecognizer.h @@ -34,7 +34,7 @@ class ARMHazardRecognizerFPMLx : public ScheduleHazardRecognizer { unsigned FpMLxStalls = 0; public: - ARMHazardRecognizerFPMLx() : ScheduleHazardRecognizer() { MaxLookAhead = 1; } + ARMHazardRecognizerFPMLx() { MaxLookAhead = 1; } HazardType getHazardType(SUnit *SU, int Stalls) override; void Reset() override; diff --git a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp index bb2859c766c2..98c8133282a2 100644 --- a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp +++ b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -3227,7 +3227,7 @@ bool ARMDAGToDAGISel::transformFixedFloatingPointConversion(SDNode *N, if (!ImmAPF.getExactInverse(&ToConvert)) return false; } - APSInt Converted(64, 0); + APSInt Converted(64, false); bool IsExact; ToConvert.convertToInteger(Converted, llvm::RoundingMode::NearestTiesToEven, &IsExact); @@ -5737,8 +5737,7 @@ bool ARMDAGToDAGISel::tryInlineAsm(SDNode *N){ // them into a GPRPair. SDLoc dl(N); - SDValue Glue = N->getGluedNode() ? N->getOperand(NumOps-1) - : SDValue(nullptr,0); + SDValue Glue = N->getGluedNode() ? N->getOperand(NumOps - 1) : SDValue(); SmallVector<bool, 8> OpChanged; // Glue node will be appended late. @@ -5801,8 +5800,8 @@ bool ARMDAGToDAGISel::tryInlineAsm(SDNode *N){ assert((i+2 < NumOps) && "Invalid number of operands in inline asm"); SDValue V0 = N->getOperand(i+1); SDValue V1 = N->getOperand(i+2); - unsigned Reg0 = cast<RegisterSDNode>(V0)->getReg(); - unsigned Reg1 = cast<RegisterSDNode>(V1)->getReg(); + Register Reg0 = cast<RegisterSDNode>(V0)->getReg(); + Register Reg1 = cast<RegisterSDNode>(V1)->getReg(); SDValue PairedReg; MachineRegisterInfo &MRI = MF->getRegInfo(); diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 3d45db349644..fe4e6b24367a 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -2899,7 +2899,7 @@ bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, unsigned Bytes = Arg.getValueSizeInBits() / 8; int FI = std::numeric_limits<int>::max(); if (Arg.getOpcode() == ISD::CopyFromReg) { - unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg(); + Register VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg(); if (!Register::isVirtualRegister(VR)) return false; MachineInstr *Def = MRI->getVRegDef(VR); @@ -4018,7 +4018,7 @@ SDValue ARMTargetLowering::LowerINTRINSIC_VOID( ARI->getCallPreservedMask(DAG.getMachineFunction(), CallingConv::C); assert(Mask && "Missing call preserved mask for calling convention"); // Mark LR an implicit live-in. - unsigned Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32)); + Register Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32)); SDValue ReturnAddress = DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, PtrVT); constexpr EVT ResultTys[] = {MVT::Other, MVT::Glue}; @@ -4272,7 +4272,7 @@ SDValue ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, RC = &ARM::GPRRegClass; // Transform the arguments stored in physical registers into virtual ones. - unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); + Register Reg = MF.addLiveIn(VA.getLocReg(), RC); SDValue ArgValue = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32); SDValue ArgValue2; @@ -4342,7 +4342,7 @@ int ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG, AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass : &ARM::GPRRegClass; for (unsigned Reg = RBegin, i = 0; Reg < REnd; ++Reg, ++i) { - unsigned VReg = MF.addLiveIn(Reg, RC); + Register VReg = MF.addLiveIn(Reg, RC); SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32); SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo(OrigArg, 4 * i)); @@ -4527,7 +4527,7 @@ SDValue ARMTargetLowering::LowerFormalArguments( llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering"); // Transform the arguments in physical registers into virtual ones. - unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); + Register Reg = MF.addLiveIn(VA.getLocReg(), RC); ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT); // If this value is passed in r0 and has the returned attribute (e.g. @@ -6065,7 +6065,7 @@ SDValue ARMTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{ } // Return LR, which contains the return address. Mark it an implicit live-in. - unsigned Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32)); + Register Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32)); return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT); } @@ -14682,7 +14682,9 @@ static SDValue PerformBFICombine(SDNode *N, SelectionDAG &DAG) { return SDValue(); } -// Check that N is CMPZ(CSINC(0, 0, CC, X)), return X if valid. +// Check that N is CMPZ(CSINC(0, 0, CC, X)), +// or CMPZ(CMOV(1, 0, CC, $cpsr, X)) +// return X if valid. static SDValue IsCMPZCSINC(SDNode *Cmp, ARMCC::CondCodes &CC) { if (Cmp->getOpcode() != ARMISD::CMPZ || !isNullConstant(Cmp->getOperand(1))) return SDValue(); @@ -14696,12 +14698,24 @@ static SDValue IsCMPZCSINC(SDNode *Cmp, ARMCC::CondCodes &CC) { CSInc.getConstantOperandVal(1) == 1 && CSInc->hasOneUse()) CSInc = CSInc.getOperand(0); - if (CSInc.getOpcode() != ARMISD::CSINC || - !isNullConstant(CSInc.getOperand(0)) || - !isNullConstant(CSInc.getOperand(1)) || !CSInc->hasOneUse()) - return SDValue(); - CC = (ARMCC::CondCodes)CSInc.getConstantOperandVal(2); - return CSInc.getOperand(3); + if (CSInc.getOpcode() == ARMISD::CSINC && + isNullConstant(CSInc.getOperand(0)) && + isNullConstant(CSInc.getOperand(1)) && CSInc->hasOneUse()) { + CC = (ARMCC::CondCodes)CSInc.getConstantOperandVal(2); + return CSInc.getOperand(3); + } + if (CSInc.getOpcode() == ARMISD::CMOV && isOneConstant(CSInc.getOperand(0)) && + isNullConstant(CSInc.getOperand(1)) && CSInc->hasOneUse()) { + CC = (ARMCC::CondCodes)CSInc.getConstantOperandVal(2); + return CSInc.getOperand(4); + } + if (CSInc.getOpcode() == ARMISD::CMOV && isOneConstant(CSInc.getOperand(1)) && + isNullConstant(CSInc.getOperand(0)) && CSInc->hasOneUse()) { + CC = ARMCC::getOppositeCondition( + (ARMCC::CondCodes)CSInc.getConstantOperandVal(2)); + return CSInc.getOperand(4); + } + return SDValue(); } static SDValue PerformCMPZCombine(SDNode *N, SelectionDAG &DAG) { @@ -15412,13 +15426,13 @@ static SDValue PerformShuffleVMOVNCombine(ShuffleVectorSDNode *N, return SDValue(); SDLoc DL(Trunc); - if (isVMOVNTruncMask(N->getMask(), VT, 0)) + if (isVMOVNTruncMask(N->getMask(), VT, false)) return DAG.getNode( ARMISD::VMOVN, DL, VT, DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(0)), DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(1)), DAG.getConstant(1, DL, MVT::i32)); - else if (isVMOVNTruncMask(N->getMask(), VT, 1)) + else if (isVMOVNTruncMask(N->getMask(), VT, true)) return DAG.getNode( ARMISD::VMOVN, DL, VT, DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(1)), @@ -18218,13 +18232,13 @@ SDValue ARMTargetLowering::PerformMVETruncCombine( SmallVector<int, 8> Mask(S0->getMask().begin(), S0->getMask().end()); Mask.append(S1->getMask().begin(), S1->getMask().end()); - if (isVMOVNTruncMask(Mask, VT, 0)) + if (isVMOVNTruncMask(Mask, VT, false)) return DAG.getNode( ARMISD::VMOVN, DL, VT, DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(0)), DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(1)), DAG.getConstant(1, DL, MVT::i32)); - if (isVMOVNTruncMask(Mask, VT, 1)) + if (isVMOVNTruncMask(Mask, VT, true)) return DAG.getNode( ARMISD::VMOVN, DL, VT, DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(1)), @@ -20775,10 +20789,10 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType()); Info.opc = ISD::INTRINSIC_W_CHAIN; - Info.memVT = MVT::getVT(PtrTy->getElementType()); + Info.memVT = MVT::getVT(PtrTy->getPointerElementType()); Info.ptrVal = I.getArgOperand(0); Info.offset = 0; - Info.align = DL.getABITypeAlign(PtrTy->getElementType()); + Info.align = DL.getABITypeAlign(PtrTy->getPointerElementType()); Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile; return true; } @@ -20787,10 +20801,10 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType()); Info.opc = ISD::INTRINSIC_W_CHAIN; - Info.memVT = MVT::getVT(PtrTy->getElementType()); + Info.memVT = MVT::getVT(PtrTy->getPointerElementType()); Info.ptrVal = I.getArgOperand(1); Info.offset = 0; - Info.align = DL.getABITypeAlign(PtrTy->getElementType()); + Info.align = DL.getABITypeAlign(PtrTy->getPointerElementType()); Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile; return true; } diff --git a/llvm/lib/Target/ARM/ARMInstrInfo.cpp b/llvm/lib/Target/ARM/ARMInstrInfo.cpp index 5dee5e04af81..00db13f2eb52 100644 --- a/llvm/lib/Target/ARM/ARMInstrInfo.cpp +++ b/llvm/lib/Target/ARM/ARMInstrInfo.cpp @@ -28,8 +28,7 @@ #include "llvm/MC/MCInst.h" using namespace llvm; -ARMInstrInfo::ARMInstrInfo(const ARMSubtarget &STI) - : ARMBaseInstrInfo(STI), RI() {} +ARMInstrInfo::ARMInstrInfo(const ARMSubtarget &STI) : ARMBaseInstrInfo(STI) {} /// Return the noop instruction to use for a noop. MCInst ARMInstrInfo::getNop() const { diff --git a/llvm/lib/Target/ARM/ARMInstrNEON.td b/llvm/lib/Target/ARM/ARMInstrNEON.td index aaf3280ea150..357aa6d062e9 100644 --- a/llvm/lib/Target/ARM/ARMInstrNEON.td +++ b/llvm/lib/Target/ARM/ARMInstrNEON.td @@ -4526,64 +4526,48 @@ let Predicates = [HasNEON, HasV8_1a] in { defm VQRDMLAH : N3VInt3_HS<1, 0, 0b1011, 1, IIC_VMACi16D, IIC_VMACi32D, IIC_VMACi16Q, IIC_VMACi32Q, "vqrdmlah", "s", null_frag>; - def : Pat<(v4i16 (saddsat - (v4i16 DPR:$src1), - (v4i16 (int_arm_neon_vqrdmulh (v4i16 DPR:$Vn), - (v4i16 DPR:$Vm))))), + def : Pat<(v4i16 (int_arm_neon_vqrdmlah (v4i16 DPR:$src1), (v4i16 DPR:$Vn), + (v4i16 DPR:$Vm))), (v4i16 (VQRDMLAHv4i16 DPR:$src1, DPR:$Vn, DPR:$Vm))>; - def : Pat<(v2i32 (saddsat - (v2i32 DPR:$src1), - (v2i32 (int_arm_neon_vqrdmulh (v2i32 DPR:$Vn), - (v2i32 DPR:$Vm))))), + def : Pat<(v2i32 (int_arm_neon_vqrdmlah (v2i32 DPR:$src1), (v2i32 DPR:$Vn), + (v2i32 DPR:$Vm))), (v2i32 (VQRDMLAHv2i32 DPR:$src1, DPR:$Vn, DPR:$Vm))>; - def : Pat<(v8i16 (saddsat - (v8i16 QPR:$src1), - (v8i16 (int_arm_neon_vqrdmulh (v8i16 QPR:$Vn), - (v8i16 QPR:$Vm))))), + def : Pat<(v8i16 (int_arm_neon_vqrdmlah (v8i16 QPR:$src1), (v8i16 QPR:$Vn), + (v8i16 QPR:$Vm))), (v8i16 (VQRDMLAHv8i16 QPR:$src1, QPR:$Vn, QPR:$Vm))>; - def : Pat<(v4i32 (saddsat - (v4i32 QPR:$src1), - (v4i32 (int_arm_neon_vqrdmulh (v4i32 QPR:$Vn), - (v4i32 QPR:$Vm))))), + def : Pat<(v4i32 (int_arm_neon_vqrdmlah (v4i32 QPR:$src1), (v4i32 QPR:$Vn), + (v4i32 QPR:$Vm))), (v4i32 (VQRDMLAHv4i32 QPR:$src1, QPR:$Vn, QPR:$Vm))>; defm VQRDMLAHsl : N3VMulOpSL_HS<0b1110, IIC_VMACi16D, IIC_VMACi32D, IIC_VMACi16Q, IIC_VMACi32Q, "vqrdmlah", "s", null_frag>; - def : Pat<(v4i16 (saddsat - (v4i16 DPR:$src1), - (v4i16 (int_arm_neon_vqrdmulh + def : Pat<(v4i16 (int_arm_neon_vqrdmlah (v4i16 DPR:$src1), (v4i16 DPR:$Vn), (v4i16 (ARMvduplane (v4i16 DPR_8:$Vm), - imm:$lane)))))), + imm:$lane)))), (v4i16 (VQRDMLAHslv4i16 DPR:$src1, DPR:$Vn, DPR_8:$Vm, imm:$lane))>; - def : Pat<(v2i32 (saddsat - (v2i32 DPR:$src1), - (v2i32 (int_arm_neon_vqrdmulh + def : Pat<(v2i32 (int_arm_neon_vqrdmlah (v2i32 DPR:$src1), (v2i32 DPR:$Vn), (v2i32 (ARMvduplane (v2i32 DPR_VFP2:$Vm), - imm:$lane)))))), + imm:$lane)))), (v2i32 (VQRDMLAHslv2i32 DPR:$src1, DPR:$Vn, DPR_VFP2:$Vm, imm:$lane))>; - def : Pat<(v8i16 (saddsat - (v8i16 QPR:$src1), - (v8i16 (int_arm_neon_vqrdmulh + def : Pat<(v8i16 (int_arm_neon_vqrdmlah (v8i16 QPR:$src1), (v8i16 QPR:$src2), (v8i16 (ARMvduplane (v8i16 QPR:$src3), - imm:$lane)))))), + imm:$lane)))), (v8i16 (VQRDMLAHslv8i16 (v8i16 QPR:$src1), (v8i16 QPR:$src2), (v4i16 (EXTRACT_SUBREG QPR:$src3, (DSubReg_i16_reg imm:$lane))), (SubReg_i16_lane imm:$lane)))>; - def : Pat<(v4i32 (saddsat - (v4i32 QPR:$src1), - (v4i32 (int_arm_neon_vqrdmulh + def : Pat<(v4i32 (int_arm_neon_vqrdmlah (v4i32 QPR:$src1), (v4i32 QPR:$src2), (v4i32 (ARMvduplane (v4i32 QPR:$src3), - imm:$lane)))))), + imm:$lane)))), (v4i32 (VQRDMLAHslv4i32 (v4i32 QPR:$src1), (v4i32 QPR:$src2), (v2i32 (EXTRACT_SUBREG @@ -4596,63 +4580,47 @@ let Predicates = [HasNEON, HasV8_1a] in { defm VQRDMLSH : N3VInt3_HS<1, 0, 0b1100, 1, IIC_VMACi16D, IIC_VMACi32D, IIC_VMACi16Q, IIC_VMACi32Q, "vqrdmlsh", "s", null_frag>; - def : Pat<(v4i16 (ssubsat - (v4i16 DPR:$src1), - (v4i16 (int_arm_neon_vqrdmulh (v4i16 DPR:$Vn), - (v4i16 DPR:$Vm))))), + def : Pat<(v4i16 (int_arm_neon_vqrdmlsh (v4i16 DPR:$src1), (v4i16 DPR:$Vn), + (v4i16 DPR:$Vm))), (v4i16 (VQRDMLSHv4i16 DPR:$src1, DPR:$Vn, DPR:$Vm))>; - def : Pat<(v2i32 (ssubsat - (v2i32 DPR:$src1), - (v2i32 (int_arm_neon_vqrdmulh (v2i32 DPR:$Vn), - (v2i32 DPR:$Vm))))), + def : Pat<(v2i32 (int_arm_neon_vqrdmlsh (v2i32 DPR:$src1), (v2i32 DPR:$Vn), + (v2i32 DPR:$Vm))), (v2i32 (VQRDMLSHv2i32 DPR:$src1, DPR:$Vn, DPR:$Vm))>; - def : Pat<(v8i16 (ssubsat - (v8i16 QPR:$src1), - (v8i16 (int_arm_neon_vqrdmulh (v8i16 QPR:$Vn), - (v8i16 QPR:$Vm))))), + def : Pat<(v8i16 (int_arm_neon_vqrdmlsh (v8i16 QPR:$src1), (v8i16 QPR:$Vn), + (v8i16 QPR:$Vm))), (v8i16 (VQRDMLSHv8i16 QPR:$src1, QPR:$Vn, QPR:$Vm))>; - def : Pat<(v4i32 (ssubsat - (v4i32 QPR:$src1), - (v4i32 (int_arm_neon_vqrdmulh (v4i32 QPR:$Vn), - (v4i32 QPR:$Vm))))), + def : Pat<(v4i32 (int_arm_neon_vqrdmlsh (v4i32 QPR:$src1), (v4i32 QPR:$Vn), + (v4i32 QPR:$Vm))), (v4i32 (VQRDMLSHv4i32 QPR:$src1, QPR:$Vn, QPR:$Vm))>; defm VQRDMLSHsl : N3VMulOpSL_HS<0b1111, IIC_VMACi16D, IIC_VMACi32D, IIC_VMACi16Q, IIC_VMACi32Q, "vqrdmlsh", "s", null_frag>; - def : Pat<(v4i16 (ssubsat - (v4i16 DPR:$src1), - (v4i16 (int_arm_neon_vqrdmulh + def : Pat<(v4i16 (int_arm_neon_vqrdmlsh (v4i16 DPR:$src1), (v4i16 DPR:$Vn), (v4i16 (ARMvduplane (v4i16 DPR_8:$Vm), - imm:$lane)))))), + imm:$lane)))), (v4i16 (VQRDMLSHslv4i16 DPR:$src1, DPR:$Vn, DPR_8:$Vm, imm:$lane))>; - def : Pat<(v2i32 (ssubsat - (v2i32 DPR:$src1), - (v2i32 (int_arm_neon_vqrdmulh + def : Pat<(v2i32 (int_arm_neon_vqrdmlsh (v2i32 DPR:$src1), (v2i32 DPR:$Vn), (v2i32 (ARMvduplane (v2i32 DPR_VFP2:$Vm), - imm:$lane)))))), + imm:$lane)))), (v2i32 (VQRDMLSHslv2i32 DPR:$src1, DPR:$Vn, DPR_VFP2:$Vm, imm:$lane))>; - def : Pat<(v8i16 (ssubsat - (v8i16 QPR:$src1), - (v8i16 (int_arm_neon_vqrdmulh + def : Pat<(v8i16 (int_arm_neon_vqrdmlsh (v8i16 QPR:$src1), (v8i16 QPR:$src2), (v8i16 (ARMvduplane (v8i16 QPR:$src3), - imm:$lane)))))), + imm:$lane)))), (v8i16 (VQRDMLSHslv8i16 (v8i16 QPR:$src1), (v8i16 QPR:$src2), (v4i16 (EXTRACT_SUBREG QPR:$src3, (DSubReg_i16_reg imm:$lane))), (SubReg_i16_lane imm:$lane)))>; - def : Pat<(v4i32 (ssubsat - (v4i32 QPR:$src1), - (v4i32 (int_arm_neon_vqrdmulh + def : Pat<(v4i32 (int_arm_neon_vqrdmlsh (v4i32 QPR:$src1), (v4i32 QPR:$src2), (v4i32 (ARMvduplane (v4i32 QPR:$src3), - imm:$lane)))))), + imm:$lane)))), (v4i32 (VQRDMLSHslv4i32 (v4i32 QPR:$src1), (v4i32 QPR:$src2), (v2i32 (EXTRACT_SUBREG diff --git a/llvm/lib/Target/ARM/ARMInstructionSelector.cpp b/llvm/lib/Target/ARM/ARMInstructionSelector.cpp index 8be4e3f160e3..188b5562cac9 100644 --- a/llvm/lib/Target/ARM/ARMInstructionSelector.cpp +++ b/llvm/lib/Target/ARM/ARMInstructionSelector.cpp @@ -171,8 +171,8 @@ createARMInstructionSelector(const ARMBaseTargetMachine &TM, ARMInstructionSelector::ARMInstructionSelector(const ARMBaseTargetMachine &TM, const ARMSubtarget &STI, const ARMRegisterBankInfo &RBI) - : InstructionSelector(), TII(*STI.getInstrInfo()), - TRI(*STI.getRegisterInfo()), TM(TM), RBI(RBI), STI(STI), Opcodes(STI), + : TII(*STI.getInstrInfo()), TRI(*STI.getRegisterInfo()), TM(TM), RBI(RBI), + STI(STI), Opcodes(STI), #define GET_GLOBALISEL_PREDICATES_INIT #include "ARMGenGlobalISel.inc" #undef GET_GLOBALISEL_PREDICATES_INIT diff --git a/llvm/lib/Target/ARM/ARMRegisterInfo.cpp b/llvm/lib/Target/ARM/ARMRegisterInfo.cpp index 6649750bb388..ff4647dd46fd 100644 --- a/llvm/lib/Target/ARM/ARMRegisterInfo.cpp +++ b/llvm/lib/Target/ARM/ARMRegisterInfo.cpp @@ -15,4 +15,4 @@ using namespace llvm; void ARMRegisterInfo::anchor() { } -ARMRegisterInfo::ARMRegisterInfo() : ARMBaseRegisterInfo() {} +ARMRegisterInfo::ARMRegisterInfo() {} diff --git a/llvm/lib/Target/ARM/ARMRegisterInfo.h b/llvm/lib/Target/ARM/ARMRegisterInfo.h index 87c0f322d3b3..2971b765a6fc 100644 --- a/llvm/lib/Target/ARM/ARMRegisterInfo.h +++ b/llvm/lib/Target/ARM/ARMRegisterInfo.h @@ -17,8 +17,6 @@ namespace llvm { -class ARMSubtarget; - struct ARMRegisterInfo : public ARMBaseRegisterInfo { virtual void anchor(); public: diff --git a/llvm/lib/Target/ARM/ARMSubtarget.cpp b/llvm/lib/Target/ARM/ARMSubtarget.cpp index 36c4bbaafcbf..2dd25234dc50 100644 --- a/llvm/lib/Target/ARM/ARMSubtarget.cpp +++ b/llvm/lib/Target/ARM/ARMSubtarget.cpp @@ -15,7 +15,6 @@ #include "ARMCallLowering.h" #include "ARMLegalizerInfo.h" #include "ARMRegisterBankInfo.h" -#include "ARMSubtarget.h" #include "ARMFrameLowering.h" #include "ARMInstrInfo.h" #include "ARMSubtarget.h" @@ -35,6 +34,7 @@ #include "llvm/MC/MCTargetOptions.h" #include "llvm/Support/CodeGen.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Support/ARMTargetParser.h" #include "llvm/Support/TargetParser.h" #include "llvm/Target/TargetOptions.h" diff --git a/llvm/lib/Target/ARM/ARMSubtarget.h b/llvm/lib/Target/ARM/ARMSubtarget.h index e61b90af31b0..1c2b7ee6ba35 100644 --- a/llvm/lib/Target/ARM/ARMSubtarget.h +++ b/llvm/lib/Target/ARM/ARMSubtarget.h @@ -121,6 +121,7 @@ protected: ARMv85a, ARMv86a, ARMv87a, + ARMv88a, ARMv8a, ARMv8mBaseline, ARMv8mMainline, @@ -129,6 +130,7 @@ protected: ARMv9a, ARMv91a, ARMv92a, + ARMv93a, }; public: @@ -174,10 +176,12 @@ protected: bool HasV8_4aOps = false; bool HasV8_5aOps = false; bool HasV8_6aOps = false; + bool HasV8_8aOps = false; bool HasV8_7aOps = false; bool HasV9_0aOps = false; bool HasV9_1aOps = false; bool HasV9_2aOps = false; + bool HasV9_3aOps = false; bool HasV8MBaselineOps = false; bool HasV8MMainlineOps = false; bool HasV8_1MMainlineOps = false; @@ -635,9 +639,11 @@ public: bool hasV8_5aOps() const { return HasV8_5aOps; } bool hasV8_6aOps() const { return HasV8_6aOps; } bool hasV8_7aOps() const { return HasV8_7aOps; } + bool hasV8_8aOps() const { return HasV8_8aOps; } bool hasV9_0aOps() const { return HasV9_0aOps; } bool hasV9_1aOps() const { return HasV9_1aOps; } bool hasV9_2aOps() const { return HasV9_2aOps; } + bool hasV9_3aOps() const { return HasV9_3aOps; } bool hasV8MBaselineOps() const { return HasV8MBaselineOps; } bool hasV8MMainlineOps() const { return HasV8MMainlineOps; } bool hasV8_1MMainlineOps() const { return HasV8_1MMainlineOps; } diff --git a/llvm/lib/Target/ARM/ARMTargetMachine.cpp b/llvm/lib/Target/ARM/ARMTargetMachine.cpp index 0b314ac2a41e..c38970f8e341 100644 --- a/llvm/lib/Target/ARM/ARMTargetMachine.cpp +++ b/llvm/lib/Target/ARM/ARMTargetMachine.cpp @@ -43,6 +43,7 @@ #include "llvm/Support/CodeGen.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/ARMTargetParser.h" #include "llvm/Support/TargetParser.h" #include "llvm/Target/TargetLoweringObjectFile.h" #include "llvm/Target/TargetOptions.h" diff --git a/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp b/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp index 8c5438f7093b..936cae17f004 100644 --- a/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp +++ b/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp @@ -54,9 +54,7 @@ void ARMElfTargetObjectFile::Initialize(MCContext &Ctx, } } -const MCRegister ARMElfTargetObjectFile::getStaticBase() const { - return ARM::R9; -} +MCRegister ARMElfTargetObjectFile::getStaticBase() const { return ARM::R9; } const MCExpr *ARMElfTargetObjectFile:: getIndirectSymViaRWPI(const MCSymbol *Sym) const { diff --git a/llvm/lib/Target/ARM/ARMTargetObjectFile.h b/llvm/lib/Target/ARM/ARMTargetObjectFile.h index 8b13198fe144..47334b9a8a45 100644 --- a/llvm/lib/Target/ARM/ARMTargetObjectFile.h +++ b/llvm/lib/Target/ARM/ARMTargetObjectFile.h @@ -17,14 +17,13 @@ namespace llvm { class ARMElfTargetObjectFile : public TargetLoweringObjectFileELF { public: - ARMElfTargetObjectFile() - : TargetLoweringObjectFileELF() { + ARMElfTargetObjectFile() { PLTRelativeVariantKind = MCSymbolRefExpr::VK_ARM_PREL31; } void Initialize(MCContext &Ctx, const TargetMachine &TM) override; - const MCRegister getStaticBase() const override; + MCRegister getStaticBase() const override; const MCExpr *getIndirectSymViaRWPI(const MCSymbol *Sym) const override; diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp index 602c6745d310..e0750a9945d2 100644 --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -1116,18 +1116,6 @@ bool ARMTTIImpl::isLegalMaskedGather(Type *Ty, Align Alignment) { if (!EnableMaskedGatherScatters || !ST->hasMVEIntegerOps()) return false; - // This method is called in 2 places: - // - from the vectorizer with a scalar type, in which case we need to get - // this as good as we can with the limited info we have (and rely on the cost - // model for the rest). - // - from the masked intrinsic lowering pass with the actual vector type. - // For MVE, we have a custom lowering pass that will already have custom - // legalised any gathers that we can to MVE intrinsics, and want to expand all - // the rest. The pass runs before the masked intrinsic lowering pass, so if we - // are here, we know we want to expand. - if (isa<VectorType>(Ty)) - return false; - unsigned EltWidth = Ty->getScalarSizeInBits(); return ((EltWidth == 32 && Alignment >= 4) || (EltWidth == 16 && Alignment >= 2) || EltWidth == 8); diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h index a56886d4fc11..5bb84899e5ef 100644 --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -189,6 +189,18 @@ public: return isLegalMaskedLoad(DataTy, Alignment); } + bool forceScalarizeMaskedGather(VectorType *VTy, Align Alignment) { + // For MVE, we have a custom lowering pass that will already have custom + // legalised any gathers that we can lower to MVE intrinsics, and want to + // expand all the rest. The pass runs before the masked intrinsic lowering + // pass. + return true; + } + + bool forceScalarizeMaskedScatter(VectorType *VTy, Align Alignment) { + return forceScalarizeMaskedGather(VTy, Alignment); + } + bool isLegalMaskedGather(Type *Ty, Align Alignment); bool isLegalMaskedScatter(Type *Ty, Align Alignment) { diff --git a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp index bfe078b06861..c7734cc2cf11 100644 --- a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp +++ b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp @@ -921,7 +921,7 @@ class ARMOperand : public MCParsedAsmOperand { }; public: - ARMOperand(KindTy K) : MCParsedAsmOperand(), Kind(K) {} + ARMOperand(KindTy K) : Kind(K) {} /// getStartLoc - Get the location of the first token of this operand. SMLoc getStartLoc() const override { return StartLoc; } @@ -1870,7 +1870,7 @@ public: } template <int shift> bool isMemRegRQOffset() const { - if (!isMVEMem() || Memory.OffsetImm != 0 || Memory.Alignment != 0) + if (!isMVEMem() || Memory.OffsetImm != nullptr || Memory.Alignment != 0) return false; if (!ARMMCRegisterClasses[ARM::GPRnopcRegClassID].contains( diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp index 851acea94022..23430dfc017a 100644 --- a/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp +++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp @@ -1049,11 +1049,11 @@ void ARMAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup, unsigned Kind = Fixup.getKind(); if (Kind >= FirstLiteralRelocationKind) return; - unsigned NumBytes = getFixupKindNumBytes(Kind); MCContext &Ctx = Asm.getContext(); Value = adjustFixupValue(Asm, Fixup, Target, Value, IsResolved, Ctx, STI); if (!Value) return; // Doesn't change encoding. + const unsigned NumBytes = getFixupKindNumBytes(Kind); unsigned Offset = Fixup.getOffset(); assert(Offset + NumBytes <= Data.size() && "Invalid fixup offset!"); @@ -1123,9 +1123,8 @@ uint32_t ARMAsmBackendDarwin::generateCompactUnwindEncoding( DenseMap<unsigned, int> RegOffsets; int FloatRegCount = 0; // Process each .cfi directive and build up compact unwind info. - for (size_t i = 0, e = Instrs.size(); i != e; ++i) { + for (const MCCFIInstruction &Inst : Instrs) { unsigned Reg; - const MCCFIInstruction &Inst = Instrs[i]; switch (Inst.getOperation()) { case MCCFIInstruction::OpDefCfa: // DW_CFA_def_cfa CFARegisterOffset = Inst.getOffset(); diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp index 37d81e4b0af1..df8f54d14a86 100644 --- a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp +++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp @@ -87,7 +87,7 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target, if (IsPCRel) { switch (Fixup.getTargetKind()) { default: - Ctx.reportFatalError(Fixup.getLoc(), "unsupported relocation on symbol"); + Ctx.reportError(Fixup.getLoc(), "unsupported relocation on symbol"); return ELF::R_ARM_NONE; case FK_Data_4: switch (Modifier) { @@ -159,7 +159,7 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target, } switch (Kind) { default: - Ctx.reportFatalError(Fixup.getLoc(), "unsupported relocation on symbol"); + Ctx.reportError(Fixup.getLoc(), "unsupported relocation on symbol"); return ELF::R_ARM_NONE; case FK_Data_1: switch (Modifier) { diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp index e060e59e3759..16bc0ca179a7 100644 --- a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp +++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp @@ -264,10 +264,8 @@ void ARMTargetAsmStreamer::emitInst(uint32_t Inst, char Suffix) { void ARMTargetAsmStreamer::emitUnwindRaw(int64_t Offset, const SmallVectorImpl<uint8_t> &Opcodes) { OS << "\t.unwind_raw " << Offset; - for (SmallVectorImpl<uint8_t>::const_iterator OCI = Opcodes.begin(), - OCE = Opcodes.end(); - OCI != OCE; ++OCI) - OS << ", 0x" << Twine::utohexstr(*OCI); + for (uint8_t Opcode : Opcodes) + OS << ", 0x" << Twine::utohexstr(Opcode); OS << '\n'; } @@ -788,6 +786,7 @@ void ARMTargetELFStreamer::emitArchDefaultAttributes() { case ARM::ArchKind::ARMV9A: case ARM::ArchKind::ARMV9_1A: case ARM::ArchKind::ARMV9_2A: + case ARM::ArchKind::ARMV9_3A: S.setAttributeItem(CPU_arch_profile, ApplicationProfile, false); S.setAttributeItem(ARM_ISA_use, Allowed, false); S.setAttributeItem(THUMB_ISA_use, AllowThumb32, false); diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp index 05e5a473a3c6..17ca1866cf95 100644 --- a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp +++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp @@ -338,8 +338,8 @@ void ARM_MC::initLLVMToCVRegMapping(MCRegisterInfo *MRI) { {codeview::RegisterId::ARM_NQ14, ARM::Q14}, {codeview::RegisterId::ARM_NQ15, ARM::Q15}, }; - for (unsigned I = 0; I < array_lengthof(RegMap); ++I) - MRI->mapLLVMRegToCVReg(RegMap[I].Reg, static_cast<int>(RegMap[I].CVReg)); + for (const auto &I : RegMap) + MRI->mapLLVMRegToCVReg(I.Reg, static_cast<int>(I.CVReg)); } static MCRegisterInfo *createARMMCRegisterInfo(const Triple &Triple) { diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h index 7ccdc6f85500..5c8f9bfdca08 100644 --- a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h +++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h @@ -36,8 +36,6 @@ class MCTargetStreamer; class StringRef; class Target; class Triple; -class raw_ostream; -class raw_pwrite_stream; namespace ARM_MC { std::string ParseARMTriple(const Triple &TT, StringRef CPU); diff --git a/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp b/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp index 54e80a095dd4..71a82a1e3271 100644 --- a/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp +++ b/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp @@ -167,7 +167,7 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF, DebugLoc dl; Register FramePtr = RegInfo->getFrameRegister(MF); - unsigned BasePtr = RegInfo->getBaseRegister(); + Register BasePtr = RegInfo->getBaseRegister(); int CFAOffset = 0; // Thumb add/sub sp, imm8 instructions implicitly multiply the offset by 4. @@ -206,7 +206,7 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF, } for (const CalleeSavedInfo &I : CSI) { - unsigned Reg = I.getReg(); + Register Reg = I.getReg(); int FI = I.getFrameIdx(); switch (Reg) { case ARM::R8: @@ -267,7 +267,7 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF, .setMIFlags(MachineInstr::FrameSetup); } for (const CalleeSavedInfo &I : CSI) { - unsigned Reg = I.getReg(); + Register Reg = I.getReg(); int FI = I.getFrameIdx(); switch (Reg) { case ARM::R8: @@ -348,7 +348,7 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF, // Emit call frame information for the callee-saved high registers. for (auto &I : CSI) { - unsigned Reg = I.getReg(); + Register Reg = I.getReg(); int FI = I.getFrameIdx(); switch (Reg) { case ARM::R8: @@ -376,7 +376,7 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF, // at this point in the prologue, so pick one. unsigned ScratchRegister = ARM::NoRegister; for (auto &I : CSI) { - unsigned Reg = I.getReg(); + Register Reg = I.getReg(); if (isARMLowRegister(Reg) && !(HasFP && Reg == FramePtr)) { ScratchRegister = Reg; break; @@ -531,7 +531,7 @@ void Thumb1FrameLowering::emitEpilogue(MachineFunction &MF, unsigned ScratchRegister = ARM::NoRegister; bool HasFP = hasFP(MF); for (auto &I : MFI.getCalleeSavedInfo()) { - unsigned Reg = I.getReg(); + Register Reg = I.getReg(); if (isARMLowRegister(Reg) && !(HasFP && Reg == FramePtr)) { ScratchRegister = Reg; break; @@ -825,7 +825,7 @@ bool Thumb1FrameLowering::spillCalleeSavedRegisters( // LoRegs for saving HiRegs. for (const CalleeSavedInfo &I : llvm::reverse(CSI)) { - unsigned Reg = I.getReg(); + Register Reg = I.getReg(); if (ARM::tGPRRegClass.contains(Reg) || Reg == ARM::LR) { LoRegsToSave[Reg] = true; @@ -949,7 +949,7 @@ bool Thumb1FrameLowering::restoreCalleeSavedRegisters( ARMRegSet CopyRegs; for (CalleeSavedInfo I : CSI) { - unsigned Reg = I.getReg(); + Register Reg = I.getReg(); if (ARM::tGPRRegClass.contains(Reg) || Reg == ARM::LR) { LoRegsToRestore[Reg] = true; @@ -1022,7 +1022,7 @@ bool Thumb1FrameLowering::restoreCalleeSavedRegisters( bool NeedsPop = false; for (CalleeSavedInfo &Info : llvm::reverse(CSI)) { - unsigned Reg = Info.getReg(); + Register Reg = Info.getReg(); // High registers (excluding lr) have already been dealt with if (!(ARM::tGPRRegClass.contains(Reg) || Reg == ARM::LR)) diff --git a/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp b/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp index 4b18f5e20d40..1a36c2ca9152 100644 --- a/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp +++ b/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp @@ -21,7 +21,7 @@ using namespace llvm; Thumb1InstrInfo::Thumb1InstrInfo(const ARMSubtarget &STI) - : ARMBaseInstrInfo(STI), RI() {} + : ARMBaseInstrInfo(STI) {} /// Return the noop instruction to use for a noop. MCInst Thumb1InstrInfo::getNop() const { diff --git a/llvm/lib/Target/ARM/Thumb2InstrInfo.h b/llvm/lib/Target/ARM/Thumb2InstrInfo.h index e6d51796ba4d..a83ff5e51004 100644 --- a/llvm/lib/Target/ARM/Thumb2InstrInfo.h +++ b/llvm/lib/Target/ARM/Thumb2InstrInfo.h @@ -18,7 +18,6 @@ namespace llvm { class ARMSubtarget; -class ScheduleHazardRecognizer; class Thumb2InstrInfo : public ARMBaseInstrInfo { ThumbRegisterInfo RI; diff --git a/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp b/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp index 1164b6ebbac3..1cc5422523f1 100644 --- a/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp +++ b/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp @@ -1147,9 +1147,8 @@ bool Thumb2SizeReduce::runOnMachineFunction(MachineFunction &MF) { // predecessors. ReversePostOrderTraversal<MachineFunction*> RPOT(&MF); bool Modified = false; - for (ReversePostOrderTraversal<MachineFunction*>::rpo_iterator - I = RPOT.begin(), E = RPOT.end(); I != E; ++I) - Modified |= ReduceMBB(**I); + for (MachineBasicBlock *MBB : RPOT) + Modified |= ReduceMBB(*MBB); return Modified; } diff --git a/llvm/lib/Target/ARM/ThumbRegisterInfo.cpp b/llvm/lib/Target/ARM/ThumbRegisterInfo.cpp index 4da6f6ab6994..5d2bc4ebe191 100644 --- a/llvm/lib/Target/ARM/ThumbRegisterInfo.cpp +++ b/llvm/lib/Target/ARM/ThumbRegisterInfo.cpp @@ -37,7 +37,7 @@ extern cl::opt<bool> ReuseFrameIndexVals; using namespace llvm; -ThumbRegisterInfo::ThumbRegisterInfo() : ARMBaseRegisterInfo() {} +ThumbRegisterInfo::ThumbRegisterInfo() {} const TargetRegisterClass * ThumbRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC, diff --git a/llvm/lib/Target/AVR/AVR.h b/llvm/lib/Target/AVR/AVR.h index 143c339c0664..0b512172ba10 100644 --- a/llvm/lib/Target/AVR/AVR.h +++ b/llvm/lib/Target/AVR/AVR.h @@ -28,7 +28,6 @@ FunctionPass *createAVRISelDag(AVRTargetMachine &TM, FunctionPass *createAVRExpandPseudoPass(); FunctionPass *createAVRFrameAnalyzerPass(); FunctionPass *createAVRRelaxMemPass(); -FunctionPass *createAVRDynAllocaSRPass(); FunctionPass *createAVRBranchSelectionPass(); void initializeAVRShiftExpandPass(PassRegistry &); @@ -39,17 +38,56 @@ void initializeAVRRelaxMemPass(PassRegistry &); namespace AVR { /// An integer that identifies all of the supported AVR address spaces. -enum AddressSpace { DataMemory, ProgramMemory }; +enum AddressSpace { + DataMemory, + ProgramMemory, + ProgramMemory1, + ProgramMemory2, + ProgramMemory3, + ProgramMemory4, + ProgramMemory5, + NumAddrSpaces, +}; /// Checks if a given type is a pointer to program memory. template <typename T> bool isProgramMemoryAddress(T *V) { - return cast<PointerType>(V->getType())->getAddressSpace() == ProgramMemory; + auto *PT = cast<PointerType>(V->getType()); + assert(PT != nullptr && "unexpected MemSDNode"); + return PT->getAddressSpace() == ProgramMemory || + PT->getAddressSpace() == ProgramMemory1 || + PT->getAddressSpace() == ProgramMemory2 || + PT->getAddressSpace() == ProgramMemory3 || + PT->getAddressSpace() == ProgramMemory4 || + PT->getAddressSpace() == ProgramMemory5; +} + +template <typename T> AddressSpace getAddressSpace(T *V) { + auto *PT = cast<PointerType>(V->getType()); + assert(PT != nullptr && "unexpected MemSDNode"); + unsigned AS = PT->getAddressSpace(); + if (AS < NumAddrSpaces) + return static_cast<AddressSpace>(AS); + return NumAddrSpaces; } inline bool isProgramMemoryAccess(MemSDNode const *N) { - auto V = N->getMemOperand()->getValue(); + auto *V = N->getMemOperand()->getValue(); + if (V != nullptr && isProgramMemoryAddress(V)) + return true; + return false; +} - return (V != nullptr) ? isProgramMemoryAddress(V) : false; +// Get the index of the program memory bank. +// -1: not program memory +// 0: ordinary program memory +// 1~5: extended program memory +inline int getProgramMemoryBank(MemSDNode const *N) { + auto *V = N->getMemOperand()->getValue(); + if (V == nullptr || !isProgramMemoryAddress(V)) + return -1; + AddressSpace AS = getAddressSpace(V); + assert(ProgramMemory <= AS && AS <= ProgramMemory5); + return static_cast<int>(AS - ProgramMemory); } } // end of namespace AVR diff --git a/llvm/lib/Target/AVR/AVRCallingConv.td b/llvm/lib/Target/AVR/AVRCallingConv.td index 87874c5c50b2..b4bc35e191c0 100644 --- a/llvm/lib/Target/AVR/AVRCallingConv.td +++ b/llvm/lib/Target/AVR/AVRCallingConv.td @@ -36,4 +36,4 @@ def ArgCC_AVR_Vararg : CallingConv<[ //===----------------------------------------------------------------------===// def CSR_Normal : CalleeSavedRegs<(add R29, R28, (sequence "R%u", 17, 2))>; -def CSR_Interrupts : CalleeSavedRegs<(add(sequence "R%u", 31, 0))>; +def CSR_Interrupts : CalleeSavedRegs<(add(sequence "R%u", 31, 2))>; diff --git a/llvm/lib/Target/AVR/AVRExpandPseudoInsts.cpp b/llvm/lib/Target/AVR/AVRExpandPseudoInsts.cpp index cb85d73772c5..144ae2b320f9 100644 --- a/llvm/lib/Target/AVR/AVRExpandPseudoInsts.cpp +++ b/llvm/lib/Target/AVR/AVRExpandPseudoInsts.cpp @@ -92,6 +92,7 @@ private: /// Specific shift implementation. bool expandLSLB7Rd(Block &MBB, BlockIt MBBI); bool expandLSRB7Rd(Block &MBB, BlockIt MBBI); + bool expandASRB6Rd(Block &MBB, BlockIt MBBI); bool expandASRB7Rd(Block &MBB, BlockIt MBBI); bool expandLSLW4Rd(Block &MBB, BlockIt MBBI); bool expandLSRW4Rd(Block &MBB, BlockIt MBBI); @@ -101,6 +102,9 @@ private: bool expandLSLW12Rd(Block &MBB, BlockIt MBBI); bool expandLSRW12Rd(Block &MBB, BlockIt MBBI); + // Common implementation of LPMWRdZ and ELPMWRdZ. + bool expandLPMWELPMW(Block &MBB, BlockIt MBBI, bool IsExt); + /// Scavenges a free GPR8 register for use. Register scavengeGPR8(MachineInstr &MI); }; @@ -808,18 +812,25 @@ bool AVRExpandPseudo::expand<AVR::LDDWRdPtrQ>(Block &MBB, BlockIt MBBI) { return true; } -template <> -bool AVRExpandPseudo::expand<AVR::LPMWRdZ>(Block &MBB, BlockIt MBBI) { +bool AVRExpandPseudo::expandLPMWELPMW(Block &MBB, BlockIt MBBI, bool IsExt) { MachineInstr &MI = *MBBI; Register DstLoReg, DstHiReg; Register DstReg = MI.getOperand(0).getReg(); Register TmpReg = 0; // 0 for no temporary register Register SrcReg = MI.getOperand(1).getReg(); bool SrcIsKill = MI.getOperand(1).isKill(); - unsigned OpLo = AVR::LPMRdZPi; - unsigned OpHi = AVR::LPMRdZ; + unsigned OpLo = IsExt ? AVR::ELPMRdZPi : AVR::LPMRdZPi; + unsigned OpHi = IsExt ? AVR::ELPMRdZ : AVR::LPMRdZ; TRI->splitReg(DstReg, DstLoReg, DstHiReg); + // Set the I/O register RAMPZ for ELPM. + if (IsExt) { + const AVRSubtarget &STI = MBB.getParent()->getSubtarget<AVRSubtarget>(); + Register Bank = MI.getOperand(2).getReg(); + // out RAMPZ, rtmp + buildMI(MBB, MBBI, AVR::OUTARr).addImm(STI.getIORegRAMPZ()).addReg(Bank); + } + // Use a temporary register if src and dst registers are the same. if (DstReg == SrcReg) TmpReg = scavengeGPR8(MI); @@ -857,8 +868,51 @@ bool AVRExpandPseudo::expand<AVR::LPMWRdZ>(Block &MBB, BlockIt MBBI) { } template <> +bool AVRExpandPseudo::expand<AVR::LPMWRdZ>(Block &MBB, BlockIt MBBI) { + return expandLPMWELPMW(MBB, MBBI, false); +} + +template <> +bool AVRExpandPseudo::expand<AVR::ELPMWRdZ>(Block &MBB, BlockIt MBBI) { + return expandLPMWELPMW(MBB, MBBI, true); +} + +template <> +bool AVRExpandPseudo::expand<AVR::ELPMBRdZ>(Block &MBB, BlockIt MBBI) { + MachineInstr &MI = *MBBI; + Register DstReg = MI.getOperand(0).getReg(); + Register SrcReg = MI.getOperand(1).getReg(); + Register BankReg = MI.getOperand(2).getReg(); + bool SrcIsKill = MI.getOperand(1).isKill(); + const AVRSubtarget &STI = MBB.getParent()->getSubtarget<AVRSubtarget>(); + + // Set the I/O register RAMPZ for ELPM (out RAMPZ, rtmp). + buildMI(MBB, MBBI, AVR::OUTARr).addImm(STI.getIORegRAMPZ()).addReg(BankReg); + + // Load byte. + auto MILB = buildMI(MBB, MBBI, AVR::ELPMRdZ) + .addReg(DstReg, RegState::Define) + .addReg(SrcReg, getKillRegState(SrcIsKill)); + + MILB.setMemRefs(MI.memoperands()); + + MI.eraseFromParent(); + return true; +} + +template <> bool AVRExpandPseudo::expand<AVR::LPMWRdZPi>(Block &MBB, BlockIt MBBI) { - llvm_unreachable("wide LPMPi is unimplemented"); + llvm_unreachable("16-bit LPMPi is unimplemented"); +} + +template <> +bool AVRExpandPseudo::expand<AVR::ELPMBRdZPi>(Block &MBB, BlockIt MBBI) { + llvm_unreachable("byte ELPMPi is unimplemented"); +} + +template <> +bool AVRExpandPseudo::expand<AVR::ELPMWRdZPi>(Block &MBB, BlockIt MBBI) { + llvm_unreachable("16-bit ELPMPi is unimplemented"); } template <typename Func> @@ -1411,6 +1465,30 @@ bool AVRExpandPseudo::expand<AVR::LSLWRd>(Block &MBB, BlockIt MBBI) { return true; } +template <> +bool AVRExpandPseudo::expand<AVR::LSLWHiRd>(Block &MBB, BlockIt MBBI) { + MachineInstr &MI = *MBBI; + Register DstLoReg, DstHiReg; + Register DstReg = MI.getOperand(0).getReg(); + bool DstIsDead = MI.getOperand(0).isDead(); + bool DstIsKill = MI.getOperand(1).isKill(); + bool ImpIsDead = MI.getOperand(2).isDead(); + TRI->splitReg(DstReg, DstLoReg, DstHiReg); + + // add hireg, hireg <==> lsl hireg + auto MILSL = + buildMI(MBB, MBBI, AVR::ADDRdRr) + .addReg(DstHiReg, RegState::Define, getDeadRegState(DstIsDead)) + .addReg(DstHiReg, getKillRegState(DstIsKill)) + .addReg(DstHiReg, getKillRegState(DstIsKill)); + + if (ImpIsDead) + MILSL->getOperand(3).setIsDead(); + + MI.eraseFromParent(); + return true; +} + bool AVRExpandPseudo::expandLSLW4Rd(Block &MBB, BlockIt MBBI) { MachineInstr &MI = *MBBI; Register DstLoReg, DstHiReg; @@ -1586,6 +1664,29 @@ bool AVRExpandPseudo::expand<AVR::LSRWRd>(Block &MBB, BlockIt MBBI) { return true; } +template <> +bool AVRExpandPseudo::expand<AVR::LSRWLoRd>(Block &MBB, BlockIt MBBI) { + MachineInstr &MI = *MBBI; + Register DstLoReg, DstHiReg; + Register DstReg = MI.getOperand(0).getReg(); + bool DstIsDead = MI.getOperand(0).isDead(); + bool DstIsKill = MI.getOperand(1).isKill(); + bool ImpIsDead = MI.getOperand(2).isDead(); + TRI->splitReg(DstReg, DstLoReg, DstHiReg); + + // lsr loreg + auto MILSR = + buildMI(MBB, MBBI, AVR::LSRRd) + .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead)) + .addReg(DstLoReg, getKillRegState(DstIsKill)); + + if (ImpIsDead) + MILSR->getOperand(2).setIsDead(); + + MI.eraseFromParent(); + return true; +} + bool AVRExpandPseudo::expandLSRW4Rd(Block &MBB, BlockIt MBBI) { MachineInstr &MI = *MBBI; Register DstLoReg, DstHiReg; @@ -1773,6 +1874,29 @@ bool AVRExpandPseudo::expand<AVR::ASRWRd>(Block &MBB, BlockIt MBBI) { return true; } +template <> +bool AVRExpandPseudo::expand<AVR::ASRWLoRd>(Block &MBB, BlockIt MBBI) { + MachineInstr &MI = *MBBI; + Register DstLoReg, DstHiReg; + Register DstReg = MI.getOperand(0).getReg(); + bool DstIsDead = MI.getOperand(0).isDead(); + bool DstIsKill = MI.getOperand(1).isKill(); + bool ImpIsDead = MI.getOperand(2).isDead(); + TRI->splitReg(DstReg, DstLoReg, DstHiReg); + + // asr loreg + auto MIASR = + buildMI(MBB, MBBI, AVR::ASRRd) + .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead)) + .addReg(DstLoReg, getKillRegState(DstIsKill)); + + if (ImpIsDead) + MIASR->getOperand(2).setIsDead(); + + MI.eraseFromParent(); + return true; +} + bool AVRExpandPseudo::expandASRW8Rd(Block &MBB, BlockIt MBBI) { MachineInstr &MI = *MBBI; Register DstLoReg, DstHiReg; @@ -1921,6 +2045,44 @@ bool AVRExpandPseudo::expand<AVR::LSRBNRd>(Block &MBB, BlockIt MBBI) { } } +bool AVRExpandPseudo::expandASRB6Rd(Block &MBB, BlockIt MBBI) { + MachineInstr &MI = *MBBI; + Register DstReg = MI.getOperand(0).getReg(); + bool DstIsDead = MI.getOperand(0).isDead(); + bool DstIsKill = MI.getOperand(1).isKill(); + + // bst r24, 6 + // lsl r24 + // sbc r24, r24 + // bld r24, 0 + + buildMI(MBB, MBBI, AVR::BST) + .addReg(DstReg) + .addImm(6) + ->getOperand(2) + .setIsUndef(true); + + buildMI(MBB, MBBI, AVR::ADDRdRr) // LSL Rd <==> ADD Rd, Rd + .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead)) + .addReg(DstReg, getKillRegState(DstIsKill)) + .addReg(DstReg, getKillRegState(DstIsKill)); + + buildMI(MBB, MBBI, AVR::SBCRdRr) + .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead)) + .addReg(DstReg, getKillRegState(DstIsKill)) + .addReg(DstReg, getKillRegState(DstIsKill)); + + buildMI(MBB, MBBI, AVR::BLD) + .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead)) + .addReg(DstReg, getKillRegState(DstIsKill)) + .addImm(0) + ->getOperand(3) + .setIsKill(); + + MI.eraseFromParent(); + return true; +} + bool AVRExpandPseudo::expandASRB7Rd(Block &MBB, BlockIt MBBI) { MachineInstr &MI = *MBBI; Register DstReg = MI.getOperand(0).getReg(); @@ -1957,6 +2119,8 @@ bool AVRExpandPseudo::expand<AVR::ASRBNRd>(Block &MBB, BlockIt MBBI) { MachineInstr &MI = *MBBI; unsigned Imm = MI.getOperand(2).getImm(); switch (Imm) { + case 6: + return expandASRB6Rd(MBB, MBBI); case 7: return expandASRB7Rd(MBB, MBBI); default: @@ -2158,6 +2322,10 @@ bool AVRExpandPseudo::expandMI(Block &MBB, BlockIt MBBI) { EXPAND(AVR::LDDWRdPtrQ); EXPAND(AVR::LPMWRdZ); EXPAND(AVR::LPMWRdZPi); + EXPAND(AVR::ELPMBRdZ); + EXPAND(AVR::ELPMWRdZ); + EXPAND(AVR::ELPMBRdZPi); + EXPAND(AVR::ELPMWRdZPi); EXPAND(AVR::AtomicLoad8); EXPAND(AVR::AtomicLoad16); EXPAND(AVR::AtomicStore8); @@ -2189,6 +2357,9 @@ bool AVRExpandPseudo::expandMI(Block &MBB, BlockIt MBBI) { EXPAND(AVR::RORWRd); EXPAND(AVR::ROLWRd); EXPAND(AVR::ASRWRd); + EXPAND(AVR::LSLWHiRd); + EXPAND(AVR::LSRWLoRd); + EXPAND(AVR::ASRWLoRd); EXPAND(AVR::LSLWNRd); EXPAND(AVR::LSRWNRd); EXPAND(AVR::ASRWNRd); diff --git a/llvm/lib/Target/AVR/AVRFrameLowering.cpp b/llvm/lib/Target/AVR/AVRFrameLowering.cpp index 543d94875037..b3bc9ede205e 100644 --- a/llvm/lib/Target/AVR/AVRFrameLowering.cpp +++ b/llvm/lib/Target/AVR/AVRFrameLowering.cpp @@ -79,11 +79,6 @@ void AVRFrameLowering::emitPrologue(MachineFunction &MF, .addReg(AVR::R0, RegState::Kill) .setMIFlag(MachineInstr::FrameSetup); BuildMI(MBB, MBBI, DL, TII.get(AVR::EORRdRr)) - .addReg(AVR::R0, RegState::Define) - .addReg(AVR::R0, RegState::Kill) - .addReg(AVR::R0, RegState::Kill) - .setMIFlag(MachineInstr::FrameSetup); - BuildMI(MBB, MBBI, DL, TII.get(AVR::EORRdRr)) .addReg(AVR::R1, RegState::Define) .addReg(AVR::R1, RegState::Kill) .addReg(AVR::R1, RegState::Kill) @@ -176,7 +171,7 @@ void AVRFrameLowering::emitEpilogue(MachineFunction &MF, const AVRInstrInfo &TII = *STI.getInstrInfo(); // Early exit if there is no need to restore the frame pointer. - if (!FrameSize) { + if (!FrameSize && !MF.getFrameInfo().hasVarSizedObjects()) { restoreStatusRegister(MF, MBB); return; } @@ -193,22 +188,24 @@ void AVRFrameLowering::emitEpilogue(MachineFunction &MF, --MBBI; } - unsigned Opcode; + if (FrameSize) { + unsigned Opcode; - // Select the optimal opcode depending on how big it is. - if (isUInt<6>(FrameSize)) { - Opcode = AVR::ADIWRdK; - } else { - Opcode = AVR::SUBIWRdK; - FrameSize = -FrameSize; - } + // Select the optimal opcode depending on how big it is. + if (isUInt<6>(FrameSize)) { + Opcode = AVR::ADIWRdK; + } else { + Opcode = AVR::SUBIWRdK; + FrameSize = -FrameSize; + } - // Restore the frame pointer by doing FP += <size>. - MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(Opcode), AVR::R29R28) - .addReg(AVR::R29R28, RegState::Kill) - .addImm(FrameSize); - // The SREG implicit def is dead. - MI->getOperand(3).setIsDead(); + // Restore the frame pointer by doing FP += <size>. + MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(Opcode), AVR::R29R28) + .addReg(AVR::R29R28, RegState::Kill) + .addImm(FrameSize); + // The SREG implicit def is dead. + MI->getOperand(3).setIsDead(); + } // Write back R29R28 to SP and temporarily disable interrupts. BuildMI(MBB, MBBI, DL, TII.get(AVR::SPWRITE), AVR::SP) @@ -230,7 +227,8 @@ bool AVRFrameLowering::hasFP(const MachineFunction &MF) const { const AVRMachineFunctionInfo *FuncInfo = MF.getInfo<AVRMachineFunctionInfo>(); return (FuncInfo->getHasSpills() || FuncInfo->getHasAllocas() || - FuncInfo->getHasStackArgs()); + FuncInfo->getHasStackArgs() || + MF.getFrameInfo().hasVarSizedObjects()); } bool AVRFrameLowering::spillCalleeSavedRegisters( @@ -248,7 +246,7 @@ bool AVRFrameLowering::spillCalleeSavedRegisters( AVRMachineFunctionInfo *AVRFI = MF.getInfo<AVRMachineFunctionInfo>(); for (const CalleeSavedInfo &I : llvm::reverse(CSI)) { - unsigned Reg = I.getReg(); + Register Reg = I.getReg(); bool IsNotLiveIn = !MBB.isLiveIn(Reg); assert(TRI->getRegSizeInBits(*TRI->getMinimalPhysRegClass(Reg)) == 8 && @@ -286,7 +284,7 @@ bool AVRFrameLowering::restoreCalleeSavedRegisters( const TargetInstrInfo &TII = *STI.getInstrInfo(); for (const CalleeSavedInfo &CCSI : CSI) { - unsigned Reg = CCSI.getReg(); + Register Reg = CCSI.getReg(); assert(TRI->getRegSizeInBits(*TRI->getMinimalPhysRegClass(Reg)) == 8 && "Invalid register size"); @@ -480,56 +478,4 @@ char AVRFrameAnalyzer::ID = 0; /// Creates instance of the frame analyzer pass. FunctionPass *createAVRFrameAnalyzerPass() { return new AVRFrameAnalyzer(); } -/// Create the Dynalloca Stack Pointer Save/Restore pass. -/// Insert a copy of SP before allocating the dynamic stack memory and restore -/// it in function exit to restore the original SP state. This avoids the need -/// of reserving a register pair for a frame pointer. -struct AVRDynAllocaSR : public MachineFunctionPass { - static char ID; - AVRDynAllocaSR() : MachineFunctionPass(ID) {} - - bool runOnMachineFunction(MachineFunction &MF) override { - // Early exit when there are no variable sized objects in the function. - if (!MF.getFrameInfo().hasVarSizedObjects()) { - return false; - } - - const AVRSubtarget &STI = MF.getSubtarget<AVRSubtarget>(); - const TargetInstrInfo &TII = *STI.getInstrInfo(); - MachineBasicBlock &EntryMBB = MF.front(); - MachineBasicBlock::iterator MBBI = EntryMBB.begin(); - DebugLoc DL = EntryMBB.findDebugLoc(MBBI); - - Register SPCopy = - MF.getRegInfo().createVirtualRegister(&AVR::DREGSRegClass); - - // Create a copy of SP in function entry before any dynallocas are - // inserted. - BuildMI(EntryMBB, MBBI, DL, TII.get(AVR::COPY), SPCopy).addReg(AVR::SP); - - // Restore SP in all exit basic blocks. - for (MachineBasicBlock &MBB : MF) { - // If last instruction is a return instruction, add a restore copy. - if (!MBB.empty() && MBB.back().isReturn()) { - MBBI = MBB.getLastNonDebugInstr(); - DL = MBBI->getDebugLoc(); - BuildMI(MBB, MBBI, DL, TII.get(AVR::COPY), AVR::SP) - .addReg(SPCopy, RegState::Kill); - } - } - - return true; - } - - StringRef getPassName() const override { - return "AVR dynalloca stack pointer save/restore"; - } -}; - -char AVRDynAllocaSR::ID = 0; - -/// createAVRDynAllocaSRPass - returns an instance of the dynalloca stack -/// pointer save/restore pass. -FunctionPass *createAVRDynAllocaSRPass() { return new AVRDynAllocaSR(); } - } // end of namespace llvm diff --git a/llvm/lib/Target/AVR/AVRISelDAGToDAG.cpp b/llvm/lib/Target/AVR/AVRISelDAGToDAG.cpp index 7ec2629ab45d..df364cae671c 100644 --- a/llvm/lib/Target/AVR/AVRISelDAGToDAG.cpp +++ b/llvm/lib/Target/AVR/AVRISelDAGToDAG.cpp @@ -38,7 +38,7 @@ public: bool SelectAddr(SDNode *Op, SDValue N, SDValue &Base, SDValue &Disp); bool selectIndexedLoad(SDNode *N); - unsigned selectIndexedProgMemLoad(const LoadSDNode *LD, MVT VT); + unsigned selectIndexedProgMemLoad(const LoadSDNode *LD, MVT VT, int Bank); bool SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintCode, std::vector<SDValue> &OutOps) override; @@ -165,35 +165,31 @@ bool AVRDAGToDAGISel::selectIndexedLoad(SDNode *N) { return true; } -unsigned AVRDAGToDAGISel::selectIndexedProgMemLoad(const LoadSDNode *LD, - MVT VT) { - ISD::MemIndexedMode AM = LD->getAddressingMode(); - +unsigned AVRDAGToDAGISel::selectIndexedProgMemLoad(const LoadSDNode *LD, MVT VT, + int Bank) { // Progmem indexed loads only work in POSTINC mode. - if (LD->getExtensionType() != ISD::NON_EXTLOAD || AM != ISD::POST_INC) { + if (LD->getExtensionType() != ISD::NON_EXTLOAD || + LD->getAddressingMode() != ISD::POST_INC) return 0; - } + + // Feature ELPM is needed for loading from extended program memory. + assert((Bank == 0 || Subtarget->hasELPM()) && + "cannot load from extended program memory on this mcu"); unsigned Opcode = 0; int Offs = cast<ConstantSDNode>(LD->getOffset())->getSExtValue(); switch (VT.SimpleTy) { - case MVT::i8: { - if (Offs != 1) { - return 0; - } - Opcode = AVR::LPMRdZPi; + case MVT::i8: + if (Offs == 1) + Opcode = Bank > 0 ? AVR::ELPMBRdZPi : AVR::LPMRdZPi; break; - } - case MVT::i16: { - if (Offs != 2) { - return 0; - } - Opcode = AVR::LPMWRdZPi; + case MVT::i16: + if (Offs == 2) + Opcode = Bank > 0 ? AVR::ELPMWRdZPi : AVR::LPMWRdZPi; break; - } default: - return 0; + break; } return Opcode; @@ -360,7 +356,12 @@ template <> bool AVRDAGToDAGISel::select<ISD::LOAD>(SDNode *N) { return selectIndexedLoad(N); } - assert(Subtarget->hasLPM() && "cannot load from program memory on this mcu"); + if (!Subtarget->hasLPM()) + report_fatal_error("cannot load from program memory on this mcu"); + + int ProgMemBank = AVR::getProgramMemoryBank(LD); + if (ProgMemBank < 0 || ProgMemBank > 5) + report_fatal_error("unexpected program memory bank"); // This is a flash memory load, move the pointer into R31R30 and emit // the lpm instruction. @@ -374,25 +375,48 @@ template <> bool AVRDAGToDAGISel::select<ISD::LOAD>(SDNode *N) { Ptr = CurDAG->getCopyFromReg(Chain, DL, AVR::R31R30, MVT::i16, Chain.getValue(1)); - SDValue RegZ = CurDAG->getRegister(AVR::R31R30, MVT::i16); - // Check if the opcode can be converted into an indexed load. - if (unsigned LPMOpc = selectIndexedProgMemLoad(LD, VT)) { + if (unsigned LPMOpc = selectIndexedProgMemLoad(LD, VT, ProgMemBank)) { // It is legal to fold the load into an indexed load. - ResNode = - CurDAG->getMachineNode(LPMOpc, DL, VT, MVT::i16, MVT::Other, Ptr, RegZ); - ReplaceUses(SDValue(N, 1), SDValue(ResNode, 1)); + if (ProgMemBank == 0) { + ResNode = + CurDAG->getMachineNode(LPMOpc, DL, VT, MVT::i16, MVT::Other, Ptr); + } else { + // Do not combine the LDI instruction into the ELPM pseudo instruction, + // since it may be reused by other ELPM pseudo instructions. + SDValue NC = CurDAG->getTargetConstant(ProgMemBank, DL, MVT::i8); + auto *NP = CurDAG->getMachineNode(AVR::LDIRdK, DL, MVT::i8, NC); + ResNode = CurDAG->getMachineNode(LPMOpc, DL, VT, MVT::i16, MVT::Other, + Ptr, SDValue(NP, 0)); + } } else { // Selecting an indexed load is not legal, fallback to a normal load. switch (VT.SimpleTy) { case MVT::i8: - ResNode = CurDAG->getMachineNode(AVR::LPMRdZ, DL, MVT::i8, MVT::Other, - Ptr, RegZ); + if (ProgMemBank == 0) { + ResNode = + CurDAG->getMachineNode(AVR::LPMRdZ, DL, MVT::i8, MVT::Other, Ptr); + } else { + // Do not combine the LDI instruction into the ELPM pseudo instruction, + // since it may be reused by other ELPM pseudo instructions. + SDValue NC = CurDAG->getTargetConstant(ProgMemBank, DL, MVT::i8); + auto *NP = CurDAG->getMachineNode(AVR::LDIRdK, DL, MVT::i8, NC); + ResNode = CurDAG->getMachineNode(AVR::ELPMBRdZ, DL, MVT::i8, MVT::Other, + Ptr, SDValue(NP, 0)); + } break; case MVT::i16: - ResNode = CurDAG->getMachineNode(AVR::LPMWRdZ, DL, MVT::i16, MVT::Other, - Ptr, RegZ); - ReplaceUses(SDValue(N, 1), SDValue(ResNode, 1)); + if (ProgMemBank == 0) { + ResNode = + CurDAG->getMachineNode(AVR::LPMWRdZ, DL, MVT::i16, MVT::Other, Ptr); + } else { + // Do not combine the LDI instruction into the ELPM pseudo instruction, + // since LDI requires the destination register in range R16~R31. + SDValue NC = CurDAG->getTargetConstant(ProgMemBank, DL, MVT::i8); + auto *NP = CurDAG->getMachineNode(AVR::LDIRdK, DL, MVT::i8, NC); + ResNode = CurDAG->getMachineNode(AVR::ELPMWRdZ, DL, MVT::i16, + MVT::Other, Ptr, SDValue(NP, 0)); + } break; default: llvm_unreachable("Unsupported VT!"); diff --git a/llvm/lib/Target/AVR/AVRISelLowering.cpp b/llvm/lib/Target/AVR/AVRISelLowering.cpp index a6f2afb87102..a58fedf6cd36 100644 --- a/llvm/lib/Target/AVR/AVRISelLowering.cpp +++ b/llvm/lib/Target/AVR/AVRISelLowering.cpp @@ -359,6 +359,11 @@ SDValue AVRTargetLowering::LowerShifts(SDValue Op, SelectionDAG &DAG) const { Victim = DAG.getNode(AVRISD::LSRBN, dl, VT, Victim, DAG.getConstant(7, dl, VT)); ShiftAmount = 0; + } else if (Op.getOpcode() == ISD::SRA && ShiftAmount == 6) { + // Optimize ASR when ShiftAmount == 6. + Victim = DAG.getNode(AVRISD::ASRBN, dl, VT, Victim, + DAG.getConstant(6, dl, VT)); + ShiftAmount = 0; } else if (Op.getOpcode() == ISD::SRA && ShiftAmount == 7) { // Optimize ASR when ShiftAmount == 7. Victim = DAG.getNode(AVRISD::ASRBN, dl, VT, Victim, @@ -387,16 +392,22 @@ SDValue AVRTargetLowering::LowerShifts(SDValue Op, SelectionDAG &DAG) const { Victim = DAG.getNode(AVRISD::LSLWN, dl, VT, Victim, DAG.getConstant(8, dl, VT)); ShiftAmount -= 8; + // Only operate on the higher byte for remaining shift bits. + Opc8 = AVRISD::LSLHI; break; case ISD::SRL: Victim = DAG.getNode(AVRISD::LSRWN, dl, VT, Victim, DAG.getConstant(8, dl, VT)); ShiftAmount -= 8; + // Only operate on the lower byte for remaining shift bits. + Opc8 = AVRISD::LSRLO; break; case ISD::SRA: Victim = DAG.getNode(AVRISD::ASRWN, dl, VT, Victim, DAG.getConstant(8, dl, VT)); ShiftAmount -= 8; + // Only operate on the lower byte for remaining shift bits. + Opc8 = AVRISD::ASRLO; break; default: break; @@ -407,11 +418,22 @@ SDValue AVRTargetLowering::LowerShifts(SDValue Op, SelectionDAG &DAG) const { Victim = DAG.getNode(AVRISD::LSLWN, dl, VT, Victim, DAG.getConstant(12, dl, VT)); ShiftAmount -= 12; + // Only operate on the higher byte for remaining shift bits. + Opc8 = AVRISD::LSLHI; break; case ISD::SRL: Victim = DAG.getNode(AVRISD::LSRWN, dl, VT, Victim, DAG.getConstant(12, dl, VT)); ShiftAmount -= 12; + // Only operate on the lower byte for remaining shift bits. + Opc8 = AVRISD::LSRLO; + break; + case ISD::SRA: + Victim = DAG.getNode(AVRISD::ASRWN, dl, VT, Victim, + DAG.getConstant(8, dl, VT)); + ShiftAmount -= 8; + // Only operate on the lower byte for remaining shift bits. + Opc8 = AVRISD::ASRLO; break; default: break; @@ -874,7 +896,8 @@ bool AVRTargetLowering::isLegalAddressingMode(const DataLayout &DL, // Allow reg+<6bit> offset. if (Offs < 0) Offs = -Offs; - if (AM.BaseGV == 0 && AM.HasBaseReg && AM.Scale == 0 && isUInt<6>(Offs)) { + if (AM.BaseGV == nullptr && AM.HasBaseReg && AM.Scale == 0 && + isUInt<6>(Offs)) { return true; } @@ -1169,7 +1192,7 @@ SDValue AVRTargetLowering::LowerFormalArguments( llvm_unreachable("Unknown argument type!"); } - unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); + Register Reg = MF.addLiveIn(VA.getLocReg(), RC); ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT); // :NOTE: Clang should not promote any i8 into i16 but for safety the @@ -1672,6 +1695,18 @@ MachineBasicBlock *AVRTargetLowering::insertMul(MachineInstr &MI, return BB; } +// Insert a read from R1, which almost always contains the value 0. +MachineBasicBlock * +AVRTargetLowering::insertCopyR1(MachineInstr &MI, MachineBasicBlock *BB) const { + const TargetInstrInfo &TII = *Subtarget.getInstrInfo(); + MachineBasicBlock::iterator I(MI); + BuildMI(*BB, I, MI.getDebugLoc(), TII.get(AVR::COPY)) + .add(MI.getOperand(0)) + .addReg(AVR::R1); + MI.eraseFromParent(); + return BB; +} + MachineBasicBlock * AVRTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const { @@ -1694,6 +1729,8 @@ AVRTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, case AVR::MULRdRr: case AVR::MULSRdRr: return insertMul(MI, MBB); + case AVR::CopyR1: + return insertCopyR1(MI, MBB); } assert((Opc == AVR::Select16 || Opc == AVR::Select8) && @@ -2012,7 +2049,7 @@ void AVRTargetLowering::LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops, SelectionDAG &DAG) const { - SDValue Result(0, 0); + SDValue Result; SDLoc DL(Op); EVT Ty = Op.getValueType(); diff --git a/llvm/lib/Target/AVR/AVRISelLowering.h b/llvm/lib/Target/AVR/AVRISelLowering.h index 3ae036b66bcb..116417b61566 100644 --- a/llvm/lib/Target/AVR/AVRISelLowering.h +++ b/llvm/lib/Target/AVR/AVRISelLowering.h @@ -38,12 +38,15 @@ enum NodeType { LSL, ///< Logical shift left. LSLBN, ///< Byte logical shift left N bits. LSLWN, ///< Word logical shift left N bits. + LSLHI, ///< Higher 8-bit of word logical shift left. LSR, ///< Logical shift right. LSRBN, ///< Byte logical shift right N bits. LSRWN, ///< Word logical shift right N bits. + LSRLO, ///< Lower 8-bit of word logical shift right. ASR, ///< Arithmetic shift right. ASRBN, ///< Byte arithmetic shift right N bits. ASRWN, ///< Word arithmetic shift right N bits. + ASRLO, ///< Lower 8-bit of word arithmetic shift right. ROR, ///< Bit rotate right. ROL, ///< Bit rotate left. LSLLOOP, ///< A loop of single logical shift left instructions. @@ -184,6 +187,8 @@ protected: private: MachineBasicBlock *insertShift(MachineInstr &MI, MachineBasicBlock *BB) const; MachineBasicBlock *insertMul(MachineInstr &MI, MachineBasicBlock *BB) const; + MachineBasicBlock *insertCopyR1(MachineInstr &MI, + MachineBasicBlock *BB) const; }; } // end namespace llvm diff --git a/llvm/lib/Target/AVR/AVRInstrInfo.cpp b/llvm/lib/Target/AVR/AVRInstrInfo.cpp index 51060018a5ca..ac52c47f93d5 100644 --- a/llvm/lib/Target/AVR/AVRInstrInfo.cpp +++ b/llvm/lib/Target/AVR/AVRInstrInfo.cpp @@ -304,11 +304,11 @@ bool AVRInstrInfo::analyzeBranch(MachineBasicBlock &MBB, } Cond.clear(); - FBB = 0; + FBB = nullptr; // Delete the JMP if it's equivalent to a fall-through. if (MBB.isLayoutSuccessor(I->getOperand(0).getMBB())) { - TBB = 0; + TBB = nullptr; I->eraseFromParent(); I = MBB.end(); UnCondBrIter = MBB.end(); diff --git a/llvm/lib/Target/AVR/AVRInstrInfo.td b/llvm/lib/Target/AVR/AVRInstrInfo.td index c7f423292da0..2b96dc0b833a 100644 --- a/llvm/lib/Target/AVR/AVRInstrInfo.td +++ b/llvm/lib/Target/AVR/AVRInstrInfo.td @@ -60,6 +60,9 @@ def AVRlsr : SDNode<"AVRISD::LSR", SDTIntUnaryOp>; def AVRrol : SDNode<"AVRISD::ROL", SDTIntUnaryOp>; def AVRror : SDNode<"AVRISD::ROR", SDTIntUnaryOp>; def AVRasr : SDNode<"AVRISD::ASR", SDTIntUnaryOp>; +def AVRlslhi : SDNode<"AVRISD::LSLHI", SDTIntUnaryOp>; +def AVRlsrlo : SDNode<"AVRISD::LSRLO", SDTIntUnaryOp>; +def AVRasrlo : SDNode<"AVRISD::ASRLO", SDTIntUnaryOp>; def AVRlslbn : SDNode<"AVRISD::LSLBN", SDTIntBinOp>; def AVRlsrbn : SDNode<"AVRISD::LSRBN", SDTIntBinOp>; def AVRasrbn : SDNode<"AVRISD::ASRBN", SDTIntBinOp>; @@ -1391,7 +1394,7 @@ let canFoldAsLoad = 1, isReMaterializable = 1 in { // ldd Rd, P+q // ldd Rd+1, P+q+1 let Constraints = "@earlyclobber $dst" in def LDDWRdPtrQ - : Pseudo<(outs DREGS_WITHOUT_YZ_WORKAROUND + : Pseudo<(outs DREGS : $dst), (ins memri : $memri), @@ -1699,21 +1702,34 @@ let mayLoad = 1, hasSideEffects = 0 in { : F16<0b1001010111011000, (outs), (ins), "elpm", []>, Requires<[HasELPM]>; - def ELPMRdZ : FLPMX<1, 0, - (outs GPR8 - : $dst), - (ins ZREG - : $z), + def ELPMRdZ : FLPMX<1, 0, (outs GPR8:$dst), (ins ZREG:$z), "elpm\t$dst, $z", []>, Requires<[HasELPMX]>; - let Defs = [R31R30] in def ELPMRdZPi : FLPMX<1, 1, - (outs GPR8 - : $dst), - (ins ZREG - : $z), - "elpm\t$dst, $z+", []>, - Requires<[HasELPMX]>; + let Defs = [R31R30] in { + def ELPMRdZPi : FLPMX<1, 1, (outs GPR8:$dst), (ins ZREG:$z), + "elpm\t$dst, $z+", []>, + Requires<[HasELPMX]>; + } + + // These pseudos are combination of the OUT and ELPM instructions. + let Defs = [R31R30], hasSideEffects = 1 in { + def ELPMBRdZ : Pseudo<(outs GPR8:$dst), (ins ZREG:$z, LD8:$p), + "elpmb\t$dst, $z, $p", []>, + Requires<[HasELPMX]>; + + def ELPMWRdZ : Pseudo<(outs DREGS:$dst), (ins ZREG:$z, LD8:$p), + "elpmw\t$dst, $z, $p", []>, + Requires<[HasELPMX]>; + + def ELPMBRdZPi : Pseudo<(outs GPR8:$dst), (ins ZREG:$z, LD8:$p), + "elpmb\t$dst, $z+, $p", []>, + Requires<[HasELPMX]>; + + def ELPMWRdZPi : Pseudo<(outs DREGS:$dst), (ins ZREG:$z, LD8:$p), + "elpmw\t$dst, $z+, $p", []>, + Requires<[HasELPMX]>; + } } // Store program memory operations. @@ -1848,6 +1864,9 @@ let Constraints = "$src = $rd", Defs = [SREG] in { : $src)), (implicit SREG)]>; + def LSLWHiRd : Pseudo<(outs DREGS:$rd), (ins DREGS:$src), "lslwhi\t$rd", + [(set i16:$rd, (AVRlslhi i16:$src)), (implicit SREG)]>; + def LSLWNRd : Pseudo<(outs DLDREGS : $rd), (ins DREGS @@ -1895,6 +1914,9 @@ let Constraints = "$src = $rd", Defs = [SREG] in { : $src)), (implicit SREG)]>; + def LSRWLoRd : Pseudo<(outs DREGS:$rd), (ins DREGS:$src), "lsrwlo\t$rd", + [(set i16:$rd, (AVRlsrlo i16:$src)), (implicit SREG)]>; + def LSRWNRd : Pseudo<(outs DLDREGS : $rd), (ins DREGS @@ -1968,6 +1990,9 @@ let Constraints = "$src = $rd", Defs = [SREG] in { : $src)), (implicit SREG)]>; + def ASRWLoRd : Pseudo<(outs DREGS:$rd), (ins DREGS:$src), "asrwlo\t$rd", + [(set i16:$rd, (AVRasrlo i16:$src)), (implicit SREG)]>; + def ROLBRd : Pseudo<(outs GPR8 : $rd), (ins GPR8 @@ -2365,6 +2390,10 @@ def Asr16 : ShiftPseudo<(outs DREGS : $src, i8 : $cnt))]>; +// lowered to a copy from R1, which contains the value zero. +let usesCustomInserter=1 in +def CopyR1 : Pseudo<(outs GPR8:$rd), (ins), "clrz\t$rd", [(set i8:$rd, 0)]>; + //===----------------------------------------------------------------------===// // Non-Instruction Patterns //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AVR/AVRRegisterInfo.cpp b/llvm/lib/Target/AVR/AVRRegisterInfo.cpp index 1886debaf492..5dd7f5c55695 100644 --- a/llvm/lib/Target/AVR/AVRRegisterInfo.cpp +++ b/llvm/lib/Target/AVR/AVRRegisterInfo.cpp @@ -44,10 +44,7 @@ AVRRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { const uint32_t * AVRRegisterInfo::getCallPreservedMask(const MachineFunction &MF, CallingConv::ID CC) const { - const AVRMachineFunctionInfo *AFI = MF.getInfo<AVRMachineFunctionInfo>(); - - return AFI->isInterruptOrSignalHandler() ? CSR_Interrupts_RegMask - : CSR_Normal_RegMask; + return CSR_Normal_RegMask; } BitVector AVRRegisterInfo::getReservedRegs(const MachineFunction &MF) const { diff --git a/llvm/lib/Target/AVR/AVRRegisterInfo.h b/llvm/lib/Target/AVR/AVRRegisterInfo.h index fa27d9283209..2c5647b52c1c 100644 --- a/llvm/lib/Target/AVR/AVRRegisterInfo.h +++ b/llvm/lib/Target/AVR/AVRRegisterInfo.h @@ -27,7 +27,7 @@ public: public: const uint16_t * - getCalleeSavedRegs(const MachineFunction *MF = 0) const override; + getCalleeSavedRegs(const MachineFunction *MF = nullptr) const override; const uint32_t *getCallPreservedMask(const MachineFunction &MF, CallingConv::ID CC) const override; BitVector getReservedRegs(const MachineFunction &MF) const override; @@ -39,7 +39,7 @@ public: /// Stack Frame Processing Methods void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj, unsigned FIOperandNum, - RegScavenger *RS = NULL) const override; + RegScavenger *RS = nullptr) const override; Register getFrameRegister(const MachineFunction &MF) const override; diff --git a/llvm/lib/Target/AVR/AVRRegisterInfo.td b/llvm/lib/Target/AVR/AVRRegisterInfo.td index bb4e86ca0536..c5fda788fe4d 100644 --- a/llvm/lib/Target/AVR/AVRRegisterInfo.td +++ b/llvm/lib/Target/AVR/AVRRegisterInfo.td @@ -178,26 +178,6 @@ def DREGSMOVW : RegisterClass<"AVR", [i16], 8, R29R28, R17R16, R15R14, R13R12, R11R10, R9R8, R7R6, R5R4, R3R2, R1R0)>; -// The 16-bit DREGS register class, excluding the Z pointer register. -// -// This is used by instructions which cause high pointer register -// contention which leads to an assertion in the register allocator. -// -// There is no technical reason why instructions that use this class -// cannot use Z; it's simply a workaround a regalloc bug. -// -// More information can be found in PR39553. -def DREGS_WITHOUT_YZ_WORKAROUND - : RegisterClass<"AVR", [i16], 8, - ( - // Return value and arguments. - add R25R24, R19R18, R21R20, R23R22, - // Scratch registers. - R27R26, - // Callee saved registers. - R17R16, R15R14, R13R12, R11R10, R9R8, R7R6, R5R4, R3R2, - R1R0)>; - // 16-bit register class for immediate instructions. def DLDREGS : RegisterClass<"AVR", [i16], 8, ( diff --git a/llvm/lib/Target/AVR/AVRSubtarget.cpp b/llvm/lib/Target/AVR/AVRSubtarget.cpp index 990e1c57e63f..8a5481423e9f 100644 --- a/llvm/lib/Target/AVR/AVRSubtarget.cpp +++ b/llvm/lib/Target/AVR/AVRSubtarget.cpp @@ -40,8 +40,7 @@ AVRSubtarget::AVRSubtarget(const Triple &TT, const std::string &CPU, m_hasTinyEncoding(false), m_hasMemMappedGPR(false), m_FeatureSetDummy(false), - InstrInfo(), FrameLowering(), - TLInfo(TM, initializeSubtargetDependencies(CPU, FS, TM)), TSInfo() { + TLInfo(TM, initializeSubtargetDependencies(CPU, FS, TM)) { // Parse features string. ParseSubtargetFeatures(CPU, /*TuneCPU*/ CPU, FS); } diff --git a/llvm/lib/Target/AVR/AVRSubtarget.h b/llvm/lib/Target/AVR/AVRSubtarget.h index 90b9cd4da7c1..f8ca191b1868 100644 --- a/llvm/lib/Target/AVR/AVRSubtarget.h +++ b/llvm/lib/Target/AVR/AVRSubtarget.h @@ -91,6 +91,9 @@ public: return ELFArch; } + /// Get I/O register address. + int getIORegRAMPZ(void) const { return 0x3b; } + private: /// The ELF e_flags architecture. unsigned ELFArch; diff --git a/llvm/lib/Target/AVR/AVRTargetMachine.cpp b/llvm/lib/Target/AVR/AVRTargetMachine.cpp index 65740f7c2306..22b9ba3ece07 100644 --- a/llvm/lib/Target/AVR/AVRTargetMachine.cpp +++ b/llvm/lib/Target/AVR/AVRTargetMachine.cpp @@ -70,7 +70,6 @@ public: bool addInstSelector() override; void addPreSched2() override; void addPreEmitPass() override; - void addPreRegAlloc() override; }; } // namespace @@ -118,11 +117,6 @@ bool AVRPassConfig::addInstSelector() { return false; } -void AVRPassConfig::addPreRegAlloc() { - // Create the dynalloc SP save/restore pass to handle variable sized allocas. - addPass(createAVRDynAllocaSRPass()); -} - void AVRPassConfig::addPreSched2() { addPass(createAVRRelaxMemPass()); addPass(createAVRExpandPseudoPass()); diff --git a/llvm/lib/Target/AVR/AVRTargetObjectFile.cpp b/llvm/lib/Target/AVR/AVRTargetObjectFile.cpp index c7715ca1f51b..fe8e863be1a3 100644 --- a/llvm/lib/Target/AVR/AVRTargetObjectFile.cpp +++ b/llvm/lib/Target/AVR/AVRTargetObjectFile.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "AVRTargetObjectFile.h" +#include "AVRTargetMachine.h" #include "llvm/BinaryFormat/ELF.h" #include "llvm/IR/DerivedTypes.h" @@ -22,14 +23,60 @@ void AVRTargetObjectFile::Initialize(MCContext &Ctx, const TargetMachine &TM) { Base::Initialize(Ctx, TM); ProgmemDataSection = Ctx.getELFSection(".progmem.data", ELF::SHT_PROGBITS, ELF::SHF_ALLOC); + Progmem1DataSection = + Ctx.getELFSection(".progmem1.data", ELF::SHT_PROGBITS, ELF::SHF_ALLOC); + Progmem2DataSection = + Ctx.getELFSection(".progmem2.data", ELF::SHT_PROGBITS, ELF::SHF_ALLOC); + Progmem3DataSection = + Ctx.getELFSection(".progmem3.data", ELF::SHT_PROGBITS, ELF::SHF_ALLOC); + Progmem4DataSection = + Ctx.getELFSection(".progmem4.data", ELF::SHT_PROGBITS, ELF::SHF_ALLOC); + Progmem5DataSection = + Ctx.getELFSection(".progmem5.data", ELF::SHT_PROGBITS, ELF::SHF_ALLOC); } MCSection *AVRTargetObjectFile::SelectSectionForGlobal( const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const { - // Global values in flash memory are placed in the progmem.data section + // Global values in flash memory are placed in the progmem*.data section // unless they already have a user assigned section. - if (AVR::isProgramMemoryAddress(GO) && !GO->hasSection() && Kind.isReadOnly()) - return ProgmemDataSection; + const auto &AVRTM = static_cast<const AVRTargetMachine &>(TM); + if (AVR::isProgramMemoryAddress(GO) && !GO->hasSection() && + Kind.isReadOnly()) { + // The AVR subtarget should support LPM to access section '.progmem*.data'. + if (!AVRTM.getSubtargetImpl()->hasLPM()) { + // TODO: Get the global object's location in source file. + getContext().reportError( + SMLoc(), + "Current AVR subtarget does not support accessing program memory"); + return Base::SelectSectionForGlobal(GO, Kind, TM); + } + // The AVR subtarget should support ELPM to access section + // '.progmem[1|2|3|4|5].data'. + if (!AVRTM.getSubtargetImpl()->hasELPM() && + AVR::getAddressSpace(GO) != AVR::ProgramMemory) { + // TODO: Get the global object's location in source file. + getContext().reportError(SMLoc(), + "Current AVR subtarget does not support " + "accessing extended program memory"); + return ProgmemDataSection; + } + switch (AVR::getAddressSpace(GO)) { + case AVR::ProgramMemory: // address space 1 + return ProgmemDataSection; + case AVR::ProgramMemory1: // address space 2 + return Progmem1DataSection; + case AVR::ProgramMemory2: // address space 3 + return Progmem2DataSection; + case AVR::ProgramMemory3: // address space 4 + return Progmem3DataSection; + case AVR::ProgramMemory4: // address space 5 + return Progmem4DataSection; + case AVR::ProgramMemory5: // address space 6 + return Progmem5DataSection; + default: + llvm_unreachable("unexpected program memory index"); + } + } // Otherwise, we work the same way as ELF. return Base::SelectSectionForGlobal(GO, Kind, TM); diff --git a/llvm/lib/Target/AVR/AVRTargetObjectFile.h b/llvm/lib/Target/AVR/AVRTargetObjectFile.h index 53d8510d9a21..609849b44029 100644 --- a/llvm/lib/Target/AVR/AVRTargetObjectFile.h +++ b/llvm/lib/Target/AVR/AVRTargetObjectFile.h @@ -25,6 +25,11 @@ public: private: MCSection *ProgmemDataSection; + MCSection *Progmem1DataSection; + MCSection *Progmem2DataSection; + MCSection *Progmem3DataSection; + MCSection *Progmem4DataSection; + MCSection *Progmem5DataSection; }; } // end namespace llvm diff --git a/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp b/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp index 95ecd28200ba..f19e7840eb31 100644 --- a/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp +++ b/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp @@ -107,13 +107,13 @@ class AVROperand : public MCParsedAsmOperand { public: AVROperand(StringRef Tok, SMLoc const &S) - : Base(), Kind(k_Token), Tok(Tok), Start(S), End(S) {} + : Kind(k_Token), Tok(Tok), Start(S), End(S) {} AVROperand(unsigned Reg, SMLoc const &S, SMLoc const &E) - : Base(), Kind(k_Register), RegImm({Reg, nullptr}), Start(S), End(E) {} + : Kind(k_Register), RegImm({Reg, nullptr}), Start(S), End(E) {} AVROperand(MCExpr const *Imm, SMLoc const &S, SMLoc const &E) - : Base(), Kind(k_Immediate), RegImm({0, Imm}), Start(S), End(E) {} + : Kind(k_Immediate), RegImm({0, Imm}), Start(S), End(E) {} AVROperand(unsigned Reg, MCExpr const *Imm, SMLoc const &S, SMLoc const &E) - : Base(), Kind(k_Memri), RegImm({Reg, Imm}), Start(S), End(E) {} + : Kind(k_Memri), RegImm({Reg, Imm}), Start(S), End(E) {} struct RegisterImmediate { unsigned Reg; @@ -281,7 +281,7 @@ bool AVRAsmParser::invalidOperand(SMLoc const &Loc, OperandVector const &Operands, uint64_t const &ErrorInfo) { SMLoc ErrorLoc = Loc; - char const *Diag = 0; + char const *Diag = nullptr; if (ErrorInfo != ~0U) { if (ErrorInfo >= Operands.size()) { diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp b/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp index a3a4d63932c0..3624ade854c0 100644 --- a/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp +++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp @@ -47,7 +47,7 @@ static void signed_width(unsigned Width, uint64_t Value, " to " + std::to_string(Max) + ")"; if (Ctx) { - Ctx->reportFatalError(Fixup.getLoc(), Diagnostic); + Ctx->reportError(Fixup.getLoc(), Diagnostic); } else { llvm_unreachable(Diagnostic.c_str()); } @@ -66,7 +66,7 @@ static void unsigned_width(unsigned Width, uint64_t Value, " (expected an integer in the range 0 to " + std::to_string(Max) + ")"; if (Ctx) { - Ctx->reportFatalError(Fixup.getLoc(), Diagnostic); + Ctx->reportError(Fixup.getLoc(), Diagnostic); } else { llvm_unreachable(Diagnostic.c_str()); } diff --git a/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp b/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp index 50298bf5e943..697deb117bcb 100644 --- a/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp +++ b/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp @@ -101,7 +101,7 @@ struct BPFOperand : public MCParsedAsmOperand { ImmOp Imm; }; - BPFOperand(KindTy K) : MCParsedAsmOperand(), Kind(K) {} + BPFOperand(KindTy K) : Kind(K) {} public: BPFOperand(const BPFOperand &o) : MCParsedAsmOperand() { diff --git a/llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp b/llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp index ab7e848409d9..46141e69d9d4 100644 --- a/llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp +++ b/llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp @@ -1002,7 +1002,7 @@ bool BPFAbstractMemberAccess::transformGEPChain(CallInst *Call, VarType = Type::getInt64Ty(BB->getContext()); // 64bit ptr or enum value GV = new GlobalVariable(*M, VarType, false, GlobalVariable::ExternalLinkage, - NULL, AccessKey); + nullptr, AccessKey); GV->addAttribute(BPFCoreSharedInfo::AmaAttr); GV->setMetadata(LLVMContext::MD_preserve_access_index, TypeMeta); GEPGlobals[AccessKey] = GV; diff --git a/llvm/lib/Target/BPF/BPFISelLowering.cpp b/llvm/lib/Target/BPF/BPFISelLowering.cpp index 90723ac04f64..0587cb0e16e3 100644 --- a/llvm/lib/Target/BPF/BPFISelLowering.cpp +++ b/llvm/lib/Target/BPF/BPFISelLowering.cpp @@ -325,7 +325,7 @@ SDValue BPFTargetLowering::LowerFormalArguments( default: { errs() << "LowerFormalArguments Unhandled argument type: " << RegVT.getEVTString() << '\n'; - llvm_unreachable(0); + llvm_unreachable(nullptr); } case MVT::i32: case MVT::i64: diff --git a/llvm/lib/Target/BPF/BPFMIChecking.cpp b/llvm/lib/Target/BPF/BPFMIChecking.cpp index eb8c48ac49de..2bc2302cf55c 100644 --- a/llvm/lib/Target/BPF/BPFMIChecking.cpp +++ b/llvm/lib/Target/BPF/BPFMIChecking.cpp @@ -41,7 +41,7 @@ private: // Initialize class variables. void initialize(MachineFunction &MFParm); - bool processAtomicInsts(void); + bool processAtomicInsts(); public: @@ -151,7 +151,7 @@ static bool hasLiveDefs(const MachineInstr &MI, const TargetRegisterInfo *TRI) { return false; } -bool BPFMIPreEmitChecking::processAtomicInsts(void) { +bool BPFMIPreEmitChecking::processAtomicInsts() { for (MachineBasicBlock &MBB : *MF) { for (MachineInstr &MI : MBB) { if (MI.getOpcode() != BPF::XADDW && diff --git a/llvm/lib/Target/BPF/BPFMIPeephole.cpp b/llvm/lib/Target/BPF/BPFMIPeephole.cpp index 354980e4bf3c..7f69c8a63443 100644 --- a/llvm/lib/Target/BPF/BPFMIPeephole.cpp +++ b/llvm/lib/Target/BPF/BPFMIPeephole.cpp @@ -56,8 +56,8 @@ private: bool isInsnFrom32Def(MachineInstr *DefInsn); bool isPhiFrom32Def(MachineInstr *MovMI); bool isMovFrom32Def(MachineInstr *MovMI); - bool eliminateZExtSeq(void); - bool eliminateZExt(void); + bool eliminateZExtSeq(); + bool eliminateZExt(); std::set<MachineInstr *> PhiInsns; @@ -172,7 +172,7 @@ bool BPFMIPeephole::isMovFrom32Def(MachineInstr *MovMI) return true; } -bool BPFMIPeephole::eliminateZExtSeq(void) { +bool BPFMIPeephole::eliminateZExtSeq() { MachineInstr* ToErase = nullptr; bool Eliminated = false; @@ -240,7 +240,7 @@ bool BPFMIPeephole::eliminateZExtSeq(void) { return Eliminated; } -bool BPFMIPeephole::eliminateZExt(void) { +bool BPFMIPeephole::eliminateZExt() { MachineInstr* ToErase = nullptr; bool Eliminated = false; @@ -312,7 +312,7 @@ private: // Initialize class variables. void initialize(MachineFunction &MFParm); - bool eliminateRedundantMov(void); + bool eliminateRedundantMov(); public: @@ -334,7 +334,7 @@ void BPFMIPreEmitPeephole::initialize(MachineFunction &MFParm) { LLVM_DEBUG(dbgs() << "*** BPF PreEmit peephole pass ***\n\n"); } -bool BPFMIPreEmitPeephole::eliminateRedundantMov(void) { +bool BPFMIPreEmitPeephole::eliminateRedundantMov() { MachineInstr* ToErase = nullptr; bool Eliminated = false; @@ -405,7 +405,7 @@ private: // Initialize class variables. void initialize(MachineFunction &MFParm); - bool eliminateTruncSeq(void); + bool eliminateTruncSeq(); public: @@ -452,7 +452,7 @@ void BPFMIPeepholeTruncElim::initialize(MachineFunction &MFParm) { // are 32-bit registers, but later on, kernel verifier will rewrite // it with 64-bit value. Therefore, truncating the value after the // load will result in incorrect code. -bool BPFMIPeepholeTruncElim::eliminateTruncSeq(void) { +bool BPFMIPeepholeTruncElim::eliminateTruncSeq() { MachineInstr* ToErase = nullptr; bool Eliminated = false; diff --git a/llvm/lib/Target/BPF/BPFMISimplifyPatchable.cpp b/llvm/lib/Target/BPF/BPFMISimplifyPatchable.cpp index 7e829ea43e89..b4232875383c 100644 --- a/llvm/lib/Target/BPF/BPFMISimplifyPatchable.cpp +++ b/llvm/lib/Target/BPF/BPFMISimplifyPatchable.cpp @@ -55,7 +55,7 @@ private: // Initialize class variables. void initialize(MachineFunction &MFParm); - bool removeLD(void); + bool removeLD(); void processCandidate(MachineRegisterInfo *MRI, MachineBasicBlock &MBB, MachineInstr &MI, Register &SrcReg, Register &DstReg, const GlobalValue *GVal, bool IsAma); diff --git a/llvm/lib/Target/BPF/BPFPreserveDIType.cpp b/llvm/lib/Target/BPF/BPFPreserveDIType.cpp index 36237b2fc4fd..6dfb7dc39922 100644 --- a/llvm/lib/Target/BPF/BPFPreserveDIType.cpp +++ b/llvm/lib/Target/BPF/BPFPreserveDIType.cpp @@ -105,10 +105,10 @@ static bool BPFPreserveDITypeImpl(Function &F) { BasicBlock *BB = Call->getParent(); IntegerType *VarType = Type::getInt64Ty(BB->getContext()); - std::string GVName = BaseName + std::to_string(Count) + "$" + - std::to_string(Reloc); + std::string GVName = + BaseName + std::to_string(Count) + "$" + std::to_string(Reloc); GlobalVariable *GV = new GlobalVariable( - *M, VarType, false, GlobalVariable::ExternalLinkage, NULL, GVName); + *M, VarType, false, GlobalVariable::ExternalLinkage, nullptr, GVName); GV->addAttribute(BPFCoreSharedInfo::TypeIdAttr); GV->setMetadata(LLVMContext::MD_preserve_access_index, MD); diff --git a/llvm/lib/Target/BPF/BPFSubtarget.cpp b/llvm/lib/Target/BPF/BPFSubtarget.cpp index 77e3cd393f87..e4d98b85e58b 100644 --- a/llvm/lib/Target/BPF/BPFSubtarget.cpp +++ b/llvm/lib/Target/BPF/BPFSubtarget.cpp @@ -59,6 +59,6 @@ void BPFSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) { BPFSubtarget::BPFSubtarget(const Triple &TT, const std::string &CPU, const std::string &FS, const TargetMachine &TM) - : BPFGenSubtargetInfo(TT, CPU, /*TuneCPU*/ CPU, FS), InstrInfo(), + : BPFGenSubtargetInfo(TT, CPU, /*TuneCPU*/ CPU, FS), FrameLowering(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM, *this) {} diff --git a/llvm/lib/Target/BPF/BTFDebug.cpp b/llvm/lib/Target/BPF/BTFDebug.cpp index 0c510686a13b..d536aed1d211 100644 --- a/llvm/lib/Target/BPF/BTFDebug.cpp +++ b/llvm/lib/Target/BPF/BTFDebug.cpp @@ -1366,7 +1366,7 @@ void BTFDebug::processGlobals(bool ProcessingMapDef) { // Calculate symbol size const DataLayout &DL = Global.getParent()->getDataLayout(); - uint32_t Size = DL.getTypeAllocSize(Global.getType()->getElementType()); + uint32_t Size = DL.getTypeAllocSize(Global.getValueType()); DataSecEntries[std::string(SecName)]->addDataSecEntry(VarId, Asm->getSymbol(&Global), Size); diff --git a/llvm/lib/Target/BPF/MCTargetDesc/BPFInstPrinter.cpp b/llvm/lib/Target/BPF/MCTargetDesc/BPFInstPrinter.cpp index e0aeec989879..200c72a07ed6 100644 --- a/llvm/lib/Target/BPF/MCTargetDesc/BPFInstPrinter.cpp +++ b/llvm/lib/Target/BPF/MCTargetDesc/BPFInstPrinter.cpp @@ -50,7 +50,7 @@ static void printExpr(const MCExpr *Expr, raw_ostream &O) { void BPFInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O, const char *Modifier) { - assert((Modifier == 0 || Modifier[0] == 0) && "No modifiers supported"); + assert((Modifier == nullptr || Modifier[0] == 0) && "No modifiers supported"); const MCOperand &Op = MI->getOperand(OpNo); if (Op.isReg()) { O << getRegisterName(Op.getReg()); diff --git a/llvm/lib/Target/CSKY/AsmParser/CSKYAsmParser.cpp b/llvm/lib/Target/CSKY/AsmParser/CSKYAsmParser.cpp index 29b99a84a6cd..a62bd111cba9 100644 --- a/llvm/lib/Target/CSKY/AsmParser/CSKYAsmParser.cpp +++ b/llvm/lib/Target/CSKY/AsmParser/CSKYAsmParser.cpp @@ -303,6 +303,14 @@ public: bool isRegSeq() const { return isRegSeqTemplate<CSKY::R0, CSKY::R31>(); } + bool isRegSeqV1() const { + return isRegSeqTemplate<CSKY::F0_32, CSKY::F15_32>(); + } + + bool isRegSeqV2() const { + return isRegSeqTemplate<CSKY::F0_32, CSKY::F31_32>(); + } + static bool isLegalRegList(unsigned from, unsigned to) { if (from == 0 && to == 0) return true; diff --git a/llvm/lib/Target/CSKY/CSKY.h b/llvm/lib/Target/CSKY/CSKY.h index 357b1e96e606..401d6fa1a0a5 100644 --- a/llvm/lib/Target/CSKY/CSKY.h +++ b/llvm/lib/Target/CSKY/CSKY.h @@ -21,6 +21,9 @@ class CSKYTargetMachine; class FunctionPass; FunctionPass *createCSKYISelDag(CSKYTargetMachine &TM); +FunctionPass *createCSKYConstantIslandPass(); + +void initializeCSKYConstantIslandsPass(PassRegistry &); } // namespace llvm diff --git a/llvm/lib/Target/CSKY/CSKY.td b/llvm/lib/Target/CSKY/CSKY.td index e26781ca6aa1..ddb7fe93706e 100644 --- a/llvm/lib/Target/CSKY/CSKY.td +++ b/llvm/lib/Target/CSKY/CSKY.td @@ -11,6 +11,40 @@ include "llvm/Target/Target.td" //===----------------------------------------------------------------------===// // CSKY subtarget features and instruction predicates. //===----------------------------------------------------------------------===// +def ModeHardFloat : + SubtargetFeature<"hard-float", "UseHardFloat", + "true", "Use hard floating point features">; +def ModeHardFloatABI : + SubtargetFeature<"hard-float-abi", "UseHardFloatABI", + "true", "Use hard floating point ABI to pass args">; + +def FeatureFPUV2_SF + : SubtargetFeature<"fpuv2_sf", "HasFPUv2SingleFloat", "true", + "Enable FPUv2 single float instructions">; +def HasFPUv2_SF : Predicate<"Subtarget->hasFPUv2SingleFloat()">, + AssemblerPredicate<(all_of FeatureFPUV2_SF), + "Enable FPUv2 single float instructions">; + +def FeatureFPUV2_DF + : SubtargetFeature<"fpuv2_df", "HasFPUv2DoubleFloat", "true", + "Enable FPUv2 double float instructions">; +def HasFPUv2_DF : Predicate<"Subtarget->hasFPUv2DoubleFloat()">, + AssemblerPredicate<(all_of FeatureFPUV2_DF), + "Enable FPUv2 double float instructions">; + +def FeatureFPUV3_SF + : SubtargetFeature<"fpuv3_sf", "HasFPUv3SingleFloat", "true", + "Enable FPUv3 single float instructions">; +def HasFPUv3_SF : Predicate<"Subtarget->hasFPUv3SingleFloat()">, + AssemblerPredicate<(all_of FeatureFPUV3_SF), + "Enable FPUv3 single float instructions">; + +def FeatureFPUV3_DF + : SubtargetFeature<"fpuv3_df", "HasFPUv3DoubleFloat", "true", + "Enable FPUv3 double float instructions">; +def HasFPUv3_DF : Predicate<"Subtarget->hasFPUv3DoubleFloat()">, + AssemblerPredicate<(all_of FeatureFPUV3_DF), + "Enable FPUv3 double float instructions">; def FeatureBTST16 : SubtargetFeature<"btst16", "HasBTST16", "true", "Use the 16-bit btsti instruction">; diff --git a/llvm/lib/Target/CSKY/CSKYAsmPrinter.cpp b/llvm/lib/Target/CSKY/CSKYAsmPrinter.cpp index 85129f78e726..c8269eeacfdb 100644 --- a/llvm/lib/Target/CSKY/CSKYAsmPrinter.cpp +++ b/llvm/lib/Target/CSKY/CSKYAsmPrinter.cpp @@ -12,6 +12,7 @@ //===----------------------------------------------------------------------===// #include "CSKYAsmPrinter.h" #include "CSKY.h" +#include "CSKYConstantPoolValue.h" #include "CSKYTargetMachine.h" #include "MCTargetDesc/CSKYInstPrinter.h" #include "MCTargetDesc/CSKYMCExpr.h" @@ -38,6 +39,7 @@ CSKYAsmPrinter::CSKYAsmPrinter(llvm::TargetMachine &TM, : AsmPrinter(TM, std::move(Streamer)), MCInstLowering(OutContext, *this) {} bool CSKYAsmPrinter::runOnMachineFunction(MachineFunction &MF) { + MCP = MF.getConstantPool(); Subtarget = &MF.getSubtarget<CSKYSubtarget>(); return AsmPrinter::runOnMachineFunction(MF); } @@ -56,16 +58,166 @@ void CSKYAsmPrinter::EmitToStreamer(MCStreamer &S, const MCInst &Inst) { // instructions) auto-generated. #include "CSKYGenMCPseudoLowering.inc" +void CSKYAsmPrinter::expandTLSLA(const MachineInstr *MI) { + const CSKYInstrInfo *TII = Subtarget->getInstrInfo(); + + DebugLoc DL = MI->getDebugLoc(); + + MCSymbol *PCLabel = OutContext.getOrCreateSymbol( + Twine(MAI->getPrivateGlobalPrefix()) + "PC" + Twine(getFunctionNumber()) + + "_" + Twine(MI->getOperand(3).getImm())); + + OutStreamer->emitLabel(PCLabel); + + auto Instr = BuildMI(*MF, DL, TII->get(CSKY::LRW32)) + .add(MI->getOperand(0)) + .add(MI->getOperand(2)); + MCInst LRWInst; + MCInstLowering.Lower(Instr, LRWInst); + EmitToStreamer(*OutStreamer, LRWInst); + + Instr = BuildMI(*MF, DL, TII->get(CSKY::GRS32)) + .add(MI->getOperand(1)) + .addSym(PCLabel); + MCInst GRSInst; + MCInstLowering.Lower(Instr, GRSInst); + EmitToStreamer(*OutStreamer, GRSInst); + return; +} + +void CSKYAsmPrinter::emitCustomConstantPool(const MachineInstr *MI) { + + // This instruction represents a floating constant pool in the function. + // The first operand is the ID# for this instruction, the second is the + // index into the MachineConstantPool that this is, the third is the size + // in bytes of this constant pool entry. + // The required alignment is specified on the basic block holding this MI. + unsigned LabelId = (unsigned)MI->getOperand(0).getImm(); + unsigned CPIdx = (unsigned)MI->getOperand(1).getIndex(); + + // If this is the first entry of the pool, mark it. + if (!InConstantPool) { + OutStreamer->emitValueToAlignment(4); + InConstantPool = true; + } + + OutStreamer->emitLabel(GetCPISymbol(LabelId)); + + const MachineConstantPoolEntry &MCPE = MCP->getConstants()[CPIdx]; + if (MCPE.isMachineConstantPoolEntry()) + emitMachineConstantPoolValue(MCPE.Val.MachineCPVal); + else + emitGlobalConstant(MF->getDataLayout(), MCPE.Val.ConstVal); + return; +} + +void CSKYAsmPrinter::emitFunctionBodyEnd() { + // Make sure to terminate any constant pools that were at the end + // of the function. + if (!InConstantPool) + return; + InConstantPool = false; +} + void CSKYAsmPrinter::emitInstruction(const MachineInstr *MI) { // Do any auto-generated pseudo lowerings. if (emitPseudoExpansionLowering(*OutStreamer, MI)) return; + // If we just ended a constant pool, mark it as such. + if (InConstantPool && MI->getOpcode() != CSKY::CONSTPOOL_ENTRY) { + InConstantPool = false; + } + + if (MI->getOpcode() == CSKY::PseudoTLSLA32) + return expandTLSLA(MI); + + if (MI->getOpcode() == CSKY::CONSTPOOL_ENTRY) + return emitCustomConstantPool(MI); + MCInst TmpInst; MCInstLowering.Lower(MI, TmpInst); EmitToStreamer(*OutStreamer, TmpInst); } +// Convert a CSKY-specific constant pool modifier into the associated +// MCSymbolRefExpr variant kind. +static CSKYMCExpr::VariantKind +getModifierVariantKind(CSKYCP::CSKYCPModifier Modifier) { + switch (Modifier) { + case CSKYCP::NO_MOD: + return CSKYMCExpr::VK_CSKY_None; + case CSKYCP::ADDR: + return CSKYMCExpr::VK_CSKY_ADDR; + case CSKYCP::GOT: + return CSKYMCExpr::VK_CSKY_GOT; + case CSKYCP::GOTOFF: + return CSKYMCExpr::VK_CSKY_GOTOFF; + case CSKYCP::PLT: + return CSKYMCExpr::VK_CSKY_PLT; + case CSKYCP::TLSGD: + return CSKYMCExpr::VK_CSKY_TLSGD; + case CSKYCP::TLSLE: + return CSKYMCExpr::VK_CSKY_TLSLE; + case CSKYCP::TLSIE: + return CSKYMCExpr::VK_CSKY_TLSIE; + } + llvm_unreachable("Invalid CSKYCPModifier!"); +} + +void CSKYAsmPrinter::emitMachineConstantPoolValue( + MachineConstantPoolValue *MCPV) { + int Size = getDataLayout().getTypeAllocSize(MCPV->getType()); + CSKYConstantPoolValue *CCPV = static_cast<CSKYConstantPoolValue *>(MCPV); + MCSymbol *MCSym; + + if (CCPV->isBlockAddress()) { + const BlockAddress *BA = + cast<CSKYConstantPoolConstant>(CCPV)->getBlockAddress(); + MCSym = GetBlockAddressSymbol(BA); + } else if (CCPV->isGlobalValue()) { + const GlobalValue *GV = cast<CSKYConstantPoolConstant>(CCPV)->getGV(); + MCSym = getSymbol(GV); + } else if (CCPV->isMachineBasicBlock()) { + const MachineBasicBlock *MBB = cast<CSKYConstantPoolMBB>(CCPV)->getMBB(); + MCSym = MBB->getSymbol(); + } else if (CCPV->isJT()) { + signed JTI = cast<CSKYConstantPoolJT>(CCPV)->getJTI(); + MCSym = GetJTISymbol(JTI); + } else { + assert(CCPV->isExtSymbol() && "unrecognized constant pool value"); + StringRef Sym = cast<CSKYConstantPoolSymbol>(CCPV)->getSymbol(); + MCSym = GetExternalSymbolSymbol(Sym); + } + // Create an MCSymbol for the reference. + const MCExpr *Expr = + MCSymbolRefExpr::create(MCSym, MCSymbolRefExpr::VK_None, OutContext); + + if (CCPV->getPCAdjustment()) { + + MCSymbol *PCLabel = OutContext.getOrCreateSymbol( + Twine(MAI->getPrivateGlobalPrefix()) + "PC" + + Twine(getFunctionNumber()) + "_" + Twine(CCPV->getLabelID())); + + const MCExpr *PCRelExpr = MCSymbolRefExpr::create(PCLabel, OutContext); + if (CCPV->mustAddCurrentAddress()) { + // We want "(<expr> - .)", but MC doesn't have a concept of the '.' + // label, so just emit a local label end reference that instead. + MCSymbol *DotSym = OutContext.createTempSymbol(); + OutStreamer->emitLabel(DotSym); + const MCExpr *DotExpr = MCSymbolRefExpr::create(DotSym, OutContext); + PCRelExpr = MCBinaryExpr::createSub(PCRelExpr, DotExpr, OutContext); + } + Expr = MCBinaryExpr::createSub(Expr, PCRelExpr, OutContext); + } + + // Create an MCSymbol for the reference. + Expr = CSKYMCExpr::create(Expr, getModifierVariantKind(CCPV->getModifier()), + OutContext); + + OutStreamer->emitValue(Expr, Size); +} + extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeCSKYAsmPrinter() { RegisterAsmPrinter<CSKYAsmPrinter> X(getTheCSKYTarget()); } diff --git a/llvm/lib/Target/CSKY/CSKYAsmPrinter.h b/llvm/lib/Target/CSKY/CSKYAsmPrinter.h index b30311e0ca64..04a253d349c8 100644 --- a/llvm/lib/Target/CSKY/CSKYAsmPrinter.h +++ b/llvm/lib/Target/CSKY/CSKYAsmPrinter.h @@ -20,6 +20,15 @@ class LLVM_LIBRARY_VISIBILITY CSKYAsmPrinter : public AsmPrinter { const CSKYSubtarget *Subtarget; + bool InConstantPool = false; + + /// Keep a pointer to constantpool entries of the current + /// MachineFunction. + MachineConstantPool *MCP; + + void expandTLSLA(const MachineInstr *MI); + void emitCustomConstantPool(const MachineInstr *MI); + public: explicit CSKYAsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer); @@ -33,9 +42,16 @@ public: bool emitPseudoExpansionLowering(MCStreamer &OutStreamer, const MachineInstr *MI); + void emitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) override; + + void emitFunctionBodyEnd() override; + void emitInstruction(const MachineInstr *MI) override; bool runOnMachineFunction(MachineFunction &MF) override; + + // we emit constant pools customly! + void emitConstantPool() override{}; }; } // end namespace llvm diff --git a/llvm/lib/Target/CSKY/CSKYConstantIslandPass.cpp b/llvm/lib/Target/CSKY/CSKYConstantIslandPass.cpp new file mode 100644 index 000000000000..3ac335e2ad9d --- /dev/null +++ b/llvm/lib/Target/CSKY/CSKYConstantIslandPass.cpp @@ -0,0 +1,1376 @@ +//===- CSKYConstantIslandPass.cpp - Emit PC Relative loads ----------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// +// Loading constants inline is expensive on CSKY and it's in general better +// to place the constant nearby in code space and then it can be loaded with a +// simple 16/32 bit load instruction like lrw. +// +// The constants can be not just numbers but addresses of functions and labels. +// This can be particularly helpful in static relocation mode for embedded +// non-linux targets. +// +//===----------------------------------------------------------------------===// + +#include "CSKY.h" +#include "CSKYConstantPoolValue.h" +#include "CSKYMachineFunctionInfo.h" +#include "CSKYSubtarget.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineConstantPool.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Config/llvm-config.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/DebugLoc.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Type.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Compiler.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/Format.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Support/raw_ostream.h" +#include <algorithm> +#include <cassert> +#include <cstdint> +#include <iterator> +#include <vector> + +using namespace llvm; + +#define DEBUG_TYPE "CSKY-constant-islands" + +STATISTIC(NumCPEs, "Number of constpool entries"); +STATISTIC(NumSplit, "Number of uncond branches inserted"); +STATISTIC(NumCBrFixed, "Number of cond branches fixed"); +STATISTIC(NumUBrFixed, "Number of uncond branches fixed"); + +namespace { + +using Iter = MachineBasicBlock::iterator; +using ReverseIter = MachineBasicBlock::reverse_iterator; + +/// CSKYConstantIslands - Due to limited PC-relative displacements, CSKY +/// requires constant pool entries to be scattered among the instructions +/// inside a function. To do this, it completely ignores the normal LLVM +/// constant pool; instead, it places constants wherever it feels like with +/// special instructions. +/// +/// The terminology used in this pass includes: +/// Islands - Clumps of constants placed in the function. +/// Water - Potential places where an island could be formed. +/// CPE - A constant pool entry that has been placed somewhere, which +/// tracks a list of users. + +class CSKYConstantIslands : public MachineFunctionPass { + /// BasicBlockInfo - Information about the offset and size of a single + /// basic block. + struct BasicBlockInfo { + /// Offset - Distance from the beginning of the function to the beginning + /// of this basic block. + /// + /// Offsets are computed assuming worst case padding before an aligned + /// block. This means that subtracting basic block offsets always gives a + /// conservative estimate of the real distance which may be smaller. + /// + /// Because worst case padding is used, the computed offset of an aligned + /// block may not actually be aligned. + unsigned Offset = 0; + + /// Size - Size of the basic block in bytes. If the block contains + /// inline assembly, this is a worst case estimate. + /// + /// The size does not include any alignment padding whether from the + /// beginning of the block, or from an aligned jump table at the end. + unsigned Size = 0; + + BasicBlockInfo() = default; + + unsigned postOffset() const { return Offset + Size; } + }; + + std::vector<BasicBlockInfo> BBInfo; + + /// WaterList - A sorted list of basic blocks where islands could be placed + /// (i.e. blocks that don't fall through to the following block, due + /// to a return, unreachable, or unconditional branch). + std::vector<MachineBasicBlock *> WaterList; + + /// NewWaterList - The subset of WaterList that was created since the + /// previous iteration by inserting unconditional branches. + SmallSet<MachineBasicBlock *, 4> NewWaterList; + + using water_iterator = std::vector<MachineBasicBlock *>::iterator; + + /// CPUser - One user of a constant pool, keeping the machine instruction + /// pointer, the constant pool being referenced, and the max displacement + /// allowed from the instruction to the CP. The HighWaterMark records the + /// highest basic block where a new CPEntry can be placed. To ensure this + /// pass terminates, the CP entries are initially placed at the end of the + /// function and then move monotonically to lower addresses. The + /// exception to this rule is when the current CP entry for a particular + /// CPUser is out of range, but there is another CP entry for the same + /// constant value in range. We want to use the existing in-range CP + /// entry, but if it later moves out of range, the search for new water + /// should resume where it left off. The HighWaterMark is used to record + /// that point. + struct CPUser { + MachineInstr *MI; + MachineInstr *CPEMI; + MachineBasicBlock *HighWaterMark; + + private: + unsigned MaxDisp; + + public: + bool NegOk; + + CPUser(MachineInstr *Mi, MachineInstr *Cpemi, unsigned Maxdisp, bool Neg) + : MI(Mi), CPEMI(Cpemi), MaxDisp(Maxdisp), NegOk(Neg) { + HighWaterMark = CPEMI->getParent(); + } + + /// getMaxDisp - Returns the maximum displacement supported by MI. + unsigned getMaxDisp() const { return MaxDisp - 16; } + + void setMaxDisp(unsigned Val) { MaxDisp = Val; } + }; + + /// CPUsers - Keep track of all of the machine instructions that use various + /// constant pools and their max displacement. + std::vector<CPUser> CPUsers; + + /// CPEntry - One per constant pool entry, keeping the machine instruction + /// pointer, the constpool index, and the number of CPUser's which + /// reference this entry. + struct CPEntry { + MachineInstr *CPEMI; + unsigned CPI; + unsigned RefCount; + + CPEntry(MachineInstr *Cpemi, unsigned Cpi, unsigned Rc = 0) + : CPEMI(Cpemi), CPI(Cpi), RefCount(Rc) {} + }; + + /// CPEntries - Keep track of all of the constant pool entry machine + /// instructions. For each original constpool index (i.e. those that + /// existed upon entry to this pass), it keeps a vector of entries. + /// Original elements are cloned as we go along; the clones are + /// put in the vector of the original element, but have distinct CPIs. + std::vector<std::vector<CPEntry>> CPEntries; + + /// ImmBranch - One per immediate branch, keeping the machine instruction + /// pointer, conditional or unconditional, the max displacement, + /// and (if isCond is true) the corresponding unconditional branch + /// opcode. + struct ImmBranch { + MachineInstr *MI; + unsigned MaxDisp : 31; + bool IsCond : 1; + int UncondBr; + + ImmBranch(MachineInstr *Mi, unsigned Maxdisp, bool Cond, int Ubr) + : MI(Mi), MaxDisp(Maxdisp), IsCond(Cond), UncondBr(Ubr) {} + }; + + /// ImmBranches - Keep track of all the immediate branch instructions. + /// + std::vector<ImmBranch> ImmBranches; + + const CSKYSubtarget *STI = nullptr; + const CSKYInstrInfo *TII; + CSKYMachineFunctionInfo *MFI; + MachineFunction *MF = nullptr; + MachineConstantPool *MCP = nullptr; + + unsigned PICLabelUId; + + void initPICLabelUId(unsigned UId) { PICLabelUId = UId; } + + unsigned createPICLabelUId() { return PICLabelUId++; } + +public: + static char ID; + + CSKYConstantIslands() : MachineFunctionPass(ID) {} + + StringRef getPassName() const override { return "CSKY Constant Islands"; } + + bool runOnMachineFunction(MachineFunction &F) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<MachineDominatorTree>(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + MachineFunctionProperties getRequiredProperties() const override { + return MachineFunctionProperties().set( + MachineFunctionProperties::Property::NoVRegs); + } + + void doInitialPlacement(std::vector<MachineInstr *> &CPEMIs); + CPEntry *findConstPoolEntry(unsigned CPI, const MachineInstr *CPEMI); + Align getCPEAlign(const MachineInstr &CPEMI); + void initializeFunctionInfo(const std::vector<MachineInstr *> &CPEMIs); + unsigned getOffsetOf(MachineInstr *MI) const; + unsigned getUserOffset(CPUser &) const; + void dumpBBs(); + + bool isOffsetInRange(unsigned UserOffset, unsigned TrialOffset, unsigned Disp, + bool NegativeOK); + bool isOffsetInRange(unsigned UserOffset, unsigned TrialOffset, + const CPUser &U); + + void computeBlockSize(MachineBasicBlock *MBB); + MachineBasicBlock *splitBlockBeforeInstr(MachineInstr &MI); + void updateForInsertedWaterBlock(MachineBasicBlock *NewBB); + void adjustBBOffsetsAfter(MachineBasicBlock *BB); + bool decrementCPEReferenceCount(unsigned CPI, MachineInstr *CPEMI); + int findInRangeCPEntry(CPUser &U, unsigned UserOffset); + bool findAvailableWater(CPUser &U, unsigned UserOffset, + water_iterator &WaterIter); + void createNewWater(unsigned CPUserIndex, unsigned UserOffset, + MachineBasicBlock *&NewMBB); + bool handleConstantPoolUser(unsigned CPUserIndex); + void removeDeadCPEMI(MachineInstr *CPEMI); + bool removeUnusedCPEntries(); + bool isCPEntryInRange(MachineInstr *MI, unsigned UserOffset, + MachineInstr *CPEMI, unsigned Disp, bool NegOk, + bool DoDump = false); + bool isWaterInRange(unsigned UserOffset, MachineBasicBlock *Water, CPUser &U, + unsigned &Growth); + bool isBBInRange(MachineInstr *MI, MachineBasicBlock *BB, unsigned Disp); + bool fixupImmediateBr(ImmBranch &Br); + bool fixupConditionalBr(ImmBranch &Br); + bool fixupUnconditionalBr(ImmBranch &Br); +}; +} // end anonymous namespace + +char CSKYConstantIslands::ID = 0; + +bool CSKYConstantIslands::isOffsetInRange(unsigned UserOffset, + unsigned TrialOffset, + const CPUser &U) { + return isOffsetInRange(UserOffset, TrialOffset, U.getMaxDisp(), U.NegOk); +} + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +/// print block size and offset information - debugging +LLVM_DUMP_METHOD void CSKYConstantIslands::dumpBBs() { + for (unsigned J = 0, E = BBInfo.size(); J != E; ++J) { + const BasicBlockInfo &BBI = BBInfo[J]; + dbgs() << format("%08x %bb.%u\t", BBI.Offset, J) + << format(" size=%#x\n", BBInfo[J].Size); + } +} +#endif + +bool CSKYConstantIslands::runOnMachineFunction(MachineFunction &Mf) { + MF = &Mf; + MCP = Mf.getConstantPool(); + STI = &static_cast<const CSKYSubtarget &>(Mf.getSubtarget()); + + LLVM_DEBUG(dbgs() << "***** CSKYConstantIslands: " + << MCP->getConstants().size() << " CP entries, aligned to " + << MCP->getConstantPoolAlign().value() << " bytes *****\n"); + + TII = STI->getInstrInfo(); + MFI = MF->getInfo<CSKYMachineFunctionInfo>(); + + // This pass invalidates liveness information when it splits basic blocks. + MF->getRegInfo().invalidateLiveness(); + + // Renumber all of the machine basic blocks in the function, guaranteeing that + // the numbers agree with the position of the block in the function. + MF->RenumberBlocks(); + + bool MadeChange = false; + + // Perform the initial placement of the constant pool entries. To start with, + // we put them all at the end of the function. + std::vector<MachineInstr *> CPEMIs; + if (!MCP->isEmpty()) + doInitialPlacement(CPEMIs); + + /// The next UID to take is the first unused one. + initPICLabelUId(CPEMIs.size()); + + // Do the initial scan of the function, building up information about the + // sizes of each block, the location of all the water, and finding all of the + // constant pool users. + initializeFunctionInfo(CPEMIs); + CPEMIs.clear(); + LLVM_DEBUG(dumpBBs()); + + /// Remove dead constant pool entries. + MadeChange |= removeUnusedCPEntries(); + + // Iteratively place constant pool entries and fix up branches until there + // is no change. + unsigned NoCPIters = 0, NoBRIters = 0; + (void)NoBRIters; + while (true) { + LLVM_DEBUG(dbgs() << "Beginning CP iteration #" << NoCPIters << '\n'); + bool CPChange = false; + for (unsigned I = 0, E = CPUsers.size(); I != E; ++I) + CPChange |= handleConstantPoolUser(I); + if (CPChange && ++NoCPIters > 30) + report_fatal_error("Constant Island pass failed to converge!"); + LLVM_DEBUG(dumpBBs()); + + // Clear NewWaterList now. If we split a block for branches, it should + // appear as "new water" for the next iteration of constant pool placement. + NewWaterList.clear(); + + LLVM_DEBUG(dbgs() << "Beginning BR iteration #" << NoBRIters << '\n'); + bool BRChange = false; + for (unsigned I = 0, E = ImmBranches.size(); I != E; ++I) + BRChange |= fixupImmediateBr(ImmBranches[I]); + if (BRChange && ++NoBRIters > 30) + report_fatal_error("Branch Fix Up pass failed to converge!"); + LLVM_DEBUG(dumpBBs()); + if (!CPChange && !BRChange) + break; + MadeChange = true; + } + + LLVM_DEBUG(dbgs() << '\n'; dumpBBs()); + + BBInfo.clear(); + WaterList.clear(); + CPUsers.clear(); + CPEntries.clear(); + ImmBranches.clear(); + return MadeChange; +} + +/// doInitialPlacement - Perform the initial placement of the constant pool +/// entries. To start with, we put them all at the end of the function. +void CSKYConstantIslands::doInitialPlacement( + std::vector<MachineInstr *> &CPEMIs) { + // Create the basic block to hold the CPE's. + MachineBasicBlock *BB = MF->CreateMachineBasicBlock(); + MF->push_back(BB); + + // MachineConstantPool measures alignment in bytes. We measure in log2(bytes). + const Align MaxAlign = MCP->getConstantPoolAlign(); + + // Mark the basic block as required by the const-pool. + BB->setAlignment(Align(2)); + + // The function needs to be as aligned as the basic blocks. The linker may + // move functions around based on their alignment. + MF->ensureAlignment(BB->getAlignment()); + + // Order the entries in BB by descending alignment. That ensures correct + // alignment of all entries as long as BB is sufficiently aligned. Keep + // track of the insertion point for each alignment. We are going to bucket + // sort the entries as they are created. + SmallVector<MachineBasicBlock::iterator, 8> InsPoint(Log2(MaxAlign) + 1, + BB->end()); + + // Add all of the constants from the constant pool to the end block, use an + // identity mapping of CPI's to CPE's. + const std::vector<MachineConstantPoolEntry> &CPs = MCP->getConstants(); + + const DataLayout &TD = MF->getDataLayout(); + for (unsigned I = 0, E = CPs.size(); I != E; ++I) { + unsigned Size = CPs[I].getSizeInBytes(TD); + assert(Size >= 4 && "Too small constant pool entry"); + Align Alignment = CPs[I].getAlign(); + // Verify that all constant pool entries are a multiple of their alignment. + // If not, we would have to pad them out so that instructions stay aligned. + assert(isAligned(Alignment, Size) && "CP Entry not multiple of 4 bytes!"); + + // Insert CONSTPOOL_ENTRY before entries with a smaller alignment. + unsigned LogAlign = Log2(Alignment); + MachineBasicBlock::iterator InsAt = InsPoint[LogAlign]; + + MachineInstr *CPEMI = + BuildMI(*BB, InsAt, DebugLoc(), TII->get(CSKY::CONSTPOOL_ENTRY)) + .addImm(I) + .addConstantPoolIndex(I) + .addImm(Size); + + CPEMIs.push_back(CPEMI); + + // Ensure that future entries with higher alignment get inserted before + // CPEMI. This is bucket sort with iterators. + for (unsigned A = LogAlign + 1; A <= Log2(MaxAlign); ++A) + if (InsPoint[A] == InsAt) + InsPoint[A] = CPEMI; + // Add a new CPEntry, but no corresponding CPUser yet. + CPEntries.emplace_back(1, CPEntry(CPEMI, I)); + ++NumCPEs; + LLVM_DEBUG(dbgs() << "Moved CPI#" << I << " to end of function, size = " + << Size << ", align = " << Alignment.value() << '\n'); + } + LLVM_DEBUG(BB->dump()); +} + +/// BBHasFallthrough - Return true if the specified basic block can fallthrough +/// into the block immediately after it. +static bool bbHasFallthrough(MachineBasicBlock *MBB) { + // Get the next machine basic block in the function. + MachineFunction::iterator MBBI = MBB->getIterator(); + // Can't fall off end of function. + if (std::next(MBBI) == MBB->getParent()->end()) + return false; + + MachineBasicBlock *NextBB = &*std::next(MBBI); + for (MachineBasicBlock::succ_iterator I = MBB->succ_begin(), + E = MBB->succ_end(); + I != E; ++I) + if (*I == NextBB) + return true; + + return false; +} + +/// findConstPoolEntry - Given the constpool index and CONSTPOOL_ENTRY MI, +/// look up the corresponding CPEntry. +CSKYConstantIslands::CPEntry * +CSKYConstantIslands::findConstPoolEntry(unsigned CPI, + const MachineInstr *CPEMI) { + std::vector<CPEntry> &CPEs = CPEntries[CPI]; + // Number of entries per constpool index should be small, just do a + // linear search. + for (unsigned I = 0, E = CPEs.size(); I != E; ++I) { + if (CPEs[I].CPEMI == CPEMI) + return &CPEs[I]; + } + return nullptr; +} + +/// getCPEAlign - Returns the required alignment of the constant pool entry +/// represented by CPEMI. Alignment is measured in log2(bytes) units. +Align CSKYConstantIslands::getCPEAlign(const MachineInstr &CPEMI) { + assert(CPEMI.getOpcode() == CSKY::CONSTPOOL_ENTRY); + + unsigned CPI = CPEMI.getOperand(1).getIndex(); + assert(CPI < MCP->getConstants().size() && "Invalid constant pool index."); + return MCP->getConstants()[CPI].getAlign(); +} + +/// initializeFunctionInfo - Do the initial scan of the function, building up +/// information about the sizes of each block, the location of all the water, +/// and finding all of the constant pool users. +void CSKYConstantIslands::initializeFunctionInfo( + const std::vector<MachineInstr *> &CPEMIs) { + BBInfo.clear(); + BBInfo.resize(MF->getNumBlockIDs()); + + // First thing, compute the size of all basic blocks, and see if the function + // has any inline assembly in it. If so, we have to be conservative about + // alignment assumptions, as we don't know for sure the size of any + // instructions in the inline assembly. + for (MachineFunction::iterator I = MF->begin(), E = MF->end(); I != E; ++I) + computeBlockSize(&*I); + + // Compute block offsets. + adjustBBOffsetsAfter(&MF->front()); + + // Now go back through the instructions and build up our data structures. + for (MachineBasicBlock &MBB : *MF) { + // If this block doesn't fall through into the next MBB, then this is + // 'water' that a constant pool island could be placed. + if (!bbHasFallthrough(&MBB)) + WaterList.push_back(&MBB); + for (MachineInstr &MI : MBB) { + if (MI.isDebugInstr()) + continue; + + int Opc = MI.getOpcode(); + if (MI.isBranch() && !MI.isIndirectBranch()) { + bool IsCond = MI.isConditionalBranch(); + unsigned Bits = 0; + unsigned Scale = 1; + int UOpc = CSKY::BR32; + + switch (MI.getOpcode()) { + case CSKY::BR16: + case CSKY::BF16: + case CSKY::BT16: + Bits = 10; + Scale = 2; + break; + default: + Bits = 16; + Scale = 2; + break; + } + + // Record this immediate branch. + unsigned MaxOffs = ((1 << (Bits - 1)) - 1) * Scale; + ImmBranches.push_back(ImmBranch(&MI, MaxOffs, IsCond, UOpc)); + } + + if (Opc == CSKY::CONSTPOOL_ENTRY) + continue; + + // Scan the instructions for constant pool operands. + for (unsigned Op = 0, E = MI.getNumOperands(); Op != E; ++Op) + if (MI.getOperand(Op).isCPI()) { + // We found one. The addressing mode tells us the max displacement + // from the PC that this instruction permits. + + // Basic size info comes from the TSFlags field. + unsigned Bits = 0; + unsigned Scale = 1; + bool NegOk = false; + + switch (Opc) { + default: + llvm_unreachable("Unknown addressing mode for CP reference!"); + case CSKY::MOVIH32: + case CSKY::ORI32: + continue; + case CSKY::PseudoTLSLA32: + case CSKY::JSRI32: + case CSKY::JMPI32: + case CSKY::LRW32: + case CSKY::LRW32_Gen: + Bits = 16; + Scale = 4; + break; + case CSKY::f2FLRW_S: + case CSKY::f2FLRW_D: + Bits = 8; + Scale = 4; + break; + case CSKY::GRS32: + Bits = 17; + Scale = 2; + NegOk = true; + break; + } + // Remember that this is a user of a CP entry. + unsigned CPI = MI.getOperand(Op).getIndex(); + MachineInstr *CPEMI = CPEMIs[CPI]; + unsigned MaxOffs = ((1 << Bits) - 1) * Scale; + CPUsers.push_back(CPUser(&MI, CPEMI, MaxOffs, NegOk)); + + // Increment corresponding CPEntry reference count. + CPEntry *CPE = findConstPoolEntry(CPI, CPEMI); + assert(CPE && "Cannot find a corresponding CPEntry!"); + CPE->RefCount++; + + // Instructions can only use one CP entry, don't bother scanning the + // rest of the operands. + break; + } + } + } +} + +/// computeBlockSize - Compute the size and some alignment information for MBB. +/// This function updates BBInfo directly. +void CSKYConstantIslands::computeBlockSize(MachineBasicBlock *MBB) { + BasicBlockInfo &BBI = BBInfo[MBB->getNumber()]; + BBI.Size = 0; + + for (const MachineInstr &MI : *MBB) + BBI.Size += TII->getInstSizeInBytes(MI); +} + +/// getOffsetOf - Return the current offset of the specified machine instruction +/// from the start of the function. This offset changes as stuff is moved +/// around inside the function. +unsigned CSKYConstantIslands::getOffsetOf(MachineInstr *MI) const { + MachineBasicBlock *MBB = MI->getParent(); + + // The offset is composed of two things: the sum of the sizes of all MBB's + // before this instruction's block, and the offset from the start of the block + // it is in. + unsigned Offset = BBInfo[MBB->getNumber()].Offset; + + // Sum instructions before MI in MBB. + for (MachineBasicBlock::iterator I = MBB->begin(); &*I != MI; ++I) { + assert(I != MBB->end() && "Didn't find MI in its own basic block?"); + Offset += TII->getInstSizeInBytes(*I); + } + return Offset; +} + +/// CompareMBBNumbers - Little predicate function to sort the WaterList by MBB +/// ID. +static bool compareMbbNumbers(const MachineBasicBlock *LHS, + const MachineBasicBlock *RHS) { + return LHS->getNumber() < RHS->getNumber(); +} + +/// updateForInsertedWaterBlock - When a block is newly inserted into the +/// machine function, it upsets all of the block numbers. Renumber the blocks +/// and update the arrays that parallel this numbering. +void CSKYConstantIslands::updateForInsertedWaterBlock( + MachineBasicBlock *NewBB) { + // Renumber the MBB's to keep them consecutive. + NewBB->getParent()->RenumberBlocks(NewBB); + + // Insert an entry into BBInfo to align it properly with the (newly + // renumbered) block numbers. + BBInfo.insert(BBInfo.begin() + NewBB->getNumber(), BasicBlockInfo()); + + // Next, update WaterList. Specifically, we need to add NewMBB as having + // available water after it. + water_iterator IP = llvm::lower_bound(WaterList, NewBB, compareMbbNumbers); + WaterList.insert(IP, NewBB); +} + +unsigned CSKYConstantIslands::getUserOffset(CPUser &U) const { + unsigned UserOffset = getOffsetOf(U.MI); + + UserOffset &= ~3u; + + return UserOffset; +} + +/// Split the basic block containing MI into two blocks, which are joined by +/// an unconditional branch. Update data structures and renumber blocks to +/// account for this change and returns the newly created block. +MachineBasicBlock * +CSKYConstantIslands::splitBlockBeforeInstr(MachineInstr &MI) { + MachineBasicBlock *OrigBB = MI.getParent(); + + // Create a new MBB for the code after the OrigBB. + MachineBasicBlock *NewBB = + MF->CreateMachineBasicBlock(OrigBB->getBasicBlock()); + MachineFunction::iterator MBBI = ++OrigBB->getIterator(); + MF->insert(MBBI, NewBB); + + // Splice the instructions starting with MI over to NewBB. + NewBB->splice(NewBB->end(), OrigBB, MI, OrigBB->end()); + + // Add an unconditional branch from OrigBB to NewBB. + // Note the new unconditional branch is not being recorded. + // There doesn't seem to be meaningful DebugInfo available; this doesn't + // correspond to anything in the source. + + // TODO: Add support for 16bit instr. + BuildMI(OrigBB, DebugLoc(), TII->get(CSKY::BR32)).addMBB(NewBB); + ++NumSplit; + + // Update the CFG. All succs of OrigBB are now succs of NewBB. + NewBB->transferSuccessors(OrigBB); + + // OrigBB branches to NewBB. + OrigBB->addSuccessor(NewBB); + + // Update internal data structures to account for the newly inserted MBB. + // This is almost the same as updateForInsertedWaterBlock, except that + // the Water goes after OrigBB, not NewBB. + MF->RenumberBlocks(NewBB); + + // Insert an entry into BBInfo to align it properly with the (newly + // renumbered) block numbers. + BBInfo.insert(BBInfo.begin() + NewBB->getNumber(), BasicBlockInfo()); + + // Next, update WaterList. Specifically, we need to add OrigMBB as having + // available water after it (but not if it's already there, which happens + // when splitting before a conditional branch that is followed by an + // unconditional branch - in that case we want to insert NewBB). + water_iterator IP = llvm::lower_bound(WaterList, OrigBB, compareMbbNumbers); + MachineBasicBlock *WaterBB = *IP; + if (WaterBB == OrigBB) + WaterList.insert(std::next(IP), NewBB); + else + WaterList.insert(IP, OrigBB); + NewWaterList.insert(OrigBB); + + // Figure out how large the OrigBB is. As the first half of the original + // block, it cannot contain a tablejump. The size includes + // the new jump we added. (It should be possible to do this without + // recounting everything, but it's very confusing, and this is rarely + // executed.) + computeBlockSize(OrigBB); + + // Figure out how large the NewMBB is. As the second half of the original + // block, it may contain a tablejump. + computeBlockSize(NewBB); + + // All BBOffsets following these blocks must be modified. + adjustBBOffsetsAfter(OrigBB); + + return NewBB; +} + +/// isOffsetInRange - Checks whether UserOffset (the location of a constant pool +/// reference) is within MaxDisp of TrialOffset (a proposed location of a +/// constant pool entry). +bool CSKYConstantIslands::isOffsetInRange(unsigned UserOffset, + unsigned TrialOffset, + unsigned MaxDisp, bool NegativeOK) { + if (UserOffset <= TrialOffset) { + // User before the Trial. + if (TrialOffset - UserOffset <= MaxDisp) + return true; + } else if (NegativeOK) { + if (UserOffset - TrialOffset <= MaxDisp) + return true; + } + return false; +} + +/// isWaterInRange - Returns true if a CPE placed after the specified +/// Water (a basic block) will be in range for the specific MI. +/// +/// Compute how much the function will grow by inserting a CPE after Water. +bool CSKYConstantIslands::isWaterInRange(unsigned UserOffset, + MachineBasicBlock *Water, CPUser &U, + unsigned &Growth) { + unsigned CPEOffset = BBInfo[Water->getNumber()].postOffset(); + unsigned NextBlockOffset; + Align NextBlockAlignment; + MachineFunction::const_iterator NextBlock = ++Water->getIterator(); + if (NextBlock == MF->end()) { + NextBlockOffset = BBInfo[Water->getNumber()].postOffset(); + NextBlockAlignment = Align(4); + } else { + NextBlockOffset = BBInfo[NextBlock->getNumber()].Offset; + NextBlockAlignment = NextBlock->getAlignment(); + } + unsigned Size = U.CPEMI->getOperand(2).getImm(); + unsigned CPEEnd = CPEOffset + Size; + + // The CPE may be able to hide in the alignment padding before the next + // block. It may also cause more padding to be required if it is more aligned + // that the next block. + if (CPEEnd > NextBlockOffset) { + Growth = CPEEnd - NextBlockOffset; + // Compute the padding that would go at the end of the CPE to align the next + // block. + Growth += offsetToAlignment(CPEEnd, NextBlockAlignment); + + // If the CPE is to be inserted before the instruction, that will raise + // the offset of the instruction. Also account for unknown alignment padding + // in blocks between CPE and the user. + if (CPEOffset < UserOffset) + UserOffset += Growth; + } else + // CPE fits in existing padding. + Growth = 0; + + return isOffsetInRange(UserOffset, CPEOffset, U); +} + +/// isCPEntryInRange - Returns true if the distance between specific MI and +/// specific ConstPool entry instruction can fit in MI's displacement field. +bool CSKYConstantIslands::isCPEntryInRange(MachineInstr *MI, + unsigned UserOffset, + MachineInstr *CPEMI, + unsigned MaxDisp, bool NegOk, + bool DoDump) { + unsigned CPEOffset = getOffsetOf(CPEMI); + + if (DoDump) { + LLVM_DEBUG({ + unsigned Block = MI->getParent()->getNumber(); + const BasicBlockInfo &BBI = BBInfo[Block]; + dbgs() << "User of CPE#" << CPEMI->getOperand(0).getImm() + << " max delta=" << MaxDisp + << format(" insn address=%#x", UserOffset) << " in " + << printMBBReference(*MI->getParent()) << ": " + << format("%#x-%x\t", BBI.Offset, BBI.postOffset()) << *MI + << format("CPE address=%#x offset=%+d: ", CPEOffset, + int(CPEOffset - UserOffset)); + }); + } + + return isOffsetInRange(UserOffset, CPEOffset, MaxDisp, NegOk); +} + +#ifndef NDEBUG +/// BBIsJumpedOver - Return true of the specified basic block's only predecessor +/// unconditionally branches to its only successor. +static bool bbIsJumpedOver(MachineBasicBlock *MBB) { + if (MBB->pred_size() != 1 || MBB->succ_size() != 1) + return false; + MachineBasicBlock *Succ = *MBB->succ_begin(); + MachineBasicBlock *Pred = *MBB->pred_begin(); + MachineInstr *PredMI = &Pred->back(); + if (PredMI->getOpcode() == CSKY::BR32 /*TODO: change to 16bit instr. */) + return PredMI->getOperand(0).getMBB() == Succ; + return false; +} +#endif + +void CSKYConstantIslands::adjustBBOffsetsAfter(MachineBasicBlock *BB) { + unsigned BBNum = BB->getNumber(); + for (unsigned I = BBNum + 1, E = MF->getNumBlockIDs(); I < E; ++I) { + // Get the offset and known bits at the end of the layout predecessor. + // Include the alignment of the current block. + unsigned Offset = BBInfo[I - 1].Offset + BBInfo[I - 1].Size; + BBInfo[I].Offset = Offset; + } +} + +/// decrementCPEReferenceCount - find the constant pool entry with index CPI +/// and instruction CPEMI, and decrement its refcount. If the refcount +/// becomes 0 remove the entry and instruction. Returns true if we removed +/// the entry, false if we didn't. +bool CSKYConstantIslands::decrementCPEReferenceCount(unsigned CPI, + MachineInstr *CPEMI) { + // Find the old entry. Eliminate it if it is no longer used. + CPEntry *CPE = findConstPoolEntry(CPI, CPEMI); + assert(CPE && "Unexpected!"); + if (--CPE->RefCount == 0) { + removeDeadCPEMI(CPEMI); + CPE->CPEMI = nullptr; + --NumCPEs; + return true; + } + return false; +} + +/// LookForCPEntryInRange - see if the currently referenced CPE is in range; +/// if not, see if an in-range clone of the CPE is in range, and if so, +/// change the data structures so the user references the clone. Returns: +/// 0 = no existing entry found +/// 1 = entry found, and there were no code insertions or deletions +/// 2 = entry found, and there were code insertions or deletions +int CSKYConstantIslands::findInRangeCPEntry(CPUser &U, unsigned UserOffset) { + MachineInstr *UserMI = U.MI; + MachineInstr *CPEMI = U.CPEMI; + + // Check to see if the CPE is already in-range. + if (isCPEntryInRange(UserMI, UserOffset, CPEMI, U.getMaxDisp(), U.NegOk, + true)) { + LLVM_DEBUG(dbgs() << "In range\n"); + return 1; + } + + // No. Look for previously created clones of the CPE that are in range. + unsigned CPI = CPEMI->getOperand(1).getIndex(); + std::vector<CPEntry> &CPEs = CPEntries[CPI]; + for (unsigned I = 0, E = CPEs.size(); I != E; ++I) { + // We already tried this one + if (CPEs[I].CPEMI == CPEMI) + continue; + // Removing CPEs can leave empty entries, skip + if (CPEs[I].CPEMI == nullptr) + continue; + if (isCPEntryInRange(UserMI, UserOffset, CPEs[I].CPEMI, U.getMaxDisp(), + U.NegOk)) { + LLVM_DEBUG(dbgs() << "Replacing CPE#" << CPI << " with CPE#" + << CPEs[I].CPI << "\n"); + // Point the CPUser node to the replacement + U.CPEMI = CPEs[I].CPEMI; + // Change the CPI in the instruction operand to refer to the clone. + for (unsigned J = 0, E = UserMI->getNumOperands(); J != E; ++J) + if (UserMI->getOperand(J).isCPI()) { + UserMI->getOperand(J).setIndex(CPEs[I].CPI); + break; + } + // Adjust the refcount of the clone... + CPEs[I].RefCount++; + // ...and the original. If we didn't remove the old entry, none of the + // addresses changed, so we don't need another pass. + return decrementCPEReferenceCount(CPI, CPEMI) ? 2 : 1; + } + } + return 0; +} + +/// getUnconditionalBrDisp - Returns the maximum displacement that can fit in +/// the specific unconditional branch instruction. +static inline unsigned getUnconditionalBrDisp(int Opc) { + unsigned Bits, Scale; + + switch (Opc) { + case CSKY::BR16: + Bits = 10; + Scale = 2; + break; + case CSKY::BR32: + Bits = 16; + Scale = 2; + break; + default: + assert(0); + break; + } + + unsigned MaxOffs = ((1 << (Bits - 1)) - 1) * Scale; + return MaxOffs; +} + +/// findAvailableWater - Look for an existing entry in the WaterList in which +/// we can place the CPE referenced from U so it's within range of U's MI. +/// Returns true if found, false if not. If it returns true, WaterIter +/// is set to the WaterList entry. +/// To ensure that this pass +/// terminates, the CPE location for a particular CPUser is only allowed to +/// move to a lower address, so search backward from the end of the list and +/// prefer the first water that is in range. +bool CSKYConstantIslands::findAvailableWater(CPUser &U, unsigned UserOffset, + water_iterator &WaterIter) { + if (WaterList.empty()) + return false; + + unsigned BestGrowth = ~0u; + for (water_iterator IP = std::prev(WaterList.end()), B = WaterList.begin();; + --IP) { + MachineBasicBlock *WaterBB = *IP; + // Check if water is in range and is either at a lower address than the + // current "high water mark" or a new water block that was created since + // the previous iteration by inserting an unconditional branch. In the + // latter case, we want to allow resetting the high water mark back to + // this new water since we haven't seen it before. Inserting branches + // should be relatively uncommon and when it does happen, we want to be + // sure to take advantage of it for all the CPEs near that block, so that + // we don't insert more branches than necessary. + unsigned Growth; + if (isWaterInRange(UserOffset, WaterBB, U, Growth) && + (WaterBB->getNumber() < U.HighWaterMark->getNumber() || + NewWaterList.count(WaterBB)) && + Growth < BestGrowth) { + // This is the least amount of required padding seen so far. + BestGrowth = Growth; + WaterIter = IP; + LLVM_DEBUG(dbgs() << "Found water after " << printMBBReference(*WaterBB) + << " Growth=" << Growth << '\n'); + + // Keep looking unless it is perfect. + if (BestGrowth == 0) + return true; + } + if (IP == B) + break; + } + return BestGrowth != ~0u; +} + +/// createNewWater - No existing WaterList entry will work for +/// CPUsers[CPUserIndex], so create a place to put the CPE. The end of the +/// block is used if in range, and the conditional branch munged so control +/// flow is correct. Otherwise the block is split to create a hole with an +/// unconditional branch around it. In either case NewMBB is set to a +/// block following which the new island can be inserted (the WaterList +/// is not adjusted). +void CSKYConstantIslands::createNewWater(unsigned CPUserIndex, + unsigned UserOffset, + MachineBasicBlock *&NewMBB) { + CPUser &U = CPUsers[CPUserIndex]; + MachineInstr *UserMI = U.MI; + MachineInstr *CPEMI = U.CPEMI; + MachineBasicBlock *UserMBB = UserMI->getParent(); + const BasicBlockInfo &UserBBI = BBInfo[UserMBB->getNumber()]; + + // If the block does not end in an unconditional branch already, and if the + // end of the block is within range, make new water there. + if (bbHasFallthrough(UserMBB)) { + // Size of branch to insert. + unsigned Delta = 4; + // Compute the offset where the CPE will begin. + unsigned CPEOffset = UserBBI.postOffset() + Delta; + + if (isOffsetInRange(UserOffset, CPEOffset, U)) { + LLVM_DEBUG(dbgs() << "Split at end of " << printMBBReference(*UserMBB) + << format(", expected CPE offset %#x\n", CPEOffset)); + NewMBB = &*++UserMBB->getIterator(); + // Add an unconditional branch from UserMBB to fallthrough block. Record + // it for branch lengthening; this new branch will not get out of range, + // but if the preceding conditional branch is out of range, the targets + // will be exchanged, and the altered branch may be out of range, so the + // machinery has to know about it. + + // TODO: Add support for 16bit instr. + int UncondBr = CSKY::BR32; + auto *NewMI = BuildMI(UserMBB, DebugLoc(), TII->get(UncondBr)) + .addMBB(NewMBB) + .getInstr(); + unsigned MaxDisp = getUnconditionalBrDisp(UncondBr); + ImmBranches.push_back( + ImmBranch(&UserMBB->back(), MaxDisp, false, UncondBr)); + BBInfo[UserMBB->getNumber()].Size += TII->getInstSizeInBytes(*NewMI); + adjustBBOffsetsAfter(UserMBB); + return; + } + } + + // What a big block. Find a place within the block to split it. + + // Try to split the block so it's fully aligned. Compute the latest split + // point where we can add a 4-byte branch instruction, and then align to + // Align which is the largest possible alignment in the function. + const Align Align = MF->getAlignment(); + unsigned BaseInsertOffset = UserOffset + U.getMaxDisp(); + LLVM_DEBUG(dbgs() << format("Split in middle of big block before %#x", + BaseInsertOffset)); + + // The 4 in the following is for the unconditional branch we'll be inserting + // Alignment of the island is handled + // inside isOffsetInRange. + BaseInsertOffset -= 4; + + LLVM_DEBUG(dbgs() << format(", adjusted to %#x", BaseInsertOffset) + << " la=" << Log2(Align) << '\n'); + + // This could point off the end of the block if we've already got constant + // pool entries following this block; only the last one is in the water list. + // Back past any possible branches (allow for a conditional and a maximally + // long unconditional). + if (BaseInsertOffset + 8 >= UserBBI.postOffset()) { + BaseInsertOffset = UserBBI.postOffset() - 8; + LLVM_DEBUG(dbgs() << format("Move inside block: %#x\n", BaseInsertOffset)); + } + unsigned EndInsertOffset = + BaseInsertOffset + 4 + CPEMI->getOperand(2).getImm(); + MachineBasicBlock::iterator MI = UserMI; + ++MI; + unsigned CPUIndex = CPUserIndex + 1; + unsigned NumCPUsers = CPUsers.size(); + for (unsigned Offset = UserOffset + TII->getInstSizeInBytes(*UserMI); + Offset < BaseInsertOffset; + Offset += TII->getInstSizeInBytes(*MI), MI = std::next(MI)) { + assert(MI != UserMBB->end() && "Fell off end of block"); + if (CPUIndex < NumCPUsers && CPUsers[CPUIndex].MI == MI) { + CPUser &U = CPUsers[CPUIndex]; + if (!isOffsetInRange(Offset, EndInsertOffset, U)) { + // Shift intertion point by one unit of alignment so it is within reach. + BaseInsertOffset -= Align.value(); + EndInsertOffset -= Align.value(); + } + // This is overly conservative, as we don't account for CPEMIs being + // reused within the block, but it doesn't matter much. Also assume CPEs + // are added in order with alignment padding. We may eventually be able + // to pack the aligned CPEs better. + EndInsertOffset += U.CPEMI->getOperand(2).getImm(); + CPUIndex++; + } + } + + NewMBB = splitBlockBeforeInstr(*--MI); +} + +/// handleConstantPoolUser - Analyze the specified user, checking to see if it +/// is out-of-range. If so, pick up the constant pool value and move it some +/// place in-range. Return true if we changed any addresses (thus must run +/// another pass of branch lengthening), false otherwise. +bool CSKYConstantIslands::handleConstantPoolUser(unsigned CPUserIndex) { + CPUser &U = CPUsers[CPUserIndex]; + MachineInstr *UserMI = U.MI; + MachineInstr *CPEMI = U.CPEMI; + unsigned CPI = CPEMI->getOperand(1).getIndex(); + unsigned Size = CPEMI->getOperand(2).getImm(); + // Compute this only once, it's expensive. + unsigned UserOffset = getUserOffset(U); + + // See if the current entry is within range, or there is a clone of it + // in range. + int result = findInRangeCPEntry(U, UserOffset); + if (result == 1) + return false; + if (result == 2) + return true; + + // Look for water where we can place this CPE. + MachineBasicBlock *NewIsland = MF->CreateMachineBasicBlock(); + MachineBasicBlock *NewMBB; + water_iterator IP; + if (findAvailableWater(U, UserOffset, IP)) { + LLVM_DEBUG(dbgs() << "Found water in range\n"); + MachineBasicBlock *WaterBB = *IP; + + // If the original WaterList entry was "new water" on this iteration, + // propagate that to the new island. This is just keeping NewWaterList + // updated to match the WaterList, which will be updated below. + if (NewWaterList.erase(WaterBB)) + NewWaterList.insert(NewIsland); + + // The new CPE goes before the following block (NewMBB). + NewMBB = &*++WaterBB->getIterator(); + } else { + LLVM_DEBUG(dbgs() << "No water found\n"); + createNewWater(CPUserIndex, UserOffset, NewMBB); + + // splitBlockBeforeInstr adds to WaterList, which is important when it is + // called while handling branches so that the water will be seen on the + // next iteration for constant pools, but in this context, we don't want + // it. Check for this so it will be removed from the WaterList. + // Also remove any entry from NewWaterList. + MachineBasicBlock *WaterBB = &*--NewMBB->getIterator(); + IP = llvm::find(WaterList, WaterBB); + if (IP != WaterList.end()) + NewWaterList.erase(WaterBB); + + // We are adding new water. Update NewWaterList. + NewWaterList.insert(NewIsland); + } + + // Remove the original WaterList entry; we want subsequent insertions in + // this vicinity to go after the one we're about to insert. This + // considerably reduces the number of times we have to move the same CPE + // more than once and is also important to ensure the algorithm terminates. + if (IP != WaterList.end()) + WaterList.erase(IP); + + // Okay, we know we can put an island before NewMBB now, do it! + MF->insert(NewMBB->getIterator(), NewIsland); + + // Update internal data structures to account for the newly inserted MBB. + updateForInsertedWaterBlock(NewIsland); + + // Decrement the old entry, and remove it if refcount becomes 0. + decrementCPEReferenceCount(CPI, CPEMI); + + // No existing clone of this CPE is within range. + // We will be generating a new clone. Get a UID for it. + unsigned ID = createPICLabelUId(); + + // Now that we have an island to add the CPE to, clone the original CPE and + // add it to the island. + U.HighWaterMark = NewIsland; + U.CPEMI = BuildMI(NewIsland, DebugLoc(), TII->get(CSKY::CONSTPOOL_ENTRY)) + .addImm(ID) + .addConstantPoolIndex(CPI) + .addImm(Size); + CPEntries[CPI].push_back(CPEntry(U.CPEMI, ID, 1)); + ++NumCPEs; + + // Mark the basic block as aligned as required by the const-pool entry. + NewIsland->setAlignment(getCPEAlign(*U.CPEMI)); + + // Increase the size of the island block to account for the new entry. + BBInfo[NewIsland->getNumber()].Size += Size; + adjustBBOffsetsAfter(&*--NewIsland->getIterator()); + + // Finally, change the CPI in the instruction operand to be ID. + for (unsigned I = 0, E = UserMI->getNumOperands(); I != E; ++I) + if (UserMI->getOperand(I).isCPI()) { + UserMI->getOperand(I).setIndex(ID); + break; + } + + LLVM_DEBUG( + dbgs() << " Moved CPE to #" << ID << " CPI=" << CPI + << format(" offset=%#x\n", BBInfo[NewIsland->getNumber()].Offset)); + + return true; +} + +/// removeDeadCPEMI - Remove a dead constant pool entry instruction. Update +/// sizes and offsets of impacted basic blocks. +void CSKYConstantIslands::removeDeadCPEMI(MachineInstr *CPEMI) { + MachineBasicBlock *CPEBB = CPEMI->getParent(); + unsigned Size = CPEMI->getOperand(2).getImm(); + CPEMI->eraseFromParent(); + BBInfo[CPEBB->getNumber()].Size -= Size; + // All succeeding offsets have the current size value added in, fix this. + if (CPEBB->empty()) { + BBInfo[CPEBB->getNumber()].Size = 0; + + // This block no longer needs to be aligned. + CPEBB->setAlignment(Align(4)); + } else { + // Entries are sorted by descending alignment, so realign from the front. + CPEBB->setAlignment(getCPEAlign(*CPEBB->begin())); + } + + adjustBBOffsetsAfter(CPEBB); + // An island has only one predecessor BB and one successor BB. Check if + // this BB's predecessor jumps directly to this BB's successor. This + // shouldn't happen currently. + assert(!bbIsJumpedOver(CPEBB) && "How did this happen?"); + // FIXME: remove the empty blocks after all the work is done? +} + +/// removeUnusedCPEntries - Remove constant pool entries whose refcounts +/// are zero. +bool CSKYConstantIslands::removeUnusedCPEntries() { + unsigned MadeChange = false; + for (unsigned I = 0, E = CPEntries.size(); I != E; ++I) { + std::vector<CPEntry> &CPEs = CPEntries[I]; + for (unsigned J = 0, Ee = CPEs.size(); J != Ee; ++J) { + if (CPEs[J].RefCount == 0 && CPEs[J].CPEMI) { + removeDeadCPEMI(CPEs[J].CPEMI); + CPEs[J].CPEMI = nullptr; + MadeChange = true; + } + } + } + return MadeChange; +} + +/// isBBInRange - Returns true if the distance between specific MI and +/// specific BB can fit in MI's displacement field. +bool CSKYConstantIslands::isBBInRange(MachineInstr *MI, + MachineBasicBlock *DestBB, + unsigned MaxDisp) { + unsigned BrOffset = getOffsetOf(MI); + unsigned DestOffset = BBInfo[DestBB->getNumber()].Offset; + + LLVM_DEBUG(dbgs() << "Branch of destination " << printMBBReference(*DestBB) + << " from " << printMBBReference(*MI->getParent()) + << " max delta=" << MaxDisp << " from " << getOffsetOf(MI) + << " to " << DestOffset << " offset " + << int(DestOffset - BrOffset) << "\t" << *MI); + + if (BrOffset <= DestOffset) { + // Branch before the Dest. + if (DestOffset - BrOffset <= MaxDisp) + return true; + } else { + if (BrOffset - DestOffset <= MaxDisp) + return true; + } + return false; +} + +/// fixupImmediateBr - Fix up an immediate branch whose destination is too far +/// away to fit in its displacement field. +bool CSKYConstantIslands::fixupImmediateBr(ImmBranch &Br) { + MachineInstr *MI = Br.MI; + MachineBasicBlock *DestBB = TII->getBranchDestBlock(*MI); + + // Check to see if the DestBB is already in-range. + if (isBBInRange(MI, DestBB, Br.MaxDisp)) + return false; + + if (!Br.IsCond) + return fixupUnconditionalBr(Br); + return fixupConditionalBr(Br); +} + +/// fixupUnconditionalBr - Fix up an unconditional branch whose destination is +/// too far away to fit in its displacement field. If the LR register has been +/// spilled in the epilogue, then we can use BSR to implement a far jump. +/// Otherwise, add an intermediate branch instruction to a branch. +bool CSKYConstantIslands::fixupUnconditionalBr(ImmBranch &Br) { + MachineInstr *MI = Br.MI; + MachineBasicBlock *MBB = MI->getParent(); + + if (!MFI->isLRSpilled()) + report_fatal_error("underestimated function size"); + + // Use BSR to implement far jump. + Br.MaxDisp = ((1 << (26 - 1)) - 1) * 2; + MI->setDesc(TII->get(CSKY::BSR32_BR)); + BBInfo[MBB->getNumber()].Size += 4; + adjustBBOffsetsAfter(MBB); + ++NumUBrFixed; + + LLVM_DEBUG(dbgs() << " Changed B to long jump " << *MI); + + return true; +} + +/// fixupConditionalBr - Fix up a conditional branch whose destination is too +/// far away to fit in its displacement field. It is converted to an inverse +/// conditional branch + an unconditional branch to the destination. +bool CSKYConstantIslands::fixupConditionalBr(ImmBranch &Br) { + MachineInstr *MI = Br.MI; + MachineBasicBlock *DestBB = TII->getBranchDestBlock(*MI); + + SmallVector<MachineOperand, 4> Cond; + Cond.push_back(MachineOperand::CreateImm(MI->getOpcode())); + Cond.push_back(MI->getOperand(0)); + TII->reverseBranchCondition(Cond); + + // Add an unconditional branch to the destination and invert the branch + // condition to jump over it: + // bteqz L1 + // => + // bnez L2 + // b L1 + // L2: + + // If the branch is at the end of its MBB and that has a fall-through block, + // direct the updated conditional branch to the fall-through block. Otherwise, + // split the MBB before the next instruction. + MachineBasicBlock *MBB = MI->getParent(); + MachineInstr *BMI = &MBB->back(); + bool NeedSplit = (BMI != MI) || !bbHasFallthrough(MBB); + + ++NumCBrFixed; + if (BMI != MI) { + if (std::next(MachineBasicBlock::iterator(MI)) == std::prev(MBB->end()) && + BMI->isUnconditionalBranch()) { + // Last MI in the BB is an unconditional branch. Can we simply invert the + // condition and swap destinations: + // beqz L1 + // b L2 + // => + // bnez L2 + // b L1 + MachineBasicBlock *NewDest = TII->getBranchDestBlock(*BMI); + if (isBBInRange(MI, NewDest, Br.MaxDisp)) { + LLVM_DEBUG( + dbgs() << " Invert Bcc condition and swap its destination with " + << *BMI); + BMI->getOperand(BMI->getNumExplicitOperands() - 1).setMBB(DestBB); + MI->getOperand(MI->getNumExplicitOperands() - 1).setMBB(NewDest); + + MI->setDesc(TII->get(Cond[0].getImm())); + return true; + } + } + } + + if (NeedSplit) { + splitBlockBeforeInstr(*MI); + // No need for the branch to the next block. We're adding an unconditional + // branch to the destination. + int Delta = TII->getInstSizeInBytes(MBB->back()); + BBInfo[MBB->getNumber()].Size -= Delta; + MBB->back().eraseFromParent(); + // BBInfo[SplitBB].Offset is wrong temporarily, fixed below + + // The conditional successor will be swapped between the BBs after this, so + // update CFG. + MBB->addSuccessor(DestBB); + std::next(MBB->getIterator())->removeSuccessor(DestBB); + } + MachineBasicBlock *NextBB = &*++MBB->getIterator(); + + LLVM_DEBUG(dbgs() << " Insert B to " << printMBBReference(*DestBB) + << " also invert condition and change dest. to " + << printMBBReference(*NextBB) << "\n"); + + // Insert a new conditional branch and a new unconditional branch. + // Also update the ImmBranch as well as adding a new entry for the new branch. + + BuildMI(MBB, DebugLoc(), TII->get(Cond[0].getImm())) + .addReg(MI->getOperand(0).getReg()) + .addMBB(NextBB); + + Br.MI = &MBB->back(); + BBInfo[MBB->getNumber()].Size += TII->getInstSizeInBytes(MBB->back()); + BuildMI(MBB, DebugLoc(), TII->get(Br.UncondBr)).addMBB(DestBB); + BBInfo[MBB->getNumber()].Size += TII->getInstSizeInBytes(MBB->back()); + unsigned MaxDisp = getUnconditionalBrDisp(Br.UncondBr); + ImmBranches.push_back(ImmBranch(&MBB->back(), MaxDisp, false, Br.UncondBr)); + + // Remove the old conditional branch. It may or may not still be in MBB. + BBInfo[MI->getParent()->getNumber()].Size -= TII->getInstSizeInBytes(*MI); + MI->eraseFromParent(); + adjustBBOffsetsAfter(MBB); + return true; +} + +/// Returns a pass that converts branches to long branches. +FunctionPass *llvm::createCSKYConstantIslandPass() { + return new CSKYConstantIslands(); +} + +INITIALIZE_PASS(CSKYConstantIslands, DEBUG_TYPE, + "CSKY constant island placement and branch shortening pass", + false, false) diff --git a/llvm/lib/Target/CSKY/CSKYConstantPoolValue.cpp b/llvm/lib/Target/CSKY/CSKYConstantPoolValue.cpp new file mode 100644 index 000000000000..d4c4bb847237 --- /dev/null +++ b/llvm/lib/Target/CSKY/CSKYConstantPoolValue.cpp @@ -0,0 +1,216 @@ +//===-- CSKYConstantPoolValue.cpp - CSKY constantpool value ---------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements the CSKY specific constantpool value class. +// +//===----------------------------------------------------------------------===// + +#include "CSKYConstantPoolValue.h" +#include "llvm/ADT/FoldingSet.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/IR/Constant.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/GlobalValue.h" +#include "llvm/IR/Type.h" +#include "llvm/Support/raw_ostream.h" +using namespace llvm; + +//===----------------------------------------------------------------------===// +// CSKYConstantPoolValue +//===----------------------------------------------------------------------===// + +CSKYConstantPoolValue::CSKYConstantPoolValue(Type *Ty, CSKYCP::CSKYCPKind Kind, + unsigned PCAdjust, + CSKYCP::CSKYCPModifier Modifier, + bool AddCurrentAddress, + unsigned ID) + : MachineConstantPoolValue(Ty), Kind(Kind), PCAdjust(PCAdjust), + Modifier(Modifier), AddCurrentAddress(AddCurrentAddress), LabelId(ID) {} + +const char *CSKYConstantPoolValue::getModifierText() const { + switch (Modifier) { + case CSKYCP::ADDR: + return "ADDR"; + case CSKYCP::GOT: + return "GOT"; + case CSKYCP::GOTOFF: + return "GOTOFF"; + case CSKYCP::PLT: + return "PLT"; + case CSKYCP::TLSIE: + return "TLSIE"; + case CSKYCP::TLSLE: + return "TLSLE"; + case CSKYCP::TLSGD: + return "TLSGD"; + case CSKYCP::NO_MOD: + return ""; + } + llvm_unreachable("Unknown modifier!"); +} + +int CSKYConstantPoolValue::getExistingMachineCPValue(MachineConstantPool *CP, + Align Alignment) { + llvm_unreachable("Shouldn't be calling this directly!"); +} + +void CSKYConstantPoolValue::addSelectionDAGCSEId(FoldingSetNodeID &ID) { + ID.AddInteger(LabelId); + ID.AddInteger(PCAdjust); + ID.AddInteger(Modifier); +} + +void CSKYConstantPoolValue::print(raw_ostream &O) const { + if (Modifier) + O << "(" << getModifierText() << ")"; + if (PCAdjust) + O << " + " << PCAdjust; +} + +//===----------------------------------------------------------------------===// +// CSKYConstantPoolConstant +//===----------------------------------------------------------------------===// + +CSKYConstantPoolConstant::CSKYConstantPoolConstant( + const Constant *C, CSKYCP::CSKYCPKind Kind, unsigned PCAdjust, + CSKYCP::CSKYCPModifier Modifier, bool AddCurrentAddress, unsigned ID) + : CSKYConstantPoolValue(C->getType(), Kind, PCAdjust, Modifier, + AddCurrentAddress, ID), + CVal(C) {} + +CSKYConstantPoolConstant *CSKYConstantPoolConstant::Create( + const Constant *C, CSKYCP::CSKYCPKind Kind, unsigned PCAdjust, + CSKYCP::CSKYCPModifier Modifier, bool AddCurrentAddress, unsigned ID) { + return new CSKYConstantPoolConstant(C, Kind, PCAdjust, Modifier, + AddCurrentAddress, ID); +} + +const GlobalValue *CSKYConstantPoolConstant::getGV() const { + assert(isa<GlobalValue>(CVal) && "CVal should be GlobalValue"); + return cast<GlobalValue>(CVal); +} + +const BlockAddress *CSKYConstantPoolConstant::getBlockAddress() const { + assert(isa<BlockAddress>(CVal) && "CVal should be BlockAddress"); + return cast<BlockAddress>(CVal); +} + +int CSKYConstantPoolConstant::getExistingMachineCPValue(MachineConstantPool *CP, + Align Alignment) { + return getExistingMachineCPValueImpl<CSKYConstantPoolConstant>(CP, Alignment); +} + +void CSKYConstantPoolConstant::addSelectionDAGCSEId(FoldingSetNodeID &ID) { + ID.AddPointer(CVal); + + CSKYConstantPoolValue::addSelectionDAGCSEId(ID); +} + +void CSKYConstantPoolConstant::print(raw_ostream &O) const { + O << CVal->getName(); + CSKYConstantPoolValue::print(O); +} + +//===----------------------------------------------------------------------===// +// CSKYConstantPoolSymbol +//===----------------------------------------------------------------------===// + +CSKYConstantPoolSymbol::CSKYConstantPoolSymbol(Type *Ty, const char *S, + unsigned PCAdjust, + CSKYCP::CSKYCPModifier Modifier, + bool AddCurrentAddress) + : CSKYConstantPoolValue(Ty, CSKYCP::CPExtSymbol, PCAdjust, Modifier, + AddCurrentAddress), + S(strdup(S)) {} + +CSKYConstantPoolSymbol * +CSKYConstantPoolSymbol::Create(Type *Ty, const char *S, unsigned PCAdjust, + CSKYCP::CSKYCPModifier Modifier) { + return new CSKYConstantPoolSymbol(Ty, S, PCAdjust, Modifier, false); +} + +int CSKYConstantPoolSymbol::getExistingMachineCPValue(MachineConstantPool *CP, + Align Alignment) { + + return getExistingMachineCPValueImpl<CSKYConstantPoolSymbol>(CP, Alignment); +} + +void CSKYConstantPoolSymbol::addSelectionDAGCSEId(FoldingSetNodeID &ID) { + ID.AddString(S); + CSKYConstantPoolValue::addSelectionDAGCSEId(ID); +} + +void CSKYConstantPoolSymbol::print(raw_ostream &O) const { + O << S; + CSKYConstantPoolValue::print(O); +} + +//===----------------------------------------------------------------------===// +// CSKYConstantPoolMBB +//===----------------------------------------------------------------------===// + +CSKYConstantPoolMBB::CSKYConstantPoolMBB(Type *Ty, const MachineBasicBlock *Mbb, + unsigned PCAdjust, + CSKYCP::CSKYCPModifier Modifier, + bool AddCurrentAddress) + : CSKYConstantPoolValue(Ty, CSKYCP::CPMachineBasicBlock, PCAdjust, Modifier, + AddCurrentAddress), + MBB(Mbb) {} + +CSKYConstantPoolMBB *CSKYConstantPoolMBB::Create(Type *Ty, + const MachineBasicBlock *Mbb, + unsigned PCAdjust) { + return new CSKYConstantPoolMBB(Ty, Mbb, PCAdjust, CSKYCP::ADDR, false); +} + +int CSKYConstantPoolMBB::getExistingMachineCPValue(MachineConstantPool *CP, + Align Alignment) { + return getExistingMachineCPValueImpl<CSKYConstantPoolMBB>(CP, Alignment); +} + +void CSKYConstantPoolMBB::addSelectionDAGCSEId(FoldingSetNodeID &ID) { + ID.AddPointer(MBB); + CSKYConstantPoolValue::addSelectionDAGCSEId(ID); +} + +void CSKYConstantPoolMBB::print(raw_ostream &O) const { + O << "BB#" << MBB->getNumber(); + CSKYConstantPoolValue::print(O); +} + +//===----------------------------------------------------------------------===// +// CSKYConstantPoolJT +//===----------------------------------------------------------------------===// + +CSKYConstantPoolJT::CSKYConstantPoolJT(Type *Ty, int JTIndex, unsigned PCAdj, + CSKYCP::CSKYCPModifier Modifier, + bool AddCurrentAddress) + : CSKYConstantPoolValue(Ty, CSKYCP::CPJT, PCAdj, Modifier, + AddCurrentAddress), + JTI(JTIndex) {} + +CSKYConstantPoolJT * +CSKYConstantPoolJT::Create(Type *Ty, int JTI, unsigned PCAdj, + CSKYCP::CSKYCPModifier Modifier) { + return new CSKYConstantPoolJT(Ty, JTI, PCAdj, Modifier, false); +} + +int CSKYConstantPoolJT::getExistingMachineCPValue(MachineConstantPool *CP, + Align Alignment) { + return getExistingMachineCPValueImpl<CSKYConstantPoolJT>(CP, Alignment); +} + +void CSKYConstantPoolJT::addSelectionDAGCSEId(FoldingSetNodeID &ID) { + ID.AddInteger(JTI); + CSKYConstantPoolValue::addSelectionDAGCSEId(ID); +} + +void CSKYConstantPoolJT::print(raw_ostream &O) const { + O << "JTI#" << JTI; + CSKYConstantPoolValue::print(O); +} diff --git a/llvm/lib/Target/CSKY/CSKYConstantPoolValue.h b/llvm/lib/Target/CSKY/CSKYConstantPoolValue.h new file mode 100644 index 000000000000..2eff9404a34c --- /dev/null +++ b/llvm/lib/Target/CSKY/CSKYConstantPoolValue.h @@ -0,0 +1,221 @@ +//===-- CSKYConstantPoolValue.h - CSKY constantpool value -----*- C++ -*---===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements the CSKY specific constantpool value class. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TARGET_CSKY_CONSTANTPOOLVALUE_H +#define LLVM_TARGET_CSKY_CONSTANTPOOLVALUE_H + +#include "llvm/ADT/StringRef.h" +#include "llvm/CodeGen/MachineConstantPool.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/ErrorHandling.h" +#include <cstddef> + +namespace llvm { + +class BlockAddress; +class Constant; +class GlobalValue; +class LLVMContext; +class MachineBasicBlock; + +namespace CSKYCP { +enum CSKYCPKind { + CPValue, + CPExtSymbol, + CPBlockAddress, + CPMachineBasicBlock, + CPJT +}; + +enum CSKYCPModifier { NO_MOD, ADDR, GOT, GOTOFF, PLT, TLSLE, TLSIE, TLSGD }; +} // namespace CSKYCP + +/// CSKYConstantPoolValue - CSKY specific constantpool value. This is used to +/// represent PC-relative displacement between the address of the load +/// instruction and the constant being loaded, i.e. (&GV-(LPIC+8)). +class CSKYConstantPoolValue : public MachineConstantPoolValue { +protected: + CSKYCP::CSKYCPKind Kind; // Kind of constant. + unsigned PCAdjust; // Extra adjustment if constantpool is pc-relative. + CSKYCP::CSKYCPModifier Modifier; // GV modifier + bool AddCurrentAddress; + + unsigned LabelId = 0; + + CSKYConstantPoolValue(Type *Ty, CSKYCP::CSKYCPKind Kind, unsigned PCAdjust, + CSKYCP::CSKYCPModifier Modifier, bool AddCurrentAddress, + unsigned ID = 0); + +public: + const char *getModifierText() const; + unsigned getPCAdjustment() const { return PCAdjust; } + bool mustAddCurrentAddress() const { return AddCurrentAddress; } + CSKYCP::CSKYCPModifier getModifier() const { return Modifier; } + unsigned getLabelID() const { return LabelId; } + + bool isGlobalValue() const { return Kind == CSKYCP::CPValue; } + bool isExtSymbol() const { return Kind == CSKYCP::CPExtSymbol; } + bool isBlockAddress() const { return Kind == CSKYCP::CPBlockAddress; } + bool isMachineBasicBlock() const { + return Kind == CSKYCP::CPMachineBasicBlock; + } + bool isJT() const { return Kind == CSKYCP::CPJT; } + + int getExistingMachineCPValue(MachineConstantPool *CP, + Align Alignment) override; + + void addSelectionDAGCSEId(FoldingSetNodeID &ID) override; + + void print(raw_ostream &O) const override; + + bool equals(const CSKYConstantPoolValue *A) const { + return this->LabelId == A->LabelId && this->PCAdjust == A->PCAdjust && + this->Modifier == A->Modifier; + } + + template <typename Derived> + int getExistingMachineCPValueImpl(MachineConstantPool *CP, Align Alignment) { + const std::vector<MachineConstantPoolEntry> &Constants = CP->getConstants(); + for (unsigned i = 0, e = Constants.size(); i != e; ++i) { + if (Constants[i].isMachineConstantPoolEntry() && + Constants[i].getAlign() >= Alignment) { + auto *CPV = + static_cast<CSKYConstantPoolValue *>(Constants[i].Val.MachineCPVal); + if (Derived *APC = dyn_cast<Derived>(CPV)) + if (cast<Derived>(this)->equals(APC)) + return i; + } + } + + return -1; + } +}; + +/// CSKY-specific constant pool values for Constants, +/// Functions, and BlockAddresses. +class CSKYConstantPoolConstant : public CSKYConstantPoolValue { + const Constant *CVal; // Constant being loaded. + + CSKYConstantPoolConstant(const Constant *C, CSKYCP::CSKYCPKind Kind, + unsigned PCAdjust, CSKYCP::CSKYCPModifier Modifier, + bool AddCurrentAddress, unsigned ID); + +public: + static CSKYConstantPoolConstant * + Create(const Constant *C, CSKYCP::CSKYCPKind Kind, unsigned PCAdjust, + CSKYCP::CSKYCPModifier Modifier, bool AddCurrentAddress, + unsigned ID = 0); + const GlobalValue *getGV() const; + const BlockAddress *getBlockAddress() const; + + int getExistingMachineCPValue(MachineConstantPool *CP, + Align Alignment) override; + void addSelectionDAGCSEId(FoldingSetNodeID &ID) override; + void print(raw_ostream &O) const override; + + bool equals(const CSKYConstantPoolConstant *A) const { + return CVal == A->CVal && CSKYConstantPoolValue::equals(A); + } + + static bool classof(const CSKYConstantPoolValue *APV) { + return APV->isGlobalValue() || APV->isBlockAddress(); + } +}; + +/// CSKYConstantPoolSymbol - CSKY-specific constantpool values for external +/// symbols. +class CSKYConstantPoolSymbol : public CSKYConstantPoolValue { + const std::string S; // ExtSymbol being loaded. + + CSKYConstantPoolSymbol(Type *Ty, const char *S, unsigned PCAdjust, + CSKYCP::CSKYCPModifier Modifier, + bool AddCurrentAddress); + +public: + static CSKYConstantPoolSymbol *Create(Type *Ty, const char *S, + unsigned PCAdjust, + CSKYCP::CSKYCPModifier Modifier); + + StringRef getSymbol() const { return S; } + + int getExistingMachineCPValue(MachineConstantPool *CP, + Align Alignment) override; + void addSelectionDAGCSEId(FoldingSetNodeID &ID) override; + void print(raw_ostream &O) const override; + + bool equals(const CSKYConstantPoolSymbol *A) const { + return S == A->S && CSKYConstantPoolValue::equals(A); + } + + static bool classof(const CSKYConstantPoolValue *ACPV) { + return ACPV->isExtSymbol(); + } +}; + +/// CSKYConstantPoolMBB - CSKY-specific constantpool value of a machine basic +/// block. +class CSKYConstantPoolMBB : public CSKYConstantPoolValue { + const MachineBasicBlock *MBB; // Machine basic block. + + CSKYConstantPoolMBB(Type *Ty, const MachineBasicBlock *Mbb, unsigned PCAdjust, + CSKYCP::CSKYCPModifier Modifier, bool AddCurrentAddress); + +public: + static CSKYConstantPoolMBB *Create(Type *Ty, const MachineBasicBlock *Mbb, + unsigned PCAdjust); + + const MachineBasicBlock *getMBB() const { return MBB; } + + int getExistingMachineCPValue(MachineConstantPool *CP, + Align Alignment) override; + void addSelectionDAGCSEId(FoldingSetNodeID &ID) override; + void print(raw_ostream &O) const override; + + bool equals(const CSKYConstantPoolMBB *A) const { + return MBB == A->MBB && CSKYConstantPoolValue::equals(A); + } + + static bool classof(const CSKYConstantPoolValue *ACPV) { + return ACPV->isMachineBasicBlock(); + } +}; + +/// CSKY-specific constantpool value of a jump table. +class CSKYConstantPoolJT : public CSKYConstantPoolValue { + signed JTI; // Machine basic block. + + CSKYConstantPoolJT(Type *Ty, int JTIndex, unsigned PCAdj, + CSKYCP::CSKYCPModifier Modifier, bool AddCurrentAddress); + +public: + static CSKYConstantPoolJT *Create(Type *Ty, int JTI, unsigned PCAdj, + CSKYCP::CSKYCPModifier Modifier); + + signed getJTI() { return JTI; } + + int getExistingMachineCPValue(MachineConstantPool *CP, + Align Alignment) override; + void addSelectionDAGCSEId(FoldingSetNodeID &ID) override; + void print(raw_ostream &O) const override; + + bool equals(const CSKYConstantPoolJT *A) const { + return JTI == A->JTI && CSKYConstantPoolValue::equals(A); + } + + static bool classof(const CSKYConstantPoolValue *ACPV) { + return ACPV->isJT(); + } +}; + +} // namespace llvm + +#endif diff --git a/llvm/lib/Target/CSKY/CSKYFrameLowering.cpp b/llvm/lib/Target/CSKY/CSKYFrameLowering.cpp index 3a8ee5713584..3bf001c2cee7 100644 --- a/llvm/lib/Target/CSKY/CSKYFrameLowering.cpp +++ b/llvm/lib/Target/CSKY/CSKYFrameLowering.cpp @@ -11,6 +11,7 @@ //===----------------------------------------------------------------------===// #include "CSKYFrameLowering.h" +#include "CSKYMachineFunctionInfo.h" #include "CSKYSubtarget.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" @@ -46,12 +47,555 @@ bool CSKYFrameLowering::hasBP(const MachineFunction &MF) const { return MFI.hasVarSizedObjects(); } +// Determines the size of the frame and maximum call frame size. +void CSKYFrameLowering::determineFrameLayout(MachineFunction &MF) const { + MachineFrameInfo &MFI = MF.getFrameInfo(); + const CSKYRegisterInfo *RI = STI.getRegisterInfo(); + + // Get the number of bytes to allocate from the FrameInfo. + uint64_t FrameSize = MFI.getStackSize(); + + // Get the alignment. + Align StackAlign = getStackAlign(); + if (RI->hasStackRealignment(MF)) { + Align MaxStackAlign = std::max(StackAlign, MFI.getMaxAlign()); + FrameSize += (MaxStackAlign.value() - StackAlign.value()); + StackAlign = MaxStackAlign; + } + + // Set Max Call Frame Size + uint64_t MaxCallSize = alignTo(MFI.getMaxCallFrameSize(), StackAlign); + MFI.setMaxCallFrameSize(MaxCallSize); + + // Make sure the frame is aligned. + FrameSize = alignTo(FrameSize, StackAlign); + + // Update frame info. + MFI.setStackSize(FrameSize); +} + void CSKYFrameLowering::emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const { - // FIXME: Implement this when we have function calls + CSKYMachineFunctionInfo *CFI = MF.getInfo<CSKYMachineFunctionInfo>(); + MachineFrameInfo &MFI = MF.getFrameInfo(); + const CSKYRegisterInfo *RI = STI.getRegisterInfo(); + const CSKYInstrInfo *TII = STI.getInstrInfo(); + MachineBasicBlock::iterator MBBI = MBB.begin(); + const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); + const MachineRegisterInfo &MRI = MF.getRegInfo(); + + Register FPReg = getFPReg(STI); + Register SPReg = CSKY::R14; + Register BPReg = getBPReg(STI); + + // Debug location must be unknown since the first debug location is used + // to determine the end of the prologue. + DebugLoc DL; + + if (MF.getFunction().hasFnAttribute("interrupt")) + BuildMI(MBB, MBBI, DL, TII->get(CSKY::NIE)); + + // Determine the correct frame layout + determineFrameLayout(MF); + + // FIXME (note copied from Lanai): This appears to be overallocating. Needs + // investigation. Get the number of bytes to allocate from the FrameInfo. + uint64_t StackSize = MFI.getStackSize(); + + // Early exit if there is no need to allocate on the stack + if (StackSize == 0 && !MFI.adjustsStack()) + return; + + const auto &CSI = MFI.getCalleeSavedInfo(); + + unsigned spillAreaSize = CFI->getCalleeSaveAreaSize(); + + uint64_t ActualSize = spillAreaSize + CFI->getVarArgsSaveSize(); + + // First part stack allocation. + adjustReg(MBB, MBBI, DL, SPReg, SPReg, -(static_cast<int64_t>(ActualSize)), + MachineInstr::NoFlags); + + // Emit ".cfi_def_cfa_offset FirstSPAdjustAmount" + unsigned CFIIndex = + MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, ActualSize)); + BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex); + + // The frame pointer is callee-saved, and code has been generated for us to + // save it to the stack. We need to skip over the storing of callee-saved + // registers as the frame pointer must be modified after it has been saved + // to the stack, not before. + // FIXME: assumes exactly one instruction is used to save each callee-saved + // register. + std::advance(MBBI, CSI.size()); + + // Iterate over list of callee-saved registers and emit .cfi_offset + // directives. + for (const auto &Entry : CSI) { + int64_t Offset = MFI.getObjectOffset(Entry.getFrameIdx()); + Register Reg = Entry.getReg(); + + unsigned Num = TRI->getRegSizeInBits(Reg, MRI) / 32; + for (unsigned i = 0; i < Num; i++) { + unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createOffset( + nullptr, RI->getDwarfRegNum(Reg, true) + i, Offset + i * 4)); + BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex); + } + } + + // Generate new FP. + if (hasFP(MF)) { + BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), FPReg) + .addReg(SPReg) + .setMIFlag(MachineInstr::FrameSetup); + + // Emit ".cfi_def_cfa_register $fp" + unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createDefCfaRegister( + nullptr, RI->getDwarfRegNum(FPReg, true))); + BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex); + + // Second part stack allocation. + adjustReg(MBB, MBBI, DL, SPReg, SPReg, + -(static_cast<int64_t>(StackSize - ActualSize)), + MachineInstr::NoFlags); + + // Realign Stack + const CSKYRegisterInfo *RI = STI.getRegisterInfo(); + if (RI->hasStackRealignment(MF)) { + Align MaxAlignment = MFI.getMaxAlign(); + + const CSKYInstrInfo *TII = STI.getInstrInfo(); + if (STI.hasE2() && isUInt<12>(~(-(int)MaxAlignment.value()))) { + BuildMI(MBB, MBBI, DL, TII->get(CSKY::ANDNI32), SPReg) + .addReg(SPReg) + .addImm(~(-(int)MaxAlignment.value())); + } else { + unsigned ShiftAmount = Log2(MaxAlignment); + + if (STI.hasE2()) { + Register VR = + MF.getRegInfo().createVirtualRegister(&CSKY::GPRRegClass); + BuildMI(MBB, MBBI, DL, TII->get(CSKY::LSRI32), VR) + .addReg(SPReg) + .addImm(ShiftAmount); + BuildMI(MBB, MBBI, DL, TII->get(CSKY::LSLI32), SPReg) + .addReg(VR) + .addImm(ShiftAmount); + } else { + Register VR = + MF.getRegInfo().createVirtualRegister(&CSKY::mGPRRegClass); + BuildMI(MBB, MBBI, DL, TII->get(CSKY::MOV16), VR).addReg(SPReg); + BuildMI(MBB, MBBI, DL, TII->get(CSKY::LSRI16), VR) + .addReg(VR) + .addImm(ShiftAmount); + BuildMI(MBB, MBBI, DL, TII->get(CSKY::LSLI16), VR) + .addReg(VR) + .addImm(ShiftAmount); + BuildMI(MBB, MBBI, DL, TII->get(CSKY::MOV16), SPReg).addReg(VR); + } + } + } + + // FP will be used to restore the frame in the epilogue, so we need + // another base register BP to record SP after re-alignment. SP will + // track the current stack after allocating variable sized objects. + if (hasBP(MF)) { + // move BP, SP + BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), BPReg).addReg(SPReg); + } + + } else { + adjustReg(MBB, MBBI, DL, SPReg, SPReg, + -(static_cast<int64_t>(StackSize - ActualSize)), + MachineInstr::NoFlags); + // Emit ".cfi_def_cfa_offset StackSize" + unsigned CFIIndex = MF.addFrameInst( + MCCFIInstruction::cfiDefCfaOffset(nullptr, MFI.getStackSize())); + BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex); + } } void CSKYFrameLowering::emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const { - // FIXME: Implement this when we have function calls + CSKYMachineFunctionInfo *CFI = MF.getInfo<CSKYMachineFunctionInfo>(); + + MachineFrameInfo &MFI = MF.getFrameInfo(); + Register FPReg = getFPReg(STI); + Register SPReg = CSKY::R14; + + // Get the insert location for the epilogue. If there were no terminators in + // the block, get the last instruction. + MachineBasicBlock::iterator MBBI = MBB.end(); + DebugLoc DL; + if (!MBB.empty()) { + MBBI = MBB.getFirstTerminator(); + if (MBBI == MBB.end()) + MBBI = MBB.getLastNonDebugInstr(); + DL = MBBI->getDebugLoc(); + + // If this is not a terminator, the actual insert location should be after + // the last instruction. + if (!MBBI->isTerminator()) + MBBI = std::next(MBBI); + } + + const auto &CSI = MFI.getCalleeSavedInfo(); + uint64_t StackSize = MFI.getStackSize(); + + uint64_t ActualSize = + CFI->getCalleeSaveAreaSize() + CFI->getVarArgsSaveSize(); + + // Skip to before the restores of callee-saved registers + // FIXME: assumes exactly one instruction is used to restore each + // callee-saved register. + auto LastFrameDestroy = MBBI; + if (!CSI.empty()) + LastFrameDestroy = std::prev(MBBI, CSI.size()); + + if (hasFP(MF)) { + const CSKYInstrInfo *TII = STI.getInstrInfo(); + BuildMI(MBB, LastFrameDestroy, DL, TII->get(TargetOpcode::COPY), SPReg) + .addReg(FPReg) + .setMIFlag(MachineInstr::NoFlags); + } else { + adjustReg(MBB, LastFrameDestroy, DL, SPReg, SPReg, (StackSize - ActualSize), + MachineInstr::FrameDestroy); + } + + adjustReg(MBB, MBBI, DL, SPReg, SPReg, ActualSize, + MachineInstr::FrameDestroy); +} + +static unsigned estimateRSStackSizeLimit(MachineFunction &MF, + const CSKYSubtarget &STI) { + unsigned Limit = (1 << 12) - 1; + + for (auto &MBB : MF) { + for (auto &MI : MBB) { + if (MI.isDebugInstr()) + continue; + + for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { + if (!MI.getOperand(i).isFI()) + continue; + + if (MI.getOpcode() == CSKY::SPILL_CARRY || + MI.getOpcode() == CSKY::RESTORE_CARRY || + MI.getOpcode() == CSKY::STORE_PAIR || + MI.getOpcode() == CSKY::LOAD_PAIR) { + Limit = std::min(Limit, ((1U << 12) - 1) * 4); + break; + } + + if (MI.getOpcode() == CSKY::ADDI32) { + Limit = std::min(Limit, (1U << 12)); + break; + } + + if (MI.getOpcode() == CSKY::ADDI16XZ) { + Limit = std::min(Limit, (1U << 3)); + break; + } + + // ADDI16 will not require an extra register, + // it can reuse the destination. + if (MI.getOpcode() == CSKY::ADDI16) + break; + + // Otherwise check the addressing mode. + switch (MI.getDesc().TSFlags & CSKYII::AddrModeMask) { + default: + LLVM_DEBUG(MI.dump()); + llvm_unreachable( + "Unhandled addressing mode in stack size limit calculation"); + case CSKYII::AddrMode32B: + Limit = std::min(Limit, (1U << 12) - 1); + break; + case CSKYII::AddrMode32H: + Limit = std::min(Limit, ((1U << 12) - 1) * 2); + break; + case CSKYII::AddrMode32WD: + Limit = std::min(Limit, ((1U << 12) - 1) * 4); + break; + case CSKYII::AddrMode16B: + Limit = std::min(Limit, (1U << 5) - 1); + break; + case CSKYII::AddrMode16H: + Limit = std::min(Limit, ((1U << 5) - 1) * 2); + break; + case CSKYII::AddrMode16W: + Limit = std::min(Limit, ((1U << 5) - 1) * 4); + break; + case CSKYII::AddrMode32SDF: + Limit = std::min(Limit, ((1U << 8) - 1) * 4); + break; + } + break; // At most one FI per instruction + } + } + } + + return Limit; +} + +void CSKYFrameLowering::determineCalleeSaves(MachineFunction &MF, + BitVector &SavedRegs, + RegScavenger *RS) const { + TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS); + + CSKYMachineFunctionInfo *CFI = MF.getInfo<CSKYMachineFunctionInfo>(); + const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); + const MachineRegisterInfo &MRI = MF.getRegInfo(); + MachineFrameInfo &MFI = MF.getFrameInfo(); + + if (hasFP(MF)) + SavedRegs.set(CSKY::R8); + + // Mark BP as used if function has dedicated base pointer. + if (hasBP(MF)) + SavedRegs.set(CSKY::R7); + + // If interrupt is enabled and there are calls in the handler, + // unconditionally save all Caller-saved registers and + // all FP registers, regardless whether they are used. + if (MF.getFunction().hasFnAttribute("interrupt") && MFI.hasCalls()) { + + static const MCPhysReg CSRegs[] = {CSKY::R0, CSKY::R1, CSKY::R2, CSKY::R3, + CSKY::R12, CSKY::R13, 0}; + + for (unsigned i = 0; CSRegs[i]; ++i) + SavedRegs.set(CSRegs[i]); + + if (STI.hasHighRegisters()) { + + static const MCPhysReg CSHRegs[] = {CSKY::R18, CSKY::R19, CSKY::R20, + CSKY::R21, CSKY::R22, CSKY::R23, + CSKY::R24, CSKY::R25, 0}; + + for (unsigned i = 0; CSHRegs[i]; ++i) + SavedRegs.set(CSHRegs[i]); + } + + static const MCPhysReg CSF32Regs[] = { + CSKY::F8_32, CSKY::F9_32, CSKY::F10_32, + CSKY::F11_32, CSKY::F12_32, CSKY::F13_32, + CSKY::F14_32, CSKY::F15_32, 0}; + static const MCPhysReg CSF64Regs[] = { + CSKY::F8_64, CSKY::F9_64, CSKY::F10_64, + CSKY::F11_64, CSKY::F12_64, CSKY::F13_64, + CSKY::F14_64, CSKY::F15_64, 0}; + + const MCPhysReg *FRegs = NULL; + if (STI.hasFPUv2DoubleFloat() || STI.hasFPUv3DoubleFloat()) + FRegs = CSF64Regs; + else if (STI.hasFPUv2SingleFloat() || STI.hasFPUv3SingleFloat()) + FRegs = CSF32Regs; + + if (FRegs != NULL) { + const MCPhysReg *Regs = MF.getRegInfo().getCalleeSavedRegs(); + + for (unsigned i = 0; Regs[i]; ++i) + if (CSKY::FPR32RegClass.contains(Regs[i]) || + CSKY::FPR64RegClass.contains(Regs[i])) { + unsigned x = 0; + for (; FRegs[x]; ++x) + if (FRegs[x] == Regs[i]) + break; + if (FRegs[x] == 0) + SavedRegs.set(Regs[i]); + } + } + } + + CFI->setLRIsSpilled(SavedRegs.test(CSKY::R15)); + + unsigned CSStackSize = 0; + for (unsigned Reg : SavedRegs.set_bits()) { + auto RegSize = TRI->getRegSizeInBits(Reg, MRI) / 8; + CSStackSize += RegSize; + } + + CFI->setCalleeSaveAreaSize(CSStackSize); + + uint64_t Limit = estimateRSStackSizeLimit(MF, STI); + + bool BigFrame = (MFI.estimateStackSize(MF) + CSStackSize >= Limit); + + if (BigFrame || CFI->isCRSpilled() || !STI.hasE2()) { + const TargetRegisterClass *RC = &CSKY::GPRRegClass; + unsigned size = TRI->getSpillSize(*RC); + Align align = TRI->getSpillAlign(*RC); + + RS->addScavengingFrameIndex(MFI.CreateStackObject(size, align, false)); + } +} + +// Not preserve stack space within prologue for outgoing variables when the +// function contains variable size objects and let eliminateCallFramePseudoInstr +// preserve stack space for it. +bool CSKYFrameLowering::hasReservedCallFrame(const MachineFunction &MF) const { + return !MF.getFrameInfo().hasVarSizedObjects(); +} + +bool CSKYFrameLowering::spillCalleeSavedRegisters( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, + ArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const { + if (CSI.empty()) + return true; + + MachineFunction *MF = MBB.getParent(); + const TargetInstrInfo &TII = *MF->getSubtarget().getInstrInfo(); + DebugLoc DL; + if (MI != MBB.end() && !MI->isDebugInstr()) + DL = MI->getDebugLoc(); + + for (auto &CS : CSI) { + // Insert the spill to the stack frame. + Register Reg = CS.getReg(); + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); + TII.storeRegToStackSlot(MBB, MI, Reg, true, CS.getFrameIdx(), RC, TRI); + } + + return true; +} + +bool CSKYFrameLowering::restoreCalleeSavedRegisters( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, + MutableArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const { + if (CSI.empty()) + return true; + + MachineFunction *MF = MBB.getParent(); + const TargetInstrInfo &TII = *MF->getSubtarget().getInstrInfo(); + DebugLoc DL; + if (MI != MBB.end() && !MI->isDebugInstr()) + DL = MI->getDebugLoc(); + + for (auto &CS : reverse(CSI)) { + Register Reg = CS.getReg(); + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); + TII.loadRegFromStackSlot(MBB, MI, Reg, CS.getFrameIdx(), RC, TRI); + assert(MI != MBB.begin() && "loadRegFromStackSlot didn't insert any code!"); + } + + return true; +} + +// Eliminate ADJCALLSTACKDOWN, ADJCALLSTACKUP pseudo instructions. +MachineBasicBlock::iterator CSKYFrameLowering::eliminateCallFramePseudoInstr( + MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI) const { + Register SPReg = CSKY::R14; + DebugLoc DL = MI->getDebugLoc(); + + if (!hasReservedCallFrame(MF)) { + // If space has not been reserved for a call frame, ADJCALLSTACKDOWN and + // ADJCALLSTACKUP must be converted to instructions manipulating the stack + // pointer. This is necessary when there is a variable length stack + // allocation (e.g. alloca), which means it's not possible to allocate + // space for outgoing arguments from within the function prologue. + int64_t Amount = MI->getOperand(0).getImm(); + + if (Amount != 0) { + // Ensure the stack remains aligned after adjustment. + Amount = alignSPAdjust(Amount); + + if (MI->getOpcode() == CSKY::ADJCALLSTACKDOWN) + Amount = -Amount; + + adjustReg(MBB, MI, DL, SPReg, SPReg, Amount, MachineInstr::NoFlags); + } + } + + return MBB.erase(MI); +} + +void CSKYFrameLowering::adjustReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + const DebugLoc &DL, Register DestReg, + Register SrcReg, int64_t Val, + MachineInstr::MIFlag Flag) const { + const CSKYInstrInfo *TII = STI.getInstrInfo(); + + if (DestReg == SrcReg && Val == 0) + return; + + // TODO: Add 16-bit instruction support with immediate num + if (STI.hasE2() && isUInt<12>(std::abs(Val) - 1)) { + BuildMI(MBB, MBBI, DL, TII->get(Val < 0 ? CSKY::SUBI32 : CSKY::ADDI32), + DestReg) + .addReg(SrcReg) + .addImm(std::abs(Val)) + .setMIFlag(Flag); + } else if (!STI.hasE2() && isShiftedUInt<7, 2>(std::abs(Val))) { + BuildMI(MBB, MBBI, DL, + TII->get(Val < 0 ? CSKY::SUBI16SPSP : CSKY::ADDI16SPSP), CSKY::R14) + .addReg(CSKY::R14, RegState::Kill) + .addImm(std::abs(Val)) + .setMIFlag(Flag); + } else { + + unsigned Op = 0; + + if (STI.hasE2()) { + Op = Val < 0 ? CSKY::SUBU32 : CSKY::ADDU32; + } else { + assert(SrcReg == DestReg); + Op = Val < 0 ? CSKY::SUBU16XZ : CSKY::ADDU16XZ; + } + + Register ScratchReg = TII->movImm(MBB, MBBI, DL, std::abs(Val), Flag); + + BuildMI(MBB, MBBI, DL, TII->get(Op), DestReg) + .addReg(SrcReg) + .addReg(ScratchReg, RegState::Kill) + .setMIFlag(Flag); + } +} + +StackOffset +CSKYFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI, + Register &FrameReg) const { + const CSKYMachineFunctionInfo *CFI = MF.getInfo<CSKYMachineFunctionInfo>(); + const MachineFrameInfo &MFI = MF.getFrameInfo(); + const TargetRegisterInfo *RI = MF.getSubtarget().getRegisterInfo(); + const auto &CSI = MFI.getCalleeSavedInfo(); + + int MinCSFI = 0; + int MaxCSFI = -1; + + int Offset = MFI.getObjectOffset(FI) + MFI.getOffsetAdjustment(); + + if (CSI.size()) { + MinCSFI = CSI[0].getFrameIdx(); + MaxCSFI = CSI[CSI.size() - 1].getFrameIdx(); + } + + if (FI >= MinCSFI && FI <= MaxCSFI) { + FrameReg = CSKY::R14; + Offset += CFI->getVarArgsSaveSize() + CFI->getCalleeSaveAreaSize(); + } else if (RI->hasStackRealignment(MF)) { + assert(hasFP(MF)); + if (!MFI.isFixedObjectIndex(FI)) { + FrameReg = hasBP(MF) ? getBPReg(STI) : CSKY::R14; + Offset += MFI.getStackSize(); + } else { + FrameReg = getFPReg(STI); + Offset += CFI->getVarArgsSaveSize() + CFI->getCalleeSaveAreaSize(); + } + } else { + if (MFI.isFixedObjectIndex(FI) && hasFP(MF)) { + FrameReg = getFPReg(STI); + Offset += CFI->getVarArgsSaveSize() + CFI->getCalleeSaveAreaSize(); + } else { + FrameReg = hasBP(MF) ? getBPReg(STI) : CSKY::R14; + Offset += MFI.getStackSize(); + } + } + + return StackOffset::getFixed(Offset); } diff --git a/llvm/lib/Target/CSKY/CSKYFrameLowering.h b/llvm/lib/Target/CSKY/CSKYFrameLowering.h index 49921a1866bc..69bf01cf1801 100644 --- a/llvm/lib/Target/CSKY/CSKYFrameLowering.h +++ b/llvm/lib/Target/CSKY/CSKYFrameLowering.h @@ -21,6 +21,11 @@ class CSKYSubtarget; class CSKYFrameLowering : public TargetFrameLowering { const CSKYSubtarget &STI; + void determineFrameLayout(MachineFunction &MF) const; + void adjustReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + const DebugLoc &DL, Register DestReg, Register SrcReg, + int64_t Val, MachineInstr::MIFlag Flag) const; + public: explicit CSKYFrameLowering(const CSKYSubtarget &STI) : TargetFrameLowering(StackGrowsDown, @@ -31,8 +36,39 @@ public: void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override; void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override; + StackOffset getFrameIndexReference(const MachineFunction &MF, int FI, + Register &FrameReg) const override; + + void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs, + RegScavenger *RS) const override; + + bool assignCalleeSavedSpillSlots( + MachineFunction &MF, const TargetRegisterInfo *TRI, + std::vector<CalleeSavedInfo> &CSI) const override { + + std::reverse(CSI.begin(), CSI.end()); + + return false; + } + + bool spillCalleeSavedRegisters(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + ArrayRef<CalleeSavedInfo> CSI, + const TargetRegisterInfo *TRI) const override; + bool + restoreCalleeSavedRegisters(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + MutableArrayRef<CalleeSavedInfo> CSI, + const TargetRegisterInfo *TRI) const override; + bool hasFP(const MachineFunction &MF) const override; bool hasBP(const MachineFunction &MF) const; + + bool hasReservedCallFrame(const MachineFunction &MF) const override; + + MachineBasicBlock::iterator + eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI) const override; }; } // namespace llvm #endif diff --git a/llvm/lib/Target/CSKY/CSKYISelDAGToDAG.cpp b/llvm/lib/Target/CSKY/CSKYISelDAGToDAG.cpp index 8dc91904b8cc..d58f9095aa0d 100644 --- a/llvm/lib/Target/CSKY/CSKYISelDAGToDAG.cpp +++ b/llvm/lib/Target/CSKY/CSKYISelDAGToDAG.cpp @@ -68,6 +68,24 @@ void CSKYDAGToDAGISel::Select(SDNode *N) { case ISD::SUBCARRY: IsSelected = selectSubCarry(N); break; + case ISD::GLOBAL_OFFSET_TABLE: { + Register GP = Subtarget->getInstrInfo()->getGlobalBaseReg(*MF); + ReplaceNode(N, CurDAG->getRegister(GP, N->getValueType(0)).getNode()); + + IsSelected = true; + break; + } + case ISD::FrameIndex: { + SDValue Imm = CurDAG->getTargetConstant(0, Dl, MVT::i32); + int FI = cast<FrameIndexSDNode>(N)->getIndex(); + SDValue TFI = CurDAG->getTargetFrameIndex(FI, MVT::i32); + ReplaceNode(N, CurDAG->getMachineNode(Subtarget->hasE2() ? CSKY::ADDI32 + : CSKY::ADDI16XZ, + Dl, MVT::i32, TFI, Imm)); + + IsSelected = true; + break; + } } if (IsSelected) diff --git a/llvm/lib/Target/CSKY/CSKYISelLowering.cpp b/llvm/lib/Target/CSKY/CSKYISelLowering.cpp index a1f7cc685d4c..0b589e3d3e4f 100644 --- a/llvm/lib/Target/CSKY/CSKYISelLowering.cpp +++ b/llvm/lib/Target/CSKY/CSKYISelLowering.cpp @@ -13,6 +13,7 @@ #include "CSKYISelLowering.h" #include "CSKYCallingConv.h" +#include "CSKYConstantPoolValue.h" #include "CSKYMachineFunctionInfo.h" #include "CSKYRegisterInfo.h" #include "CSKYSubtarget.h" @@ -37,6 +38,18 @@ CSKYTargetLowering::CSKYTargetLowering(const TargetMachine &TM, // Register Class addRegisterClass(MVT::i32, &CSKY::GPRRegClass); + if (STI.useHardFloat()) { + if (STI.hasFPUv2SingleFloat()) + addRegisterClass(MVT::f32, &CSKY::sFPR32RegClass); + else if (STI.hasFPUv3SingleFloat()) + addRegisterClass(MVT::f32, &CSKY::FPR32RegClass); + + if (STI.hasFPUv2DoubleFloat()) + addRegisterClass(MVT::f64, &CSKY::sFPR64RegClass); + else if (STI.hasFPUv3DoubleFloat()) + addRegisterClass(MVT::f64, &CSKY::FPR64RegClass); + } + setOperationAction(ISD::ADDCARRY, MVT::i32, Legal); setOperationAction(ISD::SUBCARRY, MVT::i32, Legal); setOperationAction(ISD::BITREVERSE, MVT::i32, Legal); @@ -53,16 +66,29 @@ CSKYTargetLowering::CSKYTargetLowering(const TargetMachine &TM, setOperationAction(ISD::SRA_PARTS, MVT::i32, Expand); setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand); setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand); + setOperationAction(ISD::SELECT_CC, MVT::i32, Expand); + setOperationAction(ISD::BR_CC, MVT::i32, Expand); + setOperationAction(ISD::BR_JT, MVT::Other, Expand); setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand); setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); setOperationAction(ISD::MULHS, MVT::i32, Expand); setOperationAction(ISD::MULHU, MVT::i32, Expand); + setOperationAction(ISD::VAARG, MVT::Other, Expand); + setOperationAction(ISD::VACOPY, MVT::Other, Expand); + setOperationAction(ISD::VAEND, MVT::Other, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::i32, MVT::i1, Promote); setLoadExtAction(ISD::SEXTLOAD, MVT::i32, MVT::i1, Promote); setLoadExtAction(ISD::ZEXTLOAD, MVT::i32, MVT::i1, Promote); + setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); + setOperationAction(ISD::ExternalSymbol, MVT::i32, Custom); + setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom); + setOperationAction(ISD::BlockAddress, MVT::i32, Custom); + setOperationAction(ISD::JumpTable, MVT::i32, Custom); + setOperationAction(ISD::VASTART, MVT::Other, Custom); + if (!Subtarget.hasE2()) { setLoadExtAction(ISD::SEXTLOAD, MVT::i32, MVT::i8, Expand); setLoadExtAction(ISD::SEXTLOAD, MVT::i32, MVT::i16, Expand); @@ -77,6 +103,44 @@ CSKYTargetLowering::CSKYTargetLowering(const TargetMachine &TM, setOperationAction(ISD::UDIV, MVT::i32, Expand); } + if (!Subtarget.has3r2E3r3()) { + setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Expand); + } + + // Float + + ISD::CondCode FPCCToExtend[] = { + ISD::SETONE, ISD::SETUEQ, ISD::SETUGT, + ISD::SETUGE, ISD::SETULT, ISD::SETULE, + }; + + ISD::NodeType FPOpToExpand[] = {ISD::FSIN, ISD::FCOS, ISD::FSINCOS, + ISD::FPOW, ISD::FREM, ISD::FCOPYSIGN}; + + if (STI.useHardFloat()) { + + MVT AllVTy[] = {MVT::f32, MVT::f64}; + + for (auto VT : AllVTy) { + setOperationAction(ISD::FREM, VT, Expand); + setOperationAction(ISD::SELECT_CC, VT, Expand); + setOperationAction(ISD::BR_CC, VT, Expand); + + for (auto CC : FPCCToExtend) + setCondCodeAction(CC, VT, Expand); + for (auto Op : FPOpToExpand) + setOperationAction(Op, VT, Expand); + } + + if (STI.hasFPUv2SingleFloat() || STI.hasFPUv3SingleFloat()) { + setOperationAction(ISD::ConstantFP, MVT::f32, Legal); + } + if (STI.hasFPUv2DoubleFloat() || STI.hasFPUv3DoubleFloat()) { + setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand); + setTruncStoreAction(MVT::f64, MVT::f32, Expand); + } + } + // Compute derived properties from the register classes. computeRegisterProperties(STI.getRegisterInfo()); @@ -92,6 +156,30 @@ CSKYTargetLowering::CSKYTargetLowering(const TargetMachine &TM, setSchedulingPreference(Sched::Source); } +SDValue CSKYTargetLowering::LowerOperation(SDValue Op, + SelectionDAG &DAG) const { + switch (Op.getOpcode()) { + default: + llvm_unreachable("unimplemented op"); + case ISD::GlobalAddress: + return LowerGlobalAddress(Op, DAG); + case ISD::ExternalSymbol: + return LowerExternalSymbol(Op, DAG); + case ISD::GlobalTLSAddress: + return LowerGlobalTLSAddress(Op, DAG); + case ISD::JumpTable: + return LowerJumpTable(Op, DAG); + case ISD::BlockAddress: + return LowerBlockAddress(Op, DAG); + case ISD::VASTART: + return LowerVASTART(Op, DAG); + case ISD::FRAMEADDR: + return LowerFRAMEADDR(Op, DAG); + case ISD::RETURNADDR: + return LowerRETURNADDR(Op, DAG); + } +} + EVT CSKYTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const { if (!VT.isVector()) @@ -145,6 +233,14 @@ static SDValue unpackFromRegLoc(const CSKYSubtarget &Subtarget, case MVT::i32: RC = &CSKY::GPRRegClass; break; + case MVT::f32: + RC = Subtarget.hasFPUv2SingleFloat() ? &CSKY::sFPR32RegClass + : &CSKY::FPR32RegClass; + break; + case MVT::f64: + RC = Subtarget.hasFPUv2DoubleFloat() ? &CSKY::sFPR64RegClass + : &CSKY::FPR64RegClass; + break; } Register VReg = RegInfo.createVirtualRegister(RC); @@ -181,6 +277,44 @@ static SDValue unpackFromMemLoc(SelectionDAG &DAG, SDValue Chain, return Val; } +static SDValue unpack64(SelectionDAG &DAG, SDValue Chain, const CCValAssign &VA, + const SDLoc &DL) { + assert(VA.getLocVT() == MVT::i32 && + (VA.getValVT() == MVT::f64 || VA.getValVT() == MVT::i64) && + "Unexpected VA"); + MachineFunction &MF = DAG.getMachineFunction(); + MachineFrameInfo &MFI = MF.getFrameInfo(); + MachineRegisterInfo &RegInfo = MF.getRegInfo(); + + if (VA.isMemLoc()) { + // f64/i64 is passed on the stack. + int FI = MFI.CreateFixedObject(8, VA.getLocMemOffset(), /*Immutable=*/true); + SDValue FIN = DAG.getFrameIndex(FI, MVT::i32); + return DAG.getLoad(VA.getValVT(), DL, Chain, FIN, + MachinePointerInfo::getFixedStack(MF, FI)); + } + + assert(VA.isRegLoc() && "Expected register VA assignment"); + + Register LoVReg = RegInfo.createVirtualRegister(&CSKY::GPRRegClass); + RegInfo.addLiveIn(VA.getLocReg(), LoVReg); + SDValue Lo = DAG.getCopyFromReg(Chain, DL, LoVReg, MVT::i32); + SDValue Hi; + if (VA.getLocReg() == CSKY::R3) { + // Second half of f64/i64 is passed on the stack. + int FI = MFI.CreateFixedObject(4, 0, /*Immutable=*/true); + SDValue FIN = DAG.getFrameIndex(FI, MVT::i32); + Hi = DAG.getLoad(MVT::i32, DL, Chain, FIN, + MachinePointerInfo::getFixedStack(MF, FI)); + } else { + // Second half of f64/i64 is passed in another GPR. + Register HiVReg = RegInfo.createVirtualRegister(&CSKY::GPRRegClass); + RegInfo.addLiveIn(VA.getLocReg() + 1, HiVReg); + Hi = DAG.getCopyFromReg(Chain, DL, HiVReg, MVT::i32); + } + return DAG.getNode(CSKYISD::BITCAST_FROM_LOHI, DL, VA.getValVT(), Lo, Hi); +} + // Transform physical registers into virtual registers. SDValue CSKYTargetLowering::LowerFormalArguments( SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, @@ -210,7 +344,11 @@ SDValue CSKYTargetLowering::LowerFormalArguments( CCValAssign &VA = ArgLocs[i]; SDValue ArgValue; - if (VA.isRegLoc()) + bool IsF64OnCSKY = VA.getLocVT() == MVT::i32 && VA.getValVT() == MVT::f64; + + if (IsF64OnCSKY) + ArgValue = unpack64(DAG, Chain, VA, DL); + else if (VA.isRegLoc()) ArgValue = unpackFromRegLoc(Subtarget, DAG, Chain, VA, DL); else ArgValue = unpackFromMemLoc(DAG, Chain, VA, DL); @@ -354,6 +492,255 @@ CSKYTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, return DAG.getNode(CSKYISD::RET, DL, MVT::Other, RetOps); } +// Lower a call to a callseq_start + CALL + callseq_end chain, and add input +// and output parameter nodes. +SDValue CSKYTargetLowering::LowerCall(CallLoweringInfo &CLI, + SmallVectorImpl<SDValue> &InVals) const { + SelectionDAG &DAG = CLI.DAG; + SDLoc &DL = CLI.DL; + SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs; + SmallVectorImpl<SDValue> &OutVals = CLI.OutVals; + SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins; + SDValue Chain = CLI.Chain; + SDValue Callee = CLI.Callee; + bool &IsTailCall = CLI.IsTailCall; + CallingConv::ID CallConv = CLI.CallConv; + bool IsVarArg = CLI.IsVarArg; + EVT PtrVT = getPointerTy(DAG.getDataLayout()); + MVT XLenVT = MVT::i32; + + MachineFunction &MF = DAG.getMachineFunction(); + + // Analyze the operands of the call, assigning locations to each operand. + SmallVector<CCValAssign, 16> ArgLocs; + CCState ArgCCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext()); + + ArgCCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CallConv, IsVarArg)); + + // Check if it's really possible to do a tail call. + if (IsTailCall) + IsTailCall = false; // TODO: TailCallOptimization; + + if (IsTailCall) + ++NumTailCalls; + else if (CLI.CB && CLI.CB->isMustTailCall()) + report_fatal_error("failed to perform tail call elimination on a call " + "site marked musttail"); + + // Get a count of how many bytes are to be pushed on the stack. + unsigned NumBytes = ArgCCInfo.getNextStackOffset(); + + // Create local copies for byval args + SmallVector<SDValue, 8> ByValArgs; + for (unsigned i = 0, e = Outs.size(); i != e; ++i) { + ISD::ArgFlagsTy Flags = Outs[i].Flags; + if (!Flags.isByVal()) + continue; + + SDValue Arg = OutVals[i]; + unsigned Size = Flags.getByValSize(); + Align Alignment = Flags.getNonZeroByValAlign(); + + int FI = + MF.getFrameInfo().CreateStackObject(Size, Alignment, /*isSS=*/false); + SDValue FIPtr = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); + SDValue SizeNode = DAG.getConstant(Size, DL, XLenVT); + + Chain = DAG.getMemcpy(Chain, DL, FIPtr, Arg, SizeNode, Alignment, + /*IsVolatile=*/false, + /*AlwaysInline=*/false, IsTailCall, + MachinePointerInfo(), MachinePointerInfo()); + ByValArgs.push_back(FIPtr); + } + + if (!IsTailCall) + Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, CLI.DL); + + // Copy argument values to their designated locations. + SmallVector<std::pair<Register, SDValue>, 8> RegsToPass; + SmallVector<SDValue, 8> MemOpChains; + SDValue StackPtr; + for (unsigned i = 0, j = 0, e = ArgLocs.size(); i != e; ++i) { + CCValAssign &VA = ArgLocs[i]; + SDValue ArgValue = OutVals[i]; + ISD::ArgFlagsTy Flags = Outs[i].Flags; + + bool IsF64OnCSKY = VA.getLocVT() == MVT::i32 && VA.getValVT() == MVT::f64; + + if (IsF64OnCSKY && VA.isRegLoc()) { + SDValue Split64 = + DAG.getNode(CSKYISD::BITCAST_TO_LOHI, DL, + DAG.getVTList(MVT::i32, MVT::i32), ArgValue); + SDValue Lo = Split64.getValue(0); + SDValue Hi = Split64.getValue(1); + + Register RegLo = VA.getLocReg(); + RegsToPass.push_back(std::make_pair(RegLo, Lo)); + + if (RegLo == CSKY::R3) { + // Second half of f64/i64 is passed on the stack. + // Work out the address of the stack slot. + if (!StackPtr.getNode()) + StackPtr = DAG.getCopyFromReg(Chain, DL, CSKY::R14, PtrVT); + // Emit the store. + MemOpChains.push_back( + DAG.getStore(Chain, DL, Hi, StackPtr, MachinePointerInfo())); + } else { + // Second half of f64/i64 is passed in another GPR. + assert(RegLo < CSKY::R31 && "Invalid register pair"); + Register RegHigh = RegLo + 1; + RegsToPass.push_back(std::make_pair(RegHigh, Hi)); + } + continue; + } + + ArgValue = convertValVTToLocVT(DAG, ArgValue, VA, DL); + + // Use local copy if it is a byval arg. + if (Flags.isByVal()) + ArgValue = ByValArgs[j++]; + + if (VA.isRegLoc()) { + // Queue up the argument copies and emit them at the end. + RegsToPass.push_back(std::make_pair(VA.getLocReg(), ArgValue)); + } else { + assert(VA.isMemLoc() && "Argument not register or memory"); + assert(!IsTailCall && "Tail call not allowed if stack is used " + "for passing parameters"); + + // Work out the address of the stack slot. + if (!StackPtr.getNode()) + StackPtr = DAG.getCopyFromReg(Chain, DL, CSKY::R14, PtrVT); + SDValue Address = + DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, + DAG.getIntPtrConstant(VA.getLocMemOffset(), DL)); + + // Emit the store. + MemOpChains.push_back( + DAG.getStore(Chain, DL, ArgValue, Address, MachinePointerInfo())); + } + } + + // Join the stores, which are independent of one another. + if (!MemOpChains.empty()) + Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains); + + SDValue Glue; + + // Build a sequence of copy-to-reg nodes, chained and glued together. + for (auto &Reg : RegsToPass) { + Chain = DAG.getCopyToReg(Chain, DL, Reg.first, Reg.second, Glue); + Glue = Chain.getValue(1); + } + + SmallVector<SDValue, 8> Ops; + EVT Ty = getPointerTy(DAG.getDataLayout()); + bool IsRegCall = false; + + Ops.push_back(Chain); + + if (GlobalAddressSDNode *S = dyn_cast<GlobalAddressSDNode>(Callee)) { + const GlobalValue *GV = S->getGlobal(); + bool IsLocal = + getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV); + + if (isPositionIndependent() || !Subtarget.has2E3()) { + IsRegCall = true; + Ops.push_back(getAddr<GlobalAddressSDNode, true>(S, DAG, IsLocal)); + } else { + Ops.push_back(getTargetNode(cast<GlobalAddressSDNode>(Callee), DL, Ty, + DAG, CSKYII::MO_None)); + Ops.push_back(getTargetConstantPoolValue( + cast<GlobalAddressSDNode>(Callee), Ty, DAG, CSKYII::MO_None)); + } + } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { + bool IsLocal = getTargetMachine().shouldAssumeDSOLocal( + *MF.getFunction().getParent(), nullptr); + + if (isPositionIndependent() || !Subtarget.has2E3()) { + IsRegCall = true; + Ops.push_back(getAddr<ExternalSymbolSDNode, true>(S, DAG, IsLocal)); + } else { + Ops.push_back(getTargetNode(cast<ExternalSymbolSDNode>(Callee), DL, Ty, + DAG, CSKYII::MO_None)); + Ops.push_back(getTargetConstantPoolValue( + cast<ExternalSymbolSDNode>(Callee), Ty, DAG, CSKYII::MO_None)); + } + } else { + IsRegCall = true; + Ops.push_back(Callee); + } + + // Add argument registers to the end of the list so that they are + // known live into the call. + for (auto &Reg : RegsToPass) + Ops.push_back(DAG.getRegister(Reg.first, Reg.second.getValueType())); + + if (!IsTailCall) { + // Add a register mask operand representing the call-preserved registers. + const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo(); + const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv); + assert(Mask && "Missing call preserved mask for calling convention"); + Ops.push_back(DAG.getRegisterMask(Mask)); + } + + // Glue the call to the argument copies, if any. + if (Glue.getNode()) + Ops.push_back(Glue); + + // Emit the call. + SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); + + if (IsTailCall) { + MF.getFrameInfo().setHasTailCall(); + return DAG.getNode(IsRegCall ? CSKYISD::TAILReg : CSKYISD::TAIL, DL, + NodeTys, Ops); + } + + Chain = DAG.getNode(IsRegCall ? CSKYISD::CALLReg : CSKYISD::CALL, DL, NodeTys, + Ops); + DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge); + Glue = Chain.getValue(1); + + // Mark the end of the call, which is glued to the call itself. + Chain = DAG.getCALLSEQ_END(Chain, DAG.getConstant(NumBytes, DL, PtrVT, true), + DAG.getConstant(0, DL, PtrVT, true), Glue, DL); + Glue = Chain.getValue(1); + + // Assign locations to each value returned by this call. + SmallVector<CCValAssign, 16> CSKYLocs; + CCState RetCCInfo(CallConv, IsVarArg, MF, CSKYLocs, *DAG.getContext()); + RetCCInfo.AnalyzeCallResult(Ins, CCAssignFnForReturn(CallConv, IsVarArg)); + + // Copy all of the result registers out of their specified physreg. + for (auto &VA : CSKYLocs) { + // Copy the value out + SDValue RetValue = + DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), Glue); + // Glue the RetValue to the end of the call sequence + Chain = RetValue.getValue(1); + Glue = RetValue.getValue(2); + + bool IsF64OnCSKY = VA.getLocVT() == MVT::i32 && VA.getValVT() == MVT::f64; + + if (IsF64OnCSKY) { + assert(VA.getLocReg() == GPRArgRegs[0] && "Unexpected reg assignment"); + SDValue RetValue2 = + DAG.getCopyFromReg(Chain, DL, GPRArgRegs[1], MVT::i32, Glue); + Chain = RetValue2.getValue(1); + Glue = RetValue2.getValue(2); + RetValue = DAG.getNode(CSKYISD::BITCAST_FROM_LOHI, DL, VA.getValVT(), + RetValue, RetValue2); + } + + RetValue = convertLocVTToValVT(DAG, RetValue, VA, DL); + + InVals.push_back(RetValue); + } + + return Chain; +} + CCAssignFn *CSKYTargetLowering::CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg) const { if (IsVarArg || !Subtarget.useHardFloatABI()) @@ -370,6 +757,165 @@ CCAssignFn *CSKYTargetLowering::CCAssignFnForCall(CallingConv::ID CC, return CC_CSKY_ABIV2_FP; } +static CSKYCP::CSKYCPModifier getModifier(unsigned Flags) { + + if (Flags == CSKYII::MO_ADDR32) + return CSKYCP::ADDR; + else if (Flags == CSKYII::MO_GOT32) + return CSKYCP::GOT; + else if (Flags == CSKYII::MO_GOTOFF) + return CSKYCP::GOTOFF; + else if (Flags == CSKYII::MO_PLT32) + return CSKYCP::PLT; + else if (Flags == CSKYII::MO_None) + return CSKYCP::NO_MOD; + else + assert(0 && "unknown CSKYII Modifier"); + return CSKYCP::NO_MOD; +} + +SDValue CSKYTargetLowering::getTargetConstantPoolValue(GlobalAddressSDNode *N, + EVT Ty, + SelectionDAG &DAG, + unsigned Flags) const { + CSKYConstantPoolValue *CPV = CSKYConstantPoolConstant::Create( + N->getGlobal(), CSKYCP::CPValue, 0, getModifier(Flags), false); + + return DAG.getTargetConstantPool(CPV, Ty); +} + +static MachineBasicBlock * +emitSelectPseudo(MachineInstr &MI, MachineBasicBlock *BB, unsigned Opcode) { + + const TargetInstrInfo &TII = *BB->getParent()->getSubtarget().getInstrInfo(); + DebugLoc DL = MI.getDebugLoc(); + + // To "insert" a SELECT instruction, we actually have to insert the + // diamond control-flow pattern. The incoming instruction knows the + // destination vreg to set, the condition code register to branch on, the + // true/false values to select between, and a branch opcode to use. + const BasicBlock *LLVM_BB = BB->getBasicBlock(); + MachineFunction::iterator It = ++BB->getIterator(); + + // thisMBB: + // ... + // TrueVal = ... + // bt32 c, sinkMBB + // fallthrough --> copyMBB + MachineBasicBlock *thisMBB = BB; + MachineFunction *F = BB->getParent(); + MachineBasicBlock *copyMBB = F->CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); + F->insert(It, copyMBB); + F->insert(It, sinkMBB); + + // Transfer the remainder of BB and its successor edges to sinkMBB. + sinkMBB->splice(sinkMBB->begin(), BB, + std::next(MachineBasicBlock::iterator(MI)), BB->end()); + sinkMBB->transferSuccessorsAndUpdatePHIs(BB); + + // Next, add the true and fallthrough blocks as its successors. + BB->addSuccessor(copyMBB); + BB->addSuccessor(sinkMBB); + + // bt32 condition, sinkMBB + BuildMI(BB, DL, TII.get(Opcode)) + .addReg(MI.getOperand(1).getReg()) + .addMBB(sinkMBB); + + // copyMBB: + // %FalseValue = ... + // # fallthrough to sinkMBB + BB = copyMBB; + + // Update machine-CFG edges + BB->addSuccessor(sinkMBB); + + // sinkMBB: + // %Result = phi [ %TrueValue, thisMBB ], [ %FalseValue, copyMBB ] + // ... + BB = sinkMBB; + + BuildMI(*BB, BB->begin(), DL, TII.get(CSKY::PHI), MI.getOperand(0).getReg()) + .addReg(MI.getOperand(2).getReg()) + .addMBB(thisMBB) + .addReg(MI.getOperand(3).getReg()) + .addMBB(copyMBB); + + MI.eraseFromParent(); // The pseudo instruction is gone now. + + return BB; +} + +MachineBasicBlock * +CSKYTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, + MachineBasicBlock *BB) const { + switch (MI.getOpcode()) { + default: + llvm_unreachable("Unexpected instr type to insert"); + case CSKY::ISEL32: + return emitSelectPseudo(MI, BB, CSKY::BT32); + case CSKY::ISEL16: + return emitSelectPseudo(MI, BB, CSKY::BT16); + } +} + +SDValue CSKYTargetLowering::getTargetConstantPoolValue(ExternalSymbolSDNode *N, + EVT Ty, + SelectionDAG &DAG, + unsigned Flags) const { + CSKYConstantPoolValue *CPV = + CSKYConstantPoolSymbol::Create(Type::getInt32Ty(*DAG.getContext()), + N->getSymbol(), 0, getModifier(Flags)); + + return DAG.getTargetConstantPool(CPV, Ty); +} + +SDValue CSKYTargetLowering::getTargetConstantPoolValue(JumpTableSDNode *N, + EVT Ty, + SelectionDAG &DAG, + unsigned Flags) const { + CSKYConstantPoolValue *CPV = + CSKYConstantPoolJT::Create(Type::getInt32Ty(*DAG.getContext()), + N->getIndex(), 0, getModifier(Flags)); + return DAG.getTargetConstantPool(CPV, Ty); +} + +SDValue CSKYTargetLowering::getTargetConstantPoolValue(BlockAddressSDNode *N, + EVT Ty, + SelectionDAG &DAG, + unsigned Flags) const { + CSKYConstantPoolValue *CPV = CSKYConstantPoolConstant::Create( + N->getBlockAddress(), CSKYCP::CPBlockAddress, 0, getModifier(Flags), + false); + return DAG.getTargetConstantPool(CPV, Ty); +} + +SDValue CSKYTargetLowering::getTargetNode(GlobalAddressSDNode *N, SDLoc DL, + EVT Ty, SelectionDAG &DAG, + unsigned Flags) const { + return DAG.getTargetGlobalAddress(N->getGlobal(), DL, Ty, 0, Flags); +} + +SDValue CSKYTargetLowering::getTargetNode(ExternalSymbolSDNode *N, SDLoc DL, + EVT Ty, SelectionDAG &DAG, + unsigned Flags) const { + return DAG.getTargetExternalSymbol(N->getSymbol(), Ty, Flags); +} + +SDValue CSKYTargetLowering::getTargetNode(JumpTableSDNode *N, SDLoc DL, EVT Ty, + SelectionDAG &DAG, + unsigned Flags) const { + return DAG.getTargetJumpTable(N->getIndex(), Ty, Flags); +} + +SDValue CSKYTargetLowering::getTargetNode(BlockAddressSDNode *N, SDLoc DL, + EVT Ty, SelectionDAG &DAG, + unsigned Flags) const { + return DAG.getTargetBlockAddress(N->getBlockAddress(), Ty, N->getOffset(), + Flags); +} + const char *CSKYTargetLowering::getTargetNodeName(unsigned Opcode) const { switch (Opcode) { default: @@ -380,7 +926,243 @@ const char *CSKYTargetLowering::getTargetNodeName(unsigned Opcode) const { return "CSKYISD::NIR"; case CSKYISD::RET: return "CSKYISD::RET"; + case CSKYISD::CALL: + return "CSKYISD::CALL"; + case CSKYISD::CALLReg: + return "CSKYISD::CALLReg"; + case CSKYISD::TAIL: + return "CSKYISD::TAIL"; + case CSKYISD::TAILReg: + return "CSKYISD::TAILReg"; + case CSKYISD::LOAD_ADDR: + return "CSKYISD::LOAD_ADDR"; case CSKYISD::BITCAST_TO_LOHI: return "CSKYISD::BITCAST_TO_LOHI"; + case CSKYISD::BITCAST_FROM_LOHI: + return "CSKYISD::BITCAST_FROM_LOHI"; } } + +SDValue CSKYTargetLowering::LowerGlobalAddress(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + EVT Ty = Op.getValueType(); + GlobalAddressSDNode *N = cast<GlobalAddressSDNode>(Op); + int64_t Offset = N->getOffset(); + + const GlobalValue *GV = N->getGlobal(); + bool IsLocal = getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV); + SDValue Addr = getAddr<GlobalAddressSDNode, false>(N, DAG, IsLocal); + + // In order to maximise the opportunity for common subexpression elimination, + // emit a separate ADD node for the global address offset instead of folding + // it in the global address node. Later peephole optimisations may choose to + // fold it back in when profitable. + if (Offset != 0) + return DAG.getNode(ISD::ADD, DL, Ty, Addr, + DAG.getConstant(Offset, DL, MVT::i32)); + return Addr; +} + +SDValue CSKYTargetLowering::LowerExternalSymbol(SDValue Op, + SelectionDAG &DAG) const { + ExternalSymbolSDNode *N = cast<ExternalSymbolSDNode>(Op); + + return getAddr(N, DAG, false); +} + +SDValue CSKYTargetLowering::LowerJumpTable(SDValue Op, + SelectionDAG &DAG) const { + JumpTableSDNode *N = cast<JumpTableSDNode>(Op); + + return getAddr<JumpTableSDNode, false>(N, DAG); +} + +SDValue CSKYTargetLowering::LowerBlockAddress(SDValue Op, + SelectionDAG &DAG) const { + BlockAddressSDNode *N = cast<BlockAddressSDNode>(Op); + + return getAddr(N, DAG); +} + +SDValue CSKYTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { + MachineFunction &MF = DAG.getMachineFunction(); + CSKYMachineFunctionInfo *FuncInfo = MF.getInfo<CSKYMachineFunctionInfo>(); + + SDLoc DL(Op); + SDValue FI = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), + getPointerTy(MF.getDataLayout())); + + // vastart just stores the address of the VarArgsFrameIndex slot into the + // memory location argument. + const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); + return DAG.getStore(Op.getOperand(0), DL, FI, Op.getOperand(1), + MachinePointerInfo(SV)); +} + +SDValue CSKYTargetLowering::LowerFRAMEADDR(SDValue Op, + SelectionDAG &DAG) const { + const CSKYRegisterInfo &RI = *Subtarget.getRegisterInfo(); + MachineFunction &MF = DAG.getMachineFunction(); + MachineFrameInfo &MFI = MF.getFrameInfo(); + MFI.setFrameAddressIsTaken(true); + + EVT VT = Op.getValueType(); + SDLoc dl(Op); + unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); + Register FrameReg = RI.getFrameRegister(MF); + SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT); + while (Depth--) + FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, + MachinePointerInfo()); + return FrameAddr; +} + +SDValue CSKYTargetLowering::LowerRETURNADDR(SDValue Op, + SelectionDAG &DAG) const { + const CSKYRegisterInfo &RI = *Subtarget.getRegisterInfo(); + MachineFunction &MF = DAG.getMachineFunction(); + MachineFrameInfo &MFI = MF.getFrameInfo(); + MFI.setReturnAddressIsTaken(true); + + if (verifyReturnAddressArgumentIsConstant(Op, DAG)) + return SDValue(); + + EVT VT = Op.getValueType(); + SDLoc dl(Op); + unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); + if (Depth) { + SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); + SDValue Offset = DAG.getConstant(4, dl, MVT::i32); + return DAG.getLoad(VT, dl, DAG.getEntryNode(), + DAG.getNode(ISD::ADD, dl, VT, FrameAddr, Offset), + MachinePointerInfo()); + } + // Return the value of the return address register, marking it an implicit + // live-in. + unsigned Reg = MF.addLiveIn(RI.getRARegister(), getRegClassFor(MVT::i32)); + return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT); +} + +Register CSKYTargetLowering::getExceptionPointerRegister( + const Constant *PersonalityFn) const { + return CSKY::R0; +} + +Register CSKYTargetLowering::getExceptionSelectorRegister( + const Constant *PersonalityFn) const { + return CSKY::R1; +} + +SDValue CSKYTargetLowering::LowerGlobalTLSAddress(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + EVT Ty = Op.getValueType(); + GlobalAddressSDNode *N = cast<GlobalAddressSDNode>(Op); + int64_t Offset = N->getOffset(); + MVT XLenVT = MVT::i32; + + TLSModel::Model Model = getTargetMachine().getTLSModel(N->getGlobal()); + SDValue Addr; + switch (Model) { + case TLSModel::LocalExec: + Addr = getStaticTLSAddr(N, DAG, /*UseGOT=*/false); + break; + case TLSModel::InitialExec: + Addr = getStaticTLSAddr(N, DAG, /*UseGOT=*/true); + break; + case TLSModel::LocalDynamic: + case TLSModel::GeneralDynamic: + Addr = getDynamicTLSAddr(N, DAG); + break; + } + + // In order to maximise the opportunity for common subexpression elimination, + // emit a separate ADD node for the global address offset instead of folding + // it in the global address node. Later peephole optimisations may choose to + // fold it back in when profitable. + if (Offset != 0) + return DAG.getNode(ISD::ADD, DL, Ty, Addr, + DAG.getConstant(Offset, DL, XLenVT)); + return Addr; +} + +SDValue CSKYTargetLowering::getStaticTLSAddr(GlobalAddressSDNode *N, + SelectionDAG &DAG, + bool UseGOT) const { + MachineFunction &MF = DAG.getMachineFunction(); + CSKYMachineFunctionInfo *CFI = MF.getInfo<CSKYMachineFunctionInfo>(); + + unsigned CSKYPCLabelIndex = CFI->createPICLabelUId(); + + SDLoc DL(N); + EVT Ty = getPointerTy(DAG.getDataLayout()); + + CSKYCP::CSKYCPModifier Flag = UseGOT ? CSKYCP::TLSIE : CSKYCP::TLSLE; + bool AddCurrentAddr = UseGOT ? true : false; + unsigned char PCAjust = UseGOT ? 4 : 0; + + CSKYConstantPoolValue *CPV = + CSKYConstantPoolConstant::Create(N->getGlobal(), CSKYCP::CPValue, PCAjust, + Flag, AddCurrentAddr, CSKYPCLabelIndex); + SDValue CAddr = DAG.getTargetConstantPool(CPV, Ty); + + SDValue Load; + if (UseGOT) { + SDValue PICLabel = DAG.getTargetConstant(CSKYPCLabelIndex, DL, MVT::i32); + auto *LRWGRS = DAG.getMachineNode(CSKY::PseudoTLSLA32, DL, {Ty, Ty}, + {CAddr, PICLabel}); + auto LRWADDGRS = + DAG.getNode(ISD::ADD, DL, Ty, SDValue(LRWGRS, 0), SDValue(LRWGRS, 1)); + Load = DAG.getLoad(Ty, DL, DAG.getEntryNode(), LRWADDGRS, + MachinePointerInfo(N->getGlobal())); + } else { + Load = SDValue(DAG.getMachineNode(CSKY::LRW32, DL, Ty, CAddr), 0); + } + + // Add the thread pointer. + SDValue TPReg = DAG.getRegister(CSKY::R31, MVT::i32); + return DAG.getNode(ISD::ADD, DL, Ty, Load, TPReg); +} + +SDValue CSKYTargetLowering::getDynamicTLSAddr(GlobalAddressSDNode *N, + SelectionDAG &DAG) const { + MachineFunction &MF = DAG.getMachineFunction(); + CSKYMachineFunctionInfo *CFI = MF.getInfo<CSKYMachineFunctionInfo>(); + + unsigned CSKYPCLabelIndex = CFI->createPICLabelUId(); + + SDLoc DL(N); + EVT Ty = getPointerTy(DAG.getDataLayout()); + IntegerType *CallTy = Type::getIntNTy(*DAG.getContext(), Ty.getSizeInBits()); + + CSKYConstantPoolValue *CPV = + CSKYConstantPoolConstant::Create(N->getGlobal(), CSKYCP::CPValue, 4, + CSKYCP::TLSGD, true, CSKYPCLabelIndex); + SDValue Addr = DAG.getTargetConstantPool(CPV, Ty); + SDValue PICLabel = DAG.getTargetConstant(CSKYPCLabelIndex, DL, MVT::i32); + + auto *LRWGRS = + DAG.getMachineNode(CSKY::PseudoTLSLA32, DL, {Ty, Ty}, {Addr, PICLabel}); + + auto Load = + DAG.getNode(ISD::ADD, DL, Ty, SDValue(LRWGRS, 0), SDValue(LRWGRS, 1)); + + // Prepare argument list to generate call. + ArgListTy Args; + ArgListEntry Entry; + Entry.Node = Load; + Entry.Ty = CallTy; + Args.push_back(Entry); + + // Setup call to __tls_get_addr. + TargetLowering::CallLoweringInfo CLI(DAG); + CLI.setDebugLoc(DL) + .setChain(DAG.getEntryNode()) + .setLibCallee(CallingConv::C, CallTy, + DAG.getExternalSymbol("__tls_get_addr", Ty), + std::move(Args)); + SDValue V = LowerCallTo(CLI).first; + + return V; +} diff --git a/llvm/lib/Target/CSKY/CSKYISelLowering.h b/llvm/lib/Target/CSKY/CSKYISelLowering.h index 7557c11f50a8..e1744d5ce220 100644 --- a/llvm/lib/Target/CSKY/CSKYISelLowering.h +++ b/llvm/lib/Target/CSKY/CSKYISelLowering.h @@ -27,7 +27,15 @@ enum NodeType : unsigned { NIE, NIR, RET, - BITCAST_TO_LOHI + CALL, + CALLReg, + TAIL, + TAILReg, + LOAD_ADDR, + // i32, i32 <-- f64 + BITCAST_TO_LOHI, + // f64 < -- i32, i32 + BITCAST_FROM_LOHI, }; } @@ -38,6 +46,8 @@ public: explicit CSKYTargetLowering(const TargetMachine &TM, const CSKYSubtarget &STI); + SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; + EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override; @@ -58,8 +68,96 @@ private: const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override; + SDValue LowerCall(TargetLowering::CallLoweringInfo &CLI, + SmallVectorImpl<SDValue> &InVals) const override; + const char *getTargetNodeName(unsigned Opcode) const override; + /// If a physical register, this returns the register that receives the + /// exception address on entry to an EH pad. + Register + getExceptionPointerRegister(const Constant *PersonalityFn) const override; + + /// If a physical register, this returns the register that receives the + /// exception typeid on entry to a landing pad. + Register + getExceptionSelectorRegister(const Constant *PersonalityFn) const override; + + bool isSelectSupported(SelectSupportKind Kind) const override { + // CSKY does not support scalar condition selects on vectors. + return (Kind != ScalarCondVectorVal); + } + + MachineBasicBlock * + EmitInstrWithCustomInserter(MachineInstr &MI, + MachineBasicBlock *BB) const override; + + SDValue getTargetNode(GlobalAddressSDNode *N, SDLoc DL, EVT Ty, + SelectionDAG &DAG, unsigned Flags) const; + + SDValue getTargetNode(ExternalSymbolSDNode *N, SDLoc DL, EVT Ty, + SelectionDAG &DAG, unsigned Flags) const; + + SDValue getTargetNode(JumpTableSDNode *N, SDLoc DL, EVT Ty, SelectionDAG &DAG, + unsigned Flags) const; + + SDValue getTargetNode(BlockAddressSDNode *N, SDLoc DL, EVT Ty, + SelectionDAG &DAG, unsigned Flags) const; + + SDValue getTargetConstantPoolValue(GlobalAddressSDNode *N, EVT Ty, + SelectionDAG &DAG, unsigned Flags) const; + + SDValue getTargetConstantPoolValue(ExternalSymbolSDNode *N, EVT Ty, + SelectionDAG &DAG, unsigned Flags) const; + + SDValue getTargetConstantPoolValue(JumpTableSDNode *N, EVT Ty, + SelectionDAG &DAG, unsigned Flags) const; + + SDValue getTargetConstantPoolValue(BlockAddressSDNode *N, EVT Ty, + SelectionDAG &DAG, unsigned Flags) const; + + template <class NodeTy, bool IsCall = false> + SDValue getAddr(NodeTy *N, SelectionDAG &DAG, bool IsLocal = true) const { + SDLoc DL(N); + EVT Ty = getPointerTy(DAG.getDataLayout()); + + unsigned Flag = CSKYII::MO_None; + bool IsPIC = isPositionIndependent(); + + if (IsPIC) + Flag = IsLocal ? CSKYII::MO_GOTOFF + : IsCall ? CSKYII::MO_PLT32 + : CSKYII::MO_GOT32; + + SDValue TCPV = getTargetConstantPoolValue(N, Ty, DAG, Flag); + SDValue TV = getTargetNode(N, DL, Ty, DAG, Flag); + SDValue Addr = DAG.getNode(CSKYISD::LOAD_ADDR, DL, Ty, {TV, TCPV}); + + if (!IsPIC) + return Addr; + + SDValue Result = + DAG.getNode(ISD::ADD, DL, Ty, {DAG.getGLOBAL_OFFSET_TABLE(Ty), Addr}); + if (IsLocal) + return Result; + + return DAG.getLoad(Ty, DL, DAG.getEntryNode(), Result, + MachinePointerInfo::getGOT(DAG.getMachineFunction())); + } + + SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const; + + SDValue getStaticTLSAddr(GlobalAddressSDNode *N, SelectionDAG &DAG, + bool UseGOT) const; + SDValue getDynamicTLSAddr(GlobalAddressSDNode *N, SelectionDAG &DAG) const; + CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg) const; CCAssignFn *CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg) const; }; diff --git a/llvm/lib/Target/CSKY/CSKYInstrFormatsF1.td b/llvm/lib/Target/CSKY/CSKYInstrFormatsF1.td new file mode 100644 index 000000000000..446670a4d0a9 --- /dev/null +++ b/llvm/lib/Target/CSKY/CSKYInstrFormatsF1.td @@ -0,0 +1,274 @@ +//===- CSKYInstrFormatsF1.td - CSKY Float1.0 Instr Format --*- tablegen -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// CSKY Instruction Format Float1.0 Definitions. +// +//===----------------------------------------------------------------------===// + +class CSKYFP1Inst<dag outs, dag ins, string asmstr, list<dag> pattern> + : CSKY32Inst<AddrModeNone, 0x3d, outs, ins, asmstr, pattern>, Requires<[HasFPUv2_SF]> { +} + +class F_XYZ_BASE<bits<5> datatype, bits<6> sop, dag outs, dag ins, string opcodestr, list<dag> pattern> + : CSKYFP1Inst<outs, ins, opcodestr, pattern> { + bits<4> vrx; + bits<4> vry; + bits<4> vrz; + let Inst{25 - 21} = {0, vry}; + let Inst{20 - 16} = {0, vrx}; + let Inst{15 - 11} = datatype; + let Inst{10 - 5} = sop; + let Inst{4 - 0} = {0, vrz}; +} + +class F_XZ_GF<bits<5> datatype, bits<6> sop, dag outs, dag ins, string opcodestr, list<dag> pattern> + : CSKYFP1Inst<outs, ins, opcodestr, pattern> { + bits<4> vrx; + bits<5> rz; + let Inst{25 - 21} = 0; + let Inst{20 - 16} = {0, vrx}; + let Inst{15 - 11} = datatype; + let Inst{10 - 5} = sop; + let Inst{4 - 0} = {rz}; +} + +class F_XZ_FG<bits<5> datatype, bits<6> sop, dag outs, dag ins, string opcodestr, list<dag> pattern> + : CSKYFP1Inst<outs, ins, opcodestr, pattern> { + bits<5> rx; + bits<4> vrz; + let Inst{25 - 21} = 0; + let Inst{20 - 16} = {rx}; + let Inst{15 - 11} = datatype; + let Inst{10 - 5} = sop; + let Inst{4 - 0} = {0, vrz}; +} + +class F_XZ_TRANS_FROM<bits<6> sop, string op, RegisterOperand regtype1, RegisterOperand regtype2> + : F_XZ_GF<3, sop, (outs regtype1:$rz), (ins regtype2:$vrx), !strconcat(op, "\t$rz, $vrx"), + []>; + +class F_XZ_TRANS_TO<bits<6> sop, string op, RegisterOperand regtype1, RegisterOperand regtype2> + : F_XZ_FG<3, sop, (outs regtype1:$vrz), (ins regtype2:$rx), !strconcat(op, "\t$vrz, $rx"), + []>; + +let vry = 0 in { +class F_XZ<bits<5> datatype, bits<6> sop, string op, string op_su, PatFrag opnode, RegisterOperand regtype> + : F_XYZ_BASE<datatype, sop, (outs regtype:$vrz), (ins regtype:$vrx), !strconcat(op#op_su, "\t$vrz, $vrx"), + [(set regtype:$vrz, (opnode regtype:$vrx))]>; + +class F_MOV<bits<5> datatype, bits<6> sop, string op, string op_su, RegisterOperand regtype> + : F_XYZ_BASE<datatype, sop, (outs regtype:$vrz), (ins regtype:$vrx), !strconcat(op#op_su, "\t$vrz, $vrx"), + []>; + +class F_XZ_TRANS<bits<6> sop, string op, RegisterOperand regtype1, RegisterOperand regtype2> + : F_XYZ_BASE<3, sop, (outs regtype1:$vrz), (ins regtype2:$vrx), !strconcat(op, "\t$vrz, $vrx"), + []>; + +class F_XZ_TRANS_DS<bits<6> sop, string op, PatFrag opnode> + : F_XYZ_BASE<3, sop, (outs sFPR32Op:$vrz), (ins sFPR64Op:$vrx), !strconcat(op, "\t$vrz, $vrx"), + [(set sFPR32Op:$vrz, (opnode sFPR64Op:$vrx))]>; + +class F_XZ_TRANS_SD<bits<6> sop, string op, PatFrag opnode> + : F_XYZ_BASE<3, sop, (outs sFPR64Op:$vrz), (ins sFPR32Op:$vrx), !strconcat(op, "\t$vrz, $vrx"), + [(set sFPR64Op:$vrz, (opnode sFPR32Op:$vrx))]>; +} + +multiclass FT_MOV<bits<6> sop, string op> { + def _S : F_MOV<0, sop, op, "s", sFPR32Op>; + let Predicates = [HasFPUv2_DF] in + def _D : F_MOV<1, sop, op, "d", sFPR64Op>; +} + +multiclass FT_XZ<bits<6> sop, string op, PatFrag opnode> { + def _S : F_XZ<0, sop, op, "s", opnode, sFPR32Op>; + let Predicates = [HasFPUv2_DF] in + def _D : F_XZ<1, sop, op, "d", opnode, sFPR64Op>; +} + +let vrz = 0, isCompare = 1 in { +class F_CMPXY<bits<5> datatype, bits<6> sop, string op, string op_su, RegisterOperand regtype> + : F_XYZ_BASE<datatype, sop, (outs CARRY:$ca), (ins regtype:$vrx, regtype:$vry), !strconcat(op#op_su, "\t$vrx, $vry"), + []>; + +let vry = 0 in{ +class F_CMPZX<bits<5> datatype, bits<6> sop, string op, string op_su, RegisterOperand regtype> + : F_XYZ_BASE<datatype, sop, (outs CARRY:$ca), (ins regtype:$vrx), !strconcat(op#op_su, "\t$vrx"), + []>; +} +} + +class F_XYZ<bits<5> datatype, bits<6> sop, string op, string op_su, PatFrag opnode, RegisterOperand regtype> + : F_XYZ_BASE<datatype, sop, (outs regtype:$vrz), (ins regtype:$vrx, regtype:$vry), + !strconcat(op#op_su, "\t$vrz, $vrx, $vry"), + [(set regtype:$vrz, (opnode regtype:$vrx, regtype:$vry))]>; + +multiclass FT_XYZ<bits<6> sop, string op, PatFrag opnode> { + def _S : F_XYZ<0, sop, op, "s", opnode, sFPR32Op>; + let Predicates = [HasFPUv2_DF] in + def _D : F_XYZ<1, sop, op, "d", opnode, sFPR64Op>; +} + +let Constraints = "$vrt = $vrz" in { +class F_ACCUM_XYZ<bits<5> datatype, bits<6> sop, string op, string op_su, PatFrag opnode, RegisterOperand regtype> + : F_XYZ_BASE<datatype, sop, (outs regtype:$vrz), (ins regtype:$vrt, regtype:$vrx, regtype:$vry), + !strconcat(op#op_su, "\t$vrz, $vrx, $vry"), + [(set regtype:$vrz, (opnode regtype:$vrt, regtype:$vrx, regtype:$vry))]>; +} + +multiclass FT_ACCUM_XYZ<bits<6> sop, string op, PatFrag opnode> { + def _S : F_ACCUM_XYZ<0, sop, op, "s", opnode, sFPR32Op>; + let Predicates = [HasFPUv2_DF] in + def _D : F_ACCUM_XYZ<1, sop, op, "d", opnode, sFPR64Op>; +} + +multiclass FT_CMPXY<bits<6> sop, string op> { + def _S : F_CMPXY<0, sop, op, "s", sFPR32Op>; + let Predicates = [HasFPUv2_DF] in + def _D : F_CMPXY<1, sop, op, "d", sFPR64Op>; +} + + +multiclass FT_CMPZX<bits<6> sop, string op> { + def _S : F_CMPZX<0, sop, op, "s", sFPR32Op>; + let Predicates = [HasFPUv2_DF] in + def _D : F_CMPZX<1, sop, op, "d", sFPR64Op>; +} + +class F_I8_XY_MEM<bits<7> sop, bits<1> sop_su, dag outs, dag ins, string opcodestr, list<dag> pattern> + : CSKY32Inst<AddrMode32SDF, 0x3d, outs, ins, opcodestr, pattern> { + bits<5> rx; + bits<4> vrz; + bits<8> imm8; + let Inst{25} = 0; + let Inst{24 - 21} = imm8{7 - 4}; //imm4h + let Inst{20 - 16} = rx; //rx + let Inst{15 - 9} = sop; + let Inst{8} = sop_su; + let Inst{7 - 4} = imm8{3 - 0}; // imm4l + let Inst{3 - 0} = vrz; +} + +class F_I4_XY_MEM<bits<7> sop, bits<1> sop_su, dag outs, dag ins, string opcodestr, list<dag> pattern> + : CSKY32Inst<AddrMode32SDF, 0x3d, outs, ins, opcodestr, pattern> { + bits<10> regs; + bits<5> rx; + + let Inst{25} = 0; + let Inst{24 - 21} = regs{3-0}; //imm4 + let Inst{20 - 16} = rx; //rx + let Inst{15 - 9} = sop; + let Inst{8} = sop_su; + let Inst{7 - 4} = 0; + let Inst{3 - 0} = regs{8-5}; +} + +class F_I8_Z_MEM<bits<7> sop, bits<1> sop_su, dag outs, dag ins, string opcodestr, list<dag> pattern> + : CSKY32Inst<AddrModeNone, 0x3d, outs, ins, opcodestr, pattern> { + bits<4> vrz; + bits<8> imm8; + let Inst{25} = 0; + let Inst{24 - 21} = imm8{7 - 4}; //imm4h + let Inst{20 - 16} = 0; //rx + let Inst{15 - 9} = sop; + let Inst{8} = sop_su; + let Inst{7 - 4} = imm8{3 - 0}; // imm4l + let Inst{3 - 0} = vrz; +} + +class F_XYZ_MEM<bits<7> sop, bits<1> sop_su, dag outs, dag ins, string opcodestr, list<dag> pattern> + : CSKY32Inst<AddrModeNone, 0x3d, outs, ins, opcodestr, pattern> { + bits<5> rx; + bits<5> ry; + bits<4> vrz; + bits<2> imm; + + let Inst{25 - 21} = ry; // ry; + let Inst{20 - 16} = rx; // rx; + let Inst{15 - 9} = sop; + let Inst{8} = sop_su; + let Inst{7} = 0; + let Inst{6,5} = imm; // shift; + let Inst{4} = 0; + let Inst{3 - 0} = vrz; +} + +class F_XYAI_LD<bits<7> sop, bits<1> sop_su, string op, string op_su, + RegisterOperand regtype, Operand operand> + : F_I8_XY_MEM<sop, sop_su, (outs regtype:$vrz), (ins GPR:$rx, operand:$imm8), + !strconcat(op#op_su, "\t$vrz, ($rx, ${imm8})"), []>; + +class F_XYAR_LD<bits<7> sop, bits<1> sop_su, string op, string op_su, + RegisterOperand regtype> + : F_XYZ_MEM<sop, sop_su, (outs regtype:$vrz), (ins GPR:$rx, GPR:$ry, uimm2:$imm), + op#op_su#"\t$vrz, ($rx, $ry << ${imm})", []>; + +class F_XYAI_ST<bits<7> sop, bits<1> sop_su, string op, string op_su, + RegisterOperand regtype, Operand operand> + : F_I8_XY_MEM<sop, sop_su, (outs), (ins regtype:$vrz, GPR:$rx, operand:$imm8), + !strconcat(op#op_su, "\t$vrz, ($rx, ${imm8})"), []>; + +class F_XYAR_ST<bits<7> sop, bits<1> sop_su, string op, string op_su, + RegisterOperand regtype> + : F_XYZ_MEM<sop, sop_su, (outs), (ins regtype:$vrz, GPR:$rx, GPR:$ry, uimm2:$imm), + op#op_su#"\t$vrz, ($rx, $ry << ${imm})", []>; + +def Mem8SL2 : Operand<iPTR>, ComplexPattern<iPTR, 2, "SelectAddrRegImm8", []> { + let MIOperandInfo = (ops GPR, i32imm); + let PrintMethod = "printAddrModeRegImmOperand"; + let EncoderMethod = "getAddrModeFloatImm8_sl2OpValue"; +} + +def FRRS : Operand<iPTR>, ComplexPattern<iPTR, 3, "SelectAddrRegReg", []> { + let MIOperandInfo = (ops GPR, GPR, i32imm); + let PrintMethod = "printAddrModeRegRegSLOperand"; + let EncoderMethod = "getAddrModeFloatRegRegSLOpValue"; +} + +multiclass FT_XYAI_LD<bits<7> sop, string op> { + def _S : F_XYAI_LD<sop, 0, op, "s", sFPR32Op, uimm8_2>; + let Predicates = [HasFPUv2_DF] in + def _D : F_XYAI_LD<sop, 1, op, "d", sFPR64Op, uimm8_2>; +} + +multiclass FT_XYAR_LD<bits<7> sop, string op> { + def _S : F_XYAR_LD<sop, 0, op, "s", sFPR32Op>; + let Predicates = [HasFPUv2_DF] in + def _D : F_XYAR_LD<sop, 1, op, "d", sFPR64Op>; +} + +multiclass FT_XYAI_ST<bits<7> sop, string op> { + def _S : F_XYAI_ST<sop, 0, op, "s", sFPR32Op, uimm8_2>; + let Predicates = [HasFPUv2_DF] in + def _D : F_XYAI_ST<sop, 1, op, "d", sFPR64Op, uimm8_2>; +} + +multiclass FT_XYAR_ST<bits<7> sop, string op> { + def _S : F_XYAR_ST<sop, 0, op, "s", sFPR32Op>; + let Predicates = [HasFPUv2_DF] in + def _D : F_XYAR_ST<sop, 1, op, "d", sFPR64Op>; +} + +multiclass FT_XYAR_STM<bits<7> sop, string op> { + def _S : F_I4_XY_MEM<sop, 0, (outs), + (ins GPR:$rx, regseq_f1:$regs, variable_ops), + !strconcat(op#"s", "\t$regs, (${rx})"), []>; + let Predicates = [HasFPUv2_DF] in + def _D : F_I4_XY_MEM<sop, 1, (outs), + (ins GPR:$rx, regseq_d1:$regs, variable_ops), + !strconcat(op#"d", "\t$regs, (${rx})"), []>; +} + +multiclass FT_XYAR_LDM<bits<7> sop, string op> { + def _S : F_I4_XY_MEM<sop, 0, (outs), + (ins GPR:$rx, regseq_f1:$regs, variable_ops), + !strconcat(op#"s", "\t$regs, (${rx})"), []>; + let Predicates = [HasFPUv2_DF] in + def _D : F_I4_XY_MEM<sop, 1, (outs), + (ins GPR:$rx, regseq_d1:$regs, variable_ops), + !strconcat(op#"d", "\t$regs, (${rx})"), []>; +} diff --git a/llvm/lib/Target/CSKY/CSKYInstrFormatsF2.td b/llvm/lib/Target/CSKY/CSKYInstrFormatsF2.td new file mode 100644 index 000000000000..641ad623f140 --- /dev/null +++ b/llvm/lib/Target/CSKY/CSKYInstrFormatsF2.td @@ -0,0 +1,208 @@ +//===- CSKYInstrFormatsF2.td - CSKY Float2.0 Instr Format --*- tablegen -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// CSKY Instruction Format Float2.0 Definitions. +// +//===----------------------------------------------------------------------===// + +class CSKYInstF2<AddrMode am, dag outs, dag ins, string opcodestr, + list<dag> pattern> + : CSKY32Inst<am, 0x3d, outs, ins, opcodestr, pattern> { + let Predicates = [HasFPUv3_SF]; + let DecoderNamespace = "FPUV3"; +} + +class F2_XYZ<bits<5> datatype, bits<6> sop, string opcodestr, dag outs, dag ins, + list<dag> pattern> + : CSKYInstF2<AddrModeNone, outs, ins, opcodestr, pattern> { + bits<5> vry; + bits<5> vrx; + bits<5> vrz; + + let Inst{25-21} = vry; + let Inst{20-16} = vrx; + let Inst{15-11} = datatype; + let Inst{10-5} = sop; + let Inst{4-0} = vrz; +} + +multiclass F2_XYZ_T<bits<6> sop, string op, PatFrag opnode> { + def _S : F2_XYZ<0b00000, sop, op#".32"#"\t$vrz, $vrx, $vry", + (outs FPR32Op:$vrz), (ins FPR32Op:$vrx, FPR32Op:$vry), + [(set FPR32Op:$vrz, (opnode FPR32Op:$vrx, FPR32Op:$vry))]>; + let Predicates = [HasFPUv3_DF] in + def _D : F2_XYZ<0b00001, sop, op#".64"#"\t$vrz, $vrx, $vry", + (outs FPR64Op:$vrz), (ins FPR64Op:$vrx, FPR64Op:$vry), + [(set FPR64Op:$vrz, (opnode FPR64Op:$vrx, FPR64Op:$vry))]>; +} + +let Constraints = "$vrZ = $vrz" in +multiclass F2_XYZZ_T<bits<6> sop, string op, PatFrag opnode> { + def _S : F2_XYZ<0b00000, sop, op#".32"#"\t$vrz, $vrx, $vry", + (outs FPR32Op:$vrz), (ins FPR32Op:$vrZ, FPR32Op:$vrx, FPR32Op:$vry), + [(set FPR32Op:$vrz, (opnode FPR32Op:$vrx, FPR32Op:$vry, FPR32Op:$vrZ))]>; + let Predicates = [HasFPUv3_DF] in + def _D : F2_XYZ<0b00001, sop, op#".64"#"\t$vrz, $vrx, $vry", + (outs FPR64Op:$vrz), (ins FPR64Op:$vrZ, FPR64Op:$vrx, FPR64Op:$vry), + [(set FPR64Op:$vrz, (opnode FPR64Op:$vrx, FPR64Op:$vry, FPR64Op:$vrZ))]>; +} + +let vry = 0 in { +class F2_XZ<bits<5> datatype, RegisterOperand regtype, bits<6> sop, string op, SDNode opnode> + : F2_XYZ<datatype, sop, !strconcat(op, "\t$vrz, $vrx"), + (outs regtype:$vrz), (ins regtype:$vrx), + [(set regtype:$vrz, (opnode regtype:$vrx))]>; + +class F2_XZ_SET<bits<5> datatype, RegisterOperand regtype, bits<6> sop, string op> + : F2_XYZ<datatype, sop, !strconcat(op, "\t$vrz, $vrx"), + (outs regtype:$vrz), (ins regtype:$vrx), + []>; + +class F2_XZ_P<bits<5> datatype, bits<6> sop, string op, list<dag> pattern = [], + dag outs, dag ins> + : F2_XYZ<datatype, sop, op#"\t$vrz, $vrx", outs, ins, pattern>; +} + +multiclass F2_XZ_RM<bits<5> datatype, bits<4> sop, string op, dag outs, dag ins> { + def _RN : F2_XZ_P<datatype, {sop, 0b00}, op#".rn", [], outs, ins>; + def _RZ : F2_XZ_P<datatype, {sop, 0b01}, op#".rz", [], outs, ins>; + def _RPI : F2_XZ_P<datatype, {sop, 0b10}, op#".rpi", [], outs, ins>; + def _RNI : F2_XZ_P<datatype, {sop, 0b11}, op#".rni", [], outs, ins>; +} + +multiclass F2_XZ_T<bits<6> sop, string op, SDNode opnode> { + def _S : F2_XZ<0b00000, FPR32Op, sop, op#".32", opnode>; + let Predicates = [HasFPUv3_DF] in + def _D : F2_XZ<0b00001, FPR64Op, sop, op#".64", opnode>; +} + +multiclass F2_XZ_SET_T<bits<6> sop, string op, string suffix = ""> { + def _S : F2_XZ_SET<0b00000, FPR32Op, sop, op#".32"#suffix>; + let Predicates = [HasFPUv3_DF] in + def _D : F2_XZ_SET<0b00001, FPR64Op, sop, op#".64"#suffix>; +} + + +let vrz = 0, isCompare = 1 in +class F2_CXY<bits<5> datatype, RegisterOperand regtype, bits<6> sop, string op> + : F2_XYZ<datatype, sop, !strconcat(op, "\t$vrx, $vry"), + (outs CARRY:$ca), (ins regtype:$vrx, regtype:$vry), + []>; + +multiclass F2_CXY_T<bits<6> sop, string op> { + def _S : F2_CXY<0b00000, FPR32Op, sop, op#".32">; + let Predicates = [HasFPUv3_DF] in + def _D : F2_CXY<0b00001, FPR64Op, sop, op#".64">; +} + + +let vrz = 0, vry = 0, isCompare = 1 in +class F2_CX<bits<5> datatype, RegisterOperand regtype, bits<6> sop, string op> + : F2_XYZ<datatype, sop, !strconcat(op, "\t$vrx"), + (outs CARRY:$ca), (ins regtype:$vrx), + []>; + +multiclass F2_CX_T<bits<6> sop, string op> { + def _S : F2_CX<0b00000, FPR32Op, sop, op#".32">; + let Predicates = [HasFPUv3_DF] in + def _D : F2_CX<0b00001, FPR64Op, sop, op#".64">; +} + + +class F2_LDST<bits<2> datatype, bits<1> sop, string op, dag outs, dag ins> + : CSKYInstF2<AddrMode32SDF, outs, ins, + !strconcat(op, "\t$vrz, ($rx, ${imm8})"), []> { + bits<10> imm8; + bits<5> rx; + bits<5> vrz; + + let Inst{25} = vrz{4}; + let Inst{24-21} = imm8{7-4}; + let Inst{20-16} = rx; + let Inst{15-11} = 0b00100; + let Inst{10} = sop; + let Inst{9-8} = datatype; + let Inst{7-4} = imm8{3-0}; + let Inst{3-0} = vrz{3-0}; +} + +class F2_LDST_S<bits<1> sop, string op, dag outs, dag ins> + : F2_LDST<0b00, sop, op#".32", outs, ins>; +class F2_LDST_D<bits<1> sop, string op, dag outs, dag ins> + : F2_LDST<0b01, sop, op#".64", outs, ins>; + +class F2_LDSTM<bits<2> datatype, bits<1> sop, bits<3> sop2, string op, dag outs, dag ins> + : CSKYInstF2<AddrMode32SDF, outs, ins, + !strconcat(op, "\t$regs, (${rx})"), []> { + bits<10> regs; + bits<5> rx; + + let Inst{25-21} = regs{4-0}; + let Inst{20-16} = rx; + let Inst{15-11} = 0b00110; + let Inst{10} = sop; + let Inst{9-8} = datatype; + let Inst{7-5} = sop2; + let Inst{4-0} = regs{9-5}; +} + +class F2_LDSTM_S<bits<1> sop, bits<3> sop2, string op, dag outs, dag ins> + : F2_LDSTM<0b00, sop, sop2, op#".32", outs, ins>; +class F2_LDSTM_D<bits<1> sop, bits<3> sop2, string op, dag outs, dag ins> + : F2_LDSTM<0b01, sop, sop2, op#".64", outs, ins>; + + +class F2_LDSTR<bits<2> datatype, bits<1> sop, string op, dag outs, dag ins> + : CSKYInstF2<AddrModeNone, outs, ins, + op#"\t$rz, ($rx, $ry << ${imm})", []> { + bits<5> rx; + bits<5> ry; + bits<5> rz; + bits<2> imm; + + let Inst{25-21} = ry; + let Inst{20-16} = rx; + let Inst{15-11} = 0b00101; + let Inst{10} = sop; + let Inst{9-8} = datatype; + let Inst{7} = 0; + let Inst{6-5} = imm; + let Inst{4-0} = rz; +} + +class F2_LDSTR_S<bits<1> sop, string op, dag outs, dag ins> + : F2_LDSTR<0b00, sop, op#".32", outs, ins>; +class F2_LDSTR_D<bits<1> sop, string op, dag outs, dag ins> + : F2_LDSTR<0b01, sop, op#".64", outs, ins>; + +class F2_CXYZ<bits<5> datatype, RegisterOperand regtype, bits<6> sop, string op> + : F2_XYZ<datatype, sop, !strconcat(op, "\t$vrz, $vrx, $vry"), + (outs regtype:$vrz), (ins CARRY:$ca, regtype:$vrx, regtype:$vry), + []>; +multiclass F2_CXYZ_T<bits<6> sop, string op> { + def _S : F2_CXYZ<0b00000, FPR32Op, sop, op#".32">; + let Predicates = [HasFPUv3_DF] in + def _D : F2_CXYZ<0b00001, FPR64Op, sop, op#".64">; +} + +class F2_LRW<bits<2> datatype, bits<1> sop, string op, dag outs, dag ins> + : CSKYInstF2<AddrModeNone, outs, ins, + !strconcat(op, "\t$vrz, ${imm8}"), []> { + bits<10> imm8; + bits<5> rx; + bits<5> vrz; + + let Inst{25} = vrz{4}; + let Inst{24-21} = imm8{7-4}; + let Inst{20-16} = 0; + let Inst{15-11} = 0b00111; + let Inst{10} = sop; + let Inst{9-8} = datatype; + let Inst{7-4} = imm8{3-0}; + let Inst{3-0} = vrz{3-0}; +} diff --git a/llvm/lib/Target/CSKY/CSKYInstrInfo.cpp b/llvm/lib/Target/CSKY/CSKYInstrInfo.cpp index 6fcb136cd99b..c57ccb9d6eea 100644 --- a/llvm/lib/Target/CSKY/CSKYInstrInfo.cpp +++ b/llvm/lib/Target/CSKY/CSKYInstrInfo.cpp @@ -11,6 +11,7 @@ //===----------------------------------------------------------------------===// #include "CSKYInstrInfo.h" +#include "CSKYConstantPoolValue.h" #include "CSKYMachineFunctionInfo.h" #include "CSKYTargetMachine.h" #include "llvm/MC/MCContext.h" @@ -24,6 +25,199 @@ using namespace llvm; CSKYInstrInfo::CSKYInstrInfo(CSKYSubtarget &STI) : CSKYGenInstrInfo(CSKY::ADJCALLSTACKDOWN, CSKY::ADJCALLSTACKUP), STI(STI) { + v2sf = STI.hasFPUv2SingleFloat(); + v2df = STI.hasFPUv2DoubleFloat(); + v3sf = STI.hasFPUv3SingleFloat(); + v3df = STI.hasFPUv3DoubleFloat(); +} + +static void parseCondBranch(MachineInstr &LastInst, MachineBasicBlock *&Target, + SmallVectorImpl<MachineOperand> &Cond) { + // Block ends with fall-through condbranch. + assert(LastInst.getDesc().isConditionalBranch() && + "Unknown conditional branch"); + Target = LastInst.getOperand(1).getMBB(); + Cond.push_back(MachineOperand::CreateImm(LastInst.getOpcode())); + Cond.push_back(LastInst.getOperand(0)); +} + +bool CSKYInstrInfo::analyzeBranch(MachineBasicBlock &MBB, + MachineBasicBlock *&TBB, + MachineBasicBlock *&FBB, + SmallVectorImpl<MachineOperand> &Cond, + bool AllowModify) const { + TBB = FBB = nullptr; + Cond.clear(); + + // If the block has no terminators, it just falls into the block after it. + MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr(); + if (I == MBB.end() || !isUnpredicatedTerminator(*I)) + return false; + + // Count the number of terminators and find the first unconditional or + // indirect branch. + MachineBasicBlock::iterator FirstUncondOrIndirectBr = MBB.end(); + int NumTerminators = 0; + for (auto J = I.getReverse(); J != MBB.rend() && isUnpredicatedTerminator(*J); + J++) { + NumTerminators++; + if (J->getDesc().isUnconditionalBranch() || + J->getDesc().isIndirectBranch()) { + FirstUncondOrIndirectBr = J.getReverse(); + } + } + + // If AllowModify is true, we can erase any terminators after + // FirstUncondOrIndirectBR. + if (AllowModify && FirstUncondOrIndirectBr != MBB.end()) { + while (std::next(FirstUncondOrIndirectBr) != MBB.end()) { + std::next(FirstUncondOrIndirectBr)->eraseFromParent(); + NumTerminators--; + } + I = FirstUncondOrIndirectBr; + } + + // We can't handle blocks that end in an indirect branch. + if (I->getDesc().isIndirectBranch()) + return true; + + // We can't handle blocks with more than 2 terminators. + if (NumTerminators > 2) + return true; + + // Handle a single unconditional branch. + if (NumTerminators == 1 && I->getDesc().isUnconditionalBranch()) { + TBB = getBranchDestBlock(*I); + return false; + } + + // Handle a single conditional branch. + if (NumTerminators == 1 && I->getDesc().isConditionalBranch()) { + parseCondBranch(*I, TBB, Cond); + return false; + } + + // Handle a conditional branch followed by an unconditional branch. + if (NumTerminators == 2 && std::prev(I)->getDesc().isConditionalBranch() && + I->getDesc().isUnconditionalBranch()) { + parseCondBranch(*std::prev(I), TBB, Cond); + FBB = getBranchDestBlock(*I); + return false; + } + + // Otherwise, we can't handle this. + return true; +} + +unsigned CSKYInstrInfo::removeBranch(MachineBasicBlock &MBB, + int *BytesRemoved) const { + if (BytesRemoved) + *BytesRemoved = 0; + MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr(); + if (I == MBB.end()) + return 0; + + if (!I->getDesc().isUnconditionalBranch() && + !I->getDesc().isConditionalBranch()) + return 0; + + // Remove the branch. + if (BytesRemoved) + *BytesRemoved += getInstSizeInBytes(*I); + I->eraseFromParent(); + + I = MBB.end(); + + if (I == MBB.begin()) + return 1; + --I; + if (!I->getDesc().isConditionalBranch()) + return 1; + + // Remove the branch. + if (BytesRemoved) + *BytesRemoved += getInstSizeInBytes(*I); + I->eraseFromParent(); + return 2; +} + +MachineBasicBlock * +CSKYInstrInfo::getBranchDestBlock(const MachineInstr &MI) const { + assert(MI.getDesc().isBranch() && "Unexpected opcode!"); + // The branch target is always the last operand. + int NumOp = MI.getNumExplicitOperands(); + assert(MI.getOperand(NumOp - 1).isMBB() && "Expected MBB!"); + return MI.getOperand(NumOp - 1).getMBB(); +} + +unsigned CSKYInstrInfo::insertBranch( + MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, + ArrayRef<MachineOperand> Cond, const DebugLoc &DL, int *BytesAdded) const { + if (BytesAdded) + *BytesAdded = 0; + + // Shouldn't be a fall through. + assert(TBB && "insertBranch must not be told to insert a fallthrough"); + assert((Cond.size() == 2 || Cond.size() == 0) && + "CSKY branch conditions have two components!"); + + // Unconditional branch. + if (Cond.empty()) { + MachineInstr &MI = *BuildMI(&MBB, DL, get(CSKY::BR32)).addMBB(TBB); + if (BytesAdded) + *BytesAdded += getInstSizeInBytes(MI); + return 1; + } + + // Either a one or two-way conditional branch. + unsigned Opc = Cond[0].getImm(); + MachineInstr &CondMI = *BuildMI(&MBB, DL, get(Opc)).add(Cond[1]).addMBB(TBB); + if (BytesAdded) + *BytesAdded += getInstSizeInBytes(CondMI); + + // One-way conditional branch. + if (!FBB) + return 1; + + // Two-way conditional branch. + MachineInstr &MI = *BuildMI(&MBB, DL, get(CSKY::BR32)).addMBB(FBB); + if (BytesAdded) + *BytesAdded += getInstSizeInBytes(MI); + return 2; +} + +static unsigned getOppositeBranchOpc(unsigned Opcode) { + switch (Opcode) { + default: + llvm_unreachable("Unknown conditional branch!"); + case CSKY::BT32: + return CSKY::BF32; + case CSKY::BT16: + return CSKY::BF16; + case CSKY::BF32: + return CSKY::BT32; + case CSKY::BF16: + return CSKY::BT16; + case CSKY::BHZ32: + return CSKY::BLSZ32; + case CSKY::BHSZ32: + return CSKY::BLZ32; + case CSKY::BLZ32: + return CSKY::BHSZ32; + case CSKY::BLSZ32: + return CSKY::BHZ32; + case CSKY::BNEZ32: + return CSKY::BEZ32; + case CSKY::BEZ32: + return CSKY::BNEZ32; + } +} + +bool CSKYInstrInfo::reverseBranchCondition( + SmallVectorImpl<MachineOperand> &Cond) const { + assert((Cond.size() == 2) && "Invalid branch condition!"); + Cond[0].setImm(getOppositeBranchOpc(Cond[0].getImm())); + return false; } Register CSKYInstrInfo::movImm(MachineBasicBlock &MBB, @@ -147,6 +341,10 @@ unsigned CSKYInstrInfo::isLoadFromStackSlot(const MachineInstr &MI, case CSKY::LD32H: case CSKY::LD32HS: case CSKY::LD32W: + case CSKY::FLD_S: + case CSKY::FLD_D: + case CSKY::f2FLD_S: + case CSKY::f2FLD_D: case CSKY::RESTORE_CARRY: break; } @@ -171,6 +369,10 @@ unsigned CSKYInstrInfo::isStoreToStackSlot(const MachineInstr &MI, case CSKY::ST32B: case CSKY::ST32H: case CSKY::ST32W: + case CSKY::FST_S: + case CSKY::FST_D: + case CSKY::f2FST_S: + case CSKY::f2FST_D: case CSKY::SPILL_CARRY: break; } @@ -204,7 +406,15 @@ void CSKYInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, } else if (CSKY::CARRYRegClass.hasSubClassEq(RC)) { Opcode = CSKY::SPILL_CARRY; CFI->setSpillsCR(); - } else { + } else if (v2sf && CSKY::sFPR32RegClass.hasSubClassEq(RC)) + Opcode = CSKY::FST_S; + else if (v2df && CSKY::sFPR64RegClass.hasSubClassEq(RC)) + Opcode = CSKY::FST_D; + else if (v3sf && CSKY::FPR32RegClass.hasSubClassEq(RC)) + Opcode = CSKY::f2FST_S; + else if (v3df && CSKY::FPR64RegClass.hasSubClassEq(RC)) + Opcode = CSKY::f2FST_D; + else { llvm_unreachable("Unknown RegisterClass"); } @@ -239,7 +449,15 @@ void CSKYInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, } else if (CSKY::CARRYRegClass.hasSubClassEq(RC)) { Opcode = CSKY::RESTORE_CARRY; CFI->setSpillsCR(); - } else { + } else if (v2sf && CSKY::sFPR32RegClass.hasSubClassEq(RC)) + Opcode = CSKY::FLD_S; + else if (v2df && CSKY::sFPR64RegClass.hasSubClassEq(RC)) + Opcode = CSKY::FLD_D; + else if (v3sf && CSKY::FPR32RegClass.hasSubClassEq(RC)) + Opcode = CSKY::f2FLD_S; + else if (v3df && CSKY::FPR64RegClass.hasSubClassEq(RC)) + Opcode = CSKY::f2FLD_D; + else { llvm_unreachable("Unknown RegisterClass"); } @@ -302,6 +520,38 @@ void CSKYInstrInfo::copyPhysReg(MachineBasicBlock &MBB, unsigned Opcode = 0; if (CSKY::GPRRegClass.contains(DestReg, SrcReg)) Opcode = CSKY::MOV32; + else if (v2sf && CSKY::sFPR32RegClass.contains(DestReg, SrcReg)) + Opcode = CSKY::FMOV_S; + else if (v3sf && CSKY::FPR32RegClass.contains(DestReg, SrcReg)) + Opcode = CSKY::f2FMOV_S; + else if (v2df && CSKY::sFPR64RegClass.contains(DestReg, SrcReg)) + Opcode = CSKY::FMOV_D; + else if (v3df && CSKY::FPR64RegClass.contains(DestReg, SrcReg)) + Opcode = CSKY::f2FMOV_D; + else if (v2sf && CSKY::sFPR32RegClass.contains(SrcReg) && + CSKY::GPRRegClass.contains(DestReg)) + Opcode = CSKY::FMFVRL; + else if (v3sf && CSKY::FPR32RegClass.contains(SrcReg) && + CSKY::GPRRegClass.contains(DestReg)) + Opcode = CSKY::f2FMFVRL; + else if (v2df && CSKY::sFPR64RegClass.contains(SrcReg) && + CSKY::GPRRegClass.contains(DestReg)) + Opcode = CSKY::FMFVRL_D; + else if (v3df && CSKY::FPR64RegClass.contains(SrcReg) && + CSKY::GPRRegClass.contains(DestReg)) + Opcode = CSKY::f2FMFVRL_D; + else if (v2sf && CSKY::GPRRegClass.contains(SrcReg) && + CSKY::sFPR32RegClass.contains(DestReg)) + Opcode = CSKY::FMTVRL; + else if (v3sf && CSKY::GPRRegClass.contains(SrcReg) && + CSKY::FPR32RegClass.contains(DestReg)) + Opcode = CSKY::f2FMTVRL; + else if (v2df && CSKY::GPRRegClass.contains(SrcReg) && + CSKY::sFPR64RegClass.contains(DestReg)) + Opcode = CSKY::FMTVRL_D; + else if (v3df && CSKY::GPRRegClass.contains(SrcReg) && + CSKY::FPR64RegClass.contains(DestReg)) + Opcode = CSKY::f2FMTVRL_D; else { LLVM_DEBUG(dbgs() << "src = " << SrcReg << ", dst = " << DestReg); LLVM_DEBUG(I->dump()); @@ -311,3 +561,58 @@ void CSKYInstrInfo::copyPhysReg(MachineBasicBlock &MBB, BuildMI(MBB, I, DL, get(Opcode), DestReg) .addReg(SrcReg, getKillRegState(KillSrc)); } + +Register CSKYInstrInfo::getGlobalBaseReg(MachineFunction &MF) const { + CSKYMachineFunctionInfo *CFI = MF.getInfo<CSKYMachineFunctionInfo>(); + MachineConstantPool *MCP = MF.getConstantPool(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + + Register GlobalBaseReg = CFI->getGlobalBaseReg(); + if (GlobalBaseReg != 0) + return GlobalBaseReg; + + // Insert a pseudo instruction to set the GlobalBaseReg into the first + // MBB of the function + MachineBasicBlock &FirstMBB = MF.front(); + MachineBasicBlock::iterator MBBI = FirstMBB.begin(); + DebugLoc DL; + + CSKYConstantPoolValue *CPV = CSKYConstantPoolSymbol::Create( + Type::getInt32Ty(MF.getFunction().getContext()), "_GLOBAL_OFFSET_TABLE_", + 0, CSKYCP::ADDR); + + unsigned CPI = MCP->getConstantPoolIndex(CPV, Align(4)); + + MachineMemOperand *MO = + MF.getMachineMemOperand(MachinePointerInfo::getConstantPool(MF), + MachineMemOperand::MOLoad, 4, Align(4)); + BuildMI(FirstMBB, MBBI, DL, get(CSKY::LRW32), CSKY::R28) + .addConstantPoolIndex(CPI) + .addMemOperand(MO); + + GlobalBaseReg = MRI.createVirtualRegister(&CSKY::GPRRegClass); + BuildMI(FirstMBB, MBBI, DL, get(TargetOpcode::COPY), GlobalBaseReg) + .addReg(CSKY::R28); + + CFI->setGlobalBaseReg(GlobalBaseReg); + return GlobalBaseReg; +} + +unsigned CSKYInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { + switch (MI.getOpcode()) { + default: + return MI.getDesc().getSize(); + case CSKY::CONSTPOOL_ENTRY: + return MI.getOperand(2).getImm(); + case CSKY::SPILL_CARRY: + case CSKY::RESTORE_CARRY: + case CSKY::PseudoTLSLA32: + return 8; + case TargetOpcode::INLINEASM_BR: + case TargetOpcode::INLINEASM: { + const MachineFunction *MF = MI.getParent()->getParent(); + const char *AsmStr = MI.getOperand(0).getSymbolName(); + return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo()); + } + } +} diff --git a/llvm/lib/Target/CSKY/CSKYInstrInfo.h b/llvm/lib/Target/CSKY/CSKYInstrInfo.h index 450641d96b74..1a1bbbf9154f 100644 --- a/llvm/lib/Target/CSKY/CSKYInstrInfo.h +++ b/llvm/lib/Target/CSKY/CSKYInstrInfo.h @@ -24,6 +24,11 @@ namespace llvm { class CSKYSubtarget; class CSKYInstrInfo : public CSKYGenInstrInfo { + bool v2sf; + bool v2df; + bool v3sf; + bool v3df; + protected: const CSKYSubtarget &STI; @@ -50,6 +55,28 @@ public: const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc) const override; + unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, + MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond, + const DebugLoc &DL, + int *BytesAdded = nullptr) const override; + + bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, + MachineBasicBlock *&FBB, + SmallVectorImpl<MachineOperand> &Cond, + bool AllowModify = false) const override; + + unsigned removeBranch(MachineBasicBlock &MBB, + int *BytesRemoved = nullptr) const override; + + bool + reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override; + + MachineBasicBlock *getBranchDestBlock(const MachineInstr &MI) const override; + + unsigned getInstSizeInBytes(const MachineInstr &MI) const override; + + Register getGlobalBaseReg(MachineFunction &MF) const; + // Materializes the given integer Val into DstReg. Register movImm(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, int64_t Val, diff --git a/llvm/lib/Target/CSKY/CSKYInstrInfo.td b/llvm/lib/Target/CSKY/CSKYInstrInfo.td index 30d9206eec68..a782efe7f4f4 100644 --- a/llvm/lib/Target/CSKY/CSKYInstrInfo.td +++ b/llvm/lib/Target/CSKY/CSKYInstrInfo.td @@ -15,22 +15,42 @@ // CSKY specific DAG Nodes. //===----------------------------------------------------------------------===// +// Target-independent type requirements, but with target-specific formats. def SDT_CallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>, SDTCisVT<1, i32>]>; def SDT_CallSeqEnd : SDCallSeqEnd<[SDTCisVT<0, i32>, SDTCisVT<1, i32>]>; +def SDT_CSKYCall : SDTypeProfile<0, 2, [SDTCisVT<0, iPTR>, SDTCisVT<1, iPTR>]>; + +def SDT_CSKYCallReg : SDTypeProfile<0, 1, [SDTCisVT<0, iPTR>]>; + +def SDT_CSKY_LOADADDR : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, + SDTCisVT<1, iPTR>, SDTCisVT<2, iPTR>]>; + def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_CallSeqStart, [SDNPHasChain, SDNPOutGlue]>; - def callseq_end : SDNode<"ISD::CALLSEQ_END", SDT_CallSeqEnd, [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; -// Target-dependent nodes. def CSKY_RET : SDNode<"CSKYISD::RET", SDTNone, [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>; +def CSKY_CALL : SDNode<"CSKYISD::CALL", SDT_CSKYCall, + [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue, SDNPVariadic]>; + +def CSKY_CALLReg : SDNode<"CSKYISD::CALLReg", SDT_CSKYCallReg, + [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue, SDNPVariadic]>; + +def CSKY_TAIL : SDNode<"CSKYISD::TAIL", SDT_CSKYCall, + [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue, SDNPVariadic]>; + +def CSKY_TAILReg : SDNode<"CSKYISD::TAILReg", SDT_CSKYCallReg, + [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue, SDNPVariadic]>; + +def CSKY_LOAD_ADDR : SDNode<"CSKYISD::LOAD_ADDR", SDT_CSKY_LOADADDR>; + //===----------------------------------------------------------------------===// // Operand and SDNode transformation definitions. //===----------------------------------------------------------------------===// @@ -57,6 +77,24 @@ def to_tframeindex : SDNodeXForm<frameindex, [{ return CurDAG->getTargetFrameIndex(FI->getIndex(), TLI->getPointerTy(CurDAG->getDataLayout())); }]>; +def to_tconstpool : SDNodeXForm<constpool, [{ + auto CP = cast<ConstantPoolSDNode>(N); + return CurDAG->getTargetConstantPool(CP->getConstVal(), TLI->getPointerTy(CurDAG->getDataLayout()), + CP->getAlign(), CP->getOffset(), CSKYII::MO_None); +}]>; + +def to_tconstpool_hi16 : SDNodeXForm<constpool, [{ + auto CP = cast<ConstantPoolSDNode>(N); + return CurDAG->getTargetConstantPool(CP->getConstVal(), TLI->getPointerTy(CurDAG->getDataLayout()), + CP->getAlign(), CP->getOffset(), CSKYII::MO_ADDR_HI16); +}]>; + +def to_tconstpool_lo16 : SDNodeXForm<constpool, [{ + auto CP = cast<ConstantPoolSDNode>(N); + return CurDAG->getTargetConstantPool(CP->getConstVal(), TLI->getPointerTy(CurDAG->getDataLayout()), + CP->getAlign(), CP->getOffset(), CSKYII::MO_ADDR_LO16); +}]>; + class oimm<int num> : Operand<i32>, ImmLeaf<i32, "return isUInt<"#num#">(Imm - 1);"> { let EncoderMethod = "getOImmOpValue"; @@ -1055,6 +1093,178 @@ let Predicates = [iHas2E3] in { def : Pat<(sext_inreg GPR:$src, i1), (SEXT32 GPR:$src, 0, 0)>; def : Pat<(and GPR:$src, 255), (ZEXT32 GPR:$src, 7, 0)>; def : Pat<(and GPR:$src, 65535), (ZEXT32 GPR:$src, 15, 0)>; + + // Call Patterns + def : Pat<(CSKY_CALL tglobaladdr, tconstpool:$src2), (JSRI32 tconstpool:$src2)>; + def : Pat<(CSKY_CALL texternalsym, tconstpool:$src2), (JSRI32 tconstpool:$src2)>; + def : Pat<(CSKY_TAIL tglobaladdr, tconstpool:$src2), (JMPI32 tconstpool:$src2)>; + def : Pat<(CSKY_TAIL texternalsym, tconstpool:$src2), (JMPI32 tconstpool:$src2)>; + + def : Pat<(CSKY_CALLReg GPR:$src), (JSR32 GPR:$src)>; + def : Pat<(CSKY_TAILReg GPR:$src), (JMP32 GPR:$src)>; +} + +// Symbol address Patterns +def : Pat<(CSKY_LOAD_ADDR tglobaladdr, tconstpool:$src2), (LRW32 tconstpool:$src2)>; +def : Pat<(CSKY_LOAD_ADDR tblockaddress, tconstpool:$src2), (LRW32 tconstpool:$src2)>; +def : Pat<(CSKY_LOAD_ADDR tjumptable:$src1, tconstpool:$src2), (LRW32_Gen tjumptable:$src1, tconstpool:$src2)>; +def : Pat<(CSKY_LOAD_ADDR texternalsym, tconstpool:$src2), (LRW32 tconstpool:$src2)>; + +let Predicates = [iHas2E3] in + def : Pat<(i32 constpool:$src), (GRS32 (to_tconstpool tconstpool:$src))>; + +let Predicates = [iHasE2] in + def : Pat<(i32 constpool:$src), + (ORI32 (MOVIH32 (to_tconstpool_hi16 tconstpool:$src)), + (to_tconstpool_lo16 tconstpool:$src))>; + +def : Pat<(i32 (load constpool:$src)), (LRW32 (to_tconstpool tconstpool:$src))>; + +// Branch Patterns. +let Predicates = [iHasE2] in { + def : Pat<(brcond CARRY:$ca, bb:$imm16), + (BT32 CARRY:$ca, bb:$imm16)>; + + def : Pat<(brcond (i32 (setne GPR:$rs1, uimm16:$rs2)), bb:$imm16), + (BT32 (CMPNEI32 GPR:$rs1, uimm16:$rs2), bb:$imm16)>; + def : Pat<(brcond (i32 (seteq GPR:$rs1, uimm16:$rs2)), bb:$imm16), + (BF32 (CMPNEI32 GPR:$rs1, uimm16:$rs2), bb:$imm16)>; + def : Pat<(brcond (i32 (setuge GPR:$rs1, oimm16:$rs2)), bb:$imm16), + (BT32 (CMPHSI32 GPR:$rs1, oimm16:$rs2), bb:$imm16)>; + def : Pat<(brcond (i32 (setult GPR:$rs1, oimm16:$rs2)), bb:$imm16), + (BF32 (CMPHSI32 GPR:$rs1, oimm16:$rs2), bb:$imm16)>; + def : Pat<(brcond (i32 (setlt GPR:$rs1, oimm16:$rs2)), bb:$imm16), + (BT32 (CMPLTI32 GPR:$rs1, oimm16:$rs2), bb:$imm16)>; + def : Pat<(brcond (i32 (setge GPR:$rs1, oimm16:$rs2)), bb:$imm16), + (BF32 (CMPLTI32 GPR:$rs1, oimm16:$rs2), bb:$imm16)>; + +} + +let Predicates = [iHas2E3] in { + +def : Pat<(brcond (i32 (setne GPR:$rs1, GPR:$rs2)), bb:$imm16), + (BT32 (CMPNE32 GPR:$rs1, GPR:$rs2), bb:$imm16)>; +def : Pat<(brcond (i32 (seteq GPR:$rs1, GPR:$rs2)), bb:$imm16), + (BF32 (CMPNE32 GPR:$rs1, GPR:$rs2), bb:$imm16)>; +def : Pat<(brcond (i32 (setuge GPR:$rs1, GPR:$rs2)), bb:$imm16), + (BT32 (CMPHS32 GPR:$rs1, GPR:$rs2), bb:$imm16)>; +def : Pat<(brcond (i32 (setule GPR:$rs1, GPR:$rs2)), bb:$imm16), + (BT32 (CMPHS32 GPR:$rs2, GPR:$rs1), bb:$imm16)>; +def : Pat<(brcond (i32 (setult GPR:$rs1, GPR:$rs2)), bb:$imm16), + (BF32 (CMPHS32 GPR:$rs1, GPR:$rs2), bb:$imm16)>; +def : Pat<(brcond (i32 (setugt GPR:$rs1, GPR:$rs2)), bb:$imm16), + (BF32 (CMPHS32 GPR:$rs2, GPR:$rs1), bb:$imm16)>; +def : Pat<(brcond (i32 (setlt GPR:$rs1, GPR:$rs2)), bb:$imm16), + (BT32 (CMPLT32 GPR:$rs1, GPR:$rs2), bb:$imm16)>; +def : Pat<(brcond (i32 (setgt GPR:$rs1, GPR:$rs2)), bb:$imm16), + (BT32 (CMPLT32 GPR:$rs2, GPR:$rs1), bb:$imm16)>; +def : Pat<(brcond (i32 (setge GPR:$rs1, GPR:$rs2)), bb:$imm16), + (BF32 (CMPLT32 GPR:$rs1, GPR:$rs2), bb:$imm16)>; +def : Pat<(brcond (i32 (setle GPR:$rs1, GPR:$rs2)), bb:$imm16), + (BF32 (CMPLT32 GPR:$rs2, GPR:$rs1), bb:$imm16)>; + +def : Pat<(brcond (i32 (seteq GPR:$rs1, (i32 0))), bb:$imm16), + (BEZ32 GPR:$rs1, bb:$imm16)>; +def : Pat<(brcond (i32 (setne GPR:$rs1, (i32 0))), bb:$imm16), + (BNEZ32 GPR:$rs1, bb:$imm16)>; +def : Pat<(brcond (i32 (setlt GPR:$rs1, (i32 0))), bb:$imm16), + (BLZ32 GPR:$rs1, bb:$imm16)>; +def : Pat<(brcond (i32 (setge GPR:$rs1, (i32 0))), bb:$imm16), + (BHSZ32 GPR:$rs1, bb:$imm16)>; +def : Pat<(brcond (i32 (setgt GPR:$rs1, (i32 0))), bb:$imm16), + (BHZ32 GPR:$rs1, bb:$imm16)>; +def : Pat<(brcond (i32 (setle GPR:$rs1, (i32 0))), bb:$imm16), + (BLSZ32 GPR:$rs1, bb:$imm16)>; +} + +// Compare Patterns. +let Predicates = [iHas2E3] in { + def : Pat<(setne GPR:$rs1, GPR:$rs2), + (CMPNE32 GPR:$rs1, GPR:$rs2)>; + def : Pat<(i32 (seteq GPR:$rs1, GPR:$rs2)), + (MVCV32 (CMPNE32 GPR:$rs1, GPR:$rs2))>; + def : Pat<(setuge GPR:$rs1, GPR:$rs2), + (CMPHS32 GPR:$rs1, GPR:$rs2)>; + def : Pat<(setule GPR:$rs1, GPR:$rs2), + (CMPHS32 GPR:$rs2, GPR:$rs1)>; + def : Pat<(i32 (setult GPR:$rs1, GPR:$rs2)), + (MVCV32 (CMPHS32 GPR:$rs1, GPR:$rs2))>; + def : Pat<(i32 (setugt GPR:$rs1, GPR:$rs2)), + (MVCV32 (CMPHS32 GPR:$rs2, GPR:$rs1))>; + def : Pat<(setlt GPR:$rs1, GPR:$rs2), + (CMPLT32 GPR:$rs1, GPR:$rs2)>; + def : Pat<(setgt GPR:$rs1, GPR:$rs2), + (CMPLT32 GPR:$rs2, GPR:$rs1)>; + def : Pat<(i32 (setge GPR:$rs1, GPR:$rs2)), + (MVCV32 (CMPLT32 GPR:$rs1, GPR:$rs2))>; + def : Pat<(i32 (setle GPR:$rs1, GPR:$rs2)), + (MVCV32 (CMPLT32 GPR:$rs2, GPR:$rs1))>; +} + +let Predicates = [iHasE2] in { + def : Pat<(setne GPR:$rs1, uimm16:$rs2), + (CMPNEI32 GPR:$rs1, uimm16:$rs2)>; + let Predicates = [iHas2E3] in + def : Pat<(i32 (seteq GPR:$rs1, uimm16:$rs2)), + (MVCV32 (CMPNEI32 GPR:$rs1, uimm16:$rs2))>; + def : Pat<(setuge GPR:$rs1, oimm16:$rs2), + (CMPHSI32 GPR:$rs1, oimm16:$rs2)>; + let Predicates = [iHas2E3] in + def : Pat<(i32 (setult GPR:$rs1, oimm16:$rs2)), + (MVCV32 (CMPHSI32 GPR:$rs1, oimm16:$rs2))>; + def : Pat<(setlt GPR:$rs1, oimm16:$rs2), + (CMPLTI32 GPR:$rs1, oimm16:$rs2)>; + let Predicates = [iHas2E3] in + def : Pat<(i32 (setge GPR:$rs1, oimm16:$rs2)), + (MVCV32 (CMPLTI32 GPR:$rs1, oimm16:$rs2))>; +} + +// Select Patterns. +let Predicates = [iHasE2] in { +def : Pat<(select CARRY:$ca, GPR:$rx, GPR:$false), + (MOVT32 CARRY:$ca, GPR:$rx, GPR:$false)>; +def : Pat<(select (and CARRY:$ca, 1), GPR:$rx, GPR:$false), + (MOVT32 CARRY:$ca, GPR:$rx, GPR:$false)>; + +def : Pat<(select (i32 (setne GPR:$rs1, GPR:$rs2)), GPR:$rx, GPR:$false), + (MOVT32 (CMPNE32 GPR:$rs1, GPR:$rs2), GPR:$rx, GPR:$false)>; +def : Pat<(select (i32 (setne GPR:$rs1, uimm16:$rs2)), GPR:$rx, GPR:$false), + (MOVT32 (CMPNEI32 GPR:$rs1, uimm16:$rs2), GPR:$rx, GPR:$false)>; +def : Pat<(select (i32 (seteq GPR:$rs1, GPR:$rs2)), GPR:$rx, GPR:$false), + (MOVF32 (CMPNE32 GPR:$rs1, GPR:$rs2), GPR:$rx, GPR:$false)>; +def : Pat<(select (i32 (seteq GPR:$rs1, uimm16:$rs2)), GPR:$rx, GPR:$false), + (MOVF32 (CMPNEI32 GPR:$rs1, uimm16:$rs2), GPR:$rx, GPR:$false)>; + +def : Pat<(select (i32 (setuge GPR:$rs1, GPR:$rs2)), GPR:$rx, GPR:$false), + (MOVT32 (CMPHS32 GPR:$rs1, GPR:$rs2), GPR:$rx, GPR:$false)>; +def : Pat<(select (i32 (setuge GPR:$rs1, oimm16:$rs2)), GPR:$rx, GPR:$false), + (MOVT32 (CMPHSI32 GPR:$rs1, oimm16:$rs2), GPR:$rx, GPR:$false)>; +def : Pat<(select (i32 (setule GPR:$rs1, GPR:$rs2)), GPR:$rx, GPR:$false), + (MOVT32 (CMPHS32 GPR:$rs2, GPR:$rs1), GPR:$rx, GPR:$false)>; +def : Pat<(select (i32 (setult GPR:$rs1, GPR:$rs2)), GPR:$rx, GPR:$false), + (MOVF32 (CMPHS32 GPR:$rs1, GPR:$rs2), GPR:$rx, GPR:$false)>; +def : Pat<(select (i32 (setult GPR:$rs1, oimm16:$rs2)), GPR:$rx, GPR:$false), + (MOVF32 (CMPHSI32 GPR:$rs1, oimm16:$rs2), GPR:$rx, GPR:$false)>; +def : Pat<(select (i32 (setugt GPR:$rs1, GPR:$rs2)), GPR:$rx, GPR:$false), + (MOVF32 (CMPHS32 GPR:$rs2, GPR:$rs1), GPR:$rx, GPR:$false)>; + +def : Pat<(select (i32 (setlt GPR:$rs1, GPR:$rs2)), GPR:$rx, GPR:$false), + (MOVT32 (CMPLT32 GPR:$rs1, GPR:$rs2), GPR:$rx, GPR:$false)>; +def : Pat<(select (i32 (setlt GPR:$rs1, oimm16:$rs2)), GPR:$rx, GPR:$false), + (MOVT32 (CMPLTI32 GPR:$rs1, oimm16:$rs2), GPR:$rx, GPR:$false)>; +def : Pat<(select (i32 (setgt GPR:$rs1, GPR:$rs2)), GPR:$rx, GPR:$false), + (MOVT32 (CMPLT32 GPR:$rs2, GPR:$rs1), GPR:$rx, GPR:$false)>; +def : Pat<(select (i32 (setge GPR:$rs1, GPR:$rs2)), GPR:$rx, GPR:$false), + (MOVF32 (CMPLT32 GPR:$rs1, GPR:$rs2), GPR:$rx, GPR:$false)>; +def : Pat<(select (i32 (setge GPR:$rs1, oimm16:$rs2)), GPR:$rx, GPR:$false), + (MOVF32 (CMPLTI32 GPR:$rs1, oimm16:$rs2), GPR:$rx, GPR:$false)>; +def : Pat<(select (i32 (setle GPR:$rs1, GPR:$rs2)), GPR:$rx, GPR:$false), + (MOVF32 (CMPLT32 GPR:$rs2, GPR:$rs1), GPR:$rx, GPR:$false)>; + +def : Pat<(select CARRY:$ca, GPR:$rx, GPR:$false), + (ISEL32 CARRY:$ca, GPR:$rx, GPR:$false)>; +def : Pat<(select (and CARRY:$ca, 1), GPR:$rx, GPR:$false), + (ISEL32 CARRY:$ca, GPR:$rx, GPR:$false)>; } // Constant materialize patterns. @@ -1150,3 +1360,5 @@ def CONSTPOOL_ENTRY : CSKYPseudo<(outs), (ins i32imm:$instid, i32imm:$cpidx, i32imm:$size), "", []>; include "CSKYInstrInfo16Instr.td" +include "CSKYInstrInfoF1.td" +include "CSKYInstrInfoF2.td" diff --git a/llvm/lib/Target/CSKY/CSKYInstrInfoF1.td b/llvm/lib/Target/CSKY/CSKYInstrInfoF1.td new file mode 100644 index 000000000000..30cef024f35a --- /dev/null +++ b/llvm/lib/Target/CSKY/CSKYInstrInfoF1.td @@ -0,0 +1,420 @@ +//===- CSKYInstrInfoF1.td - CSKY Instruction Float1.0 ------*- tablegen -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file describes the CSKY instructions in TableGen format. +// +//===----------------------------------------------------------------------===// + +def regseq_f1 : Operand<iPTR> { + let EncoderMethod = "getRegisterSeqOpValue"; + let ParserMatchClass = RegSeqAsmOperand<"V1">; + let PrintMethod = "printRegisterSeq"; + let DecoderMethod = "DecodeRegSeqOperandF1"; + let MIOperandInfo = (ops sFPR32, uimm5); +} + +def regseq_d1 : Operand<iPTR> { + let EncoderMethod = "getRegisterSeqOpValue"; + let ParserMatchClass = RegSeqAsmOperand<"V1">; + let PrintMethod = "printRegisterSeq"; + let DecoderMethod = "DecodeRegSeqOperandD1"; + let MIOperandInfo = (ops sFPR64, uimm5); +} + +def sFPR32Op : RegisterOperand<sFPR32, "printFPR">; +def sFPR64Op : RegisterOperand<sFPR64, "printFPR">; +def sFPR64_V_OP : RegisterOperand<sFPR64_V, "printFPR">; + +include "CSKYInstrFormatsF1.td" + +//===----------------------------------------------------------------------===// +// CSKY specific DAG Nodes. +//===----------------------------------------------------------------------===// + +def SDT_BITCAST_TO_LOHI : SDTypeProfile<2, 1, [SDTCisSameAs<0, 1>]>; +def CSKY_BITCAST_TO_LOHI : SDNode<"CSKYISD::BITCAST_TO_LOHI", SDT_BITCAST_TO_LOHI>; +def SDT_BITCAST_FROM_LOHI : SDTypeProfile<1, 2, [SDTCisSameAs<1, 2>]>; +def CSKY_BITCAST_FROM_LOHI : SDNode<"CSKYISD::BITCAST_FROM_LOHI", SDT_BITCAST_FROM_LOHI>; +//===----------------------------------------------------------------------===// +// Operand and SDNode transformation definitions. +//===----------------------------------------------------------------------===// + +def fpimm0 : PatLeaf<(fpimm), [{ return N->isExactlyValue(+0.0); }]>; + +def fpimm32_hi16 : SDNodeXForm<fpimm, [{ + return CurDAG->getTargetConstant( + (N->getValueAPF().bitcastToAPInt().getZExtValue() >> 16) & 0xFFFF, + SDLoc(N), MVT::i32); +}]>; + +def fpimm32_lo16 : SDNodeXForm<fpimm, [{ + return CurDAG->getTargetConstant( + N->getValueAPF().bitcastToAPInt().getZExtValue() & 0xFFFF, + SDLoc(N), MVT::i32); +}]>; + +class fpimm_xform<int width, int shift = 0> : SDNodeXForm<fpimm, + "return CurDAG->getTargetConstant(N->getValueAPF().bitcastToAPInt().lshr("#shift#").getLoBits("#width#"), SDLoc(N), MVT::i32);">; + +class fpimm_xform_i16<int width, int shift = 0> : SDNodeXForm<fpimm, + "return CurDAG->getTargetConstant(N->getValueAPF().bitcastToAPInt().lshr("#shift#").getLoBits("#width#"), SDLoc(N), MVT::i16);">; + +class fpimm_t<int width, int shift = 0> : PatLeaf<(fpimm), + "return isShiftedUInt<"#width#", "#shift#">(N->getValueAPF().bitcastToAPInt().getZExtValue());">; + +def fpimm8 : fpimm_t<8>; +def fpimm8_8 : fpimm_t<8, 8>; +def fpimm8_16 : fpimm_t<8, 16>; +def fpimm8_24 : fpimm_t<8, 24>; +def fpimm16 : fpimm_t<16>; +def fpimm16_8 : fpimm_t<16, 8>; +def fpimm16_16 : fpimm_t<16, 16>; +def fpimm24 : fpimm_t<24>; +def fpimm24_8 : fpimm_t<24, 8>; +def fpimm32 : fpimm_t<32>; + +def fpimm8_sr0_XFORM : fpimm_xform<8>; +def fpimm8_sr8_XFORM : fpimm_xform<8, 8>; +def fpimm8_sr16_XFORM : fpimm_xform<8, 16>; +def fpimm8_sr24_XFORM : fpimm_xform<8, 24>; + +def fpimm8_sr0_i16_XFORM : fpimm_xform_i16<8>; +def fpimm8_sr8_i16_XFORM : fpimm_xform_i16<8, 8>; + +def fconstpool_symbol : Operand<iPTR> { + let ParserMatchClass = Constpool; + let EncoderMethod = + "getConstpoolSymbolOpValue<CSKY::fixup_csky_pcrel_uimm8_scale4>"; + let DecoderMethod = "decodeUImmOperand<8, 2>"; + let PrintMethod = "printConstpool"; + let OperandType = "OPERAND_PCREL"; +} + + + +//===----------------------------------------------------------------------===// +// Instructions +//===----------------------------------------------------------------------===// + +//arithmetic + +def FABSM : F_XZ<0x2, 0b000110, "fabsm", "", UnOpFrag<(fabs node:$Src)>, sFPR64_V_OP>; +def FNEGM : F_XZ<0x2, 0b000111, "fnegm", "", UnOpFrag<(fneg node:$Src)>, sFPR64_V_OP>; +def FADDM : F_XYZ<0x2, 0b000000, "faddm", "", BinOpFrag<(fadd node:$LHS, node:$RHS)>, sFPR64_V_OP>; +def FSUBM : F_XYZ<0x2, 0b000001, "fsubm", "", BinOpFrag<(fsub node:$LHS, node:$RHS)>, sFPR64_V_OP>; +def FMULM : F_XYZ<0x2, 0b010000, "fmulm", "", BinOpFrag<(fmul node:$LHS, node:$RHS)>, sFPR64_V_OP>; +def FNMULM : F_XYZ<0x2, 0b010001, "fnmulm", "", BinOpFrag<(fneg (fmul node:$LHS, node:$RHS))>, sFPR64_V_OP>; +def FMACM : F_ACCUM_XYZ<0x2, 0b010100, "fmacm", "", TriOpFrag<(fadd node:$LHS, (fmul node:$MHS, node:$RHS))>, sFPR64_V_OP>; +def FMSCM : F_ACCUM_XYZ<0x2, 0b010101, "fmscm", "", TriOpFrag<(fsub (fmul node:$MHS, node:$RHS), node:$LHS)>, sFPR64_V_OP>; +def FNMACM : F_ACCUM_XYZ<0x2, 0b010110, "fnmacm", "", TriOpFrag<(fsub node:$LHS, (fmul node:$MHS, node:$RHS))>, sFPR64_V_OP>; +def FNMSCM : F_ACCUM_XYZ<0x2, 0b010111, "fnmscm", "", TriOpFrag<(fneg (fadd node:$LHS, (fmul node:$MHS, node:$RHS)))>, sFPR64_V_OP>; + +def FMOVM : F_MOV<0x2, 0b000100, "fmovm", "", sFPR64_V_OP>; + +defm FABS : FT_XZ<0b000110, "fabs", UnOpFrag<(fabs node:$Src)>>; +defm FNEG : FT_XZ<0b000111, "fneg", UnOpFrag<(fneg node:$Src)>>; +defm FSQRT : FT_XZ<0b011010, "fsqrt", UnOpFrag<(fsqrt node:$Src)>>; + +defm FADD : FT_XYZ<0b000000, "fadd", BinOpFrag<(fadd node:$LHS, node:$RHS)>>; +defm FSUB : FT_XYZ<0b000001, "fsub", BinOpFrag<(fsub node:$LHS, node:$RHS)>>; +defm FDIV : FT_XYZ<0b011000, "fdiv", BinOpFrag<(fdiv node:$LHS, node:$RHS)>>; +defm FMUL : FT_XYZ<0b010000, "fmul", BinOpFrag<(fmul node:$LHS, node:$RHS)>>; +defm FNMUL : FT_XYZ<0b010001, "fnmul", BinOpFrag<(fneg (fmul node:$LHS, node:$RHS))>>; +defm FMAC : FT_ACCUM_XYZ<0b010100, "fmac", TriOpFrag<(fadd node:$LHS, (fmul node:$MHS, node:$RHS))>>; +defm FMSC : FT_ACCUM_XYZ<0b010101, "fmsc", TriOpFrag<(fsub (fmul node:$MHS, node:$RHS), node:$LHS)>>; +defm FNMAC : FT_ACCUM_XYZ<0b010110, "fnmac", TriOpFrag<(fsub node:$LHS, (fmul node:$MHS, node:$RHS))>>; +defm FNMSC : FT_ACCUM_XYZ<0b010111, "fnmsc", TriOpFrag<(fneg (fadd node:$LHS, (fmul node:$MHS, node:$RHS)))>>; + +defm FCMPHS : FT_CMPXY<0b001100, "fcmphs">; +defm FCMPLT : FT_CMPXY<0b001101, "fcmplt">; +defm FCMPNE : FT_CMPXY<0b001110, "fcmpne">; +defm FCMPUO : FT_CMPXY<0b001111, "fcmpuo">; +defm FCMPZHS : FT_CMPZX<0b001000, "fcmpzhs">; +defm FCMPZLS : FT_CMPZX<0b001001, "fcmpzls">; +defm FCMPZNE : FT_CMPZX<0b001010, "fcmpzne">; +defm FCMPZUO : FT_CMPZX<0b001011, "fcmpzuo">; + +defm FRECIP : FT_MOV<0b011001, "frecip">; + +//fmov, fmtvr, fmfvr +defm FMOV : FT_MOV<0b000100, "fmov">; +def FMFVRL : F_XZ_GF<3, 0b011001, (outs GPR:$rz), (ins sFPR32Op:$vrx), + "fmfvrl\t$rz, $vrx", [(set GPR:$rz, (bitconvert sFPR32Op:$vrx))]>; +def FMTVRL : F_XZ_FG<3, 0b011011, (outs sFPR32Op:$vrz), (ins GPR:$rx), + "fmtvrl\t$vrz, $rx", [(set sFPR32Op:$vrz, (bitconvert GPR:$rx))]>; + +let Predicates = [HasFPUv2_DF] in { + let isCodeGenOnly = 1 in + def FMFVRL_D : F_XZ_GF<3, 0b011001, (outs GPR:$rz), (ins sFPR64Op:$vrx), + "fmfvrl\t$rz, $vrx", []>; + def FMFVRH_D : F_XZ_GF<3, 0b011000, (outs GPR:$rz), (ins sFPR64Op:$vrx), + "fmfvrh\t$rz, $vrx", []>; + let isCodeGenOnly = 1 in + def FMTVRL_D : F_XZ_FG<3, 0b011011, (outs sFPR64Op:$vrz), (ins GPR:$rx), + "fmtvrl\t$vrz, $rx", []>; +let Constraints = "$vrZ = $vrz" in + def FMTVRH_D : F_XZ_FG<3, 0b011010, (outs sFPR64Op:$vrz), (ins sFPR64Op:$vrZ, GPR:$rx), + "fmtvrh\t$vrz, $rx", []>; +} + +//fcvt + +def FSITOS : F_XZ_TRANS<0b010000, "fsitos", sFPR32Op, sFPR32Op>; +def : Pat<(f32 (sint_to_fp GPR:$a)), + (FSITOS (COPY_TO_REGCLASS GPR:$a, sFPR32))>, + Requires<[HasFPUv2_SF]>; + +def FUITOS : F_XZ_TRANS<0b010001, "fuitos", sFPR32Op, sFPR32Op>; +def : Pat<(f32 (uint_to_fp GPR:$a)), + (FUITOS (COPY_TO_REGCLASS GPR:$a, sFPR32))>, + Requires<[HasFPUv2_SF]>; + +def FSITOD : F_XZ_TRANS<0b010100, "fsitod", sFPR64Op, sFPR64Op>; +def : Pat<(f64 (sint_to_fp GPR:$a)), + (FSITOD (COPY_TO_REGCLASS GPR:$a, sFPR64))>, + Requires<[HasFPUv2_DF]>; + +def FUITOD : F_XZ_TRANS<0b010101, "fuitod", sFPR64Op, sFPR64Op>; +def : Pat<(f64 (uint_to_fp GPR:$a)), + (FUITOD (COPY_TO_REGCLASS GPR:$a, sFPR64))>, + Requires<[HasFPUv2_DF]>; + +let Predicates = [HasFPUv2_DF] in { +def FDTOS : F_XZ_TRANS_DS<0b010110,"fdtos", UnOpFrag<(fpround node:$Src)>>; +def FSTOD : F_XZ_TRANS_SD<0b010111,"fstod", UnOpFrag<(fpextend node:$Src)>>; +} + +def rpiFSTOSI : F_XZ_TRANS<0b000010, "fstosi.rpi", sFPR32Op, sFPR32Op>; +def rpiFSTOUI : F_XZ_TRANS<0b000110, "fstoui.rpi", sFPR32Op, sFPR32Op>; +def rzFSTOSI : F_XZ_TRANS<0b000001, "fstosi.rz", sFPR32Op, sFPR32Op>; +def rzFSTOUI : F_XZ_TRANS<0b000101, "fstoui.rz", sFPR32Op, sFPR32Op>; +def rnFSTOSI : F_XZ_TRANS<0b000000, "fstosi.rn", sFPR32Op, sFPR32Op>; +def rnFSTOUI : F_XZ_TRANS<0b000100, "fstoui.rn", sFPR32Op, sFPR32Op>; +def rniFSTOSI : F_XZ_TRANS<0b000011, "fstosi.rni", sFPR32Op, sFPR32Op>; +def rniFSTOUI : F_XZ_TRANS<0b000111, "fstoui.rni", sFPR32Op, sFPR32Op>; + +let Predicates = [HasFPUv2_DF] in { +def rpiFDTOSI : F_XZ_TRANS<0b001010, "fdtosi.rpi", sFPR64Op, sFPR64Op>; +def rpiFDTOUI : F_XZ_TRANS<0b001110, "fdtoui.rpi", sFPR64Op, sFPR64Op>; +def rzFDTOSI : F_XZ_TRANS<0b001001, "fdtosi.rz", sFPR64Op, sFPR64Op>; +def rzFDTOUI : F_XZ_TRANS<0b001101, "fdtoui.rz", sFPR64Op, sFPR64Op>; +def rnFDTOSI : F_XZ_TRANS<0b001000, "fdtosi.rn", sFPR64Op, sFPR64Op>; +def rnFDTOUI : F_XZ_TRANS<0b001100, "fdtoui.rn", sFPR64Op, sFPR64Op>; +def rniFDTOSI : F_XZ_TRANS<0b001011, "fdtosi.rni", sFPR64Op, sFPR64Op>; +def rniFDTOUI : F_XZ_TRANS<0b001111, "fdtoui.rni", sFPR64Op, sFPR64Op>; +} + +multiclass FPToIntegerPats<SDNode round, string SUFFIX> { + def : Pat<(i32 (fp_to_sint (round sFPR32Op:$Rn))), + (COPY_TO_REGCLASS (!cast<Instruction>(SUFFIX # FSTOSI) sFPR32Op:$Rn), GPR)>, + Requires<[HasFPUv2_SF]>; + def : Pat<(i32 (fp_to_uint (round sFPR32Op:$Rn))), + (COPY_TO_REGCLASS (!cast<Instruction>(SUFFIX # FSTOUI) sFPR32Op:$Rn), GPR)>, + Requires<[HasFPUv2_SF]>; + def : Pat<(i32 (fp_to_sint (round sFPR64Op:$Rn))), + (COPY_TO_REGCLASS (!cast<Instruction>(SUFFIX # FDTOSI) sFPR64Op:$Rn), GPR)>, + Requires<[HasFPUv2_DF]>; + def : Pat<(i32 (fp_to_uint (round sFPR64Op:$Rn))), + (COPY_TO_REGCLASS (!cast<Instruction>(SUFFIX # FDTOUI) sFPR64Op:$Rn), GPR)>, + Requires<[HasFPUv2_DF]>; +} + +defm: FPToIntegerPats<fceil, "rpi">; +defm: FPToIntegerPats<fround, "rn">; +defm: FPToIntegerPats<ffloor, "rni">; + +multiclass FPToIntegerTowardszeroPats<string SUFFIX> { + def : Pat<(i32 (fp_to_sint sFPR32Op:$Rn)), + (COPY_TO_REGCLASS (!cast<Instruction>(SUFFIX # FSTOSI) sFPR32Op:$Rn), GPR)>, + Requires<[HasFPUv2_SF]>; + def : Pat<(i32 (fp_to_uint sFPR32Op:$Rn)), + (COPY_TO_REGCLASS (!cast<Instruction>(SUFFIX # FSTOUI) sFPR32Op:$Rn), GPR)>, + Requires<[HasFPUv2_SF]>; + def : Pat<(i32 (fp_to_sint sFPR64Op:$Rn)), + (COPY_TO_REGCLASS (!cast<Instruction>(SUFFIX # FDTOSI) sFPR64Op:$Rn), GPR)>, + Requires<[HasFPUv2_DF]>; + def : Pat<(i32 (fp_to_uint sFPR64Op:$Rn)), + (COPY_TO_REGCLASS (!cast<Instruction>(SUFFIX # FDTOUI) sFPR64Op:$Rn), GPR)>, + Requires<[HasFPUv2_DF]>; +} + +defm: FPToIntegerTowardszeroPats<"rz">; + + +//fld, fst +let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in { + defm FLD : FT_XYAI_LD<0b0010000, "fld">; + defm FLDR : FT_XYAR_LD<0b0010100, "fldr">; + defm FLDM : FT_XYAR_LDM<0b0011000, "fldm">; + + let Predicates = [HasFPUv2_DF] in + def FLDRM : F_XYAR_LD<0b0010101, 0, "fldrm", "", sFPR64Op>; + let Predicates = [HasFPUv2_DF] in + def FLDMM : F_I4_XY_MEM<0b0011001, 0, + (outs), (ins GPR:$rx, regseq_d1:$regs, variable_ops), "fldmm\t$regs, (${rx})", []>; + let Predicates = [HasFPUv2_DF] in + def FLDM : F_XYAI_LD<0b0010001, 0, "fldm", "", sFPR64Op, uimm8_3>; +} + + + +let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in { + defm FST : FT_XYAI_ST<0b0010010, "fst">; + defm FSTR : FT_XYAR_ST<0b0010110, "fstr">; + defm FSTM : FT_XYAR_STM<0b0011010, "fstm">; + + let Predicates = [HasFPUv2_DF] in + def FSTRM : F_XYAR_ST<0b0010111, 0, "fstrm", "", sFPR64Op>; + let Predicates = [HasFPUv2_DF] in + def FSTMM : F_I4_XY_MEM<0b0011011, 0, + (outs), (ins GPR:$rx, regseq_d1:$regs, variable_ops), "fstmm\t$regs, (${rx})", []>; + let Predicates = [HasFPUv2_DF] in + def FSTM : F_XYAI_ST<0b0010011, 0, "fstm", "", sFPR64Op, uimm8_3>; +} + +defm : LdPat<load, uimm8_2, FLD_S, f32>, Requires<[HasFPUv2_SF]>; +defm : LdPat<load, uimm8_2, FLD_D, f64>, Requires<[HasFPUv2_DF]>; +defm : LdrPat<load, FLDR_S, f32>, Requires<[HasFPUv2_SF]>; +defm : LdrPat<load, FLDR_D, f64>, Requires<[HasFPUv2_DF]>; + +defm : StPat<store, f32, uimm8_2, FST_S>, Requires<[HasFPUv2_SF]>; +defm : StPat<store, f64, uimm8_2, FST_D>, Requires<[HasFPUv2_DF]>; +defm : StrPat<store, f32, FSTR_S>, Requires<[HasFPUv2_SF]>; +defm : StrPat<store, f64, FSTR_D>, Requires<[HasFPUv2_DF]>; + + +def : Pat<(f32 fpimm16:$imm), (COPY_TO_REGCLASS (MOVI32 (fpimm32_lo16 fpimm16:$imm)), sFPR32)>, + Requires<[HasFPUv2_SF]>; +def : Pat<(f32 fpimm16_16:$imm), (f32 (COPY_TO_REGCLASS (MOVIH32 (fpimm32_hi16 fpimm16_16:$imm)), sFPR32))>, + Requires<[HasFPUv2_SF]>; +def : Pat<(f32 fpimm:$imm), (COPY_TO_REGCLASS (ORI32 (MOVIH32 (fpimm32_hi16 fpimm:$imm)), (fpimm32_lo16 fpimm:$imm)), sFPR32)>, + Requires<[HasFPUv2_SF]>; + +def : Pat<(f64(CSKY_BITCAST_FROM_LOHI GPR:$rs1, GPR:$rs2)), (FMTVRH_D(FMTVRL_D GPR:$rs1), GPR:$rs2)>, + Requires<[HasFPUv2_DF]>; + +multiclass BRCond_Bin<CondCode CC, string Instr, Instruction Br, Instruction MV> { + let Predicates = [HasFPUv2_SF] in + def : Pat<(brcond (i32 (setcc sFPR32Op:$rs1, sFPR32Op:$rs2, CC)), bb:$imm16), + (Br (!cast<Instruction>(Instr#_S) sFPR32Op:$rs1, sFPR32Op:$rs2), bb:$imm16)>; + let Predicates = [HasFPUv2_DF] in + def : Pat<(brcond (i32 (setcc sFPR64Op:$rs1, sFPR64Op:$rs2, CC)), bb:$imm16), + (Br (!cast<Instruction>(Instr#_D) sFPR64Op:$rs1, sFPR64Op:$rs2), bb:$imm16)>; + + let Predicates = [HasFPUv2_SF] in + def : Pat<(i32 (setcc sFPR32Op:$rs1, sFPR32Op:$rs2, CC)), + (MV (!cast<Instruction>(Instr#_S) sFPR32Op:$rs1, sFPR32Op:$rs2))>; + let Predicates = [HasFPUv2_DF] in + def : Pat<(i32 (setcc sFPR64Op:$rs1, sFPR64Op:$rs2, CC)), + (MV (!cast<Instruction>(Instr#_D) sFPR64Op:$rs1, sFPR64Op:$rs2))>; +} + +multiclass BRCond_Bin_SWAP<CondCode CC, string Instr, Instruction Br, Instruction MV> { + let Predicates = [HasFPUv2_SF] in + def : Pat<(brcond (i32 (setcc sFPR32Op:$rs1, sFPR32Op:$rs2, CC)), bb:$imm16), + (Br (!cast<Instruction>(Instr#_S) sFPR32Op:$rs2, sFPR32Op:$rs1), bb:$imm16)>; + let Predicates = [HasFPUv2_DF] in + def : Pat<(brcond (i32 (setcc sFPR64Op:$rs1, sFPR64Op:$rs2, CC)), bb:$imm16), + (Br (!cast<Instruction>(Instr#_D) sFPR64Op:$rs2, sFPR64Op:$rs1), bb:$imm16)>; + + let Predicates = [HasFPUv2_SF] in + def : Pat<(i32 (setcc sFPR32Op:$rs1, sFPR32Op:$rs2, CC)), + (MV (!cast<Instruction>(Instr#_S) sFPR32Op:$rs2, sFPR32Op:$rs1))>; + let Predicates = [HasFPUv2_DF] in + def : Pat<(i32 (setcc sFPR64Op:$rs1, sFPR64Op:$rs2, CC)), + (MV (!cast<Instruction>(Instr#_D) sFPR64Op:$rs2, sFPR64Op:$rs1))>; +} + +// inverse (order && compare) to (unorder || inverse(compare)) + +defm : BRCond_Bin<SETUNE, "FCMPNE", BT32, MVC32>; +defm : BRCond_Bin<SETOEQ, "FCMPNE", BF32, MVCV32>; +defm : BRCond_Bin<SETOGE, "FCMPHS", BT32, MVC32>; +defm : BRCond_Bin<SETOLT, "FCMPLT", BT32, MVC32>; +defm : BRCond_Bin<SETUO, "FCMPUO", BT32, MVC32>; +defm : BRCond_Bin<SETO, "FCMPUO", BF32, MVCV32>; +defm : BRCond_Bin_SWAP<SETOGT, "FCMPLT", BT32, MVC32>; +defm : BRCond_Bin_SWAP<SETOLE, "FCMPHS", BT32, MVC32>; + +defm : BRCond_Bin<SETNE, "FCMPNE", BT32, MVC32>; +defm : BRCond_Bin<SETEQ, "FCMPNE", BF32, MVCV32>; +defm : BRCond_Bin<SETGE, "FCMPHS", BT32, MVC32>; +defm : BRCond_Bin<SETLT, "FCMPLT", BT32, MVC32>; +defm : BRCond_Bin_SWAP<SETGT, "FCMPLT", BT32, MVC32>; +defm : BRCond_Bin_SWAP<SETLE, "FCMPHS", BT32, MVC32>; + +// ----------- + +let Predicates = [HasFPUv2_SF] in { + def : Pat<(brcond (i32 (setcc sFPR32Op:$rs1, fpimm0, SETOGE)), bb:$imm16), + (BT32 (FCMPZHS_S sFPR32Op:$rs1), bb:$imm16)>; + def : Pat<(i32 (setcc sFPR32Op:$rs1, fpimm0, SETOGE)), + (MVC32 (FCMPZHS_S sFPR32Op:$rs1))>; + def : Pat<(brcond (i32 (setcc sFPR32Op:$rs1, fpimm0, SETOLT)), bb:$imm16), + (BF32 (FCMPZHS_S sFPR32Op:$rs1), bb:$imm16)>; + def : Pat<(i32 (setcc sFPR32Op:$rs1, fpimm0, SETOLT)), + (MVCV32 (FCMPZHS_S sFPR32Op:$rs1))>; + def : Pat<(brcond (i32 (setcc sFPR32Op:$rs1, fpimm0, SETOLE)), bb:$imm16), + (BT32 (FCMPZLS_S sFPR32Op:$rs1), bb:$imm16)>; + def : Pat<(i32 (setcc sFPR32Op:$rs1, fpimm0, SETOLE)), + (MVC32 (FCMPZLS_S sFPR32Op:$rs1))>; + def : Pat<(brcond (i32 (setcc sFPR32Op:$rs1, fpimm0, SETOGT)), bb:$imm16), + (BF32 (FCMPZLS_S sFPR32Op:$rs1), bb:$imm16)>; + def : Pat<(i32 (setcc sFPR32Op:$rs1, fpimm0, SETOGT)), + (MVCV32 (FCMPZLS_S sFPR32Op:$rs1))>; + def : Pat<(brcond (i32 (setcc sFPR32Op:$rs1, fpimm0, SETUNE)), bb:$imm16), + (BT32 (FCMPZNE_S sFPR32Op:$rs1), bb:$imm16)>; + def : Pat<(i32 (setcc sFPR32Op:$rs1, fpimm0, SETUNE)), + (MVC32 (FCMPZNE_S sFPR32Op:$rs1))>; + def : Pat<(brcond (i32 (setcc sFPR32Op:$rs1, fpimm0, SETOEQ)), bb:$imm16), + (BF32 (FCMPZNE_S sFPR32Op:$rs1), bb:$imm16)>; + def : Pat<(i32 (setcc sFPR32Op:$rs1, fpimm0, SETOEQ)), + (MVCV32 (FCMPZNE_S sFPR32Op:$rs1))>; + def : Pat<(brcond (i32 (setcc sFPR32Op:$rs1, fpimm, SETUO)), bb:$imm16), + (BT32 (FCMPZUO_S sFPR32Op:$rs1), bb:$imm16)>; + def : Pat<(i32 (setcc sFPR32Op:$rs1, fpimm, SETUO)), + (MVC32 (FCMPZUO_S sFPR32Op:$rs1))>; + def : Pat<(brcond (i32 (setcc sFPR32Op:$rs1, fpimm, SETO)), bb:$imm16), + (BF32 (FCMPZUO_S sFPR32Op:$rs1), bb:$imm16)>; + def : Pat<(i32 (setcc sFPR32Op:$rs1, fpimm, SETO)), + (MVCV32 (FCMPZUO_S sFPR32Op:$rs1))>; + def : Pat<(brcond (i32 (setcc sFPR32Op:$rs1, fpimm0, SETGE)), bb:$imm16), + (BT32 (FCMPZHS_S sFPR32Op:$rs1), bb:$imm16)>; + def : Pat<(i32 (setcc sFPR32Op:$rs1, fpimm0, SETGE)), + (MVC32 (FCMPZHS_S sFPR32Op:$rs1))>; + def : Pat<(brcond (i32 (setcc sFPR32Op:$rs1, fpimm0, SETLT)), bb:$imm16), + (BF32 (FCMPZHS_S sFPR32Op:$rs1), bb:$imm16)>; + def : Pat<(i32 (setcc sFPR32Op:$rs1, fpimm0, SETLT)), + (MVCV32 (FCMPZHS_S sFPR32Op:$rs1))>; + def : Pat<(brcond (i32 (setcc sFPR32Op:$rs1, fpimm0, SETLE)), bb:$imm16), + (BT32 (FCMPZLS_S sFPR32Op:$rs1), bb:$imm16)>; + def : Pat<(i32 (setcc sFPR32Op:$rs1, fpimm0, SETLE)), + (MVC32 (FCMPZLS_S sFPR32Op:$rs1))>; + def : Pat<(brcond (i32 (setcc sFPR32Op:$rs1, fpimm0, SETGT)), bb:$imm16), + (BF32 (FCMPZLS_S sFPR32Op:$rs1), bb:$imm16)>; + def : Pat<(i32 (setcc sFPR32Op:$rs1, fpimm0, SETGT)), + (MVCV32 (FCMPZLS_S sFPR32Op:$rs1))>; + def : Pat<(brcond (i32 (setcc sFPR32Op:$rs1, fpimm0, SETNE)), bb:$imm16), + (BT32 (FCMPZNE_S sFPR32Op:$rs1), bb:$imm16)>; + def : Pat<(i32 (setcc sFPR32Op:$rs1, fpimm0, SETNE)), + (MVC32 (FCMPZNE_S sFPR32Op:$rs1))>; + def : Pat<(brcond (i32 (setcc sFPR32Op:$rs1, fpimm0, SETEQ)), bb:$imm16), + (BF32 (FCMPZNE_S sFPR32Op:$rs1), bb:$imm16)>; + def : Pat<(i32 (setcc sFPR32Op:$rs1, fpimm0, SETEQ)), + (MVCV32 (FCMPZNE_S sFPR32Op:$rs1))>; +} + +let usesCustomInserter = 1 in { + let Predicates = [HasFPUv2_SF] in + def FSELS : CSKYPseudo<(outs sFPR32Op:$dst), (ins CARRY:$cond, sFPR32Op:$src1, sFPR32Op:$src2), + "!fsels\t$dst, $src1, src2", [(set sFPR32Op:$dst, (select CARRY:$cond, sFPR32Op:$src1, sFPR32Op:$src2))]>; + + let Predicates = [HasFPUv2_DF] in + def FSELD : CSKYPseudo<(outs sFPR64Op:$dst), (ins CARRY:$cond, sFPR64Op:$src1, sFPR64Op:$src2), + "!fseld\t$dst, $src1, src2", [(set sFPR64Op:$dst, (select CARRY:$cond, sFPR64Op:$src1, sFPR64Op:$src2))]>; +}
\ No newline at end of file diff --git a/llvm/lib/Target/CSKY/CSKYInstrInfoF2.td b/llvm/lib/Target/CSKY/CSKYInstrInfoF2.td new file mode 100644 index 000000000000..8a00e7d9af3a --- /dev/null +++ b/llvm/lib/Target/CSKY/CSKYInstrInfoF2.td @@ -0,0 +1,462 @@ +//===- CSKYInstrInfoF2.td - CSKY Instruction Float2.0 ------*- tablegen -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file describes the CSKY instructions in TableGen format. +// +//===----------------------------------------------------------------------===// + +def regseq_f2 : Operand<i32> { + let EncoderMethod = "getRegisterSeqOpValue"; + let ParserMatchClass = RegSeqAsmOperand<"V2">; + let PrintMethod = "printRegisterSeq"; + let DecoderMethod = "DecodeRegSeqOperandF2"; + let MIOperandInfo = (ops FPR32, uimm5); +} + +def regseq_d2 : Operand<i32> { + let EncoderMethod = "getRegisterSeqOpValue"; + let ParserMatchClass = RegSeqAsmOperand<"V2">; + let PrintMethod = "printRegisterSeq"; + let DecoderMethod = "DecodeRegSeqOperandD2"; + let MIOperandInfo = (ops FPR64, uimm5); +} + +def FPR32Op : RegisterOperand<FPR32, "printFPR">; +def FPR64Op : RegisterOperand<FPR64, "printFPR">; + +include "CSKYInstrFormatsF2.td" + +// Predicates +def IsOrAdd: PatFrag<(ops node:$A, node:$B), (or node:$A, node:$B), [{ + return isOrEquivalentToAdd(N); +}]>; + +//===----------------------------------------------------------------------===// +// Instructions +//===----------------------------------------------------------------------===// + +defm f2FADD : F2_XYZ_T<0b000000, "fadd", BinOpFrag<(fadd node:$LHS, node:$RHS)>>; +defm f2FSUB : F2_XYZ_T<0b000001, "fsub", BinOpFrag<(fsub node:$LHS, node:$RHS)>>; +defm f2FDIV : F2_XYZ_T<0b011000, "fdiv", BinOpFrag<(fdiv node:$LHS, node:$RHS)>>; +defm f2FMUL : F2_XYZ_T<0b010000, "fmul", BinOpFrag<(fmul node:$LHS, node:$RHS)>>; + +defm f2FMAXNM : F2_XYZ_T<0b101000, "fmaxnm", BinOpFrag<(fmaxnum node:$LHS, node:$RHS)>>; +defm f2FMINNM : F2_XYZ_T<0b101001, "fminnm", BinOpFrag<(fminnum node:$LHS, node:$RHS)>>; + +defm f2FABS : F2_XZ_T<0b000110, "fabs", fabs>; +defm f2FNEG : F2_XZ_T<0b000111, "fneg", fneg>; +defm f2FSQRT : F2_XZ_T<0b011010, "fsqrt", fsqrt>; +defm f2FMOV : F2_XZ_SET_T<0b000100, "fmov">; +def f2FMOVX : F2_XZ_SET<0b00001, FPR32Op, 0b000101, "fmovx.32">; + +defm f2RECIP : F2_XZ_SET_T<0b011001, "frecip">; + +// fld/fst +let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in { + def f2FLD_S : F2_LDST_S<0b0, "fld", (outs FPR32Op:$vrz), (ins GPR:$rx, uimm8_2:$imm8)>; + let Predicates = [HasFPUv3_DF] in + def f2FLD_D : F2_LDST_D<0b0, "fld", (outs FPR64Op:$vrz), (ins GPR:$rx, uimm8_2:$imm8)>; +} +let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in { + def f2FST_S : F2_LDST_S<0b1, "fst", (outs), (ins FPR32Op:$vrz, GPR:$rx, uimm8_2:$imm8)>; + let Predicates = [HasFPUv3_DF] in + def f2FST_D : F2_LDST_D<0b1, "fst", (outs), (ins FPR64Op:$vrz, GPR:$rx, uimm8_2:$imm8)>; +} + +let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in { + def f2FSTM_S : F2_LDSTM_S<0b1, 0, "fstm", (outs), (ins GPR:$rx, regseq_f2:$regs, variable_ops)>; + let Predicates = [HasFPUv3_DF] in + def f2FSTM_D : F2_LDSTM_D<0b1, 0, "fstm", (outs), (ins GPR:$rx, regseq_d2:$regs, variable_ops)>; + + def f2FSTMU_S : F2_LDSTM_S<0b1, 0b100, "fstmu", (outs), (ins GPR:$rx, regseq_f2:$regs, variable_ops)>; + let Predicates = [HasFPUv3_DF] in + def f2FSTMU_D : F2_LDSTM_D<0b1, 0b100, "fstmu", (outs), (ins GPR:$rx, regseq_d2:$regs, variable_ops)>; +} + +let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in { + def f2FLDM_S : F2_LDSTM_S<0b0, 0, "fldm", (outs), (ins GPR:$rx, regseq_f2:$regs, variable_ops)>; + let Predicates = [HasFPUv3_DF] in + def f2FLDM_D : F2_LDSTM_D<0b0, 0, "fldm", (outs), (ins GPR:$rx, regseq_d2:$regs, variable_ops)>; + + def f2FLDMU_S : F2_LDSTM_S<0b0, 0b100, "fldmu", (outs), (ins GPR:$rx, regseq_f2:$regs, variable_ops)>; + let Predicates = [HasFPUv3_DF] in + def f2FLDMU_D : F2_LDSTM_D<0b0, 0b100, "fldmu", (outs), (ins GPR:$rx, regseq_d2:$regs, variable_ops)>; +} + +multiclass FLSR { + let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in { + def FLDR_S : F2_LDSTR_S<0b0, "fldr", (outs FPR32Op:$rz), (ins GPR:$rx, GPR:$ry, uimm2:$imm)>; + let Predicates = [HasFPUv3_DF] in + def FLDR_D : F2_LDSTR_D<0b0, "fldr", (outs FPR64Op:$rz), (ins GPR:$rx, GPR:$ry, uimm2:$imm)>; + } + let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in { + def FSTR_S : F2_LDSTR_S<0b1, "fstr", (outs), (ins FPR32Op:$rz, GPR:$rx, GPR:$ry, uimm2:$imm)>; + let Predicates = [HasFPUv3_DF] in + def FSTR_D : F2_LDSTR_D<0b1, "fstr", (outs), (ins FPR64Op:$rz, GPR:$rx, GPR:$ry, uimm2:$imm)>; + } +} + +defm f2: FLSR; + +def f2FLRW_S : F2_LRW<0b00, 0b0, "flrw.32", (outs FPR32Op:$vrz), (ins fconstpool_symbol:$imm8)>; +def f2FLRW_D : F2_LRW<0b01, 0b0, "flrw.64", (outs FPR64Op:$vrz), (ins fconstpool_symbol:$imm8)>; + +def : Pat<(f32 (load constpool:$src)), (f2FLRW_S (to_tconstpool tconstpool:$src))>, Requires<[HasFPUv3_SF]>; +def : Pat<(f64 (load constpool:$src)), (f2FLRW_D (to_tconstpool tconstpool:$src))>, Requires<[HasFPUv3_DF]>; + +defm : LdPat<load, uimm8_2, f2FLD_S, f32>, Requires<[HasFPUv3_SF]>; +defm : LdPat<load, uimm8_2, f2FLD_D, f64>, Requires<[HasFPUv3_DF]>; +defm : LdrPat<load, f2FLDR_S, f32>, Requires<[HasFPUv3_SF]>; +defm : LdrPat<load, f2FLDR_D, f64>, Requires<[HasFPUv3_DF]>; + +defm : StPat<store, f32, uimm8_2, f2FST_S>, Requires<[HasFPUv3_SF]>; +defm : StPat<store, f64, uimm8_2, f2FST_D>, Requires<[HasFPUv3_DF]>; +defm : StrPat<store, f32, f2FSTR_S>, Requires<[HasFPUv3_SF]>; +defm : StrPat<store, f64, f2FSTR_D>, Requires<[HasFPUv3_DF]>; + +// fmfvr +let vry = 0 in +def f2FMFVRL : F2_XYZ<0b00011, 0b011001, "fmfvr.32.1\t$vrz, $vrx", + (outs GPR:$vrz), (ins FPR32Op:$vrx), + [(set GPR:$vrz, (bitconvert FPR32Op:$vrx))]>; +// TODO: vrz and vrz+1 +def f2FMFVRL_2 : F2_XYZ<0b00011, 0b111010, "fmfvr.32.2\t$vrz, $vry, $vrx", + (outs GPR:$vrz, GPR:$vry), (ins FPR64Op:$vrx), + []>; + +let Predicates = [HasFPUv3_DF] in { +let vry = 0 in { +let isCodeGenOnly = 1 in +def f2FMFVRL_D : F2_XYZ<0b00011, 0b011001, "fmfvr.32.1\t$vrz, $vrx", + (outs GPR:$vrz), (ins FPR64Op:$vrx), + []>; +def f2FMFVRH_D : F2_XYZ<0b00011, 0b011000, "fmfvrh\t$vrz, $vrx", + (outs GPR:$vrz), (ins FPR64Op:$vrx), + []>; +} +def f2FMFVR_D : F2_XYZ<0b00011, 0b111000, "fmfvr.64\t$vrz, $vry, $vrx", + (outs GPR:$vrz, GPR:$vry), (ins FPR64Op:$vrx), + [(set GPR:$vrz, GPR:$vry, (CSKY_BITCAST_TO_LOHI FPR64Op:$vrx))]>; +} + +// fmtvr +def f2FMTVRL : F2_XZ_P<0b00011, 0b011011, "fmtvr.32.1", + [(set FPR32Op:$vrz, (bitconvert GPR:$vrx))], + (outs FPR32Op:$vrz), (ins GPR:$vrx)>; +// TODO: vrz and vrz+1 +def f2FMTVRL_2 : F2_XYZ<0b00011, 0b111110, "fmtvr.32.2\t$vrz, $vrx, $vry", + (outs FPR32Op:$vrz), (ins GPR:$vrx, GPR:$vry), + []>; + +let Predicates = [HasFPUv3_DF] in { +let isCodeGenOnly = 1 in +def f2FMTVRL_D : F2_XZ_P<0b00011, 0b011011, "fmtvr.32.1", + [], + (outs FPR64Op:$vrz), (ins GPR:$vrx)>; +let Constraints = "$vrZ = $vrz" in +def f2FMTVRH_D : F2_XZ_P<0b00011, 0b011010, "fmtvrh", + [], + (outs FPR64Op:$vrz), (ins FPR64Op:$vrZ, GPR:$vrx)>; +def f2FMTVR_D : F2_XYZ<0b00011, 0b111100, "fmtvr.64\t$vrz, $vrx, $vry", + (outs FPR64Op:$vrz), (ins GPR:$vrx, GPR:$vry), + [(set FPR64Op:$vrz, (CSKY_BITCAST_FROM_LOHI GPR:$vrx, GPR:$vry))]>; +} + +// fcmp + +defm f2FCMPHS: F2_CXY_T<0b001100, "fcmphs">; +defm f2FCMPLT: F2_CXY_T<0b001101, "fcmplt">; +defm f2FCMPNE: F2_CXY_T<0b001110, "fcmpne">; +defm f2FCMPUO: F2_CXY_T<0b001111, "fcmpuo">; + +defm f2FCMPHSZ: F2_CX_T<0b001000, "fcmphsz">; +defm f2FCMPHZ : F2_CX_T<0b101010, "fcmphz">; +defm f2FCMPLSZ: F2_CX_T<0b101011, "fcmplsz">; +defm f2FCMPLTZ: F2_CX_T<0b001001, "fcmpltz">; +defm f2FCMPNEZ: F2_CX_T<0b001010, "fcmpnez">; +defm f2FCMPUOZ: F2_CX_T<0b001011, "fcmpuoz">; + +defm f2FMULA : F2_XYZZ_T<0b010100, "fmula", + TriOpFrag<(fadd (fmul node:$LHS, node:$MHS), node:$RHS)>>; + +defm f2FMULS : F2_XYZZ_T<0b010110, "fmuls", + TriOpFrag<(fsub node:$RHS, (fmul node:$LHS, node:$MHS))>>; + +defm f2FFMULA : F2_XYZZ_T<0b110000, "ffmula", + TriOpFrag<(fma node:$LHS, node:$MHS, node:$RHS)>>; + +defm f2FFMULS : F2_XYZZ_T<0b110001, "ffmuls", + TriOpFrag<(fma (fneg node:$LHS), node:$MHS, node:$RHS)>>; + +defm f2FFNMULA : F2_XYZZ_T<0b110010, "ffnmula", + TriOpFrag<(fneg (fma node:$LHS, node:$MHS, node:$RHS))>>; + +defm f2FFNMULS : F2_XYZZ_T<0b110011, "ffnmuls", + TriOpFrag<(fma node:$LHS, node:$MHS, (fneg node:$RHS))>>; + +defm f2FNMULA : F2_XYZZ_T<0b010111, "fnmula", + TriOpFrag<(fneg (fadd (fmul node:$LHS, node:$MHS), node:$RHS))>>; + +defm f2FNMULS : F2_XYZZ_T<0b010101, "fnmuls", + TriOpFrag<(fneg (fsub node:$RHS, (fmul node:$LHS, node:$MHS)))>>; + +defm f2FNMUL : F2_XYZ_T<0b010001, "fnmul", + BinOpFrag<(fneg (fmul node:$LHS, node:$RHS))>>; + +// fcvt +def f2FFTOS32_S : F2_XZ_P<0b01000, 0b011011, "fftoi.f32.s32", [], (outs FPR32Op:$vrz), (ins FPR32Op:$vrx)>; +def f2FFTOU32_S : F2_XZ_P<0b01000, 0b011010, "fftoi.f32.u32", [], (outs FPR32Op:$vrz), (ins FPR32Op:$vrx)>; +def f2FS32TOF_S : F2_XZ_P<0b01001, 0b011011, "fitof.s32.f32", [], (outs FPR32Op:$vrz), (ins FPR32Op:$vrx)>; +def f2FU32TOF_S : F2_XZ_P<0b01001, 0b011010, "fitof.u32.f32", [], (outs FPR32Op:$vrz), (ins FPR32Op:$vrx)>; +def f2FFTOXU32_S : F2_XZ_P<0b01000, 0b001010, "fftox.f32.u32", [], (outs FPR32Op:$vrz), (ins FPR32Op:$vrx)>; +def f2FFTOXS32_S : F2_XZ_P<0b01000, 0b001011, "fftox.f32.s32", [], (outs FPR32Op:$vrz), (ins FPR32Op:$vrx)>; +def f2FXTOFU32_S : F2_XZ_P<0b01001, 0b001010, "fxtof.u32.f32", [], (outs FPR32Op:$vrz), (ins FPR32Op:$vrx)>; +def f2FXTOFS32_S : F2_XZ_P<0b01001, 0b001011, "fxtof.s32.f32", [], (outs FPR32Op:$vrz), (ins FPR32Op:$vrx)>; +let Predicates = [HasFPUv3_DF] in { +def f2FFTOS32_D : F2_XZ_P<0b01000, 0b011101, "fftoi.f64.s32", [], (outs FPR32Op:$vrz), (ins FPR64Op:$vrx)>; +def f2FFTOU32_D : F2_XZ_P<0b01000, 0b011100, "fftoi.f64.u32", [], (outs FPR32Op:$vrz), (ins FPR64Op:$vrx)>; +def f2FS32TOF_D : F2_XZ_P<0b01001, 0b011101, "fitof.s32.f64", [], (outs FPR64Op:$vrz), (ins FPR32Op:$vrx)>; +def f2FU32TOF_D : F2_XZ_P<0b01001, 0b011100, "fitof.u32.f64", [], (outs FPR64Op:$vrz), (ins FPR32Op:$vrx)>; +def f2FFTOXU32_D : F2_XZ_P<0b01000, 0b001100, "fftox.f64.u32", [], (outs FPR32Op:$vrz), (ins FPR32Op:$vrx)>; +def f2FFTOXS32_D : F2_XZ_P<0b01000, 0b001101, "fftox.f64.s32", [], (outs FPR32Op:$vrz), (ins FPR32Op:$vrx)>; +def f2FXTOFU32_D : F2_XZ_P<0b01001, 0b001100, "fxtof.u32.f64", [], (outs FPR32Op:$vrz), (ins FPR32Op:$vrx)>; +def f2FXTOFS32_D : F2_XZ_P<0b01001, 0b001101, "fxtof.s32.f64", [], (outs FPR32Op:$vrz), (ins FPR32Op:$vrx)>; +} + +defm f2FF32TOSI32 : F2_XZ_RM<0b00011, 0b0000, "fftoi.f32.s32", (outs FPR32Op:$vrz), (ins FPR32Op:$vrx)>; +defm f2FF32TOUI32 : F2_XZ_RM<0b00011, 0b0001, "fftoi.f32.u32", (outs FPR32Op:$vrz), (ins FPR32Op:$vrx)>; +defm f2FF32TOFI32 : F2_XZ_RM<0b01000, 0b1001, "fftofi.f32", (outs FPR32Op:$vrz), (ins FPR32Op:$vrx)>; +let Predicates = [HasFPUv3_DF] in { +defm f2FF64TOSI32 : F2_XZ_RM<0b00011, 0b0010, "fftoi.f64.s32", (outs FPR32Op:$vrz), (ins FPR64Op:$vrx)>; +defm f2FF64TOUI32 : F2_XZ_RM<0b00011, 0b0011, "fftoi.f64.u32", (outs FPR32Op:$vrz), (ins FPR64Op:$vrx)>; +defm f2FF64TOFI32 : F2_XZ_RM<0b01000, 0b1010, "fftofi.f64", (outs FPR32Op:$vrz), (ins FPR32Op:$vrx)>; +} + +def : Pat<(i32 (fp_to_sint (fround FPR32Op:$vrx))), (COPY_TO_REGCLASS (f2FF32TOSI32_RN $vrx), GPR)>, Requires<[HasFPUv3_SF]>; +def : Pat<(i32 (fp_to_uint (fround FPR32Op:$vrx))), (COPY_TO_REGCLASS (f2FF32TOUI32_RN $vrx), GPR)>, Requires<[HasFPUv3_SF]>; +def : Pat<(i32 (fp_to_sint (fceil FPR32Op:$vrx))), (COPY_TO_REGCLASS (f2FF32TOSI32_RPI $vrx), GPR)>, Requires<[HasFPUv3_SF]>; +def : Pat<(i32 (fp_to_uint (fceil FPR32Op:$vrx))), (COPY_TO_REGCLASS (f2FF32TOUI32_RPI $vrx), GPR)>, Requires<[HasFPUv3_SF]>; +def : Pat<(i32 (fp_to_sint (ffloor FPR32Op:$vrx))), (COPY_TO_REGCLASS (f2FF32TOSI32_RNI $vrx), GPR)>, Requires<[HasFPUv3_SF]>; +def : Pat<(i32 (fp_to_uint (ffloor FPR32Op:$vrx))), (COPY_TO_REGCLASS (f2FF32TOUI32_RNI $vrx), GPR)>, Requires<[HasFPUv3_SF]>; +def : Pat<(i32 (fp_to_sint (ftrunc FPR32Op:$vrx))), (COPY_TO_REGCLASS (f2FF32TOSI32_RZ $vrx), GPR)>, Requires<[HasFPUv3_SF]>; +def : Pat<(i32 (fp_to_uint (ftrunc FPR32Op:$vrx))), (COPY_TO_REGCLASS (f2FF32TOUI32_RZ $vrx), GPR)>, Requires<[HasFPUv3_SF]>; +def : Pat<(i32 (fp_to_sint FPR32Op:$vrx)), (COPY_TO_REGCLASS (f2FF32TOSI32_RZ $vrx), GPR)>, Requires<[HasFPUv3_SF]>; +def : Pat<(i32 (fp_to_uint FPR32Op:$vrx)), (COPY_TO_REGCLASS (f2FF32TOUI32_RZ $vrx), GPR)>, Requires<[HasFPUv3_SF]>; + +def : Pat<(i32 (fp_to_sint (fround FPR64Op:$vrx))), (COPY_TO_REGCLASS (f2FF64TOSI32_RN $vrx), GPR)>, Requires<[HasFPUv3_DF]>; +def : Pat<(i32 (fp_to_uint (fround FPR64Op:$vrx))), (COPY_TO_REGCLASS (f2FF64TOUI32_RN $vrx), GPR)>, Requires<[HasFPUv3_DF]>; +def : Pat<(i32 (fp_to_sint (fceil FPR64Op:$vrx))), (COPY_TO_REGCLASS (f2FF64TOSI32_RPI $vrx), GPR)>, Requires<[HasFPUv3_DF]>; +def : Pat<(i32 (fp_to_uint (fceil FPR64Op:$vrx))), (COPY_TO_REGCLASS (f2FF64TOUI32_RPI $vrx), GPR)>, Requires<[HasFPUv3_DF]>; +def : Pat<(i32 (fp_to_sint (ffloor FPR64Op:$vrx))), (COPY_TO_REGCLASS (f2FF64TOSI32_RNI $vrx), GPR)>, Requires<[HasFPUv3_DF]>; +def : Pat<(i32 (fp_to_uint (ffloor FPR64Op:$vrx))), (COPY_TO_REGCLASS (f2FF64TOUI32_RNI $vrx), GPR)>, Requires<[HasFPUv3_DF]>; +def : Pat<(i32 (fp_to_sint (ftrunc FPR64Op:$vrx))), (COPY_TO_REGCLASS (f2FF64TOSI32_RZ $vrx), GPR)>, Requires<[HasFPUv3_DF]>; +def : Pat<(i32 (fp_to_uint (ftrunc FPR64Op:$vrx))), (COPY_TO_REGCLASS (f2FF64TOUI32_RZ $vrx), GPR)>, Requires<[HasFPUv3_DF]>; +def : Pat<(i32 (fp_to_sint FPR64Op:$vrx)), (COPY_TO_REGCLASS (f2FF64TOSI32_RZ $vrx), GPR)>, Requires<[HasFPUv3_DF]>; +def : Pat<(i32 (fp_to_uint FPR64Op:$vrx)), (COPY_TO_REGCLASS (f2FF64TOUI32_RZ $vrx), GPR)>, Requires<[HasFPUv3_DF]>; + +def : Pat<(sint_to_fp GPR:$vrx), (f2FS32TOF_S (COPY_TO_REGCLASS $vrx, FPR32))>, Requires<[HasFPUv3_SF]>; +def : Pat<(uint_to_fp GPR:$vrx), (f2FU32TOF_S (COPY_TO_REGCLASS $vrx, FPR32))>, Requires<[HasFPUv3_SF]>; +def : Pat<(sint_to_fp GPR:$vrx), (f2FS32TOF_D (COPY_TO_REGCLASS $vrx, FPR32))>, Requires<[HasFPUv3_DF]>; +def : Pat<(uint_to_fp GPR:$vrx), (f2FU32TOF_D (COPY_TO_REGCLASS $vrx, FPR32))>, Requires<[HasFPUv3_DF]>; + +let Predicates = [HasFPUv3_DF] in { +def f2FDTOS : F2_XZ_P<0b00011, 0b010110, "fdtos", [(set FPR32Op:$vrz, (fpround FPR64Op:$vrx))], (outs FPR32Op:$vrz), + (ins FPR64Op:$vrx)>; +def f2FSTOD : F2_XZ_P<0b00011, 0b010111, "fstod", [(set FPR64Op:$vrz, (fpextend FPR32Op:$vrx))], (outs FPR64Op:$vrz), + (ins FPR32Op:$vrx)>; +} + +// fsel +defm f2FSEL: F2_CXYZ_T<0b111001, "fsel">; + +def f2FINS: F2_XZ_SET<0b00000, FPR32Op, 0b011011, "fins.32">; + +def : Pat<(f32 fpimm16:$imm),(COPY_TO_REGCLASS (MOVI32 (fpimm32_lo16 fpimm16:$imm)), FPR32)>, + Requires<[HasFPUv3_SF]>; +def : Pat<(f32 fpimm16_16:$imm), (COPY_TO_REGCLASS (MOVIH32 (fpimm32_hi16 fpimm16_16:$imm)), FPR32)>, + Requires<[HasFPUv3_SF]>; +def : Pat<(f32 fpimm:$imm),(COPY_TO_REGCLASS (ORI32 (MOVIH32 (fpimm32_hi16 fpimm:$imm)), (fpimm32_lo16 fpimm:$imm)), FPR32)>, + Requires<[HasFPUv3_SF]>; + + +multiclass BRCond_Bin_F2<CondCode CC, string Instr, Instruction Br, Instruction MV, bit IsSelectSwap = 0> { + let Predicates = [HasFPUv3_SF] in + def : Pat<(brcond (i32 (setcc FPR32Op:$rs1, FPR32Op:$rs2, CC)), bb:$imm16), + (Br (!cast<Instruction>(Instr#_S) FPR32Op:$rs1, FPR32Op:$rs2), bb:$imm16)>; + let Predicates = [HasFPUv3_DF] in + def : Pat<(brcond (i32 (setcc FPR64Op:$rs1, FPR64Op:$rs2, CC)), bb:$imm16), + (Br (!cast<Instruction>(Instr#_D) FPR64Op:$rs1, FPR64Op:$rs2), bb:$imm16)>; + + let Predicates = [HasFPUv3_SF] in + def : Pat<(i32 (setcc FPR32Op:$rs1, FPR32Op:$rs2, CC)), + (MV (!cast<Instruction>(Instr#_S) FPR32Op:$rs1, FPR32Op:$rs2))>; + let Predicates = [HasFPUv3_DF] in + def : Pat<(i32 (setcc FPR64Op:$rs1, FPR64Op:$rs2, CC)), + (MV (!cast<Instruction>(Instr#_D) FPR64Op:$rs1, FPR64Op:$rs2))>; + + let Predicates = [HasFPUv3_SF] in { + def : Pat<(select (i32 (setcc FPR32Op:$rs1, FPR32Op:$rs2, CC)), FPR32Op:$rx, FPR32Op:$false), + !if( + !eq(IsSelectSwap, 0), + (f2FSEL_S (!cast<Instruction>(Instr#_S) FPR32Op:$rs1, FPR32Op:$rs2), FPR32Op:$rx, FPR32Op:$false), + (f2FSEL_S (!cast<Instruction>(Instr#_S) FPR32Op:$rs1, FPR32Op:$rs2), FPR32Op:$false, FPR32Op:$rx) + )>; + } + let Predicates = [HasFPUv3_DF] in { + def : Pat<(select (i32 (setcc FPR64Op:$rs1, FPR64Op:$rs2, CC)), FPR64Op:$rx, FPR64Op:$false), + !if( + !eq(IsSelectSwap, 0), + (f2FSEL_D (!cast<Instruction>(Instr#_D) FPR64Op:$rs1, FPR64Op:$rs2), FPR64Op:$rx, FPR64Op:$false), + (f2FSEL_D (!cast<Instruction>(Instr#_D) FPR64Op:$rs1, FPR64Op:$rs2), FPR64Op:$false, FPR64Op:$rx) + )>; + } +} + +multiclass BRCond_Bin_SWAP_F2<CondCode CC, string Instr, Instruction Br, Instruction MV, bit IsSelectSwap = 0> { + let Predicates = [HasFPUv3_SF] in + def : Pat<(brcond (i32 (setcc FPR32Op:$rs1, FPR32Op:$rs2, CC)), bb:$imm16), + (Br (!cast<Instruction>(Instr#_S) FPR32Op:$rs2, FPR32Op:$rs1), bb:$imm16)>; + let Predicates = [HasFPUv3_DF] in + def : Pat<(brcond (i32 (setcc FPR64Op:$rs1, FPR64Op:$rs2, CC)), bb:$imm16), + (Br (!cast<Instruction>(Instr#_D) FPR64Op:$rs2, FPR64Op:$rs1), bb:$imm16)>; + + let Predicates = [HasFPUv3_SF] in + def : Pat<(i32 (setcc FPR32Op:$rs1, FPR32Op:$rs2, CC)), + (MV (!cast<Instruction>(Instr#_S) FPR32Op:$rs2, FPR32Op:$rs1))>; + let Predicates = [HasFPUv3_DF] in + def : Pat<(i32 (setcc FPR64Op:$rs1, FPR64Op:$rs2, CC)), + (MV (!cast<Instruction>(Instr#_D) FPR64Op:$rs2, FPR64Op:$rs1))>; + + let Predicates = [HasFPUv3_SF] in { + def : Pat<(select (i32 (setcc FPR32Op:$rs1, FPR32Op:$rs2, CC)), FPR32Op:$rx, FPR32Op:$false), + !if( + !eq(IsSelectSwap, 0), + (f2FSEL_S (!cast<Instruction>(Instr#_S) FPR32Op:$rs2, FPR32Op:$rs1), FPR32Op:$rx, FPR32Op:$false), + (f2FSEL_S (!cast<Instruction>(Instr#_S) FPR32Op:$rs2, FPR32Op:$rs1), FPR32Op:$false, FPR32Op:$rx) + )>; + } + let Predicates = [HasFPUv3_DF] in { + def : Pat<(select (i32 (setcc FPR64Op:$rs1, FPR64Op:$rs2, CC)), FPR64Op:$rx, FPR64Op:$false), + !if( + !eq(IsSelectSwap, 0), + (f2FSEL_D (!cast<Instruction>(Instr#_D) FPR64Op:$rs2, FPR64Op:$rs1), FPR64Op:$rx, FPR64Op:$false), + (f2FSEL_D (!cast<Instruction>(Instr#_D) FPR64Op:$rs2, FPR64Op:$rs1), FPR64Op:$false, FPR64Op:$rx) + )>; + } +} + +// inverse (order && compare) to (unorder || inverse(compare)) + +defm : BRCond_Bin_F2<SETUNE, "f2FCMPNE", BT32, MVC32>; +defm : BRCond_Bin_F2<SETOEQ, "f2FCMPNE", BF32, MVCV32, 1>; +defm : BRCond_Bin_F2<SETOGE, "f2FCMPHS", BT32, MVC32>; +defm : BRCond_Bin_F2<SETOLT, "f2FCMPLT", BT32, MVC32>; +defm : BRCond_Bin_F2<SETUO, "f2FCMPUO", BT32, MVC32>; +defm : BRCond_Bin_F2<SETO, "f2FCMPUO", BF32, MVCV32, 1>; +defm : BRCond_Bin_SWAP_F2<SETOGT, "f2FCMPLT", BT32, MVC32>; +defm : BRCond_Bin_SWAP_F2<SETOLE, "f2FCMPHS", BT32, MVC32>; + +defm : BRCond_Bin_F2<SETNE, "f2FCMPNE", BT32, MVC32>; +defm : BRCond_Bin_F2<SETEQ, "f2FCMPNE", BF32, MVCV32, 1>; +defm : BRCond_Bin_F2<SETGE, "f2FCMPHS", BT32, MVC32>; +defm : BRCond_Bin_F2<SETLT, "f2FCMPLT", BT32, MVC32>; +defm : BRCond_Bin_SWAP_F2<SETGT, "f2FCMPLT", BT32, MVC32>; +defm : BRCond_Bin_SWAP_F2<SETLE, "f2FCMPHS", BT32, MVC32>; + +// ------ + +let Predicates = [HasFPUv3_SF] in { + def : Pat<(brcond (i32 (setcc FPR32Op:$rs1, fpimm0, SETOGE)), bb:$imm16), + (BT32 (f2FCMPHSZ_S FPR32Op:$rs1), bb:$imm16)>; + def : Pat<(i32 (setcc FPR32Op:$rs1, fpimm0, SETOGE)), + (MVC32 (f2FCMPHSZ_S FPR32Op:$rs1))>; + def : Pat<(select (i32 (setcc FPR32Op:$rs1, fpimm0, SETOGE)), FPR32Op:$rx, FPR32Op:$false), + (f2FSEL_S (f2FCMPHSZ_S FPR32Op:$rs1), FPR32Op:$rx, FPR32Op:$false)>; + def : Pat<(brcond (i32 (setcc FPR32Op:$rs1, fpimm0, SETOLT)), bb:$imm16), + (BT32 (f2FCMPLTZ_S FPR32Op:$rs1), bb:$imm16)>; + def : Pat<(i32 (setcc FPR32Op:$rs1, fpimm0, SETOLT)), + (MVC32 (f2FCMPLTZ_S FPR32Op:$rs1))>; + def : Pat<(select (i32 (setcc FPR32Op:$rs1, fpimm0, SETOLT)), FPR32Op:$rx, FPR32Op:$false), + (f2FSEL_S (f2FCMPLTZ_S FPR32Op:$rs1), FPR32Op:$rx, FPR32Op:$false)>; + def : Pat<(brcond (i32 (setcc FPR32Op:$rs1, fpimm0, SETOLE)), bb:$imm16), + (BT32 (f2FCMPLSZ_S FPR32Op:$rs1), bb:$imm16)>; + def : Pat<(i32 (setcc FPR32Op:$rs1, fpimm0, SETOLE)), + (MVC32 (f2FCMPLSZ_S FPR32Op:$rs1))>; + def : Pat<(select (i32 (setcc FPR32Op:$rs1, fpimm0, SETOLE)), FPR32Op:$rx, FPR32Op:$false), + (f2FSEL_S (f2FCMPLSZ_S FPR32Op:$rs1), FPR32Op:$rx, FPR32Op:$false)>; + def : Pat<(brcond (i32 (setcc FPR32Op:$rs1, fpimm0, SETOGT)), bb:$imm16), + (BT32 (f2FCMPHZ_S FPR32Op:$rs1), bb:$imm16)>; + def : Pat<(i32 (setcc FPR32Op:$rs1, fpimm0, SETOGT)), + (MVC32 (f2FCMPHZ_S FPR32Op:$rs1))>; + def : Pat<(select (i32 (setcc FPR32Op:$rs1, fpimm0, SETOGT)), FPR32Op:$rx, FPR32Op:$false), + (f2FSEL_S (f2FCMPHZ_S FPR32Op:$rs1), FPR32Op:$rx, FPR32Op:$false)>; + def : Pat<(brcond (i32 (setcc FPR32Op:$rs1, fpimm0, SETUNE)), bb:$imm16), + (BT32 (f2FCMPNEZ_S FPR32Op:$rs1), bb:$imm16)>; + def : Pat<(i32 (setcc FPR32Op:$rs1, fpimm0, SETUNE)), + (MVC32 (f2FCMPNEZ_S FPR32Op:$rs1))>; + def : Pat<(select (i32 (setcc FPR32Op:$rs1, fpimm0, SETUNE)), FPR32Op:$rx, FPR32Op:$false), + (f2FSEL_S (f2FCMPNEZ_S FPR32Op:$rs1), FPR32Op:$rx, FPR32Op:$false)>; + def : Pat<(brcond (i32 (setcc FPR32Op:$rs1, fpimm, SETUO)), bb:$imm16), + (BT32 (f2FCMPUOZ_S FPR32Op:$rs1), bb:$imm16)>; + def : Pat<(i32 (setcc FPR32Op:$rs1, fpimm, SETUO)), + (MVC32 (f2FCMPUOZ_S FPR32Op:$rs1))>; + def : Pat<(select (i32 (setcc FPR32Op:$rs1, fpimm, SETUO)), FPR32Op:$rx, FPR32Op:$false), + (f2FSEL_S (f2FCMPUOZ_S FPR32Op:$rs1), FPR32Op:$rx, FPR32Op:$false)>; + def : Pat<(brcond (i32 (setcc FPR32Op:$rs1, fpimm0, SETGE)), bb:$imm16), + (BT32 (f2FCMPHSZ_S FPR32Op:$rs1), bb:$imm16)>; + def : Pat<(i32 (setcc FPR32Op:$rs1, fpimm0, SETGE)), + (MVC32 (f2FCMPHSZ_S FPR32Op:$rs1))>; + def : Pat<(select (i32 (setcc FPR32Op:$rs1, fpimm0, SETGE)), FPR32Op:$rx, FPR32Op:$false), + (f2FSEL_S (f2FCMPHSZ_S FPR32Op:$rs1), FPR32Op:$rx, FPR32Op:$false)>; + def : Pat<(brcond (i32 (setcc FPR32Op:$rs1, fpimm0, SETLT)), bb:$imm16), + (BT32 (f2FCMPLTZ_S FPR32Op:$rs1), bb:$imm16)>; + def : Pat<(i32 (setcc FPR32Op:$rs1, fpimm0, SETLT)), + (MVC32 (f2FCMPLTZ_S FPR32Op:$rs1))>; + def : Pat<(select (i32 (setcc FPR32Op:$rs1, fpimm0, SETLT)), FPR32Op:$rx, FPR32Op:$false), + (f2FSEL_S (f2FCMPLTZ_S FPR32Op:$rs1), FPR32Op:$rx, FPR32Op:$false)>; + def : Pat<(brcond (i32 (setcc FPR32Op:$rs1, fpimm0, SETLE)), bb:$imm16), + (BT32 (f2FCMPLSZ_S FPR32Op:$rs1), bb:$imm16)>; + def : Pat<(i32 (setcc FPR32Op:$rs1, fpimm0, SETLE)), + (MVC32 (f2FCMPLSZ_S FPR32Op:$rs1))>; + def : Pat<(select (i32 (setcc FPR32Op:$rs1, fpimm0, SETLE)), FPR32Op:$rx, FPR32Op:$false), + (f2FSEL_S (f2FCMPLSZ_S FPR32Op:$rs1), FPR32Op:$rx, FPR32Op:$false)>; + def : Pat<(brcond (i32 (setcc FPR32Op:$rs1, fpimm0, SETGT)), bb:$imm16), + (BT32 (f2FCMPHZ_S FPR32Op:$rs1), bb:$imm16)>; + def : Pat<(i32 (setcc FPR32Op:$rs1, fpimm0, SETGT)), + (MVC32 (f2FCMPHZ_S FPR32Op:$rs1))>; + def : Pat<(select (i32 (setcc FPR32Op:$rs1, fpimm0, SETGT)), FPR32Op:$rx, FPR32Op:$false), + (f2FSEL_S (f2FCMPHZ_S FPR32Op:$rs1), FPR32Op:$rx, FPR32Op:$false)>; + + + def : Pat<(brcond (i32 (setcc FPR32Op:$rs1, fpimm, SETO)), bb:$imm16), + (BF32 (f2FCMPUOZ_S FPR32Op:$rs1), bb:$imm16)>; + def : Pat<(i32 (setcc FPR32Op:$rs1, fpimm, SETO)), + (MVCV32 (f2FCMPUOZ_S FPR32Op:$rs1))>; + def : Pat<(select (i32 (setcc FPR32Op:$rs1, fpimm, SETO)), FPR32Op:$rx, FPR32Op:$false), + (f2FSEL_S (f2FCMPUOZ_S FPR32Op:$rs1), FPR32Op:$false, FPR32Op:$rx)>; + def : Pat<(brcond (i32 (setcc FPR32Op:$rs1, fpimm0, SETOEQ)), bb:$imm16), + (BF32 (f2FCMPNEZ_S FPR32Op:$rs1), bb:$imm16)>; + def : Pat<(i32 (setcc FPR32Op:$rs1, fpimm0, SETOEQ)), + (MVCV32 (f2FCMPNEZ_S FPR32Op:$rs1))>; + def : Pat<(select (i32 (setcc FPR32Op:$rs1, fpimm0, SETOEQ)), FPR32Op:$rx, FPR32Op:$false), + (f2FSEL_S (f2FCMPNEZ_S FPR32Op:$rs1), FPR32Op:$false, FPR32Op:$rx)>; + def : Pat<(brcond (i32 (setcc FPR32Op:$rs1, fpimm0, SETEQ)), bb:$imm16), + (BF32 (f2FCMPNEZ_S FPR32Op:$rs1), bb:$imm16)>; + def : Pat<(i32 (setcc FPR32Op:$rs1, fpimm0, SETEQ)), + (MVCV32 (f2FCMPNEZ_S FPR32Op:$rs1))>; + def : Pat<(select (i32 (setcc FPR32Op:$rs1, fpimm0, SETEQ)), FPR32Op:$rx, FPR32Op:$false), + (f2FSEL_S (f2FCMPNEZ_S FPR32Op:$rs1), FPR32Op:$false, FPR32Op:$rx)>; +} + + +let Predicates = [HasFPUv3_SF] in +def : Pat<(select CARRY:$ca, FPR32Op:$rx, FPR32Op:$false), + (f2FSEL_S CARRY:$ca, FPR32Op:$rx, FPR32Op:$false)>; +let Predicates = [HasFPUv3_DF] in +def : Pat<(select CARRY:$ca, FPR64Op:$rx, FPR64Op:$false), + (f2FSEL_D CARRY:$ca, FPR64Op:$rx, FPR64Op:$false)>;
\ No newline at end of file diff --git a/llvm/lib/Target/CSKY/CSKYRegisterInfo.td b/llvm/lib/Target/CSKY/CSKYRegisterInfo.td index ade5c7f795af..b7f4fc17166b 100644 --- a/llvm/lib/Target/CSKY/CSKYRegisterInfo.td +++ b/llvm/lib/Target/CSKY/CSKYRegisterInfo.td @@ -194,6 +194,8 @@ def FPR64 : RegisterClass<"CSKY", [f64], 64, def sFPR64 : RegisterClass<"CSKY", [f64], 64, (add (sequence "F%u_64", 0, 15))>; +def sFPR64_V : RegisterClass<"CSKY", [v2f32], 32, (add sFPR64)>; + def FPR128 : RegisterClass<"CSKY", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64, v8f16], 128, (add (sequence "F%u_128", 0, 31))>; diff --git a/llvm/lib/Target/CSKY/CSKYTargetMachine.cpp b/llvm/lib/Target/CSKY/CSKYTargetMachine.cpp index 8f61feb6506d..94b24044c27d 100644 --- a/llvm/lib/Target/CSKY/CSKYTargetMachine.cpp +++ b/llvm/lib/Target/CSKY/CSKYTargetMachine.cpp @@ -23,6 +23,9 @@ using namespace llvm; extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeCSKYTarget() { RegisterTargetMachine<CSKYTargetMachine> X(getTheCSKYTarget()); + + PassRegistry *Registry = PassRegistry::getPassRegistry(); + initializeCSKYConstantIslandsPass(*Registry); } static std::string computeDataLayout(const Triple &TT) { @@ -92,6 +95,7 @@ public: } bool addInstSelector() override; + void addPreEmitPass() override; }; } // namespace @@ -105,3 +109,7 @@ bool CSKYPassConfig::addInstSelector() { return false; } + +void CSKYPassConfig::addPreEmitPass() { + addPass(createCSKYConstantIslandPass()); +} diff --git a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYInstPrinter.cpp b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYInstPrinter.cpp index 7001de999a51..07757f03c258 100644 --- a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYInstPrinter.cpp +++ b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYInstPrinter.cpp @@ -73,6 +73,13 @@ void CSKYInstPrinter::printRegName(raw_ostream &O, unsigned RegNo) const { O << getRegisterName(RegNo); } +void CSKYInstPrinter::printFPRRegName(raw_ostream &O, unsigned RegNo) const { + if (PrintBranchImmAsAddress) + O << getRegisterName(RegNo, CSKY::NoRegAltName); + else + O << getRegisterName(RegNo); +} + void CSKYInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O, const char *Modifier) { @@ -201,3 +208,11 @@ const char *CSKYInstPrinter::getRegisterName(unsigned RegNo) { return getRegisterName(RegNo, ArchRegNames ? CSKY::NoRegAltName : CSKY::ABIRegAltName); } + +void CSKYInstPrinter::printFPR(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O) { + const MCOperand &MO = MI->getOperand(OpNo); + assert(MO.isReg()); + + printFPRRegName(O, MO.getReg()); +} diff --git a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYInstPrinter.h b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYInstPrinter.h index f93a342ec6a3..52a1b9276762 100644 --- a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYInstPrinter.h +++ b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYInstPrinter.h @@ -36,6 +36,8 @@ public: void printOperand(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O, const char *Modifier = nullptr); + void printFPRRegName(raw_ostream &O, unsigned RegNo) const; + // Autogenerated by tblgen. std::pair<const char *, uint64_t> getMnemonic(const MCInst *MI) override; void printInstruction(const MCInst *MI, uint64_t Address, @@ -60,6 +62,8 @@ public: const MCSubtargetInfo &STI, raw_ostream &O); void printSPAddr(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); + void printFPR(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); static const char *getRegisterName(unsigned RegNo); static const char *getRegisterName(unsigned RegNo, unsigned AltIdx); }; diff --git a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCAsmInfo.cpp b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCAsmInfo.cpp index 668247bbbd87..543f2e3d43d4 100644 --- a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCAsmInfo.cpp +++ b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCAsmInfo.cpp @@ -22,4 +22,6 @@ CSKYMCAsmInfo::CSKYMCAsmInfo(const Triple &TargetTriple) { AlignmentIsInBytes = false; SupportsDebugInformation = true; CommentString = "#"; + + ExceptionsType = ExceptionHandling::DwarfCFI; } diff --git a/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp b/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp index d131cf896834..15eba89eeb55 100644 --- a/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp +++ b/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp @@ -211,8 +211,7 @@ struct HexagonOperand : public MCParsedAsmOperand { struct ImmTy Imm; }; - HexagonOperand(KindTy K, MCContext &Context) - : MCParsedAsmOperand(), Kind(K), Context(Context) {} + HexagonOperand(KindTy K, MCContext &Context) : Kind(K), Context(Context) {} public: HexagonOperand(const HexagonOperand &o) diff --git a/llvm/lib/Target/Hexagon/BitTracker.cpp b/llvm/lib/Target/Hexagon/BitTracker.cpp index 685bafd785df..17adf32750db 100644 --- a/llvm/lib/Target/Hexagon/BitTracker.cpp +++ b/llvm/lib/Target/Hexagon/BitTracker.cpp @@ -940,8 +940,8 @@ void BT::visitBranchesFrom(const MachineInstr &BI) { // If evaluated successfully add the targets to the cumulative list. if (Trace) { dbgs() << " adding targets:"; - for (unsigned i = 0, n = BTs.size(); i < n; ++i) - dbgs() << " " << printMBBReference(*BTs[i]); + for (const MachineBasicBlock *BT : BTs) + dbgs() << " " << printMBBReference(*BT); if (FallsThrough) dbgs() << "\n falls through\n"; else diff --git a/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp b/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp index 428d25da6dbc..b2a842233bb8 100644 --- a/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp +++ b/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp @@ -3260,13 +3260,12 @@ bool HexagonLoopRescheduling::processLoop(LoopCand &C) { dbgs() << "Group[" << i << "] inp: " << printReg(G.Inp.Reg, HRI, G.Inp.Sub) << " out: " << printReg(G.Out.Reg, HRI, G.Out.Sub) << "\n"; - for (unsigned j = 0, m = G.Ins.size(); j < m; ++j) - dbgs() << " " << *G.Ins[j]; + for (const MachineInstr *MI : G.Ins) + dbgs() << " " << MI; } }); - for (unsigned i = 0, n = Groups.size(); i < n; ++i) { - InstrGroup &G = Groups[i]; + for (InstrGroup &G : Groups) { if (!isShuffleOf(G.Out.Reg, G.Inp.Reg)) continue; auto LoopInpEq = [G] (const PhiInfo &P) -> bool { diff --git a/llvm/lib/Target/Hexagon/HexagonBitTracker.cpp b/llvm/lib/Target/Hexagon/HexagonBitTracker.cpp index 1938a5c259da..8e014b395286 100644 --- a/llvm/lib/Target/Hexagon/HexagonBitTracker.cpp +++ b/llvm/lib/Target/Hexagon/HexagonBitTracker.cpp @@ -493,6 +493,11 @@ bool HexagonEvaluator::evaluate(const MachineInstr &MI, RegisterCell RC = eADD(rc(1), lo(M, W0)); return rr0(RC, Outputs); } + case M2_mnaci: { + RegisterCell M = eMLS(rc(2), rc(3)); + RegisterCell RC = eSUB(rc(1), lo(M, W0)); + return rr0(RC, Outputs); + } case M2_mpysmi: { RegisterCell M = eMLS(rc(1), eIMM(im(2), W0)); return rr0(lo(M, 32), Outputs); diff --git a/llvm/lib/Target/Hexagon/HexagonCFGOptimizer.cpp b/llvm/lib/Target/Hexagon/HexagonCFGOptimizer.cpp index b456cf139c55..a31ad45f4bb0 100644 --- a/llvm/lib/Target/Hexagon/HexagonCFGOptimizer.cpp +++ b/llvm/lib/Target/Hexagon/HexagonCFGOptimizer.cpp @@ -118,13 +118,10 @@ bool HexagonCFGOptimizer::runOnMachineFunction(MachineFunction &Fn) { return false; // Loop over all of the basic blocks. - for (MachineFunction::iterator MBBb = Fn.begin(), MBBe = Fn.end(); - MBBb != MBBe; ++MBBb) { - MachineBasicBlock *MBB = &*MBBb; - + for (MachineBasicBlock &MBB : Fn) { // Traverse the basic block. - MachineBasicBlock::iterator MII = MBB->getFirstTerminator(); - if (MII != MBB->end()) { + MachineBasicBlock::iterator MII = MBB.getFirstTerminator(); + if (MII != MBB.end()) { MachineInstr &MI = *MII; int Opc = MI.getOpcode(); if (IsConditionalBranch(Opc)) { @@ -155,17 +152,17 @@ bool HexagonCFGOptimizer::runOnMachineFunction(MachineFunction &Fn) { // Remove BB2 // BB3: ... // BB4: ... - unsigned NumSuccs = MBB->succ_size(); - MachineBasicBlock::succ_iterator SI = MBB->succ_begin(); + unsigned NumSuccs = MBB.succ_size(); + MachineBasicBlock::succ_iterator SI = MBB.succ_begin(); MachineBasicBlock* FirstSucc = *SI; MachineBasicBlock* SecondSucc = *(++SI); MachineBasicBlock* LayoutSucc = nullptr; MachineBasicBlock* JumpAroundTarget = nullptr; - if (MBB->isLayoutSuccessor(FirstSucc)) { + if (MBB.isLayoutSuccessor(FirstSucc)) { LayoutSucc = FirstSucc; JumpAroundTarget = SecondSucc; - } else if (MBB->isLayoutSuccessor(SecondSucc)) { + } else if (MBB.isLayoutSuccessor(SecondSucc)) { LayoutSucc = SecondSucc; JumpAroundTarget = FirstSucc; } else { @@ -201,7 +198,7 @@ bool HexagonCFGOptimizer::runOnMachineFunction(MachineFunction &Fn) { if (case1 || case2) { InvertAndChangeJumpTarget(MI, UncondTarget); - MBB->replaceSuccessor(JumpAroundTarget, UncondTarget); + MBB.replaceSuccessor(JumpAroundTarget, UncondTarget); // Remove the unconditional branch in LayoutSucc. LayoutSucc->erase(LayoutSucc->begin()); diff --git a/llvm/lib/Target/Hexagon/HexagonCallingConv.td b/llvm/lib/Target/Hexagon/HexagonCallingConv.td index 93e17e608dd1..cc41b569e490 100644 --- a/llvm/lib/Target/Hexagon/HexagonCallingConv.td +++ b/llvm/lib/Target/Hexagon/HexagonCallingConv.td @@ -126,16 +126,16 @@ def CC_Hexagon_HVX: CallingConv<[ // HVX 128-byte mode CCIfHvx128< - CCIfType<[v32i32,v64i16,v128i8], + CCIfType<[v32i32,v64i16,v128i8,v32f32,v64f16], CCAssignToReg<[V0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15]>>>, CCIfHvx128< - CCIfType<[v64i32,v128i16,v256i8], + CCIfType<[v64i32,v128i16,v256i8,v64f32,v128f16], CCAssignToReg<[W0,W1,W2,W3,W4,W5,W6,W7]>>>, CCIfHvx128< - CCIfType<[v32i32,v64i16,v128i8], + CCIfType<[v32i32,v64i16,v128i8,v32f32,v64f16], CCAssignToStack<128,128>>>, CCIfHvx128< - CCIfType<[v64i32,v128i16,v256i8], + CCIfType<[v64i32,v128i16,v256i8,v64f32,v128f16], CCAssignToStack<256,128>>>, CCDelegateTo<CC_Hexagon> @@ -152,10 +152,10 @@ def RetCC_Hexagon_HVX: CallingConv<[ // HVX 128-byte mode CCIfHvx128< - CCIfType<[v32i32,v64i16,v128i8], + CCIfType<[v32i32,v64i16,v128i8,v32f32,v64f16], CCAssignToReg<[V0]>>>, CCIfHvx128< - CCIfType<[v64i32,v128i16,v256i8], + CCIfType<[v64i32,v128i16,v256i8,v64f32,v128f16], CCAssignToReg<[W0]>>>, CCDelegateTo<RetCC_Hexagon> diff --git a/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp b/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp index a53efeb96961..fc5e05d8c9a0 100644 --- a/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp +++ b/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp @@ -290,13 +290,11 @@ namespace { raw_ostream &operator<< (raw_ostream &OS, const NodeToUsesMap &M) LLVM_ATTRIBUTE_UNUSED; raw_ostream &operator<< (raw_ostream &OS, const NodeToUsesMap &M){ - using const_iterator = NodeToUsesMap::const_iterator; - - for (const_iterator I = M.begin(), E = M.end(); I != E; ++I) { - const UseSet &Us = I->second; - OS << I->first << " -> #" << Us.size() << '{'; - for (UseSet::const_iterator J = Us.begin(), F = Us.end(); J != F; ++J) { - User *R = (*J)->getUser(); + for (const auto &I : M) { + const UseSet &Us = I.second; + OS << I.first << " -> #" << Us.size() << '{'; + for (const Use *U : Us) { + User *R = U->getUser(); if (R->hasName()) OS << ' ' << R->getName(); else @@ -420,15 +418,12 @@ void HexagonCommonGEP::collect() { // instruction that uses another GEP instruction as the base pointer, the // gep node for the base pointer should already exist. ValueToNodeMap NM; - for (ValueVect::iterator I = BO.begin(), E = BO.end(); I != E; ++I) { - BasicBlock *B = cast<BasicBlock>(*I); - for (BasicBlock::iterator J = B->begin(), F = B->end(); J != F; ++J) { - if (!isa<GetElementPtrInst>(J)) - continue; - GetElementPtrInst *GepI = cast<GetElementPtrInst>(J); - if (isHandledGepForm(GepI)) - processGepInst(GepI, NM); - } + for (Value *I : BO) { + BasicBlock *B = cast<BasicBlock>(I); + for (Instruction &J : *B) + if (auto *GepI = dyn_cast<GetElementPtrInst>(&J)) + if (isHandledGepForm(GepI)) + processGepInst(GepI, NM); } LLVM_DEBUG(dbgs() << "Gep nodes after initial collection:\n" << Nodes); @@ -436,17 +431,14 @@ void HexagonCommonGEP::collect() { static void invert_find_roots(const NodeVect &Nodes, NodeChildrenMap &NCM, NodeVect &Roots) { - using const_iterator = NodeVect::const_iterator; - - for (const_iterator I = Nodes.begin(), E = Nodes.end(); I != E; ++I) { - GepNode *N = *I; - if (N->Flags & GepNode::Root) { - Roots.push_back(N); - continue; - } - GepNode *PN = N->Parent; - NCM[PN].push_back(N); + for (GepNode *N : Nodes) { + if (N->Flags & GepNode::Root) { + Roots.push_back(N); + continue; } + GepNode *PN = N->Parent; + NCM[PN].push_back(N); + } } static void nodes_for_root(GepNode *Root, NodeChildrenMap &NCM, @@ -546,8 +538,7 @@ void HexagonCommonGEP::common() { using NodeSetMap = std::map<unsigned, NodeSet>; NodeSetMap MaybeEq; - for (NodeVect::iterator I = Nodes.begin(), E = Nodes.end(); I != E; ++I) { - GepNode *N = *I; + for (GepNode *N : Nodes) { unsigned H = node_hash(N); MaybeEq[H].insert(N); } @@ -556,9 +547,8 @@ void HexagonCommonGEP::common() { // one for equality and the other for non-equality. NodeSymRel EqRel; // Equality relation (as set of equivalence classes). NodePairSet Eq, Ne; // Caches. - for (NodeSetMap::iterator I = MaybeEq.begin(), E = MaybeEq.end(); - I != E; ++I) { - NodeSet &S = I->second; + for (auto &I : MaybeEq) { + NodeSet &S = I.second; for (NodeSet::iterator NI = S.begin(), NE = S.end(); NI != NE; ++NI) { GepNode *N = *NI; // If node already has a class, then the class must have been created @@ -612,8 +602,7 @@ void HexagonCommonGEP::common() { // Update the min element's flags, and user list. uint32_t Flags = 0; UseSet &MinUs = Uses[Min]; - for (NodeSet::iterator J = S.begin(), F = S.end(); J != F; ++J) { - GepNode *N = *J; + for (GepNode *N : S) { uint32_t NF = N->Flags; // If N is used, append all original values of N to the list of // original values of Min. @@ -633,8 +622,7 @@ void HexagonCommonGEP::common() { // selected (minimum) node from the corresponding equivalence class. // If a given parent does not have an equivalence class, leave it // unchanged (it means that it's the only element in its class). - for (NodeVect::iterator I = Nodes.begin(), E = Nodes.end(); I != E; ++I) { - GepNode *N = *I; + for (GepNode *N : Nodes) { if (N->Flags & GepNode::Root) continue; const NodeSet *PC = node_class(N->Parent, EqRel); @@ -652,8 +640,7 @@ void HexagonCommonGEP::common() { // Finally, erase the nodes that are no longer used. NodeSet Erase; - for (NodeVect::iterator I = Nodes.begin(), E = Nodes.end(); I != E; ++I) { - GepNode *N = *I; + for (GepNode *N : Nodes) { const NodeSet *PC = node_class(N, EqRel); if (!PC) continue; @@ -663,7 +650,7 @@ void HexagonCommonGEP::common() { if (N == F->second) continue; // Node for removal. - Erase.insert(*I); + Erase.insert(N); } erase_if(Nodes, in_set(Erase)); @@ -775,8 +762,7 @@ BasicBlock *HexagonCommonGEP::recalculatePlacement(GepNode *Node, NodeToUsesMap::iterator UF = Uses.find(Node); assert(UF != Uses.end() && "Used node with no use information"); UseSet &Us = UF->second; - for (UseSet::iterator I = Us.begin(), E = Us.end(); I != E; ++I) { - Use *U = *I; + for (Use *U : Us) { User *R = U->getUser(); if (!isa<Instruction>(R)) continue; @@ -790,8 +776,7 @@ BasicBlock *HexagonCommonGEP::recalculatePlacement(GepNode *Node, NodeChildrenMap::iterator CF = NCM.find(Node); if (CF != NCM.end()) { NodeVect &Cs = CF->second; - for (NodeVect::iterator I = Cs.begin(), E = Cs.end(); I != E; ++I) { - GepNode *CN = *I; + for (GepNode *CN : Cs) { NodeToValueMap::iterator LF = Loc.find(CN); // If the child is only used in GEP instructions (i.e. is not used in // non-GEP instructions), the nearest dominator computed for it may @@ -831,8 +816,8 @@ BasicBlock *HexagonCommonGEP::recalculatePlacementRec(GepNode *Node, NodeChildrenMap::iterator CF = NCM.find(Node); if (CF != NCM.end()) { NodeVect &Cs = CF->second; - for (NodeVect::iterator I = Cs.begin(), E = Cs.end(); I != E; ++I) - recalculatePlacementRec(*I, NCM, Loc); + for (GepNode *C : Cs) + recalculatePlacementRec(C, NCM, Loc); } BasicBlock *LB = recalculatePlacement(Node, NCM, Loc); LLVM_DEBUG(dbgs() << "LocRec end for node:" << Node << '\n'); @@ -921,8 +906,8 @@ BasicBlock *HexagonCommonGEP::adjustForInvariance(GepNode *Node, NodeChildrenMap::iterator CF = NCM.find(Node); if (CF != NCM.end()) { NodeVect &Cs = CF->second; - for (NodeVect::iterator I = Cs.begin(), E = Cs.end(); I != E; ++I) - adjustForInvariance(*I, NCM, Loc); + for (GepNode *C : Cs) + adjustForInvariance(C, NCM, Loc); } return LocB; } @@ -938,10 +923,9 @@ namespace { raw_ostream &operator<< (raw_ostream &OS, const LocationAsBlock &Loc) LLVM_ATTRIBUTE_UNUSED ; raw_ostream &operator<< (raw_ostream &OS, const LocationAsBlock &Loc) { - for (NodeToValueMap::const_iterator I = Loc.Map.begin(), E = Loc.Map.end(); - I != E; ++I) { - OS << I->first << " -> "; - if (BasicBlock *B = cast_or_null<BasicBlock>(I->second)) + for (const auto &I : Loc.Map) { + OS << I.first << " -> "; + if (BasicBlock *B = cast_or_null<BasicBlock>(I.second)) OS << B->getName() << '(' << B << ')'; else OS << "<null-block>"; @@ -1016,8 +1000,7 @@ void HexagonCommonGEP::separateConstantChains(GepNode *Node, // Collect all used nodes together with the uses from loads and stores, // where the GEP node could be folded into the load/store instruction. NodeToUsesMap FNs; // Foldable nodes. - for (NodeSet::iterator I = Ns.begin(), E = Ns.end(); I != E; ++I) { - GepNode *N = *I; + for (GepNode *N : Ns) { if (!(N->Flags & GepNode::Used)) continue; NodeToUsesMap::iterator UF = Uses.find(N); @@ -1025,8 +1008,7 @@ void HexagonCommonGEP::separateConstantChains(GepNode *Node, UseSet &Us = UF->second; // Loads/stores that use the node N. UseSet LSs; - for (UseSet::iterator J = Us.begin(), F = Us.end(); J != F; ++J) { - Use *U = *J; + for (Use *U : Us) { User *R = U->getUser(); // We're interested in uses that provide the address. It can happen // that the value may also be provided via GEP, but we won't handle @@ -1051,11 +1033,11 @@ void HexagonCommonGEP::separateConstantChains(GepNode *Node, LLVM_DEBUG(dbgs() << "Nodes with foldable users:\n" << FNs); - for (NodeToUsesMap::iterator I = FNs.begin(), E = FNs.end(); I != E; ++I) { - GepNode *N = I->first; - UseSet &Us = I->second; - for (UseSet::iterator J = Us.begin(), F = Us.end(); J != F; ++J) - separateChainForNode(N, *J, Loc); + for (auto &FN : FNs) { + GepNode *N = FN.first; + UseSet &Us = FN.second; + for (Use *U : Us) + separateChainForNode(N, U, Loc); } } @@ -1068,21 +1050,21 @@ void HexagonCommonGEP::computeNodePlacement(NodeToValueMap &Loc) { // Compute the initial placement determined by the users' locations, and // the locations of the child nodes. - for (NodeVect::iterator I = Roots.begin(), E = Roots.end(); I != E; ++I) - recalculatePlacementRec(*I, NCM, Loc); + for (GepNode *Root : Roots) + recalculatePlacementRec(Root, NCM, Loc); LLVM_DEBUG(dbgs() << "Initial node placement:\n" << LocationAsBlock(Loc)); if (OptEnableInv) { - for (NodeVect::iterator I = Roots.begin(), E = Roots.end(); I != E; ++I) - adjustForInvariance(*I, NCM, Loc); + for (GepNode *Root : Roots) + adjustForInvariance(Root, NCM, Loc); LLVM_DEBUG(dbgs() << "Node placement after adjustment for invariance:\n" << LocationAsBlock(Loc)); } if (OptEnableConst) { - for (NodeVect::iterator I = Roots.begin(), E = Roots.end(); I != E; ++I) - separateConstantChains(*I, NCM, Loc); + for (GepNode *Root : Roots) + separateConstantChains(Root, NCM, Loc); } LLVM_DEBUG(dbgs() << "Node use information:\n" << Uses); @@ -1153,8 +1135,8 @@ void HexagonCommonGEP::getAllUsersForNode(GepNode *Node, ValueVect &Values, NodeToUsesMap::iterator UF = Uses.find(N); assert(UF != Uses.end() && "No use information for used node"); UseSet &Us = UF->second; - for (UseSet::iterator I = Us.begin(), E = Us.end(); I != E; ++I) - Values.push_back((*I)->getUser()); + for (const auto &U : Us) + Values.push_back(U->getUser()); } NodeChildrenMap::iterator CF = NCM.find(N); if (CF != NCM.end()) { @@ -1223,8 +1205,7 @@ void HexagonCommonGEP::materialize(NodeToValueMap &Loc) { // to the Roots list. if (LastCN > 0) { NodeVect &Cs = NCM[Last]; - for (NodeVect::iterator I = Cs.begin(), E = Cs.end(); I != E; ++I) { - GepNode *CN = *I; + for (GepNode *CN : Cs) { CN->Flags &= ~GepNode::Internal; CN->Flags |= GepNode::Root; CN->BaseVal = NewInst; @@ -1238,10 +1219,8 @@ void HexagonCommonGEP::materialize(NodeToValueMap &Loc) { NodeToUsesMap::iterator UF = Uses.find(Last); assert(UF != Uses.end() && "No use information found"); UseSet &Us = UF->second; - for (UseSet::iterator I = Us.begin(), E = Us.end(); I != E; ++I) { - Use *U = *I; + for (Use *U : Us) U->set(NewInst); - } } } } @@ -1261,8 +1240,8 @@ void HexagonCommonGEP::removeDeadCode() { ValueVect Ins; for (Instruction &I : llvm::reverse(*B)) Ins.push_back(&I); - for (ValueVect::iterator I = Ins.begin(), E = Ins.end(); I != E; ++I) { - Instruction *In = cast<Instruction>(*I); + for (Value *I : Ins) { + Instruction *In = cast<Instruction>(I); if (isInstructionTriviallyDead(In)) In->eraseFromParent(); } diff --git a/llvm/lib/Target/Hexagon/HexagonConstExtenders.cpp b/llvm/lib/Target/Hexagon/HexagonConstExtenders.cpp index d3fcdb6ae9a8..d8af35cbf3a8 100644 --- a/llvm/lib/Target/Hexagon/HexagonConstExtenders.cpp +++ b/llvm/lib/Target/Hexagon/HexagonConstExtenders.cpp @@ -229,7 +229,7 @@ namespace { private: struct Register { Register() = default; - Register(unsigned R, unsigned S) : Reg(R), Sub(S) {} + Register(llvm::Register R, unsigned S) : Reg(R), Sub(S) {} Register(const MachineOperand &Op) : Reg(Op.getReg()), Sub(Op.getSubReg()) {} Register &operator=(const MachineOperand &Op) { @@ -1573,7 +1573,7 @@ HCE::Register HCE::insertInitializer(Loc DefL, const ExtenderInit &ExtI) { // No compounds are available. It is not clear whether we should // even process such extenders where the initializer cannot be // a single instruction, but do it for now. - unsigned TmpR = MRI->createVirtualRegister(&Hexagon::IntRegsRegClass); + llvm::Register TmpR = MRI->createVirtualRegister(&Hexagon::IntRegsRegClass); BuildMI(MBB, At, dl, HII->get(Hexagon::S2_asl_i_r), TmpR) .add(MachineOperand(Ex.Rs)) .addImm(Ex.S); diff --git a/llvm/lib/Target/Hexagon/HexagonConstPropagation.cpp b/llvm/lib/Target/Hexagon/HexagonConstPropagation.cpp index daf311fc49d4..105bf2811a20 100644 --- a/llvm/lib/Target/Hexagon/HexagonConstPropagation.cpp +++ b/llvm/lib/Target/Hexagon/HexagonConstPropagation.cpp @@ -125,8 +125,8 @@ namespace { }; LatticeCell() : Kind(Top), Size(0), IsSpecial(false) { - for (unsigned i = 0; i < MaxCellSize; ++i) - Values[i] = nullptr; + for (const Constant *&Value : Values) + Value = nullptr; } bool meet(const LatticeCell &L); @@ -1029,8 +1029,8 @@ bool MachineConstPropagator::rewrite(MachineFunction &MF) { ToRemove.push_back(const_cast<MachineBasicBlock*>(SB)); Targets.remove(SB); } - for (unsigned i = 0, n = ToRemove.size(); i < n; ++i) - removeCFGEdge(B, ToRemove[i]); + for (MachineBasicBlock *MBB : ToRemove) + removeCFGEdge(B, MBB); // If there are any blocks left in the computed targets, it means that // we think that the block could go somewhere, but the CFG does not. // This could legitimately happen in blocks that have non-returning diff --git a/llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp b/llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp index 03b0f75b2dc1..2ee7f1325df9 100644 --- a/llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp +++ b/llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp @@ -70,9 +70,7 @@ class HexagonCopyToCombine : public MachineFunctionPass { public: static char ID; - HexagonCopyToCombine() : MachineFunctionPass(ID) { - initializeHexagonCopyToCombinePass(*PassRegistry::getPassRegistry()); - } + HexagonCopyToCombine() : MachineFunctionPass(ID) {} void getAnalysisUsage(AnalysisUsage &AU) const override { MachineFunctionPass::getAnalysisUsage(AU); diff --git a/llvm/lib/Target/Hexagon/HexagonEarlyIfConv.cpp b/llvm/lib/Target/Hexagon/HexagonEarlyIfConv.cpp index 9a3feb5b6af1..2207925ceeba 100644 --- a/llvm/lib/Target/Hexagon/HexagonEarlyIfConv.cpp +++ b/llvm/lib/Target/Hexagon/HexagonEarlyIfConv.cpp @@ -612,8 +612,8 @@ bool HexagonEarlyIfConversion::visitBlock(MachineBasicBlock *B, // Simply keep a list of children of B, and traverse that list. using DTNodeVectType = SmallVector<MachineDomTreeNode *, 4>; DTNodeVectType Cn(GTN::child_begin(N), GTN::child_end(N)); - for (DTNodeVectType::iterator I = Cn.begin(), E = Cn.end(); I != E; ++I) { - MachineBasicBlock *SB = (*I)->getBlock(); + for (auto &I : Cn) { + MachineBasicBlock *SB = I->getBlock(); if (!Deleted.count(SB)) Changed |= visitBlock(SB, L); } @@ -648,8 +648,8 @@ bool HexagonEarlyIfConversion::visitLoop(MachineLoop *L) { << "\n"); bool Changed = false; if (L) { - for (MachineLoop::iterator I = L->begin(), E = L->end(); I != E; ++I) - Changed |= visitLoop(*I); + for (MachineLoop *I : *L) + Changed |= visitLoop(I); } MachineBasicBlock *EntryB = GraphTraits<MachineFunction*>::getEntryNode(MFN); @@ -964,8 +964,8 @@ void HexagonEarlyIfConversion::removeBlock(MachineBasicBlock *B) { using DTNodeVectType = SmallVector<MachineDomTreeNode *, 4>; DTNodeVectType Cn(GTN::child_begin(N), GTN::child_end(N)); - for (DTNodeVectType::iterator I = Cn.begin(), E = Cn.end(); I != E; ++I) { - MachineBasicBlock *SB = (*I)->getBlock(); + for (auto &I : Cn) { + MachineBasicBlock *SB = I->getBlock(); MDT->changeImmediateDominator(SB, IDB); } } @@ -973,8 +973,8 @@ void HexagonEarlyIfConversion::removeBlock(MachineBasicBlock *B) { while (!B->succ_empty()) B->removeSuccessor(B->succ_begin()); - for (auto I = B->pred_begin(), E = B->pred_end(); I != E; ++I) - (*I)->removeSuccessor(B, true); + for (MachineBasicBlock *Pred : B->predecessors()) + Pred->removeSuccessor(B, true); Deleted.insert(B); MDT->eraseNode(B); @@ -1064,8 +1064,8 @@ bool HexagonEarlyIfConversion::runOnMachineFunction(MachineFunction &MF) { Deleted.clear(); bool Changed = false; - for (MachineLoopInfo::iterator I = MLI->begin(), E = MLI->end(); I != E; ++I) - Changed |= visitLoop(*I); + for (MachineLoop *L : *MLI) + Changed |= visitLoop(L); Changed |= visitLoop(nullptr); return Changed; diff --git a/llvm/lib/Target/Hexagon/HexagonExpandCondsets.cpp b/llvm/lib/Target/Hexagon/HexagonExpandCondsets.cpp index c444cf557c21..2693940bb1e9 100644 --- a/llvm/lib/Target/Hexagon/HexagonExpandCondsets.cpp +++ b/llvm/lib/Target/Hexagon/HexagonExpandCondsets.cpp @@ -1106,8 +1106,7 @@ bool HexagonExpandCondsets::isIntReg(RegisterRef RR, unsigned &BW) { } bool HexagonExpandCondsets::isIntraBlocks(LiveInterval &LI) { - for (LiveInterval::iterator I = LI.begin(), E = LI.end(); I != E; ++I) { - LiveRange::Segment &LR = *I; + for (LiveRange::Segment &LR : LI) { // Range must start at a register... if (!LR.start.isRegister()) return false; @@ -1160,16 +1159,16 @@ bool HexagonExpandCondsets::coalesceRegisters(RegisterRef R1, RegisterRef R2) { // Move all live segments from L2 to L1. using ValueInfoMap = DenseMap<VNInfo *, VNInfo *>; ValueInfoMap VM; - for (LiveInterval::iterator I = L2.begin(), E = L2.end(); I != E; ++I) { - VNInfo *NewVN, *OldVN = I->valno; + for (LiveRange::Segment &I : L2) { + VNInfo *NewVN, *OldVN = I.valno; ValueInfoMap::iterator F = VM.find(OldVN); if (F == VM.end()) { - NewVN = L1.getNextValue(I->valno->def, LIS->getVNInfoAllocator()); + NewVN = L1.getNextValue(I.valno->def, LIS->getVNInfoAllocator()); VM.insert(std::make_pair(OldVN, NewVN)); } else { NewVN = F->second; } - L1.addSegment(LiveRange::Segment(I->start, I->end, NewVN)); + L1.addSegment(LiveRange::Segment(I.start, I.end, NewVN)); } while (!L2.empty()) L2.removeSegment(*L2.begin()); diff --git a/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp b/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp index 12ceac545e9d..989a98571434 100644 --- a/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp +++ b/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp @@ -416,8 +416,8 @@ void HexagonFrameLowering::findShrunkPrologEpilog(MachineFunction &MF, UnsignedMap RPO; RPOTType RPOT(&MF); unsigned RPON = 0; - for (RPOTType::rpo_iterator I = RPOT.begin(), E = RPOT.end(); I != E; ++I) - RPO[(*I)->getNumber()] = RPON++; + for (auto &I : RPOT) + RPO[I->getNumber()] = RPON++; // Don't process functions that have loops, at least for now. Placement // of prolog and epilog must take loop structure into account. For simpli- @@ -1410,7 +1410,7 @@ bool HexagonFrameLowering::insertCSRSpillsInBlock(MachineBasicBlock &MBB, } for (const CalleeSavedInfo &I : CSI) { - unsigned Reg = I.getReg(); + Register Reg = I.getReg(); // Add live in registers. We treat eh_return callee saved register r0 - r3 // specially. They are not really callee saved registers as they are not // supposed to be killed. @@ -1479,7 +1479,7 @@ bool HexagonFrameLowering::insertCSRRestoresInBlock(MachineBasicBlock &MBB, } for (const CalleeSavedInfo &I : CSI) { - unsigned Reg = I.getReg(); + Register Reg = I.getReg(); const TargetRegisterClass *RC = HRI.getMinimalPhysRegClass(Reg); int FI = I.getFrameIdx(); HII.loadRegFromStackSlot(MBB, MI, Reg, FI, RC, &HRI); @@ -1620,7 +1620,7 @@ bool HexagonFrameLowering::assignCalleeSavedSpillSlots(MachineFunction &MF, // sub-registers to SRegs. LLVM_DEBUG(dbgs() << "Initial CS registers: {"); for (const CalleeSavedInfo &I : CSI) { - unsigned R = I.getReg(); + Register R = I.getReg(); LLVM_DEBUG(dbgs() << ' ' << printReg(R, TRI)); for (MCSubRegIterator SR(R, TRI, true); SR.isValid(); ++SR) SRegs[*SR] = true; @@ -2635,7 +2635,7 @@ bool HexagonFrameLowering::shouldInlineCSR(const MachineFunction &MF, // a contiguous block starting from D8. BitVector Regs(Hexagon::NUM_TARGET_REGS); for (const CalleeSavedInfo &I : CSI) { - unsigned R = I.getReg(); + Register R = I.getReg(); if (!Hexagon::DoubleRegsRegClass.contains(R)) return true; Regs[R] = true; diff --git a/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp b/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp index 85230cac9d7c..0bb1658e7698 100644 --- a/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp +++ b/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp @@ -583,14 +583,12 @@ namespace { char HexagonGenInsert::ID = 0; void HexagonGenInsert::dump_map() const { - using iterator = IFMapType::const_iterator; - - for (iterator I = IFMap.begin(), E = IFMap.end(); I != E; ++I) { - dbgs() << " " << printReg(I->first, HRI) << ":\n"; - const IFListType &LL = I->second; - for (unsigned i = 0, n = LL.size(); i < n; ++i) - dbgs() << " " << PrintIFR(LL[i].first, HRI) << ", " - << PrintRegSet(LL[i].second, HRI) << '\n'; + for (const auto &I : IFMap) { + dbgs() << " " << printReg(I.first, HRI) << ":\n"; + const IFListType &LL = I.second; + for (const auto &J : LL) + dbgs() << " " << PrintIFR(J.first, HRI) << ", " + << PrintRegSet(J.second, HRI) << '\n'; } } @@ -627,8 +625,8 @@ void HexagonGenInsert::buildOrderingBT(RegisterOrdering &RB, using SortableVectorType = std::vector<unsigned>; SortableVectorType VRs; - for (RegisterOrdering::iterator I = RB.begin(), E = RB.end(); I != E; ++I) - VRs.push_back(I->first); + for (auto &I : RB) + VRs.push_back(I.first); llvm::sort(VRs, LexCmp); // Transfer the results to the outgoing register ordering. for (unsigned i = 0, n = VRs.size(); i < n; ++i) @@ -853,20 +851,18 @@ bool HexagonGenInsert::findRecordInsertForms(unsigned VR, if (isDebug()) { dbgs() << "Prefixes matching register " << printReg(VR, HRI) << "\n"; - for (LRSMapType::iterator I = LM.begin(), E = LM.end(); I != E; ++I) { - dbgs() << " L=" << I->first << ':'; - const RSListType &LL = I->second; - for (unsigned i = 0, n = LL.size(); i < n; ++i) - dbgs() << " (" << printReg(LL[i].first, HRI) << ",@" - << LL[i].second << ')'; + for (const auto &I : LM) { + dbgs() << " L=" << I.first << ':'; + const RSListType &LL = I.second; + for (const auto &J : LL) + dbgs() << " (" << printReg(J.first, HRI) << ",@" << J.second << ')'; dbgs() << '\n'; } } bool Recorded = false; - for (iterator I = AVs.begin(), E = AVs.end(); I != E; ++I) { - unsigned SrcR = *I; + for (unsigned SrcR : AVs) { int FDi = -1, LDi = -1; // First/last different bit. const BitTracker::RegisterCell &AC = CMS->lookup(SrcR); uint16_t AW = AC.width(); @@ -888,8 +884,8 @@ bool HexagonGenInsert::findRecordInsertForms(unsigned VR, if (F == LM.end()) continue; RSListType &LL = F->second; - for (unsigned i = 0, n = LL.size(); i < n; ++i) { - uint16_t S = LL[i].second; + for (const auto &I : LL) { + uint16_t S = I.second; // MinL is the minimum length of the prefix. Any length above MinL // allows some flexibility as to where the prefix can start: // given the extra length EL=L-MinL, the prefix must start between @@ -900,7 +896,7 @@ bool HexagonGenInsert::findRecordInsertForms(unsigned VR, uint16_t LowS = (EL < FD) ? FD-EL : 0; if (S < LowS) // Starts too early. continue; - unsigned InsR = LL[i].first; + unsigned InsR = I.first; if (!isValidInsertForm(VR, SrcR, InsR, L, S)) continue; if (isDebug()) { @@ -1029,10 +1025,10 @@ void HexagonGenInsert::findRemovableRegisters(unsigned VR, IFRecord IF, } void HexagonGenInsert::computeRemovableRegisters() { - for (IFMapType::iterator I = IFMap.begin(), E = IFMap.end(); I != E; ++I) { - IFListType &LL = I->second; - for (unsigned i = 0, n = LL.size(); i < n; ++i) - findRemovableRegisters(I->first, LL[i].first, LL[i].second); + for (auto &I : IFMap) { + IFListType &LL = I.second; + for (auto &J : LL) + findRemovableRegisters(I.first, J.first, J.second); } } @@ -1064,8 +1060,8 @@ void HexagonGenInsert::pruneCoveredSets(unsigned VR) { MachineInstr *DefVR = MRI->getVRegDef(VR); bool DefEx = HII->isConstExtended(*DefVR); bool HasNE = false; - for (unsigned i = 0, n = LL.size(); i < n; ++i) { - if (LL[i].second.empty()) + for (const auto &I : LL) { + if (I.second.empty()) continue; HasNE = true; break; @@ -1172,8 +1168,8 @@ void HexagonGenInsert::pruneCandidates() { // selection method. // First, remove candidates whose potentially removable set is a subset // of another candidate's set. - for (IFMapType::iterator I = IFMap.begin(), E = IFMap.end(); I != E; ++I) - pruneCoveredSets(I->first); + for (const auto &I : IFMap) + pruneCoveredSets(I.first); UnsignedMap RPO; @@ -1181,18 +1177,18 @@ void HexagonGenInsert::pruneCandidates() { RPOTType RPOT(MFN); unsigned RPON = 0; - for (RPOTType::rpo_iterator I = RPOT.begin(), E = RPOT.end(); I != E; ++I) - RPO[(*I)->getNumber()] = RPON++; + for (const auto &I : RPOT) + RPO[I->getNumber()] = RPON++; PairMapType Memo; // Memoization map for distance calculation. // Remove candidates that would use registers defined too far away. - for (IFMapType::iterator I = IFMap.begin(), E = IFMap.end(); I != E; ++I) - pruneUsesTooFar(I->first, RPO, Memo); + for (const auto &I : IFMap) + pruneUsesTooFar(I.first, RPO, Memo); pruneEmptyLists(); - for (IFMapType::iterator I = IFMap.begin(), E = IFMap.end(); I != E; ++I) - pruneRegCopies(I->first); + for (const auto &I : IFMap) + pruneRegCopies(I.first); } namespace { @@ -1277,8 +1273,8 @@ void HexagonGenInsert::selectCandidates() { for (IFMapType::iterator I = IFMap.begin(); I != End; ++I) { const IFListType &LL = I->second; RegisterSet TT; - for (unsigned i = 0, n = LL.size(); i < n; ++i) - TT.insert(LL[i].second); + for (const auto &J : LL) + TT.insert(J.second); for (unsigned R = TT.find_first(); R; R = TT.find_next(R)) RemC[R]++; AllRMs.insert(TT); @@ -1384,8 +1380,8 @@ bool HexagonGenInsert::generateInserts() { // Create a new register for each one from IFMap, and store them in the // map. UnsignedMap RegMap; - for (IFMapType::iterator I = IFMap.begin(), E = IFMap.end(); I != E; ++I) { - unsigned VR = I->first; + for (auto &I : IFMap) { + unsigned VR = I.first; const TargetRegisterClass *RC = MRI->getRegClass(VR); Register NewVR = MRI->createVirtualRegister(RC); RegMap[VR] = NewVR; @@ -1394,15 +1390,15 @@ bool HexagonGenInsert::generateInserts() { // We can generate the "insert" instructions using potentially stale re- // gisters: SrcR and InsR for a given VR may be among other registers that // are also replaced. This is fine, we will do the mass "rauw" a bit later. - for (IFMapType::iterator I = IFMap.begin(), E = IFMap.end(); I != E; ++I) { - MachineInstr *MI = MRI->getVRegDef(I->first); + for (auto &I : IFMap) { + MachineInstr *MI = MRI->getVRegDef(I.first); MachineBasicBlock &B = *MI->getParent(); DebugLoc DL = MI->getDebugLoc(); - unsigned NewR = RegMap[I->first]; + unsigned NewR = RegMap[I.first]; bool R32 = MRI->getRegClass(NewR) == &Hexagon::IntRegsRegClass; const MCInstrDesc &D = R32 ? HII->get(Hexagon::S2_insert) : HII->get(Hexagon::S2_insertp); - IFRecord IF = I->second[0].first; + IFRecord IF = I.second[0].first; unsigned Wdh = IF.Wdh, Off = IF.Off; unsigned InsS = 0; if (R32 && MRI->getRegClass(IF.InsR) == &Hexagon::DoubleRegsRegClass) { @@ -1428,9 +1424,9 @@ bool HexagonGenInsert::generateInserts() { MRI->clearKillFlags(IF.InsR); } - for (IFMapType::iterator I = IFMap.begin(), E = IFMap.end(); I != E; ++I) { - MachineInstr *DefI = MRI->getVRegDef(I->first); - MRI->replaceRegWith(I->first, RegMap[I->first]); + for (const auto &I : IFMap) { + MachineInstr *DefI = MRI->getVRegDef(I.first); + MRI->replaceRegWith(I.first, RegMap[I.first]); DefI->eraseFromParent(); } @@ -1523,9 +1519,8 @@ bool HexagonGenInsert::runOnMachineFunction(MachineFunction &MF) { if (isDebug()) { dbgs() << "Cell ordering:\n"; - for (RegisterOrdering::iterator I = CellOrd.begin(), E = CellOrd.end(); - I != E; ++I) { - unsigned VR = I->first, Pos = I->second; + for (const auto &I : CellOrd) { + unsigned VR = I.first, Pos = I.second; dbgs() << printReg(VR, HRI) << " -> " << Pos << "\n"; } } diff --git a/llvm/lib/Target/Hexagon/HexagonGenPredicate.cpp b/llvm/lib/Target/Hexagon/HexagonGenPredicate.cpp index 1a66394e9757..00615f355146 100644 --- a/llvm/lib/Target/Hexagon/HexagonGenPredicate.cpp +++ b/llvm/lib/Target/Hexagon/HexagonGenPredicate.cpp @@ -505,8 +505,8 @@ bool HexagonGenPredicate::runOnMachineFunction(MachineFunction &MF) { bool Changed = false; collectPredicateGPR(MF); - for (SetOfReg::iterator I = PredGPRs.begin(), E = PredGPRs.end(); I != E; ++I) - processPredicateGPR(*I); + for (const RegisterSubReg &R : PredGPRs) + processPredicateGPR(R); bool Again; do { diff --git a/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp b/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp index 5d2e1b259449..43afae441457 100644 --- a/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp +++ b/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp @@ -1127,8 +1127,8 @@ bool HexagonHardwareLoops::convertToHardwareLoop(MachineLoop *L, bool L1Used = false; // Process nested loops first. - for (MachineLoop::iterator I = L->begin(), E = L->end(); I != E; ++I) { - Changed |= convertToHardwareLoop(*I, RecL0used, RecL1used); + for (MachineLoop *I : *L) { + Changed |= convertToHardwareLoop(I, RecL0used, RecL1used); L0Used |= RecL0used; L1Used |= RecL1used; } @@ -1587,16 +1587,6 @@ void HexagonHardwareLoops::setImmediate(MachineOperand &MO, int64_t Val) { MO.setReg(NewR); } -static bool isImmValidForOpcode(unsigned CmpOpc, int64_t Imm) { - // These two instructions are not extendable. - if (CmpOpc == Hexagon::A4_cmpbeqi) - return isUInt<8>(Imm); - if (CmpOpc == Hexagon::A4_cmpbgti) - return isInt<8>(Imm); - // The rest of the comparison-with-immediate instructions are extendable. - return true; -} - bool HexagonHardwareLoops::fixupInductionVariable(MachineLoop *L) { MachineBasicBlock *Header = L->getHeader(); MachineBasicBlock *Latch = L->getLoopLatch(); @@ -1812,9 +1802,9 @@ bool HexagonHardwareLoops::fixupInductionVariable(MachineLoop *L) { // Most comparisons of register against an immediate value allow // the immediate to be constant-extended. There are some exceptions // though. Make sure the new combination will work. - if (CmpImmOp->isImm()) - if (!isImmValidForOpcode(PredDef->getOpcode(), CmpImm)) - return false; + if (CmpImmOp->isImm() && !TII->isExtendable(*PredDef) && + !TII->isValidOffset(PredDef->getOpcode(), CmpImm, TRI, false)) + return false; // Make sure that the compare happens after the bump. Otherwise, // after the fixup, the compare would use a yet-undefined register. diff --git a/llvm/lib/Target/Hexagon/HexagonHazardRecognizer.cpp b/llvm/lib/Target/Hexagon/HexagonHazardRecognizer.cpp index 44679d429de5..e2215c9900d0 100644 --- a/llvm/lib/Target/Hexagon/HexagonHazardRecognizer.cpp +++ b/llvm/lib/Target/Hexagon/HexagonHazardRecognizer.cpp @@ -44,12 +44,7 @@ HexagonHazardRecognizer::getHazardType(SUnit *SU, int stalls) { if (!Resources->canReserveResources(*MI)) { LLVM_DEBUG(dbgs() << "*** Hazard in cycle " << PacketNum << ", " << *MI); HazardType RetVal = Hazard; - if (TII->mayBeNewStore(*MI)) { - // Make sure the register to be stored is defined by an instruction in the - // packet. - MachineOperand &MO = MI->getOperand(MI->getNumOperands() - 1); - if (!MO.isReg() || RegDefs.count(MO.getReg()) == 0) - return Hazard; + if (isNewStore(*MI)) { // The .new store version uses different resources so check if it // causes a hazard. MachineFunction *MF = MI->getParent()->getParent(); @@ -105,6 +100,15 @@ bool HexagonHazardRecognizer::ShouldPreferAnother(SUnit *SU) { return UsesDotCur && ((SU == UsesDotCur) ^ (DotCurPNum == (int)PacketNum)); } +/// Return true if the instruction would be converted to a new value store when +/// packetized. +bool HexagonHazardRecognizer::isNewStore(MachineInstr &MI) { + if (!TII->mayBeNewStore(MI)) + return false; + MachineOperand &MO = MI.getOperand(MI.getNumOperands() - 1); + return (MO.isReg() && RegDefs.count(MO.getReg()) != 0); +} + void HexagonHazardRecognizer::EmitInstruction(SUnit *SU) { MachineInstr *MI = SU->getInstr(); if (!MI) @@ -119,7 +123,7 @@ void HexagonHazardRecognizer::EmitInstruction(SUnit *SU) { if (TII->isZeroCost(MI->getOpcode())) return; - if (!Resources->canReserveResources(*MI)) { + if (!Resources->canReserveResources(*MI) || isNewStore(*MI)) { // It must be a .new store since other instructions must be able to be // reserved at this point. assert(TII->mayBeNewStore(*MI) && "Expecting .new store"); @@ -127,11 +131,12 @@ void HexagonHazardRecognizer::EmitInstruction(SUnit *SU) { MachineInstr *NewMI = MF->CreateMachineInstr(TII->get(TII->getDotNewOp(*MI)), MI->getDebugLoc()); - assert(Resources->canReserveResources(*NewMI)); - Resources->reserveResources(*NewMI); + if (Resources->canReserveResources(*NewMI)) + Resources->reserveResources(*NewMI); + else + Resources->reserveResources(*MI); MF->deleteMachineInstr(NewMI); - } - else + } else Resources->reserveResources(*MI); LLVM_DEBUG(dbgs() << " Add instruction " << *MI); diff --git a/llvm/lib/Target/Hexagon/HexagonHazardRecognizer.h b/llvm/lib/Target/Hexagon/HexagonHazardRecognizer.h index 53b9cb43b4b6..0528cbd1f15f 100644 --- a/llvm/lib/Target/Hexagon/HexagonHazardRecognizer.h +++ b/llvm/lib/Target/Hexagon/HexagonHazardRecognizer.h @@ -40,6 +40,10 @@ class HexagonHazardRecognizer : public ScheduleHazardRecognizer { // The set of registers defined by instructions in the current packet. SmallSet<unsigned, 8> RegDefs; + // Return true if the instruction is a store that is converted to a new value + // store because its value is defined in the same packet. + bool isNewStore(MachineInstr &MI); + public: HexagonHazardRecognizer(const InstrItineraryData *II, const HexagonInstrInfo *HII, diff --git a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp index 2679e399852f..161768b8dc22 100644 --- a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp +++ b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp @@ -1176,6 +1176,9 @@ void HexagonDAGToDAGISel::ppHoistZextI1(std::vector<SDNode*> &&Nodes) { EVT UVT = U->getValueType(0); if (!UVT.isSimple() || !UVT.isInteger() || UVT.getSimpleVT() == MVT::i1) continue; + // Do not generate select for all i1 vector type. + if (UVT.isVector() && UVT.getVectorElementType() == MVT::i1) + continue; if (isMemOPCandidate(N, U)) continue; @@ -1282,7 +1285,7 @@ void HexagonDAGToDAGISel::emitFunctionEntryCode() { MachineFrameInfo &MFI = MF->getFrameInfo(); MachineBasicBlock *EntryBB = &MF->front(); - unsigned AR = FuncInfo->CreateReg(MVT::i32); + Register AR = FuncInfo->CreateReg(MVT::i32); Align EntryMaxA = MFI.getMaxAlign(); BuildMI(EntryBB, DebugLoc(), HII->get(Hexagon::PS_aligna), AR) .addImm(EntryMaxA.value()); diff --git a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp index ed4874baf7c8..0a6dd727eb82 100644 --- a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp +++ b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp @@ -230,8 +230,7 @@ bool Coloring::color() { WorkQ.push_back(N); } - for (unsigned I = 0; I < WorkQ.size(); ++I) { - Node N = WorkQ[I]; + for (Node N : WorkQ) { NodeSet &Ns = Edges[N]; auto P = getUniqueColor(Ns); if (P.first) { @@ -270,8 +269,7 @@ bool Coloring::color() { #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void Coloring::dump() const { dbgs() << "{ Order: {"; - for (unsigned I = 0; I != Order.size(); ++I) { - Node P = Order[I]; + for (Node P : Order) { if (P != Ignore) dbgs() << ' ' << P; else @@ -761,8 +759,7 @@ void ResultStack::print(raw_ostream &OS, const SelectionDAG &G) const { namespace { struct ShuffleMask { ShuffleMask(ArrayRef<int> M) : Mask(M) { - for (unsigned I = 0, E = Mask.size(); I != E; ++I) { - int M = Mask[I]; + for (int M : Mask) { if (M == -1) continue; MinSrc = (MinSrc == -1) ? M : std::min(MinSrc, M); @@ -935,8 +932,7 @@ static SmallVector<unsigned, 4> getInputSegmentList(ShuffleMask SM, unsigned Shift = Log2_32(SegLen); BitVector Segs(alignTo(SM.MaxSrc + 1, SegLen) >> Shift); - for (int I = 0, E = SM.Mask.size(); I != E; ++I) { - int M = SM.Mask[I]; + for (int M : SM.Mask) { if (M >= 0) Segs.set(M >> Shift); } @@ -2397,6 +2393,7 @@ void HexagonDAGToDAGISel::SelectV65GatherPred(SDNode *N) { SDValue Base = N->getOperand(4); SDValue Modifier = N->getOperand(5); SDValue Offset = N->getOperand(6); + SDValue ImmOperand = CurDAG->getTargetConstant(0, dl, MVT::i32); unsigned Opcode; unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); @@ -2418,7 +2415,8 @@ void HexagonDAGToDAGISel::SelectV65GatherPred(SDNode *N) { } SDVTList VTs = CurDAG->getVTList(MVT::Other); - SDValue Ops[] = { Address, Predicate, Base, Modifier, Offset, Chain }; + SDValue Ops[] = { Address, ImmOperand, + Predicate, Base, Modifier, Offset, Chain }; SDNode *Result = CurDAG->getMachineNode(Opcode, dl, VTs, Ops); MachineMemOperand *MemOp = cast<MemIntrinsicSDNode>(N)->getMemOperand(); @@ -2434,6 +2432,7 @@ void HexagonDAGToDAGISel::SelectV65Gather(SDNode *N) { SDValue Base = N->getOperand(3); SDValue Modifier = N->getOperand(4); SDValue Offset = N->getOperand(5); + SDValue ImmOperand = CurDAG->getTargetConstant(0, dl, MVT::i32); unsigned Opcode; unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); @@ -2455,7 +2454,7 @@ void HexagonDAGToDAGISel::SelectV65Gather(SDNode *N) { } SDVTList VTs = CurDAG->getVTList(MVT::Other); - SDValue Ops[] = { Address, Base, Modifier, Offset, Chain }; + SDValue Ops[] = { Address, ImmOperand, Base, Modifier, Offset, Chain }; SDNode *Result = CurDAG->getMachineNode(Opcode, dl, VTs, Ops); MachineMemOperand *MemOp = cast<MemIntrinsicSDNode>(N)->getMemOperand(); diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp index 88effed9f076..d7ca934a23e6 100644 --- a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp +++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp @@ -543,9 +543,8 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // The Glue is necessary since all emitted instructions must be // stuck together. if (!CLI.IsTailCall) { - for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { - Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, - RegsToPass[i].second, Glue); + for (const auto &R : RegsToPass) { + Chain = DAG.getCopyToReg(Chain, dl, R.first, R.second, Glue); Glue = Chain.getValue(1); } } else { @@ -560,9 +559,8 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // // Do not flag preceding copytoreg stuff together with the following stuff. Glue = SDValue(); - for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { - Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, - RegsToPass[i].second, Glue); + for (const auto &R : RegsToPass) { + Chain = DAG.getCopyToReg(Chain, dl, R.first, R.second, Glue); Glue = Chain.getValue(1); } Glue = SDValue(); @@ -589,10 +587,8 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // Add argument registers to the end of the list so that they are // known live into the call. - for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { - Ops.push_back(DAG.getRegister(RegsToPass[i].first, - RegsToPass[i].second.getValueType())); - } + for (const auto &R : RegsToPass) + Ops.push_back(DAG.getRegister(R.first, R.second.getValueType())); const uint32_t *Mask = HRI.getCallPreservedMask(MF, CallConv); assert(Mask && "Missing call preserved mask for calling convention"); @@ -690,7 +686,7 @@ HexagonTargetLowering::LowerINLINEASM(SDValue Op, SelectionDAG &DAG) const { case InlineAsm::Kind_RegDef: case InlineAsm::Kind_RegDefEarlyClobber: { for (; NumVals; --NumVals, ++i) { - unsigned Reg = cast<RegisterSDNode>(Op.getOperand(i))->getReg(); + Register Reg = cast<RegisterSDNode>(Op.getOperand(i))->getReg(); if (Reg != LR) continue; HMFI.setHasClobberLR(true); @@ -1190,7 +1186,7 @@ HexagonTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const { } // Return LR, which contains the return address. Mark it an implicit live-in. - unsigned Reg = MF.addLiveIn(HRI.getRARegister(), getRegClassFor(MVT::i32)); + Register Reg = MF.addLiveIn(HRI.getRARegister(), getRegClassFor(MVT::i32)); return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT); } @@ -1776,6 +1772,18 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM, setOperationAction(ISD::SINT_TO_FP, MVT::i8, Promote); setOperationAction(ISD::SINT_TO_FP, MVT::i16, Promote); + // Special handling for half-precision floating point conversions. + // Lower half float conversions into library calls. + setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand); + setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand); + setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand); + setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand); + + setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand); + setTruncStoreAction(MVT::f32, MVT::f16, Expand); + setTruncStoreAction(MVT::f64, MVT::f16, Expand); + // Handling of indexed loads/stores: default is "expand". // for (MVT VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64, MVT::f32, MVT::f64, @@ -1856,6 +1864,11 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM, else setLibcallName(RTLIB::SQRT_F32, "__hexagon_sqrtf"); + // Routines to handle fp16 storage type. + setLibcallName(RTLIB::FPROUND_F32_F16, "__truncsfhf2"); + setLibcallName(RTLIB::FPROUND_F64_F16, "__truncdfhf2"); + setLibcallName(RTLIB::FPEXT_F16_F32, "__extendhfsf2"); + // These cause problems when the shift amount is non-constant. setLibcallName(RTLIB::SHL_I128, nullptr); setLibcallName(RTLIB::SRL_I128, nullptr); @@ -2204,8 +2217,7 @@ HexagonTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) // Express the shuffle mask in terms of bytes. SmallVector<int,8> ByteMask; unsigned ElemBytes = VecTy.getVectorElementType().getSizeInBits() / 8; - for (unsigned i = 0, e = Mask.size(); i != e; ++i) { - int M = Mask[i]; + for (int M : Mask) { if (M < 0) { for (unsigned j = 0; j != ElemBytes; ++j) ByteMask.push_back(-1); @@ -2428,8 +2440,8 @@ HexagonTargetLowering::buildVector32(ArrayRef<SDValue> Elem, const SDLoc &dl, if (AllConst) { int32_t V = (Consts[0]->getZExtValue() & 0xFF) | (Consts[1]->getZExtValue() & 0xFF) << 8 | - (Consts[1]->getZExtValue() & 0xFF) << 16 | - Consts[2]->getZExtValue() << 24; + (Consts[2]->getZExtValue() & 0xFF) << 16 | + Consts[3]->getZExtValue() << 24; return DAG.getBitcast(MVT::v4i8, DAG.getConstant(V, dl, MVT::i32)); } @@ -2720,7 +2732,6 @@ SDValue HexagonTargetLowering::getZero(const SDLoc &dl, MVT Ty, SelectionDAG &DAG) const { if (Ty.isVector()) { - assert(Ty.isInteger() && "Only integer vectors are supported here"); unsigned W = Ty.getSizeInBits(); if (W <= 64) return DAG.getBitcast(Ty, DAG.getConstant(0, dl, MVT::getIntegerVT(W))); diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.h b/llvm/lib/Target/Hexagon/HexagonISelLowering.h index d518c036f125..f9ce7a9407aa 100644 --- a/llvm/lib/Target/Hexagon/HexagonISelLowering.h +++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.h @@ -458,6 +458,7 @@ private: SelectionDAG &DAG) const; SDValue LowerHvxBuildVector(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerHvxSplatVector(SDValue Op, SelectionDAG &DAG) const; SDValue LowerHvxConcatVectors(SDValue Op, SelectionDAG &DAG) const; SDValue LowerHvxExtractElement(SDValue Op, SelectionDAG &DAG) const; SDValue LowerHvxInsertElement(SDValue Op, SelectionDAG &DAG) const; @@ -468,7 +469,6 @@ private: SDValue LowerHvxSignExt(SDValue Op, SelectionDAG &DAG) const; SDValue LowerHvxZeroExt(SDValue Op, SelectionDAG &DAG) const; SDValue LowerHvxCttz(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerHvxMul(SDValue Op, SelectionDAG &DAG) const; SDValue LowerHvxMulh(SDValue Op, SelectionDAG &DAG) const; SDValue LowerHvxSetCC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerHvxExtend(SDValue Op, SelectionDAG &DAG) const; @@ -476,6 +476,8 @@ private: SDValue LowerHvxShift(SDValue Op, SelectionDAG &DAG) const; SDValue LowerHvxIntrinsic(SDValue Op, SelectionDAG &DAG) const; SDValue LowerHvxMaskedOp(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerHvxFpExtend(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerHvxConvertFpInt(SDValue Op, SelectionDAG &DAG) const; SDValue SplitHvxPairOp(SDValue Op, SelectionDAG &DAG) const; SDValue SplitHvxMemOp(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp index f7237f496aee..0ba75a544c04 100644..100755 --- a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp +++ b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp @@ -55,6 +55,12 @@ HexagonTargetLowering::initializeHVXLowering() { addRegisterClass(MVT::v32i1, &Hexagon::HvxQRRegClass); addRegisterClass(MVT::v64i1, &Hexagon::HvxQRRegClass); addRegisterClass(MVT::v128i1, &Hexagon::HvxQRRegClass); + if (Subtarget.useHVXV68Ops() && Subtarget.useHVXFloatingPoint()) { + addRegisterClass(MVT::v32f32, &Hexagon::HvxVRRegClass); + addRegisterClass(MVT::v64f16, &Hexagon::HvxVRRegClass); + addRegisterClass(MVT::v64f32, &Hexagon::HvxWRRegClass); + addRegisterClass(MVT::v128f16, &Hexagon::HvxWRRegClass); + } } // Set up operation actions. @@ -83,6 +89,72 @@ HexagonTargetLowering::initializeHVXLowering() { setOperationAction(ISD::VECTOR_SHUFFLE, ByteW, Legal); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); + if (Subtarget.useHVX128BOps() && Subtarget.useHVXV68Ops() && + Subtarget.useHVXFloatingPoint()) { + setOperationAction(ISD::FMINNUM, MVT::v64f16, Legal); + setOperationAction(ISD::FMAXNUM, MVT::v64f16, Legal); + setOperationAction(ISD::FADD, MVT::v64f16, Legal); + setOperationAction(ISD::FSUB, MVT::v64f16, Legal); + setOperationAction(ISD::FMUL, MVT::v64f16, Legal); + setOperationAction(ISD::FADD, MVT::v32f32, Legal); + setOperationAction(ISD::FSUB, MVT::v32f32, Legal); + setOperationAction(ISD::FMUL, MVT::v32f32, Legal); + setOperationAction(ISD::FMINNUM, MVT::v32f32, Legal); + setOperationAction(ISD::FMAXNUM, MVT::v32f32, Legal); + setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64f16, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v64f16, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32f32, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v32f32, Custom); + + // Handle ISD::BUILD_VECTOR for v32f32 in a custom way to generate vsplat + setOperationAction(ISD::BUILD_VECTOR, MVT::v32f32, Custom); + + // BUILD_VECTOR with f16 operands cannot be promoted without + // promoting the result, so lower the node to vsplat or constant pool + setOperationAction(ISD::BUILD_VECTOR, MVT::f16, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::f16, Custom); + setOperationAction(ISD::SPLAT_VECTOR, MVT::f16, Custom); + setOperationAction(ISD::SPLAT_VECTOR, MVT::v64f16, Legal); + setOperationAction(ISD::SPLAT_VECTOR, MVT::v32f32, Legal); + // Vector shuffle is always promoted to ByteV and a bitcast to f16 is + // generated. + setPromoteTo(ISD::VECTOR_SHUFFLE, MVT::v64f16, ByteV); + setPromoteTo(ISD::VECTOR_SHUFFLE, MVT::v64f32, ByteW); + setPromoteTo(ISD::VECTOR_SHUFFLE, MVT::v32f32, ByteV); + + // Custom-lower BUILD_VECTOR for vector pairs. The standard (target- + // independent) handling of it would convert it to a load, which is + // not always the optimal choice. + setOperationAction(ISD::BUILD_VECTOR, MVT::v64f32, Custom); + // Make concat-vectors custom to handle concats of more than 2 vectors. + setOperationAction(ISD::CONCAT_VECTORS, MVT::v128f16, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v64f32, Custom); + + setOperationAction(ISD::LOAD, MVT::v64f32, Custom); + setOperationAction(ISD::STORE, MVT::v64f32, Custom); + setOperationAction(ISD::FADD, MVT::v64f32, Custom); + setOperationAction(ISD::FSUB, MVT::v64f32, Custom); + setOperationAction(ISD::FMUL, MVT::v64f32, Custom); + setOperationAction(ISD::FMINNUM, MVT::v64f32, Custom); + setOperationAction(ISD::FMAXNUM, MVT::v64f32, Custom); + setOperationAction(ISD::VSELECT, MVT::v64f32, Custom); + + if (Subtarget.useHVXQFloatOps()) { + setOperationAction(ISD::FP_EXTEND, MVT::v64f32, Custom); + setOperationAction(ISD::FP_ROUND, MVT::v64f16, Legal); + } else if (Subtarget.useHVXIEEEFPOps()) { + setOperationAction(ISD::FP_EXTEND, MVT::v64f32, Legal); + setOperationAction(ISD::FP_ROUND, MVT::v64f16, Legal); + } + + setOperationAction(ISD::MLOAD, MVT::v32f32, Custom); + setOperationAction(ISD::MSTORE, MVT::v32f32, Custom); + setOperationAction(ISD::MLOAD, MVT::v64f16, Custom); + setOperationAction(ISD::MSTORE, MVT::v64f16, Custom); + setOperationAction(ISD::MLOAD, MVT::v64f32, Custom); + setOperationAction(ISD::MSTORE, MVT::v64f32, Custom); + } + for (MVT T : LegalV) { setIndexedLoadAction(ISD::POST_INC, T, Legal); setIndexedStoreAction(ISD::POST_INC, T, Legal); @@ -137,6 +209,18 @@ HexagonTargetLowering::initializeHVXLowering() { setPromoteTo(ISD::VECTOR_SHUFFLE, T, ByteV); } + if (Subtarget.useHVXQFloatOps()) { + setOperationAction(ISD::SINT_TO_FP, T, Expand); + setOperationAction(ISD::UINT_TO_FP, T, Expand); + setOperationAction(ISD::FP_TO_SINT, T, Expand); + setOperationAction(ISD::FP_TO_UINT, T, Expand); + } else if (Subtarget.useHVXIEEEFPOps()) { + setOperationAction(ISD::SINT_TO_FP, T, Custom); + setOperationAction(ISD::UINT_TO_FP, T, Custom); + setOperationAction(ISD::FP_TO_SINT, T, Custom); + setOperationAction(ISD::FP_TO_UINT, T, Custom); + } + setCondCodeAction(ISD::SETNE, T, Expand); setCondCodeAction(ISD::SETLE, T, Expand); setCondCodeAction(ISD::SETGE, T, Expand); @@ -198,8 +282,39 @@ HexagonTargetLowering::initializeHVXLowering() { setOperationAction(ISD::UMIN, T, Custom); setOperationAction(ISD::UMAX, T, Custom); } + + setOperationAction(ISD::SINT_TO_FP, T, Custom); + setOperationAction(ISD::UINT_TO_FP, T, Custom); + setOperationAction(ISD::FP_TO_SINT, T, Custom); + setOperationAction(ISD::FP_TO_UINT, T, Custom); } + setCondCodeAction(ISD::SETNE, MVT::v64f16, Expand); + setCondCodeAction(ISD::SETLE, MVT::v64f16, Expand); + setCondCodeAction(ISD::SETGE, MVT::v64f16, Expand); + setCondCodeAction(ISD::SETLT, MVT::v64f16, Expand); + setCondCodeAction(ISD::SETONE, MVT::v64f16, Expand); + setCondCodeAction(ISD::SETOLE, MVT::v64f16, Expand); + setCondCodeAction(ISD::SETOGE, MVT::v64f16, Expand); + setCondCodeAction(ISD::SETOLT, MVT::v64f16, Expand); + setCondCodeAction(ISD::SETUNE, MVT::v64f16, Expand); + setCondCodeAction(ISD::SETULE, MVT::v64f16, Expand); + setCondCodeAction(ISD::SETUGE, MVT::v64f16, Expand); + setCondCodeAction(ISD::SETULT, MVT::v64f16, Expand); + + setCondCodeAction(ISD::SETNE, MVT::v32f32, Expand); + setCondCodeAction(ISD::SETLE, MVT::v32f32, Expand); + setCondCodeAction(ISD::SETGE, MVT::v32f32, Expand); + setCondCodeAction(ISD::SETLT, MVT::v32f32, Expand); + setCondCodeAction(ISD::SETONE, MVT::v32f32, Expand); + setCondCodeAction(ISD::SETOLE, MVT::v32f32, Expand); + setCondCodeAction(ISD::SETOGE, MVT::v32f32, Expand); + setCondCodeAction(ISD::SETOLT, MVT::v32f32, Expand); + setCondCodeAction(ISD::SETUNE, MVT::v32f32, Expand); + setCondCodeAction(ISD::SETULE, MVT::v32f32, Expand); + setCondCodeAction(ISD::SETUGE, MVT::v32f32, Expand); + setCondCodeAction(ISD::SETULT, MVT::v32f32, Expand); + // Boolean vectors. for (MVT T : LegalW) { @@ -497,7 +612,9 @@ HexagonTargetLowering::buildHvxVectorReg(ArrayRef<SDValue> Values, assert(ElemSize*VecLen == HwLen); SmallVector<SDValue,32> Words; - if (VecTy.getVectorElementType() != MVT::i32) { + if (VecTy.getVectorElementType() != MVT::i32 && + !(Subtarget.useHVXFloatingPoint() && + VecTy.getVectorElementType() == MVT::f32)) { assert((ElemSize == 1 || ElemSize == 2) && "Invalid element size"); unsigned OpsPerWord = (ElemSize == 1) ? 4 : 2; MVT PartVT = MVT::getVectorVT(VecTy.getVectorElementType(), OpsPerWord); @@ -506,22 +623,31 @@ HexagonTargetLowering::buildHvxVectorReg(ArrayRef<SDValue> Values, Words.push_back(DAG.getBitcast(MVT::i32, W)); } } else { - Words.assign(Values.begin(), Values.end()); + for (SDValue V : Values) + Words.push_back(DAG.getBitcast(MVT::i32, V)); } + auto isSplat = [] (ArrayRef<SDValue> Values, SDValue &SplatV) { + unsigned NumValues = Values.size(); + assert(NumValues > 0); + bool IsUndef = true; + for (unsigned i = 0; i != NumValues; ++i) { + if (Values[i].isUndef()) + continue; + IsUndef = false; + if (!SplatV.getNode()) + SplatV = Values[i]; + else if (SplatV != Values[i]) + return false; + } + if (IsUndef) + SplatV = Values[0]; + return true; + }; unsigned NumWords = Words.size(); - bool IsSplat = true, IsUndef = true; SDValue SplatV; - for (unsigned i = 0; i != NumWords && IsSplat; ++i) { - if (isUndef(Words[i])) - continue; - IsUndef = false; - if (!SplatV.getNode()) - SplatV = Words[i]; - else if (SplatV != Words[i]) - IsSplat = false; - } - if (IsUndef) + bool IsSplat = isSplat(Words, SplatV); + if (IsSplat && isUndef(SplatV)) return DAG.getUNDEF(VecTy); if (IsSplat) { assert(SplatV.getNode()); @@ -618,24 +744,75 @@ HexagonTargetLowering::buildHvxVectorReg(ArrayRef<SDValue> Values, } } - // Construct two halves in parallel, then or them together. + // Find most common element to initialize vector with. This is to avoid + // unnecessary vinsert/valign for cases where the same value is present + // many times. Creates a histogram of the vector's elements to find the + // most common element n. assert(4*Words.size() == Subtarget.getVectorLength()); - SDValue HalfV0 = getInstr(Hexagon::V6_vd0, dl, VecTy, {}, DAG); - SDValue HalfV1 = getInstr(Hexagon::V6_vd0, dl, VecTy, {}, DAG); - SDValue S = DAG.getConstant(4, dl, MVT::i32); + int VecHist[32]; + int n = 0; + for (unsigned i = 0; i != NumWords; ++i) { + VecHist[i] = 0; + if (Words[i].isUndef()) + continue; + for (unsigned j = i; j != NumWords; ++j) + if (Words[i] == Words[j]) + VecHist[i]++; + + if (VecHist[i] > VecHist[n]) + n = i; + } + + SDValue HalfV = getZero(dl, VecTy, DAG); + if (VecHist[n] > 1) { + SDValue SplatV = DAG.getNode(ISD::SPLAT_VECTOR, dl, VecTy, Words[n]); + HalfV = DAG.getNode(HexagonISD::VALIGN, dl, VecTy, + {HalfV, SplatV, DAG.getConstant(HwLen/2, dl, MVT::i32)}); + } + SDValue HalfV0 = HalfV; + SDValue HalfV1 = HalfV; + + // Construct two halves in parallel, then or them together. Rn and Rm count + // number of rotations needed before the next element. One last rotation is + // performed post-loop to position the last element. + int Rn = 0, Rm = 0; + SDValue Sn, Sm; + SDValue N = HalfV0; + SDValue M = HalfV1; for (unsigned i = 0; i != NumWords/2; ++i) { - SDValue N = DAG.getNode(HexagonISD::VINSERTW0, dl, VecTy, - {HalfV0, Words[i]}); - SDValue M = DAG.getNode(HexagonISD::VINSERTW0, dl, VecTy, - {HalfV1, Words[i+NumWords/2]}); - HalfV0 = DAG.getNode(HexagonISD::VROR, dl, VecTy, {N, S}); - HalfV1 = DAG.getNode(HexagonISD::VROR, dl, VecTy, {M, S}); + + // Rotate by element count since last insertion. + if (Words[i] != Words[n] || VecHist[n] <= 1) { + Sn = DAG.getConstant(Rn, dl, MVT::i32); + HalfV0 = DAG.getNode(HexagonISD::VROR, dl, VecTy, {N, Sn}); + N = DAG.getNode(HexagonISD::VINSERTW0, dl, VecTy, + {HalfV0, Words[i]}); + Rn = 0; + } + if (Words[i+NumWords/2] != Words[n] || VecHist[n] <= 1) { + Sm = DAG.getConstant(Rm, dl, MVT::i32); + HalfV1 = DAG.getNode(HexagonISD::VROR, dl, VecTy, {M, Sm}); + M = DAG.getNode(HexagonISD::VINSERTW0, dl, VecTy, + {HalfV1, Words[i+NumWords/2]}); + Rm = 0; + } + Rn += 4; + Rm += 4; } + // Perform last rotation. + Sn = DAG.getConstant(Rn+HwLen/2, dl, MVT::i32); + Sm = DAG.getConstant(Rm, dl, MVT::i32); + HalfV0 = DAG.getNode(HexagonISD::VROR, dl, VecTy, {N, Sn}); + HalfV1 = DAG.getNode(HexagonISD::VROR, dl, VecTy, {M, Sm}); + + SDValue T0 = DAG.getBitcast(tyVector(VecTy, MVT::i32), HalfV0); + SDValue T1 = DAG.getBitcast(tyVector(VecTy, MVT::i32), HalfV1); + + SDValue DstV = DAG.getNode(ISD::OR, dl, ty(T0), {T0, T1}); - HalfV0 = DAG.getNode(HexagonISD::VROR, dl, VecTy, - {HalfV0, DAG.getConstant(HwLen/2, dl, MVT::i32)}); - SDValue DstV = DAG.getNode(ISD::OR, dl, VecTy, {HalfV0, HalfV1}); - return DstV; + SDValue OutV = + DAG.getBitcast(tyVector(ty(DstV), VecTy.getVectorElementType()), DstV); + return OutV; } SDValue @@ -1237,6 +1414,19 @@ HexagonTargetLowering::LowerHvxBuildVector(SDValue Op, SelectionDAG &DAG) if (VecTy.getVectorElementType() == MVT::i1) return buildHvxVectorPred(Ops, dl, VecTy, DAG); + // In case of MVT::f16 BUILD_VECTOR, since MVT::f16 is + // not a legal type, just bitcast the node to use i16 + // types and bitcast the result back to f16 + if (VecTy.getVectorElementType() == MVT::f16) { + SmallVector<SDValue,64> NewOps; + for (unsigned i = 0; i != Size; i++) + NewOps.push_back(DAG.getBitcast(MVT::i16, Ops[i])); + + SDValue T0 = DAG.getNode(ISD::BUILD_VECTOR, dl, + tyVector(VecTy, MVT::i16), NewOps); + return DAG.getBitcast(tyVector(VecTy, MVT::f16), T0); + } + if (VecTy.getSizeInBits() == 16*Subtarget.getVectorLength()) { ArrayRef<SDValue> A(Ops); MVT SingleTy = typeSplit(VecTy).first; @@ -1249,6 +1439,24 @@ HexagonTargetLowering::LowerHvxBuildVector(SDValue Op, SelectionDAG &DAG) } SDValue +HexagonTargetLowering::LowerHvxSplatVector(SDValue Op, SelectionDAG &DAG) + const { + const SDLoc &dl(Op); + MVT VecTy = ty(Op); + MVT ArgTy = ty(Op.getOperand(0)); + + if (ArgTy == MVT::f16) { + MVT SplatTy = MVT::getVectorVT(MVT::i16, VecTy.getVectorNumElements()); + SDValue ToInt16 = DAG.getBitcast(MVT::i16, Op.getOperand(0)); + SDValue ToInt32 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, ToInt16); + SDValue Splat = DAG.getNode(ISD::SPLAT_VECTOR, dl, SplatTy, ToInt32); + return DAG.getBitcast(VecTy, Splat); + } + + return SDValue(); +} + +SDValue HexagonTargetLowering::LowerHvxConcatVectors(SDValue Op, SelectionDAG &DAG) const { // Vector concatenation of two integer (non-bool) vectors does not need @@ -1363,6 +1571,7 @@ SDValue HexagonTargetLowering::LowerHvxInsertElement(SDValue Op, SelectionDAG &DAG) const { const SDLoc &dl(Op); + MVT VecTy = ty(Op); SDValue VecV = Op.getOperand(0); SDValue ValV = Op.getOperand(1); SDValue IdxV = Op.getOperand(2); @@ -1370,6 +1579,14 @@ HexagonTargetLowering::LowerHvxInsertElement(SDValue Op, SelectionDAG &DAG) if (ElemTy == MVT::i1) return insertHvxElementPred(VecV, IdxV, ValV, dl, DAG); + if (ElemTy == MVT::f16) { + SDValue T0 = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, + tyVector(VecTy, MVT::i16), + DAG.getBitcast(tyVector(VecTy, MVT::i16), VecV), + DAG.getBitcast(MVT::i16, ValV), IdxV); + return DAG.getBitcast(tyVector(VecTy, MVT::f16), T0); + } + return insertHvxElementReg(VecV, IdxV, ValV, dl, DAG); } @@ -1800,6 +2017,80 @@ HexagonTargetLowering::LowerHvxMaskedOp(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, {StoreLo, StoreHi}); } +SDValue HexagonTargetLowering::LowerHvxFpExtend(SDValue Op, + SelectionDAG &DAG) const { + // This conversion only applies to QFloat. + assert(Subtarget.useHVXQFloatOps()); + + assert(Op->getOpcode() == ISD::FP_EXTEND); + + MVT VecTy = ty(Op); + MVT ArgTy = ty(Op.getOperand(0)); + const SDLoc &dl(Op); + assert(VecTy == MVT::v64f32 && ArgTy == MVT::v64f16); + + SDValue F16Vec = Op.getOperand(0); + + APFloat FloatVal = APFloat(1.0f); + bool Ignored; + FloatVal.convert(APFloat::IEEEhalf(), APFloat::rmNearestTiesToEven, &Ignored); + SDValue Fp16Ones = DAG.getConstantFP(FloatVal, dl, ArgTy); + SDValue VmpyVec = + getInstr(Hexagon::V6_vmpy_qf32_hf, dl, VecTy, {F16Vec, Fp16Ones}, DAG); + + MVT HalfTy = typeSplit(VecTy).first; + VectorPair Pair = opSplit(VmpyVec, dl, DAG); + SDValue LoVec = + getInstr(Hexagon::V6_vconv_sf_qf32, dl, HalfTy, {Pair.first}, DAG); + SDValue HiVec = + getInstr(Hexagon::V6_vconv_sf_qf32, dl, HalfTy, {Pair.second}, DAG); + + SDValue ShuffVec = + getInstr(Hexagon::V6_vshuffvdd, dl, VecTy, + {HiVec, LoVec, DAG.getConstant(-4, dl, MVT::i32)}, DAG); + + return ShuffVec; +} + +SDValue +HexagonTargetLowering::LowerHvxConvertFpInt(SDValue Op, SelectionDAG &DAG) + const { + // This conversion only applies to IEEE. + assert(Subtarget.useHVXIEEEFPOps()); + + unsigned Opc = Op.getOpcode(); + // Catch invalid conversion ops (just in case). + assert(Opc == ISD::FP_TO_SINT || Opc == ISD::FP_TO_UINT || + Opc == ISD::SINT_TO_FP || Opc == ISD::UINT_TO_FP); + MVT ResTy = ty(Op); + + if (Opc == ISD::FP_TO_SINT || Opc == ISD::FP_TO_UINT) { + MVT FpTy = ty(Op.getOperand(0)).getVectorElementType(); + // There are only conversions of f16. + if (FpTy != MVT::f16) + return SDValue(); + + MVT IntTy = ResTy.getVectorElementType(); + // Other int types aren't legal in HVX, so we shouldn't see them here. + assert(IntTy == MVT::i8 || IntTy == MVT::i16 || IntTy == MVT::i32); + // Conversions to i8 and i16 are legal. + if (IntTy == MVT::i8 || IntTy == MVT::i16) + return Op; + } else { + // Converting int -> fp. + if (ResTy.getVectorElementType() != MVT::f16) + return SDValue(); + MVT IntTy = ty(Op.getOperand(0)).getVectorElementType(); + // Other int types aren't legal in HVX, so we shouldn't see them here. + assert(IntTy == MVT::i8 || IntTy == MVT::i16 || IntTy == MVT::i32); + // i8, i16 -> f16 is legal. + if (IntTy == MVT::i8 || IntTy == MVT::i16) + return Op; + } + + return SDValue(); +} + SDValue HexagonTargetLowering::SplitHvxPairOp(SDValue Op, SelectionDAG &DAG) const { assert(!Op.isMachineOpcode()); @@ -2104,10 +2395,22 @@ HexagonTargetLowering::LowerHvxOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::MLOAD: case ISD::MSTORE: return SplitHvxMemOp(Op, DAG); + case ISD::SINT_TO_FP: + case ISD::UINT_TO_FP: + case ISD::FP_TO_SINT: + case ISD::FP_TO_UINT: + if (ty(Op).getSizeInBits() == ty(Op.getOperand(0)).getSizeInBits()) + return SplitHvxPairOp(Op, DAG); + break; case ISD::CTPOP: case ISD::CTLZ: case ISD::CTTZ: case ISD::MUL: + case ISD::FADD: + case ISD::FSUB: + case ISD::FMUL: + case ISD::FMINNUM: + case ISD::FMAXNUM: case ISD::MULHS: case ISD::MULHU: case ISD::AND: @@ -2134,6 +2437,7 @@ HexagonTargetLowering::LowerHvxOperation(SDValue Op, SelectionDAG &DAG) const { default: break; case ISD::BUILD_VECTOR: return LowerHvxBuildVector(Op, DAG); + case ISD::SPLAT_VECTOR: return LowerHvxSplatVector(Op, DAG); case ISD::CONCAT_VECTORS: return LowerHvxConcatVectors(Op, DAG); case ISD::INSERT_SUBVECTOR: return LowerHvxInsertSubvector(Op, DAG); case ISD::INSERT_VECTOR_ELT: return LowerHvxInsertElement(Op, DAG); @@ -2158,6 +2462,11 @@ HexagonTargetLowering::LowerHvxOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::MSTORE: return LowerHvxMaskedOp(Op, DAG); // Unaligned loads will be handled by the default lowering. case ISD::LOAD: return SDValue(); + case ISD::FP_EXTEND: return LowerHvxFpExtend(Op, DAG); + case ISD::FP_TO_SINT: + case ISD::FP_TO_UINT: + case ISD::SINT_TO_FP: + case ISD::UINT_TO_FP: return LowerHvxConvertFpInt(Op, DAG); } #ifndef NDEBUG Op.dumpr(&DAG); diff --git a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp index 931b0c0e0090..9b4e92a16663 100644 --- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp +++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp @@ -146,6 +146,48 @@ static unsigned nonDbgMICount(MachineBasicBlock::const_instr_iterator MIB, return Count; } +// Check if the A2_tfrsi instruction is cheap or not. If the operand has +// to be constant-extendend it is not cheap since it occupies two slots +// in a packet. +bool HexagonInstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const { + // Enable the following steps only at Os/Oz + if (!(MI.getMF()->getFunction().hasOptSize())) + return MI.isAsCheapAsAMove(); + + if (MI.getOpcode() == Hexagon::A2_tfrsi) { + auto Op = MI.getOperand(1); + // If the instruction has a global address as operand, it is not cheap + // since the operand will be constant extended. + if (Op.getType() == MachineOperand::MO_GlobalAddress) + return false; + // If the instruction has an operand of size > 16bits, its will be + // const-extended and hence, it is not cheap. + if (Op.isImm()) { + int64_t Imm = Op.getImm(); + if (!isInt<16>(Imm)) + return false; + } + } + return MI.isAsCheapAsAMove(); +} + +// Do not sink floating point instructions that updates USR register. +// Example: +// feclearexcept +// F2_conv_w2sf +// fetestexcept +// MachineSink sinks F2_conv_w2sf and we are not able to catch exceptions. +// TODO: On some of these floating point instructions, USR is marked as Use. +// In reality, these instructions also Def the USR. If USR is marked as Def, +// some of the assumptions in assembler packetization are broken. +bool HexagonInstrInfo::shouldSink(const MachineInstr &MI) const { + // Assumption: A floating point instruction that reads the USR will write + // the USR as well. + if (isFloat(MI) && MI.hasRegisterImplicitUseOperand(Hexagon::USR)) + return false; + return true; +} + /// Find the hardware loop instruction used to set-up the specified loop. /// On Hexagon, we have two instructions used to set-up the hardware loop /// (LOOP0, LOOP1) with corresponding endloop (ENDLOOP0, ENDLOOP1) instructions @@ -1464,75 +1506,75 @@ HexagonInstrInfo::expandVGatherPseudo(MachineInstr &MI) const { switch (Opc) { case Hexagon::V6_vgathermh_pseudo: First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermh)) - .add(MI.getOperand(1)) .add(MI.getOperand(2)) - .add(MI.getOperand(3)); + .add(MI.getOperand(3)) + .add(MI.getOperand(4)); BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai)) .add(MI.getOperand(0)) - .addImm(0) + .addImm(MI.getOperand(1).getImm()) .addReg(Hexagon::VTMP); MBB.erase(MI); return First.getInstrIterator(); case Hexagon::V6_vgathermw_pseudo: First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermw)) - .add(MI.getOperand(1)) .add(MI.getOperand(2)) - .add(MI.getOperand(3)); + .add(MI.getOperand(3)) + .add(MI.getOperand(4)); BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai)) .add(MI.getOperand(0)) - .addImm(0) + .addImm(MI.getOperand(1).getImm()) .addReg(Hexagon::VTMP); MBB.erase(MI); return First.getInstrIterator(); case Hexagon::V6_vgathermhw_pseudo: First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermhw)) - .add(MI.getOperand(1)) .add(MI.getOperand(2)) - .add(MI.getOperand(3)); + .add(MI.getOperand(3)) + .add(MI.getOperand(4)); BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai)) .add(MI.getOperand(0)) - .addImm(0) + .addImm(MI.getOperand(1).getImm()) .addReg(Hexagon::VTMP); MBB.erase(MI); return First.getInstrIterator(); case Hexagon::V6_vgathermhq_pseudo: First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermhq)) - .add(MI.getOperand(1)) .add(MI.getOperand(2)) .add(MI.getOperand(3)) - .add(MI.getOperand(4)); + .add(MI.getOperand(4)) + .add(MI.getOperand(5)); BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai)) .add(MI.getOperand(0)) - .addImm(0) + .addImm(MI.getOperand(1).getImm()) .addReg(Hexagon::VTMP); MBB.erase(MI); return First.getInstrIterator(); case Hexagon::V6_vgathermwq_pseudo: First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermwq)) - .add(MI.getOperand(1)) .add(MI.getOperand(2)) .add(MI.getOperand(3)) - .add(MI.getOperand(4)); + .add(MI.getOperand(4)) + .add(MI.getOperand(5)); BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai)) .add(MI.getOperand(0)) - .addImm(0) + .addImm(MI.getOperand(1).getImm()) .addReg(Hexagon::VTMP); MBB.erase(MI); return First.getInstrIterator(); case Hexagon::V6_vgathermhwq_pseudo: First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermhwq)) - .add(MI.getOperand(1)) .add(MI.getOperand(2)) .add(MI.getOperand(3)) - .add(MI.getOperand(4)); + .add(MI.getOperand(4)) + .add(MI.getOperand(5)); BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai)) .add(MI.getOperand(0)) - .addImm(0) + .addImm(MI.getOperand(1).getImm()) .addReg(Hexagon::VTMP); MBB.erase(MI); return First.getInstrIterator(); @@ -1851,6 +1893,7 @@ bool HexagonInstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg, case Hexagon::C4_cmplte: case Hexagon::C4_cmplteu: SrcReg2 = MI.getOperand(2).getReg(); + Value = 0; return true; case Hexagon::C2_cmpeqi: @@ -2725,7 +2768,13 @@ bool HexagonInstrInfo::isValidOffset(unsigned Opcode, int Offset, case Hexagon::V6_vL32b_nt_ai: case Hexagon::V6_vS32b_nt_ai: case Hexagon::V6_vL32Ub_ai: - case Hexagon::V6_vS32Ub_ai: { + case Hexagon::V6_vS32Ub_ai: + case Hexagon::V6_vgathermh_pseudo: + case Hexagon::V6_vgathermw_pseudo: + case Hexagon::V6_vgathermhw_pseudo: + case Hexagon::V6_vgathermhq_pseudo: + case Hexagon::V6_vgathermwq_pseudo: + case Hexagon::V6_vgathermhwq_pseudo: { unsigned VectorSize = TRI->getSpillSize(Hexagon::HvxVRRegClass); assert(isPowerOf2_32(VectorSize)); if (Offset & (VectorSize-1)) @@ -2751,6 +2800,11 @@ bool HexagonInstrInfo::isValidOffset(unsigned Opcode, int Offset, case Hexagon::S4_storeirit_io: case Hexagon::S4_storeirif_io: return isShiftedUInt<6,2>(Offset); + // Handle these two compare instructions that are not extendable. + case Hexagon::A4_cmpbeqi: + return isUInt<8>(Offset); + case Hexagon::A4_cmpbgti: + return isInt<8>(Offset); } if (Extend) @@ -2788,6 +2842,8 @@ bool HexagonInstrInfo::isValidOffset(unsigned Opcode, int Offset, case Hexagon::L4_isub_memopw_io: case Hexagon::L4_add_memopw_io: case Hexagon::L4_sub_memopw_io: + case Hexagon::L4_iand_memopw_io: + case Hexagon::L4_ior_memopw_io: case Hexagon::L4_and_memopw_io: case Hexagon::L4_or_memopw_io: return (0 <= Offset && Offset <= 255); @@ -2796,6 +2852,8 @@ bool HexagonInstrInfo::isValidOffset(unsigned Opcode, int Offset, case Hexagon::L4_isub_memoph_io: case Hexagon::L4_add_memoph_io: case Hexagon::L4_sub_memoph_io: + case Hexagon::L4_iand_memoph_io: + case Hexagon::L4_ior_memoph_io: case Hexagon::L4_and_memoph_io: case Hexagon::L4_or_memoph_io: return (0 <= Offset && Offset <= 127); @@ -2804,6 +2862,8 @@ bool HexagonInstrInfo::isValidOffset(unsigned Opcode, int Offset, case Hexagon::L4_isub_memopb_io: case Hexagon::L4_add_memopb_io: case Hexagon::L4_sub_memopb_io: + case Hexagon::L4_iand_memopb_io: + case Hexagon::L4_ior_memopb_io: case Hexagon::L4_and_memopb_io: case Hexagon::L4_or_memopb_io: return (0 <= Offset && Offset <= 63); @@ -2848,8 +2908,18 @@ bool HexagonInstrInfo::isValidOffset(unsigned Opcode, int Offset, case Hexagon::S2_pstorerdt_io: case Hexagon::S2_pstorerdf_io: return isShiftedUInt<6,3>(Offset); + + case Hexagon::L2_loadbsw2_io: + case Hexagon::L2_loadbzw2_io: + return isShiftedInt<11,1>(Offset); + + case Hexagon::L2_loadbsw4_io: + case Hexagon::L2_loadbzw4_io: + return isShiftedInt<11,2>(Offset); } // switch + dbgs() << "Failed Opcode is : " << Opcode << " (" << getName(Opcode) + << ")\n"; llvm_unreachable("No offset range is defined for this opcode. " "Please define it in the above switch statement!"); } @@ -3486,9 +3556,9 @@ int HexagonInstrInfo::getDuplexOpcode(const MachineInstr &MI, if (Iter != DupMap.end()) return Iter->second; } else { // Conversion to Tiny core. - for (auto Iter = DupMap.begin(), End = DupMap.end(); Iter != End; ++Iter) - if (Iter->second == OpNum) - return Iter->first; + for (const auto &Iter : DupMap) + if (Iter.second == OpNum) + return Iter.first; } return -1; } @@ -3516,6 +3586,10 @@ int HexagonInstrInfo::getDotCurOp(const MachineInstr &MI) const { return Hexagon::V6_vL32b_nt_cur_pi; case Hexagon::V6_vL32b_nt_ai: return Hexagon::V6_vL32b_nt_cur_ai; + case Hexagon::V6_vL32b_ppu: + return Hexagon::V6_vL32b_cur_ppu; + case Hexagon::V6_vL32b_nt_ppu: + return Hexagon::V6_vL32b_nt_cur_ppu; } return 0; } @@ -3532,6 +3606,10 @@ int HexagonInstrInfo::getNonDotCurOp(const MachineInstr &MI) const { return Hexagon::V6_vL32b_nt_pi; case Hexagon::V6_vL32b_nt_cur_ai: return Hexagon::V6_vL32b_nt_ai; + case Hexagon::V6_vL32b_cur_ppu: + return Hexagon::V6_vL32b_ppu; + case Hexagon::V6_vL32b_nt_cur_ppu: + return Hexagon::V6_vL32b_nt_ppu; } return 0; } diff --git a/llvm/lib/Target/Hexagon/HexagonInstrInfo.h b/llvm/lib/Target/Hexagon/HexagonInstrInfo.h index 830f04d9eac3..2af09c857d86 100644 --- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.h +++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.h @@ -335,6 +335,13 @@ public: getSerializableBitmaskMachineOperandTargetFlags() const override; bool isTailCall(const MachineInstr &MI) const override; + bool isAsCheapAsAMove(const MachineInstr &MI) const override; + + // Return true if the instruction should be sunk by MachineSink. + // MachineSink determines on its own whether the instruction is safe to sink; + // this gives the target a hook to override the default behavior with regards + // to which instructions should be sunk. + bool shouldSink(const MachineInstr &MI) const override; /// HexagonInstrInfo specifics. diff --git a/llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp b/llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp index 2cdfbe7845b6..ea6a7498e27f 100644 --- a/llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp +++ b/llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp @@ -110,6 +110,8 @@ private: bool changeAddAsl(NodeAddr<UseNode *> AddAslUN, MachineInstr *AddAslMI, const MachineOperand &ImmOp, unsigned ImmOpNum); bool isValidOffset(MachineInstr *MI, int Offset); + unsigned getBaseOpPosition(MachineInstr *MI); + unsigned getOffsetOpPosition(MachineInstr *MI); }; } // end anonymous namespace @@ -322,6 +324,25 @@ bool HexagonOptAddrMode::isSafeToExtLR(NodeAddr<StmtNode *> SN, } bool HexagonOptAddrMode::isValidOffset(MachineInstr *MI, int Offset) { + if (HII->isHVXVec(*MI)) { + // only HVX vgather instructions handled + // TODO: extend the pass to other vector load/store operations + switch (MI->getOpcode()) { + case Hexagon::V6_vgathermh_pseudo: + case Hexagon::V6_vgathermw_pseudo: + case Hexagon::V6_vgathermhw_pseudo: + case Hexagon::V6_vgathermhq_pseudo: + case Hexagon::V6_vgathermwq_pseudo: + case Hexagon::V6_vgathermhwq_pseudo: + return HII->isValidOffset(MI->getOpcode(), Offset, HRI, false); + default: + return false; + } + } + + if (HII->getAddrMode(*MI) != HexagonII::BaseImmOffset) + return false; + unsigned AlignMask = 0; switch (HII->getMemAccessSize(*MI)) { case HexagonII::MemAccessSize::DoubleWordAccess: @@ -345,29 +366,67 @@ bool HexagonOptAddrMode::isValidOffset(MachineInstr *MI, int Offset) { return HII->isValidOffset(MI->getOpcode(), Offset, HRI, false); } +unsigned HexagonOptAddrMode::getBaseOpPosition(MachineInstr *MI) { + const MCInstrDesc &MID = MI->getDesc(); + switch (MI->getOpcode()) { + // vgather pseudos are mayLoad and mayStore + // hence need to explicitly specify Base and + // Offset operand positions + case Hexagon::V6_vgathermh_pseudo: + case Hexagon::V6_vgathermw_pseudo: + case Hexagon::V6_vgathermhw_pseudo: + case Hexagon::V6_vgathermhq_pseudo: + case Hexagon::V6_vgathermwq_pseudo: + case Hexagon::V6_vgathermhwq_pseudo: + return 0; + default: + return MID.mayLoad() ? 1 : 0; + } +} + +unsigned HexagonOptAddrMode::getOffsetOpPosition(MachineInstr *MI) { + assert( + (HII->getAddrMode(*MI) == HexagonII::BaseImmOffset) && + "Looking for an offset in non-BaseImmOffset addressing mode instruction"); + + const MCInstrDesc &MID = MI->getDesc(); + switch (MI->getOpcode()) { + // vgather pseudos are mayLoad and mayStore + // hence need to explicitly specify Base and + // Offset operand positions + case Hexagon::V6_vgathermh_pseudo: + case Hexagon::V6_vgathermw_pseudo: + case Hexagon::V6_vgathermhw_pseudo: + case Hexagon::V6_vgathermhq_pseudo: + case Hexagon::V6_vgathermwq_pseudo: + case Hexagon::V6_vgathermhwq_pseudo: + return 1; + default: + return MID.mayLoad() ? 2 : 1; + } +} + bool HexagonOptAddrMode::processAddUses(NodeAddr<StmtNode *> AddSN, MachineInstr *AddMI, const NodeList &UNodeList) { Register AddDefR = AddMI->getOperand(0).getReg(); + Register BaseReg = AddMI->getOperand(1).getReg(); for (auto I = UNodeList.rbegin(), E = UNodeList.rend(); I != E; ++I) { NodeAddr<UseNode *> UN = *I; NodeAddr<StmtNode *> SN = UN.Addr->getOwner(*DFG); MachineInstr *MI = SN.Addr->getCode(); const MCInstrDesc &MID = MI->getDesc(); if ((!MID.mayLoad() && !MID.mayStore()) || - HII->getAddrMode(*MI) != HexagonII::BaseImmOffset || - HII->isHVXVec(*MI)) + HII->getAddrMode(*MI) != HexagonII::BaseImmOffset) return false; - MachineOperand BaseOp = MID.mayLoad() ? MI->getOperand(1) - : MI->getOperand(0); + MachineOperand BaseOp = MI->getOperand(getBaseOpPosition(MI)); if (!BaseOp.isReg() || BaseOp.getReg() != AddDefR) return false; - MachineOperand OffsetOp = MID.mayLoad() ? MI->getOperand(2) - : MI->getOperand(1); + MachineOperand OffsetOp = MI->getOperand(getOffsetOpPosition(MI)); if (!OffsetOp.isImm()) return false; @@ -382,11 +441,19 @@ bool HexagonOptAddrMode::processAddUses(NodeAddr<StmtNode *> AddSN, // Ex: Rx= add(Rt,#10) // memw(Rx+#0) = Rs // will be replaced with => memw(Rt+#10) = Rs - Register BaseReg = AddMI->getOperand(1).getReg(); if (!isSafeToExtLR(AddSN, AddMI, BaseReg, UNodeList)) return false; } + NodeId LRExtRegRD = 0; + // Iterate through all the UseNodes in SN and find the reaching def + // for the LRExtReg. + for (NodeAddr<UseNode *> UA : AddSN.Addr->members_if(DFG->IsUse, *DFG)) { + RegisterRef RR = UA.Addr->getRegRef(*DFG); + if (BaseReg == RR.Reg) + LRExtRegRD = UA.Addr->getReachingDef(); + } + // Update all the uses of 'add' with the appropriate base and offset // values. bool Changed = false; @@ -400,6 +467,12 @@ bool HexagonOptAddrMode::processAddUses(NodeAddr<StmtNode *> AddSN, LLVM_DEBUG(dbgs() << "\t\t[MI <BB#" << UseMI->getParent()->getNumber() << ">]: " << *UseMI << "\n"); Changed |= updateAddUses(AddMI, UseMI); + + // Set the reachingDef for UseNode under consideration + // after updating the Add use. This local change is + // to avoid rebuilding of the RDF graph after update. + NodeAddr<DefNode *> LRExtRegDN = DFG->addr<DefNode *>(LRExtRegRD); + UseN.Addr->linkToDef(UseN.Id, LRExtRegDN); } if (Changed) @@ -409,21 +482,18 @@ bool HexagonOptAddrMode::processAddUses(NodeAddr<StmtNode *> AddSN, } bool HexagonOptAddrMode::updateAddUses(MachineInstr *AddMI, - MachineInstr *UseMI) { + MachineInstr *UseMI) { const MachineOperand ImmOp = AddMI->getOperand(2); const MachineOperand AddRegOp = AddMI->getOperand(1); - Register newReg = AddRegOp.getReg(); - const MCInstrDesc &MID = UseMI->getDesc(); + Register NewReg = AddRegOp.getReg(); - MachineOperand &BaseOp = MID.mayLoad() ? UseMI->getOperand(1) - : UseMI->getOperand(0); - MachineOperand &OffsetOp = MID.mayLoad() ? UseMI->getOperand(2) - : UseMI->getOperand(1); - BaseOp.setReg(newReg); + MachineOperand &BaseOp = UseMI->getOperand(getBaseOpPosition(UseMI)); + MachineOperand &OffsetOp = UseMI->getOperand(getOffsetOpPosition(UseMI)); + BaseOp.setReg(NewReg); BaseOp.setIsUndef(AddRegOp.isUndef()); BaseOp.setImplicit(AddRegOp.isImplicit()); OffsetOp.setImm(ImmOp.getImm() + OffsetOp.getImm()); - MRI->clearKillFlags(newReg); + MRI->clearKillFlags(NewReg); return true; } diff --git a/llvm/lib/Target/Hexagon/HexagonPatterns.td b/llvm/lib/Target/Hexagon/HexagonPatterns.td index cad5ca8ab92e..3abbd896c519 100644 --- a/llvm/lib/Target/Hexagon/HexagonPatterns.td +++ b/llvm/lib/Target/Hexagon/HexagonPatterns.td @@ -87,18 +87,6 @@ def V8I8: PatLeaf<(v8i8 DoubleRegs:$R)>; def V4I16: PatLeaf<(v4i16 DoubleRegs:$R)>; def V2I32: PatLeaf<(v2i32 DoubleRegs:$R)>; -def HQ8: PatLeaf<(VecQ8 HvxQR:$R)>; -def HQ16: PatLeaf<(VecQ16 HvxQR:$R)>; -def HQ32: PatLeaf<(VecQ32 HvxQR:$R)>; - -def HVI8: PatLeaf<(VecI8 HvxVR:$R)>; -def HVI16: PatLeaf<(VecI16 HvxVR:$R)>; -def HVI32: PatLeaf<(VecI32 HvxVR:$R)>; - -def HWI8: PatLeaf<(VecPI8 HvxWR:$R)>; -def HWI16: PatLeaf<(VecPI16 HvxWR:$R)>; -def HWI32: PatLeaf<(VecPI32 HvxWR:$R)>; - def SDTVecLeaf: SDTypeProfile<1, 0, [SDTCisVec<0>]>; def SDTVecVecIntOp: @@ -269,6 +257,9 @@ def anyimm3: PatLeaf<(i32 AnyImm3:$Addr)>; def f32ImmPred : PatLeaf<(f32 fpimm:$F)>; def f64ImmPred : PatLeaf<(f64 fpimm:$F)>; +def f32zero: PatLeaf<(f32 fpimm:$F), [{ + return N->isExactlyValue(APFloat::getZero(APFloat::IEEEsingle(), false)); +}]>; // This complex pattern is really only to detect various forms of // sign-extension i32->i64. The selected value will be of type i64 @@ -378,6 +369,12 @@ def Umin: pf2<umin>; def Umax: pf2<umax>; def Rol: pf2<rotl>; +def Fptosi: pf1<fp_to_sint>; +def Fptoui: pf1<fp_to_uint>; +def Sitofp: pf1<sint_to_fp>; +def Uitofp: pf1<uint_to_fp>; + + // --(1) Immediate ------------------------------------------------------- // @@ -2083,7 +2080,7 @@ let AddedComplexity = 20 in { defm: Loadxi_pat<sextloadi8, i32, anyimm0, L2_loadrb_io>; defm: Loadxi_pat<sextloadi16, i32, anyimm1, L2_loadrh_io>; defm: Loadxi_pat<sextloadv2i8, v2i16, anyimm1, L2_loadbsw2_io>; - defm: Loadxi_pat<sextloadv4i8, v4i16, anyimm2, L2_loadbzw4_io>; + defm: Loadxi_pat<sextloadv4i8, v4i16, anyimm2, L2_loadbsw4_io>; defm: Loadxi_pat<zextloadi1, i32, anyimm0, L2_loadrub_io>; defm: Loadxi_pat<zextloadi8, i32, anyimm0, L2_loadrub_io>; defm: Loadxi_pat<zextloadi16, i32, anyimm1, L2_loadruh_io>; @@ -2135,7 +2132,7 @@ let AddedComplexity = 60 in { def: Loadxu_pat<sextloadi8, i32, anyimm0, L4_loadrb_ur>; def: Loadxu_pat<sextloadi16, i32, anyimm1, L4_loadrh_ur>; def: Loadxu_pat<sextloadv2i8, v2i16, anyimm1, L4_loadbsw2_ur>; - def: Loadxu_pat<sextloadv4i8, v4i16, anyimm2, L4_loadbzw4_ur>; + def: Loadxu_pat<sextloadv4i8, v4i16, anyimm2, L4_loadbsw4_ur>; def: Loadxu_pat<zextloadi1, i32, anyimm0, L4_loadrub_ur>; def: Loadxu_pat<zextloadi8, i32, anyimm0, L4_loadrub_ur>; def: Loadxu_pat<zextloadi16, i32, anyimm1, L4_loadruh_ur>; diff --git a/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td b/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td index a22a3f8ec0ca..0a3dff057ccd 100644 --- a/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td +++ b/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td @@ -6,6 +6,21 @@ // //===----------------------------------------------------------------------===// +def HQ8: PatLeaf<(VecQ8 HvxQR:$R)>; +def HQ16: PatLeaf<(VecQ16 HvxQR:$R)>; +def HQ32: PatLeaf<(VecQ32 HvxQR:$R)>; + +def HVI8: PatLeaf<(VecI8 HvxVR:$R)>; +def HVI16: PatLeaf<(VecI16 HvxVR:$R)>; +def HVI32: PatLeaf<(VecI32 HvxVR:$R)>; +def HVF16: PatLeaf<(VecF16 HvxVR:$R)>; +def HVF32: PatLeaf<(VecF32 HvxVR:$R)>; + +def HWI8: PatLeaf<(VecPI8 HvxWR:$R)>; +def HWI16: PatLeaf<(VecPI16 HvxWR:$R)>; +def HWI32: PatLeaf<(VecPI32 HvxWR:$R)>; +def HWF16: PatLeaf<(VecPF16 HvxWR:$R)>; +def HWF32: PatLeaf<(VecPF32 HvxWR:$R)>; def SDTVecUnaryOp: SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>]>; @@ -49,7 +64,7 @@ def HexagonVPACKL: SDNode<"HexagonISD::VPACKL", SDTVecUnaryOp>; def HexagonVUNPACK: SDNode<"HexagonISD::VUNPACK", SDTVecUnaryOp>; def HexagonVUNPACKU: SDNode<"HexagonISD::VUNPACKU", SDTVecUnaryOp>; -def vzero: PatFrag<(ops), (splat_vector (i32 0))>; +def vzero: PatFrags<(ops), [(splat_vector (i32 0)), (splat_vector (f32zero))]>; def qtrue: PatFrag<(ops), (HexagonQTRUE)>; def qfalse: PatFrag<(ops), (HexagonQFALSE)>; def qcat: PatFrag<(ops node:$Qs, node:$Qt), @@ -150,12 +165,19 @@ let Predicates = [UseHVX] in { defm: HvxLda_pat<V6_vL32b_ai, alignedload, VecI8, IsVecOff>; defm: HvxLda_pat<V6_vL32b_ai, alignedload, VecI16, IsVecOff>; defm: HvxLda_pat<V6_vL32b_ai, alignedload, VecI32, IsVecOff>; - defm: HvxLd_pat<V6_vL32Ub_ai, unalignedload, VecI8, IsVecOff>; defm: HvxLd_pat<V6_vL32Ub_ai, unalignedload, VecI16, IsVecOff>; defm: HvxLd_pat<V6_vL32Ub_ai, unalignedload, VecI32, IsVecOff>; } +let Predicates = [UseHVXV68] in { + defm: HvxLda_pat<V6_vL32b_nt_ai, alignednontemporalload, VecF16, IsVecOff>; + defm: HvxLda_pat<V6_vL32b_nt_ai, alignednontemporalload, VecF32, IsVecOff>; + defm: HvxLda_pat<V6_vL32b_ai, alignedload, VecF16, IsVecOff>; + defm: HvxLda_pat<V6_vL32b_ai, alignedload, VecF32, IsVecOff>; + defm: HvxLd_pat<V6_vL32Ub_ai, unalignedload, VecF16, IsVecOff>; + defm: HvxLd_pat<V6_vL32Ub_ai, unalignedload, VecF32, IsVecOff>; +} // HVX stores @@ -199,6 +221,15 @@ let Predicates = [UseHVX] in { defm: HvxSt_pat<V6_vS32Ub_ai, unalignedstore, HVI32, IsVecOff>; } +let Predicates = [UseHVXV68] in { + defm: HvxSt_pat<V6_vS32b_nt_ai, alignednontemporalstore, HVF16, IsVecOff>; + defm: HvxSt_pat<V6_vS32b_nt_ai, alignednontemporalstore, HVF32, IsVecOff>; + defm: HvxSt_pat<V6_vS32b_ai, alignedstore, HVF16, IsVecOff>; + defm: HvxSt_pat<V6_vS32b_ai, alignedstore, HVF32, IsVecOff>; + defm: HvxSt_pat<V6_vS32Ub_ai, unalignedstore, HVF16, IsVecOff>; + defm: HvxSt_pat<V6_vS32Ub_ai, unalignedstore, HVF32, IsVecOff>; +} + // Bitcasts between same-size vector types are no-ops, except for the // actual type change. let Predicates = [UseHVX] in { @@ -211,6 +242,24 @@ let Predicates = [UseHVX] in { defm: NopCast_pat<VecPI16, VecPI32, HvxWR>; } +let Predicates = [UseHVX, UseHVXFloatingPoint] in { + defm: NopCast_pat<VecI8, VecF16, HvxVR>; + defm: NopCast_pat<VecI8, VecF32, HvxVR>; + defm: NopCast_pat<VecI16, VecF16, HvxVR>; + defm: NopCast_pat<VecI16, VecF32, HvxVR>; + defm: NopCast_pat<VecI32, VecF16, HvxVR>; + defm: NopCast_pat<VecI32, VecF32, HvxVR>; + defm: NopCast_pat<VecF16, VecF32, HvxVR>; + + defm: NopCast_pat<VecPI8, VecPF16, HvxWR>; + defm: NopCast_pat<VecPI8, VecPF32, HvxWR>; + defm: NopCast_pat<VecPI16, VecPF16, HvxWR>; + defm: NopCast_pat<VecPI16, VecPF32, HvxWR>; + defm: NopCast_pat<VecPI32, VecPF16, HvxWR>; + defm: NopCast_pat<VecPI32, VecPF32, HvxWR>; + defm: NopCast_pat<VecPF16, VecPF32, HvxWR>; +} + let Predicates = [UseHVX] in { let AddedComplexity = 100 in { // These should be preferred over a vsplat of 0. @@ -220,6 +269,7 @@ let Predicates = [UseHVX] in { def: Pat<(VecPI8 vzero), (PS_vdd0)>; def: Pat<(VecPI16 vzero), (PS_vdd0)>; def: Pat<(VecPI32 vzero), (PS_vdd0)>; + def: Pat<(VecPF32 vzero), (PS_vdd0)>; def: Pat<(concat_vectors (VecI8 vzero), (VecI8 vzero)), (PS_vdd0)>; def: Pat<(concat_vectors (VecI16 vzero), (VecI16 vzero)), (PS_vdd0)>; @@ -251,6 +301,28 @@ let Predicates = [UseHVX] in { (V6_vinsertwr HvxVR:$Vu, I32:$Rt)>; } +let Predicates = [UseHVX, UseHVXFloatingPoint] in { + let AddedComplexity = 100 in { + def: Pat<(VecF16 vzero), (V6_vd0)>; + def: Pat<(VecF32 vzero), (V6_vd0)>; + def: Pat<(VecPF16 vzero), (PS_vdd0)>; + def: Pat<(VecPF32 vzero), (PS_vdd0)>; + + def: Pat<(concat_vectors (VecF16 vzero), (VecF16 vzero)), (PS_vdd0)>; + def: Pat<(concat_vectors (VecF32 vzero), (VecF32 vzero)), (PS_vdd0)>; + } + + def: Pat<(VecPF16 (concat_vectors HVF16:$Vs, HVF16:$Vt)), + (Combinev HvxVR:$Vt, HvxVR:$Vs)>; + def: Pat<(VecPF32 (concat_vectors HVF32:$Vs, HVF32:$Vt)), + (Combinev HvxVR:$Vt, HvxVR:$Vs)>; + + def: Pat<(HexagonVINSERTW0 HVF16:$Vu, I32:$Rt), + (V6_vinsertwr HvxVR:$Vu, I32:$Rt)>; + def: Pat<(HexagonVINSERTW0 HVF32:$Vu, I32:$Rt), + (V6_vinsertwr HvxVR:$Vu, I32:$Rt)>; +} + // Splats for HvxV60 def V60splatib: OutPatFrag<(ops node:$V), (V6_lvsplatw (ToI32 (SplatB $V)))>; def V60splatih: OutPatFrag<(ops node:$V), (V6_lvsplatw (ToI32 (SplatH $V)))>; @@ -307,6 +379,18 @@ let Predicates = [UseHVX,UseHVXV62] in { def: Pat<(VecPI32 (splat_vector I32:$Rs)), (Rep (V62splatrw $Rs))>; } } +let Predicates = [UseHVXV68, UseHVXFloatingPoint] in { + let AddedComplexity = 30 in { + def: Pat<(VecF16 (splat_vector u16_0ImmPred:$V)), (V62splatih imm:$V)>; + def: Pat<(VecF32 (splat_vector anyint:$V)), (V62splatiw imm:$V)>; + def: Pat<(VecF32 (splat_vector f32ImmPred:$V)), (V62splatiw (ftoi $V))>; + } + let AddedComplexity = 20 in { + def: Pat<(VecF16 (splat_vector I32:$Rs)), (V62splatrh $Rs)>; + def: Pat<(VecF32 (splat_vector I32:$Rs)), (V62splatrw $Rs)>; + def: Pat<(VecF32 (splat_vector F32:$Rs)), (V62splatrw $Rs)>; + } +} class Vneg1<ValueType VecTy> : PatFrag<(ops), (VecTy (splat_vector (i32 -1)))>; @@ -369,6 +453,107 @@ let Predicates = [UseHVX] in { (V6_vmux HvxQR:$Qu, HvxVR:$Vt, HvxVR:$Vs)>; } +// For now, we always deal with vector floating point in SF mode. +class OpR_RR_pat_conv<InstHexagon MI, PatFrag Op, ValueType ResType, + PatFrag RsPred, PatFrag RtPred = RsPred> + : Pat<(ResType (Op RsPred:$Rs, RtPred:$Rt)), + (V6_vconv_sf_qf32 (VecF32 (MI RsPred:$Rs, RtPred:$Rt)))>; + +class OpR_RR_pat_conv_hf<InstHexagon MI, PatFrag Op, ValueType ResType, + PatFrag RsPred, PatFrag RtPred = RsPred> + : Pat<(ResType (Op RsPred:$Rs, RtPred:$Rt)), + (V6_vconv_hf_qf16 (VecF16 (MI RsPred:$Rs, RtPred:$Rt)))>; + +let Predicates = [UseHVXV68, UseHVXQFloat] in { + def: OpR_RR_pat_conv_hf<V6_vsub_hf, pf2<fsub>, VecF16, HVF16>; + def: OpR_RR_pat_conv_hf<V6_vadd_hf, pf2<fadd>, VecF16, HVF16>; + def: OpR_RR_pat_conv_hf<V6_vmpy_qf16_hf, pf2<fmul>, VecF16, HVF16>; + def: OpR_RR_pat_conv<V6_vsub_sf, pf2<fsub>, VecF32, HVF32>; + def: OpR_RR_pat_conv<V6_vadd_sf, pf2<fadd>, VecF32, HVF32>; + def: OpR_RR_pat_conv<V6_vmpy_qf32_sf, pf2<fmul>, VecF32, HVF32>; + + // For now we assume that the fp32 register is always coming in as IEEE float + // since the qfloat arithmetic instructions above always generate the + // accompanying conversions as part of their pattern + def: Pat<(VecF16 (pf1<fpround> HWF32:$Vuu)), + (V6_vdealh (V6_vconv_hf_qf32 + (VecPF32 (Combinev (V6_vadd_sf (HiVec HvxWR:$Vuu), (V6_vd0)), + (V6_vadd_sf (LoVec HvxWR:$Vuu), (V6_vd0)) + ))))>; + // fpextend for QFloat is handled manually in HexagonISelLoweringHVX.cpp. +} + +// HVX IEEE arithmetic Instructions +let Predicates = [UseHVXV68, UseHVXIEEEFP] in { + def: Pat<(fadd HVF16:$Rs, HVF16:$Rt), + (V6_vadd_hf_hf HVF16:$Rs, HVF16:$Rt)>; + def: Pat<(fadd HVF32:$Rs, HVF32:$Rt), + (V6_vadd_sf_sf HVF32:$Rs, HVF32:$Rt)>; + def: Pat<(fsub HVF16:$Rs, HVF16:$Rt), + (V6_vsub_hf_hf HVF16:$Rs, HVF16:$Rt)>; + def: Pat<(fsub HVF32:$Rs, HVF32:$Rt), + (V6_vsub_sf_sf HVF32:$Rs, HVF32:$Rt)>; + def: Pat<(fmul HVF16:$Rs, HVF16:$Rt), + (V6_vmpy_hf_hf HVF16:$Rs, HVF16:$Rt)>; + def: Pat<(fmul HVF32:$Rs, HVF32:$Rt), + (V6_vmpy_sf_sf HVF32:$Rs, HVF32:$Rt)>; + + def: Pat<(VecF16 (pf1<fpround> HWF32:$Vuu)), + (V6_vdealh (V6_vcvt_hf_sf (HiVec HvxWR:$Vuu), (LoVec HvxWR:$Vuu)))>; + def: Pat<(VecPF32 (pf1<fpextend> HVF16:$Vu)), + (V6_vcvt_sf_hf (V6_vshuffh HvxVR:$Vu))>; + + def: OpR_R_pat<V6_vcvt_h_hf, Fptosi, VecI16, HVF16>; + def: OpR_R_pat<V6_vcvt_uh_hf, Fptoui, VecI16, HVF16>; + def: OpR_R_pat<V6_vcvt_hf_h, Sitofp, VecF16, HVI16>; + def: OpR_R_pat<V6_vcvt_hf_uh, Uitofp, VecF16, HVI16>; + + def: Pat<(VecI8 (Fptosi HWF16:$Vu)), + (V6_vcvt_b_hf (HiVec $Vu), (LoVec $Vu))>; + def: Pat<(VecI8 (Fptoui HWF16:$Vu)), + (V6_vcvt_ub_hf (HiVec $Vu), (LoVec $Vu))>; + def: Pat<(VecPF16 (Sitofp HVI8:$Vu)), (V6_vcvt_hf_b HvxVR:$Vu)>; + def: Pat<(VecPF16 (Uitofp HVI8:$Vu)), (V6_vcvt_hf_ub HvxVR:$Vu)>; +} + +let Predicates = [UseHVXV68, UseHVXFloatingPoint] in { + def: Pat<(vselect HQ16:$Qu, HVF16:$Vs, HVF16:$Vt), + (V6_vmux HvxQR:$Qu, HvxVR:$Vs, HvxVR:$Vt)>; + def: Pat<(vselect (qnot HQ16:$Qu), HVF16:$Vs, HVF16:$Vt), + (V6_vmux HvxQR:$Qu, HvxVR:$Vt, HvxVR:$Vs)>; + + def: Pat<(vselect HQ32:$Qu, HVF32:$Vs, HVF32:$Vt), + (V6_vmux HvxQR:$Qu, HvxVR:$Vs, HvxVR:$Vt)>; + def: Pat<(vselect (qnot HQ32:$Qu), HVF32:$Vs, HVF32:$Vt), + (V6_vmux HvxQR:$Qu, HvxVR:$Vt, HvxVR:$Vs)>; +} + +let Predicates = [UseHVXV68, UseHVX128B, UseHVXQFloat] in { + let AddedComplexity = 220 in { + defm: MinMax_pats<V6_vmin_hf, V6_vmax_hf, vselect, setgt, VecQ16, HVF16>; + defm: MinMax_pats<V6_vmin_hf, V6_vmax_hf, vselect, setogt, VecQ16, HVF16>; + defm: MinMax_pats<V6_vmin_sf, V6_vmax_sf, vselect, setgt, VecQ32, HVF32>; + defm: MinMax_pats<V6_vmin_sf, V6_vmax_sf, vselect, setogt, VecQ32, HVF32>; + } + def: OpR_RR_pat<V6_vmin_hf, pf2<fminnum>, VecF16, HVF16>; + def: OpR_RR_pat<V6_vmax_hf, pf2<fmaxnum>, VecF16, HVF16>; + def: OpR_RR_pat<V6_vmin_sf, pf2<fminnum>, VecF32, HVF32>; + def: OpR_RR_pat<V6_vmax_sf, pf2<fmaxnum>, VecF32, HVF32>; +} + +let Predicates = [UseHVXV68, UseHVX128B, UseHVXIEEEFP] in { + let AddedComplexity = 220 in { + defm: MinMax_pats<V6_vfmin_hf, V6_vfmax_hf, vselect, setgt, VecQ16, HVF16>; + defm: MinMax_pats<V6_vfmin_hf, V6_vfmax_hf, vselect, setogt, VecQ16, HVF16>; + defm: MinMax_pats<V6_vfmin_sf, V6_vfmax_sf, vselect, setgt, VecQ32, HVF32>; + defm: MinMax_pats<V6_vfmin_sf, V6_vfmax_sf, vselect, setogt, VecQ32, HVF32>; + } + def: OpR_RR_pat<V6_vfmin_hf, pf2<fminnum>, VecF16, HVF16>; + def: OpR_RR_pat<V6_vfmax_hf, pf2<fmaxnum>, VecF16, HVF16>; + def: OpR_RR_pat<V6_vfmin_sf, pf2<fminnum>, VecF32, HVF32>; + def: OpR_RR_pat<V6_vfmax_sf, pf2<fmaxnum>, VecF32, HVF32>; +} + let Predicates = [UseHVX] in { // For i8 vectors Vs = (a0, a1, ...), Vt = (b0, b1, ...), // V6_vmpybv Vs, Vt produces a pair of i16 vectors Hi:Lo, @@ -551,6 +736,12 @@ let Predicates = [UseHVX] in { def: HvxSel_pat<PS_wselect, HWI32>; } +def V2Q: OutPatFrag<(ops node:$Vs), (V6_vandvrt $Vs, (A2_tfrsi -1))>; + +let Predicates = [UseHVX] in + def: Pat<(select I1:$Pu, VecI1:$Qs, VecI1:$Qt), + (V2Q (PS_vselect $Pu, (Q2V $Qs), (Q2V $Qt)))>; + let Predicates = [UseHVX] in { def: Pat<(VecQ8 (qtrue)), (PS_qtrue)>; def: Pat<(VecQ16 (qtrue)), (PS_qtrue)>; @@ -623,3 +814,63 @@ let Predicates = [UseHVX] in { def: AccRRR_pat<V6_vgtuw_or, Or, setugt, HQ32, HVI32, HVI32>; def: AccRRR_pat<V6_vgtuw_xor, Xor, setugt, HQ32, HVI32, HVI32>; } + +let Predicates = [UseHVXV68, UseHVXFloatingPoint] in { + def: OpR_RR_pat<V6_veqh, seteq, VecQ16, HVF16>; + def: OpR_RR_pat<V6_veqh, setoeq, VecQ16, HVF16>; + def: OpR_RR_pat<V6_veqh, setueq, VecQ16, HVF16>; + def: OpR_RR_pat<V6_vgthf, setgt, VecQ16, HVF16>; + def: OpR_RR_pat<V6_vgthf, setogt, VecQ16, HVF16>; + def: OpR_RR_pat<V6_vgthf, setugt, VecQ16, HVF16>; + + def: OpR_RR_pat<V6_veqw, seteq, VecQ32, HVF32>; + def: OpR_RR_pat<V6_veqw, setoeq, VecQ32, HVF32>; + def: OpR_RR_pat<V6_veqw, setueq, VecQ32, HVF32>; + def: OpR_RR_pat<V6_vgtsf, setgt, VecQ32, HVF32>; + def: OpR_RR_pat<V6_vgtsf, setogt, VecQ32, HVF32>; + def: OpR_RR_pat<V6_vgtsf, setugt, VecQ32, HVF32>; + + def: AccRRR_pat<V6_veqh_and, And, seteq, HQ16, HVF16, HVF16>; + def: AccRRR_pat<V6_veqh_or, Or, seteq, HQ16, HVF16, HVF16>; + def: AccRRR_pat<V6_veqh_xor, Xor, seteq, HQ16, HVF16, HVF16>; + def: AccRRR_pat<V6_veqh_and, And, setoeq, HQ16, HVF16, HVF16>; + def: AccRRR_pat<V6_veqh_or, Or, setoeq, HQ16, HVF16, HVF16>; + def: AccRRR_pat<V6_veqh_xor, Xor, setoeq, HQ16, HVF16, HVF16>; + def: AccRRR_pat<V6_veqh_and, And, setueq, HQ16, HVF16, HVF16>; + def: AccRRR_pat<V6_veqh_or, Or, setueq, HQ16, HVF16, HVF16>; + def: AccRRR_pat<V6_veqh_xor, Xor, setueq, HQ16, HVF16, HVF16>; + def: AccRRR_pat<V6_vgthf_and, And, setgt, HQ16, HVF16, HVF16>; + def: AccRRR_pat<V6_vgthf_or, Or, setgt, HQ16, HVF16, HVF16>; + def: AccRRR_pat<V6_vgthf_xor, Xor, setgt, HQ16, HVF16, HVF16>; + def: AccRRR_pat<V6_vgthf_and, And, setogt, HQ16, HVF16, HVF16>; + def: AccRRR_pat<V6_vgthf_or, Or, setogt, HQ16, HVF16, HVF16>; + def: AccRRR_pat<V6_vgthf_xor, Xor, setogt, HQ16, HVF16, HVF16>; + def: AccRRR_pat<V6_vgthf_and, And, setugt, HQ16, HVF16, HVF16>; + def: AccRRR_pat<V6_vgthf_or, Or, setugt, HQ16, HVF16, HVF16>; + def: AccRRR_pat<V6_vgthf_xor, Xor, setugt, HQ16, HVF16, HVF16>; + + def: AccRRR_pat<V6_veqw_and, And, seteq, HQ32, HVF32, HVF32>; + def: AccRRR_pat<V6_veqw_or, Or, seteq, HQ32, HVF32, HVF32>; + def: AccRRR_pat<V6_veqw_xor, Xor, seteq, HQ32, HVF32, HVF32>; + def: AccRRR_pat<V6_veqw_and, And, setoeq, HQ32, HVF32, HVF32>; + def: AccRRR_pat<V6_veqw_or, Or, setoeq, HQ32, HVF32, HVF32>; + def: AccRRR_pat<V6_veqw_xor, Xor, setoeq, HQ32, HVF32, HVF32>; + def: AccRRR_pat<V6_veqw_and, And, setueq, HQ32, HVF32, HVF32>; + def: AccRRR_pat<V6_veqw_or, Or, setueq, HQ32, HVF32, HVF32>; + def: AccRRR_pat<V6_veqw_xor, Xor, setueq, HQ32, HVF32, HVF32>; + def: AccRRR_pat<V6_vgtsf_and, And, setgt, HQ32, HVF32, HVF32>; + def: AccRRR_pat<V6_vgtsf_or, Or, setgt, HQ32, HVF32, HVF32>; + def: AccRRR_pat<V6_vgtsf_xor, Xor, setgt, HQ32, HVF32, HVF32>; + def: AccRRR_pat<V6_vgtsf_and, And, setogt, HQ32, HVF32, HVF32>; + def: AccRRR_pat<V6_vgtsf_or, Or, setogt, HQ32, HVF32, HVF32>; + def: AccRRR_pat<V6_vgtsf_xor, Xor, setogt, HQ32, HVF32, HVF32>; + def: AccRRR_pat<V6_vgtsf_and, And, setugt, HQ32, HVF32, HVF32>; + def: AccRRR_pat<V6_vgtsf_or, Or, setugt, HQ32, HVF32, HVF32>; + def: AccRRR_pat<V6_vgtsf_xor, Xor, setugt, HQ32, HVF32, HVF32>; + + def: Pat<(VecQ16 (setone HVF16:$Vt, HVF16:$Vu)), + (V6_pred_not (V6_veqh HvxVR:$Vt, HvxVR:$Vu))>; + + def: Pat<(VecQ32 (setone HVF32:$Vt, HVF32:$Vu)), + (V6_pred_not (V6_veqw HvxVR:$Vt, HvxVR:$Vu))>; +} diff --git a/llvm/lib/Target/Hexagon/HexagonPatternsV65.td b/llvm/lib/Target/Hexagon/HexagonPatternsV65.td index 4cd45ecbe1a1..f927f9b9e7c3 100644 --- a/llvm/lib/Target/Hexagon/HexagonPatternsV65.td +++ b/llvm/lib/Target/Hexagon/HexagonPatternsV65.td @@ -7,28 +7,31 @@ //===----------------------------------------------------------------------===// multiclass vgathermh<RegisterClass RC> { - let isCodeGenOnly = 1, isPseudo = 1, mayLoad = 1, mayStore = 1 in + let isCodeGenOnly = 1, isPseudo = 1, mayLoad = 1, + mayStore = 1, addrMode = BaseImmOffset, accessSize = HalfWordAccess in def NAME : CVI_GATHER_TMP_LD_Resource_NoOpcode<(outs ), - (ins IntRegs:$_dst_, IntRegs:$Rt, - ModRegs:$Mu, RC:$Vv), + (ins IntRegs:$_dst_, s4_0Imm:$Ii, + IntRegs:$Rt, ModRegs:$Mu, RC:$Vv), ".error \"should not emit\" ", []>; } multiclass vgathermw<RegisterClass RC> { - let isCodeGenOnly = 1, isPseudo = 1, mayLoad = 1, mayStore = 1 in + let isCodeGenOnly = 1, isPseudo = 1, mayLoad = 1, + mayStore = 1, addrMode = BaseImmOffset, accessSize = WordAccess in def NAME : CVI_GATHER_TMP_LD_Resource_NoOpcode<(outs ), - (ins IntRegs:$_dst_, IntRegs:$Rt, - ModRegs:$Mu, RC:$Vv), + (ins IntRegs:$_dst_, s4_0Imm:$Ii, + IntRegs:$Rt, ModRegs:$Mu, RC:$Vv), ".error \"should not emit\" ", []>; } multiclass vgathermhw<RegisterClass RC> { - let isCodeGenOnly = 1, isPseudo = 1, mayLoad = 1, mayStore = 1 in + let isCodeGenOnly = 1, isPseudo = 1, mayLoad = 1, + mayStore = 1, addrMode = BaseImmOffset, accessSize = HalfWordAccess in def NAME : CVI_GATHER_TMP_LD_Resource_NoOpcode<(outs ), - (ins IntRegs:$_dst_, IntRegs:$Rt, - ModRegs:$Mu, RC:$Vv), + (ins IntRegs:$_dst_, s4_0Imm:$Ii, + IntRegs:$Rt, ModRegs:$Mu, RC:$Vv), ".error \"should not emit\" ", []>; } @@ -38,28 +41,34 @@ defm V6_vgathermw_pseudo : vgathermw<HvxVR>; defm V6_vgathermhw_pseudo : vgathermhw<HvxWR>; multiclass vgathermhq<RegisterClass RC1, RegisterClass RC2> { - let isCodeGenOnly = 1, isPseudo = 1, mayLoad = 1, mayStore = 1 in + let isCodeGenOnly = 1, isPseudo = 1, mayLoad = 1, + mayStore = 1, addrMode = BaseImmOffset, accessSize = HalfWordAccess in def NAME : CVI_GATHER_TMP_LD_Resource_NoOpcode<(outs ), - (ins IntRegs:$_dst_, RC2:$Vq, IntRegs:$Rt, - ModRegs:$Mu, RC1:$Vv), + (ins IntRegs:$_dst_, s4_0Imm:$Ii, + RC2:$Vq, IntRegs:$Rt, ModRegs:$Mu, + RC1:$Vv), ".error \"should not emit\" ", []>; } multiclass vgathermwq<RegisterClass RC1, RegisterClass RC2> { - let isCodeGenOnly = 1, isPseudo = 1, mayLoad = 1, mayStore = 1 in + let isCodeGenOnly = 1, isPseudo = 1, mayLoad = 1, + mayStore = 1, addrMode = BaseImmOffset, accessSize = WordAccess in def NAME : CVI_GATHER_TMP_LD_Resource_NoOpcode<(outs ), - (ins IntRegs:$_dst_, RC2:$Vq, IntRegs:$Rt, - ModRegs:$Mu, RC1:$Vv), + (ins IntRegs:$_dst_, s4_0Imm:$Ii, + RC2:$Vq, IntRegs:$Rt, ModRegs:$Mu, + RC1:$Vv), ".error \"should not emit\" ", []>; } multiclass vgathermhwq<RegisterClass RC1, RegisterClass RC2> { - let isCodeGenOnly = 1, isPseudo = 1, mayLoad = 1, mayStore = 1 in + let isCodeGenOnly = 1, isPseudo = 1, mayLoad = 1, + mayStore = 1, addrMode = BaseImmOffset, accessSize = HalfWordAccess in def NAME : CVI_GATHER_TMP_LD_Resource_NoOpcode<(outs ), - (ins IntRegs:$_dst_, RC2:$Vq, IntRegs:$Rt, - ModRegs:$Mu, RC1:$Vv), + (ins IntRegs:$_dst_, s4_0Imm:$Ii, + RC2:$Vq, IntRegs:$Rt, ModRegs:$Mu, + RC1:$Vv), ".error \"should not emit\" ", []>; } diff --git a/llvm/lib/Target/Hexagon/HexagonRegisterInfo.td b/llvm/lib/Target/Hexagon/HexagonRegisterInfo.td index 8b7138d3c809..4c387c8ba638 100644 --- a/llvm/lib/Target/Hexagon/HexagonRegisterInfo.td +++ b/llvm/lib/Target/Hexagon/HexagonRegisterInfo.td @@ -479,6 +479,10 @@ def VecI16: ValueTypeByHwMode<[Hvx64, Hvx128, DefaultMode], [v32i16, v64i16, v32i16]>; def VecI32: ValueTypeByHwMode<[Hvx64, Hvx128, DefaultMode], [v16i32, v32i32, v16i32]>; +def VecF16: ValueTypeByHwMode<[Hvx64, Hvx128, DefaultMode], + [v32f16, v64f16, v32f16]>; +def VecF32: ValueTypeByHwMode<[Hvx64, Hvx128, DefaultMode], + [v16f32, v32f32, v16f32]>; def VecPI8: ValueTypeByHwMode<[Hvx64, Hvx128, DefaultMode], [v128i8, v256i8, v128i8]>; @@ -486,6 +490,10 @@ def VecPI16: ValueTypeByHwMode<[Hvx64, Hvx128, DefaultMode], [v64i16, v128i16, v64i16]>; def VecPI32: ValueTypeByHwMode<[Hvx64, Hvx128, DefaultMode], [v32i32, v64i32, v32i32]>; +def VecPF16: ValueTypeByHwMode<[Hvx64, Hvx128, DefaultMode], + [v64f16, v128f16, v64f16]>; +def VecPF32: ValueTypeByHwMode<[Hvx64, Hvx128, DefaultMode], + [v32f32, v64f32, v32f32]>; def VecQ8: ValueTypeByHwMode<[Hvx64, Hvx128, DefaultMode], [v64i1, v128i1, v64i1]>; @@ -496,13 +504,13 @@ def VecQ32: ValueTypeByHwMode<[Hvx64, Hvx128, DefaultMode], // HVX register classes -def HvxVR : RegisterClass<"Hexagon", [VecI8, VecI16, VecI32], 512, +def HvxVR : RegisterClass<"Hexagon", [VecI8, VecI16, VecI32, VecF16, VecF32], 512, (add (sequence "V%u", 0, 31), VTMP)> { let RegInfos = RegInfoByHwMode<[Hvx64, Hvx128, DefaultMode], [RegInfo<512,512,512>, RegInfo<1024,1024,1024>, RegInfo<512,512,512>]>; } -def HvxWR : RegisterClass<"Hexagon", [VecPI8, VecPI16, VecPI32], 1024, +def HvxWR : RegisterClass<"Hexagon", [VecPI8, VecPI16, VecPI32, VecPF16, VecPF32], 1024, (add (sequence "W%u", 0, 15), (sequence "WR%u", 0, 15))> { let RegInfos = RegInfoByHwMode<[Hvx64, Hvx128, DefaultMode], [RegInfo<1024,1024,1024>, RegInfo<2048,2048,2048>, RegInfo<1024,1024,1024>]>; diff --git a/llvm/lib/Target/Hexagon/HexagonSplitDouble.cpp b/llvm/lib/Target/Hexagon/HexagonSplitDouble.cpp index 9a0f57fce97d..ada78ca70559 100644 --- a/llvm/lib/Target/Hexagon/HexagonSplitDouble.cpp +++ b/llvm/lib/Target/Hexagon/HexagonSplitDouble.cpp @@ -224,14 +224,14 @@ void HexagonSplitDoubleRegs::partitionRegisters(UUSetMap &P2Rs) { unsigned NumRegs = MRI->getNumVirtRegs(); BitVector DoubleRegs(NumRegs); for (unsigned i = 0; i < NumRegs; ++i) { - unsigned R = Register::index2VirtReg(i); + Register R = Register::index2VirtReg(i); if (MRI->getRegClass(R) == DoubleRC) DoubleRegs.set(i); } BitVector FixedRegs(NumRegs); for (int x = DoubleRegs.find_first(); x >= 0; x = DoubleRegs.find_next(x)) { - unsigned R = Register::index2VirtReg(x); + Register R = Register::index2VirtReg(x); MachineInstr *DefI = MRI->getVRegDef(R); // In some cases a register may exist, but never be defined or used. // It should never appear anywhere, but mark it as "fixed", just to be @@ -244,7 +244,7 @@ void HexagonSplitDoubleRegs::partitionRegisters(UUSetMap &P2Rs) { for (int x = DoubleRegs.find_first(); x >= 0; x = DoubleRegs.find_next(x)) { if (FixedRegs[x]) continue; - unsigned R = Register::index2VirtReg(x); + Register R = Register::index2VirtReg(x); LLVM_DEBUG(dbgs() << printReg(R, TRI) << " ~~"); USet &Asc = AssocMap[R]; for (auto U = MRI->use_nodbg_begin(R), Z = MRI->use_nodbg_end(); @@ -281,7 +281,7 @@ void HexagonSplitDoubleRegs::partitionRegisters(UUSetMap &P2Rs) { unsigned NextP = 1; USet Visited; for (int x = DoubleRegs.find_first(); x >= 0; x = DoubleRegs.find_next(x)) { - unsigned R = Register::index2VirtReg(x); + Register R = Register::index2VirtReg(x); if (Visited.count(R)) continue; // Create a new partition for R. @@ -578,8 +578,7 @@ void HexagonSplitDoubleRegs::collectIndRegs(LoopRegMap &IRM) { append_range(WorkQ, *WorkQ[i]); USet Rs; - for (unsigned i = 0, n = WorkQ.size(); i < n; ++i) { - MachineLoop *L = WorkQ[i]; + for (MachineLoop *L : WorkQ) { Rs.clear(); collectIndRegsForLoop(L, Rs); if (!Rs.empty()) diff --git a/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp b/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp index 08bb4580b585..bdd2a2cfc5fa 100644 --- a/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp +++ b/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp @@ -228,7 +228,9 @@ bool HexagonSubtarget::isTypeForHVX(Type *VecTy, bool IncludeBool) const { if (!VecTy->isVectorTy() || isa<ScalableVectorType>(VecTy)) return false; // Avoid types like <2 x i32*>. - if (!cast<VectorType>(VecTy)->getElementType()->isIntegerTy()) + Type *ScalTy = VecTy->getScalarType(); + if (!ScalTy->isIntegerTy() && + !(ScalTy->isFloatingPointTy() && useHVXFloatingPoint())) return false; // The given type may be something like <17 x i32>, which is not MVT, // but can be represented as (non-simple) EVT. @@ -466,28 +468,46 @@ void HexagonSubtarget::adjustSchedDependency(SUnit *Src, int SrcOpIdx, return; } - if (!hasV60Ops()) - return; - - // Set the latency for a copy to zero since we hope that is will get removed. + // Set the latency for a copy to zero since we hope that is will get + // removed. if (DstInst->isCopy()) Dep.setLatency(0); // If it's a REG_SEQUENCE/COPY, use its destination instruction to determine // the correct latency. - if ((DstInst->isRegSequence() || DstInst->isCopy()) && Dst->NumSuccs == 1) { + // If there are multiple uses of the def of COPY/REG_SEQUENCE, set the latency + // only if the latencies on all the uses are equal, otherwise set it to + // default. + if ((DstInst->isRegSequence() || DstInst->isCopy())) { Register DReg = DstInst->getOperand(0).getReg(); - MachineInstr *DDst = Dst->Succs[0].getSUnit()->getInstr(); - unsigned UseIdx = -1; - for (unsigned OpNum = 0; OpNum < DDst->getNumOperands(); OpNum++) { - const MachineOperand &MO = DDst->getOperand(OpNum); - if (MO.isReg() && MO.getReg() && MO.isUse() && MO.getReg() == DReg) { - UseIdx = OpNum; + int DLatency = -1; + for (const auto &DDep : Dst->Succs) { + MachineInstr *DDst = DDep.getSUnit()->getInstr(); + int UseIdx = -1; + for (unsigned OpNum = 0; OpNum < DDst->getNumOperands(); OpNum++) { + const MachineOperand &MO = DDst->getOperand(OpNum); + if (MO.isReg() && MO.getReg() && MO.isUse() && MO.getReg() == DReg) { + UseIdx = OpNum; + break; + } + } + + if (UseIdx == -1) + continue; + + int Latency = (InstrInfo.getOperandLatency(&InstrItins, *SrcInst, 0, + *DDst, UseIdx)); + // Set DLatency for the first time. + DLatency = (DLatency == -1) ? Latency : DLatency; + + // For multiple uses, if the Latency is different across uses, reset + // DLatency. + if (DLatency != Latency) { + DLatency = -1; break; } } - int DLatency = (InstrInfo.getOperandLatency(&InstrItins, *SrcInst, - 0, *DDst, UseIdx)); + DLatency = std::max(DLatency, 0); Dep.setLatency((unsigned)DLatency); } @@ -500,8 +520,10 @@ void HexagonSubtarget::adjustSchedDependency(SUnit *Src, int SrcOpIdx, Dep.setLatency(0); return; } - - updateLatency(*SrcInst, *DstInst, Dep); + int Latency = Dep.getLatency(); + bool IsArtificial = Dep.isArtificial(); + Latency = updateLatency(*SrcInst, *DstInst, IsArtificial, Latency); + Dep.setLatency(Latency); } void HexagonSubtarget::getPostRAMutations( @@ -530,21 +552,19 @@ bool HexagonSubtarget::usePredicatedCalls() const { return EnablePredicatedCalls; } -void HexagonSubtarget::updateLatency(MachineInstr &SrcInst, - MachineInstr &DstInst, SDep &Dep) const { - if (Dep.isArtificial()) { - Dep.setLatency(1); - return; - } - +int HexagonSubtarget::updateLatency(MachineInstr &SrcInst, + MachineInstr &DstInst, bool IsArtificial, + int Latency) const { + if (IsArtificial) + return 1; if (!hasV60Ops()) - return; - - auto &QII = static_cast<const HexagonInstrInfo&>(*getInstrInfo()); + return Latency; + auto &QII = static_cast<const HexagonInstrInfo &>(*getInstrInfo()); // BSB scheduling. if (QII.isHVXVec(SrcInst) || useBSBScheduling()) - Dep.setLatency((Dep.getLatency() + 1) >> 1); + Latency = (Latency + 1) >> 1; + return Latency; } void HexagonSubtarget::restoreLatency(SUnit *Src, SUnit *Dst) const { @@ -580,9 +600,9 @@ void HexagonSubtarget::restoreLatency(SUnit *Src, SUnit *Dst) const { // For some instructions (ex: COPY), we might end up with < 0 latency // as they don't have any Itinerary class associated with them. Latency = std::max(Latency, 0); - + bool IsArtificial = I.isArtificial(); + Latency = updateLatency(*SrcI, *DstI, IsArtificial, Latency); I.setLatency(Latency); - updateLatency(*SrcI, *DstI, I); } } diff --git a/llvm/lib/Target/Hexagon/HexagonSubtarget.h b/llvm/lib/Target/Hexagon/HexagonSubtarget.h index e4f375440be1..db682676cf12 100644 --- a/llvm/lib/Target/Hexagon/HexagonSubtarget.h +++ b/llvm/lib/Target/Hexagon/HexagonSubtarget.h @@ -325,8 +325,8 @@ public: private: // Helper function responsible for increasing the latency only. - void updateLatency(MachineInstr &SrcInst, MachineInstr &DstInst, SDep &Dep) - const; + int updateLatency(MachineInstr &SrcInst, MachineInstr &DstInst, + bool IsArtificial, int Latency) const; void restoreLatency(SUnit *Src, SUnit *Dst) const; void changeLatency(SUnit *Src, SUnit *Dst, unsigned Lat) const; bool isBestZeroLatency(SUnit *Src, SUnit *Dst, const HexagonInstrInfo *TII, diff --git a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp index fcf829b522cc..c6703bb8a62a 100644 --- a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp +++ b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp @@ -139,6 +139,7 @@ namespace llvm { void initializeHexagonBitSimplifyPass(PassRegistry&); void initializeHexagonConstExtendersPass(PassRegistry&); void initializeHexagonConstPropagationPass(PassRegistry&); + void initializeHexagonCopyToCombinePass(PassRegistry&); void initializeHexagonEarlyIfConversionPass(PassRegistry&); void initializeHexagonExpandCondsetsPass(PassRegistry&); void initializeHexagonGenMuxPass(PassRegistry&); @@ -199,6 +200,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeHexagonTarget() { initializeHexagonBitSimplifyPass(PR); initializeHexagonConstExtendersPass(PR); initializeHexagonConstPropagationPass(PR); + initializeHexagonCopyToCombinePass(PR); initializeHexagonEarlyIfConversionPass(PR); initializeHexagonGenMuxPass(PR); initializeHexagonHardwareLoopsPass(PR); diff --git a/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp b/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp index 85ec0cdcd8f0..e9b658d18175 100644 --- a/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp +++ b/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp @@ -886,7 +886,8 @@ bool HexagonPacketizerList::canPromoteToDotNew(const MachineInstr &MI, // Create a dot new machine instruction to see if resources can be // allocated. If not, bail out now. - int NewOpcode = HII->getDotNewOp(MI); + int NewOpcode = (RC != &Hexagon::PredRegsRegClass) ? HII->getDotNewOp(MI) : + HII->getDotNewPredOp(MI, MBPI); const MCInstrDesc &D = HII->get(NewOpcode); MachineInstr *NewMI = MF.CreateMachineInstr(D, DebugLoc()); bool ResourcesAvailable = ResourceTracker->canReserveResources(*NewMI); @@ -1107,6 +1108,11 @@ static bool cannotCoexistAsymm(const MachineInstr &MI, const MachineInstr &MJ, HII.isHVXMemWithAIndirect(MI, MJ)) return true; + // Don't allow a store and an instruction that must be in slot0 and + // doesn't allow a slot1 instruction. + if (MI.mayStore() && HII.isRestrictNoSlot1Store(MJ) && HII.isPureSlot0(MJ)) + return true; + // An inline asm cannot be together with a branch, because we may not be // able to remove the asm out after packetizing (i.e. if the asm must be // moved past the bundle). Similarly, two asms cannot be together to avoid @@ -1526,6 +1532,13 @@ bool HexagonPacketizerList::isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) { bool IsVecJ = HII->isHVXVec(J); bool IsVecI = HII->isHVXVec(I); + // Don't reorder the loads if there is an order dependence. This would + // occur if the first instruction must go in slot0. + if (LoadJ && LoadI && HII->isPureSlot0(J)) { + FoundSequentialDependence = true; + break; + } + if (Slot1Store && MF.getSubtarget<HexagonSubtarget>().hasV65Ops() && ((LoadJ && StoreI && !NVStoreI) || (StoreJ && LoadI && !NVStoreJ)) && @@ -1696,9 +1709,12 @@ HexagonPacketizerList::addToPacket(MachineInstr &MI) { MachineBasicBlock::iterator MII = MI.getIterator(); MachineBasicBlock *MBB = MI.getParent(); - if (CurrentPacketMIs.empty()) + if (CurrentPacketMIs.empty()) { PacketStalls = false; + PacketStallCycles = 0; + } PacketStalls |= producesStall(MI); + PacketStallCycles = std::max(PacketStallCycles, calcStall(MI)); if (MI.isImplicitDef()) { // Add to the packet to allow subsequent instructions to be checked @@ -1818,14 +1834,6 @@ bool HexagonPacketizerList::shouldAddToPacket(const MachineInstr &MI) { if (Minimal) return false; - // Constrainst for not packetizing this MI with existing instructions in a - // packet. - // MI is a store instruction. - // CurrentPacketMIs has a SLOT0 only instruction with constraint - // A_RESTRICT_NOSLOT1_STORE/isRestrictNoSlot1Store. - if (MI.mayStore() && isPureSlot0InsnWithNoSlot1Store(MI)) - return false; - if (producesStall(MI)) return false; @@ -1865,25 +1873,8 @@ bool HexagonPacketizerList::shouldAddToPacket(const MachineInstr &MI) { return true; } -bool HexagonPacketizerList::isPureSlot0InsnWithNoSlot1Store( - const MachineInstr &MI) { - bool noSlot1Store = false; - bool isSlot0Only = false; - for (auto J : CurrentPacketMIs) { - noSlot1Store |= HII->isRestrictNoSlot1Store(*J); - isSlot0Only |= HII->isPureSlot0(*J); - } - - return (noSlot1Store && isSlot0Only); -} - // V60 forward scheduling. -bool HexagonPacketizerList::producesStall(const MachineInstr &I) { - // If the packet already stalls, then ignore the stall from a subsequent - // instruction in the same packet. - if (PacketStalls) - return false; - +unsigned int HexagonPacketizerList::calcStall(const MachineInstr &I) { // Check whether the previous packet is in a different loop. If this is the // case, there is little point in trying to avoid a stall because that would // favor the rare case (loop entry) over the common case (loop iteration). @@ -1895,10 +1886,12 @@ bool HexagonPacketizerList::producesStall(const MachineInstr &I) { auto *OldBB = OldPacketMIs.front()->getParent(); auto *ThisBB = I.getParent(); if (MLI->getLoopFor(OldBB) != MLI->getLoopFor(ThisBB)) - return false; + return 0; } SUnit *SUI = MIToSUnit[const_cast<MachineInstr *>(&I)]; + if (!SUI) + return 0; // If the latency is 0 and there is a data dependence between this // instruction and any instruction in the current packet, we disregard any @@ -1927,7 +1920,7 @@ bool HexagonPacketizerList::producesStall(const MachineInstr &I) { if (Pred.getSUnit() == SUJ) if ((Pred.getLatency() == 0 && Pred.isAssignedRegDep()) || HII->isNewValueJump(I) || HII->isToBeScheduledASAP(*J, I)) - return false; + return 0; } // Check if the latency is greater than one between this instruction and any @@ -1936,10 +1929,20 @@ bool HexagonPacketizerList::producesStall(const MachineInstr &I) { SUnit *SUJ = MIToSUnit[J]; for (auto &Pred : SUI->Preds) if (Pred.getSUnit() == SUJ && Pred.getLatency() > 1) - return true; + return Pred.getLatency(); } - return false; + return 0; +} + +bool HexagonPacketizerList::producesStall(const MachineInstr &I) { + unsigned int Latency = calcStall(I); + if (Latency == 0) + return false; + // Ignore stall unless it stalls more than previous instruction in packet + if (PacketStalls) + return Latency > PacketStallCycles; + return true; } //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.h b/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.h index 27a47220570a..6a709e566f86 100644 --- a/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.h +++ b/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.h @@ -56,6 +56,9 @@ class HexagonPacketizerList : public VLIWPacketizerList { // Set to true if the packet contains an instruction that stalls with an // instruction from the previous packet. bool PacketStalls = false; + // Set to the number of cycles of stall a given instruction will incur + // because of dependence on instruction in previous packet. + unsigned int PacketStallCycles = 0; // Set to true if the packet has a duplex pair of sub-instructions. bool PacketHasDuplex = false; @@ -156,7 +159,7 @@ protected: bool hasRegMaskDependence(const MachineInstr &I, const MachineInstr &J); bool hasDualStoreDependence(const MachineInstr &I, const MachineInstr &J); bool producesStall(const MachineInstr &MI); - bool isPureSlot0InsnWithNoSlot1Store(const MachineInstr &MI); + unsigned int calcStall(const MachineInstr &MI); }; } // end namespace llvm diff --git a/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp b/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp index 21386a91c7b3..6aca8d807872 100644 --- a/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp +++ b/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp @@ -443,7 +443,7 @@ auto AlignVectors::createAdjustedPointer(IRBuilder<> &Builder, Value *Ptr, // we don't need to do pointer casts. auto *PtrTy = cast<PointerType>(Ptr->getType()); if (!PtrTy->isOpaque()) { - Type *ElemTy = PtrTy->getElementType(); + Type *ElemTy = PtrTy->getNonOpaquePointerElementType(); int ElemSize = HVC.getAllocSizeOf(ElemTy); if (Adjust % ElemSize == 0 && Adjust != 0) { Value *Tmp0 = @@ -718,7 +718,7 @@ auto AlignVectors::realignGroup(const MoveGroup &Move) const -> bool { // Maximum alignment present in the whole address group. const AddrInfo &WithMaxAlign = - getMaxOf(BaseInfos, [](const AddrInfo &AI) { return AI.HaveAlign; }); + getMaxOf(MoveInfos, [](const AddrInfo &AI) { return AI.HaveAlign; }); Align MaxGiven = WithMaxAlign.HaveAlign; // Minimum alignment present in the move address group. @@ -1181,12 +1181,15 @@ auto HexagonVectorCombine::rescale(IRBuilder<> &Builder, Value *Mask, int ToCount = (FromCount * FromSize) / ToSize; assert((FromCount * FromSize) % ToSize == 0); + auto *FromITy = IntegerType::get(F.getContext(), FromSize * 8); + auto *ToITy = IntegerType::get(F.getContext(), ToSize * 8); + // Mask <N x i1> -> sext to <N x FromTy> -> bitcast to <M x ToTy> -> // -> trunc to <M x i1>. Value *Ext = Builder.CreateSExt( - Mask, VectorType::get(FromSTy, FromCount, /*Scalable*/ false)); + Mask, VectorType::get(FromITy, FromCount, /*Scalable*/ false)); Value *Cast = Builder.CreateBitCast( - Ext, VectorType::get(ToSTy, ToCount, /*Scalable*/ false)); + Ext, VectorType::get(ToITy, ToCount, /*Scalable*/ false)); return Builder.CreateTrunc( Cast, VectorType::get(getBoolTy(), ToCount, /*Scalable*/ false)); } diff --git a/llvm/lib/Target/Hexagon/HexagonVectorLoopCarriedReuse.cpp b/llvm/lib/Target/Hexagon/HexagonVectorLoopCarriedReuse.cpp index f973862a0c9b..94b878e21f4d 100644 --- a/llvm/lib/Target/Hexagon/HexagonVectorLoopCarriedReuse.cpp +++ b/llvm/lib/Target/Hexagon/HexagonVectorLoopCarriedReuse.cpp @@ -659,8 +659,7 @@ void HexagonVectorLoopCarriedReuse::findLoopCarriedDeps() { delete D; } LLVM_DEBUG(dbgs() << "Found " << Dependences.size() << " dependences\n"); - LLVM_DEBUG(for (size_t i = 0; i < Dependences.size(); - ++i) { dbgs() << *Dependences[i] << "\n"; }); + LLVM_DEBUG(for (const DepChain *D : Dependences) dbgs() << *D << "\n";); } Pass *llvm::createHexagonVectorLoopCarriedReuseLegacyPass() { diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp index 96c2965296ca..8a866cfe9161 100644 --- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp +++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp @@ -16,6 +16,7 @@ #include "MCTargetDesc/HexagonMCInstrInfo.h" #include "MCTargetDesc/HexagonMCShuffler.h" #include "MCTargetDesc/HexagonMCTargetDesc.h" + #include "llvm/ADT/Twine.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCInst.h" @@ -65,7 +66,8 @@ void HexagonMCChecker::init() { void HexagonMCChecker::initReg(MCInst const &MCI, unsigned R, unsigned &PredReg, bool &isTrue) { - if (HexagonMCInstrInfo::isPredicated(MCII, MCI) && isPredicateRegister(R)) { + if (HexagonMCInstrInfo::isPredicated(MCII, MCI) && + HexagonMCInstrInfo::isPredReg(RI, R)) { // Note an used predicate register. PredReg = R; isTrue = HexagonMCInstrInfo::isPredicatedTrue(MCII, MCI); @@ -123,7 +125,7 @@ void HexagonMCChecker::init(MCInst const &MCI) { // same packet with an instruction that modifies is explicitly. Deal // with such situations individually. SoftDefs.insert(R); - else if (isPredicateRegister(R) && + else if (HexagonMCInstrInfo::isPredReg(RI, R) && HexagonMCInstrInfo::isPredicateLate(MCII, MCI)) // Include implicit late predicates. LatePreds.insert(R); @@ -167,7 +169,7 @@ void HexagonMCChecker::init(MCInst const &MCI) { // side-effect, then note as a soft definition. SoftDefs.insert(*SRI); else if (HexagonMCInstrInfo::isPredicateLate(MCII, MCI) && - isPredicateRegister(*SRI)) + HexagonMCInstrInfo::isPredReg(RI, *SRI)) // Some insns produce predicates too late to be used in the same packet. LatePreds.insert(*SRI); else if (i == 0 && HexagonMCInstrInfo::getType(MCII, MCI) == @@ -193,7 +195,7 @@ void HexagonMCChecker::init(MCInst const &MCI) { if (MCI.getOperand(i).isReg()) { unsigned P = MCI.getOperand(i).getReg(); - if (isPredicateRegister(P)) + if (HexagonMCInstrInfo::isPredReg(RI, P)) NewPreds.insert(P); } } @@ -202,7 +204,7 @@ HexagonMCChecker::HexagonMCChecker(MCContext &Context, MCInstrInfo const &MCII, MCSubtargetInfo const &STI, MCInst &mcb, MCRegisterInfo const &ri, bool ReportErrors) : Context(Context), MCB(mcb), RI(ri), MCII(MCII), STI(STI), - ReportErrors(ReportErrors), ReversePairs() { + ReportErrors(ReportErrors) { init(); } @@ -210,8 +212,7 @@ HexagonMCChecker::HexagonMCChecker(HexagonMCChecker const &Other, MCSubtargetInfo const &STI, bool CopyReportErrors) : Context(Other.Context), MCB(Other.MCB), RI(Other.RI), MCII(Other.MCII), - STI(STI), ReportErrors(CopyReportErrors ? Other.ReportErrors : false), - ReversePairs() { + STI(STI), ReportErrors(CopyReportErrors ? Other.ReportErrors : false) { init(); } @@ -233,9 +234,10 @@ bool HexagonMCChecker::check(bool FullCheck) { bool chkHWLoop = checkHWLoop(); bool chkValidTmpDst = FullCheck ? checkValidTmpDst() : true; bool chkLegalVecRegPair = checkLegalVecRegPair(); + bool ChkHVXAccum = checkHVXAccum(); bool chk = chkP && chkNV && chkR && chkRRO && chkS && chkSh && chkSl && chkAXOK && chkCofMax1 && chkHWLoop && chkValidTmpDst && - chkLegalVecRegPair; + chkLegalVecRegPair && ChkHVXAccum; return chk; } @@ -274,20 +276,27 @@ static bool isDuplexAGroup(unsigned Opcode) { } static bool isNeitherAnorX(MCInstrInfo const &MCII, MCInst const &ID) { - unsigned Result = 0; + if (HexagonMCInstrInfo::isFloat(MCII, ID)) + return true; unsigned Type = HexagonMCInstrInfo::getType(MCII, ID); - if (Type == HexagonII::TypeDUPLEX) { - unsigned subInst0Opcode = ID.getOperand(0).getInst()->getOpcode(); - unsigned subInst1Opcode = ID.getOperand(1).getInst()->getOpcode(); - Result += !isDuplexAGroup(subInst0Opcode); - Result += !isDuplexAGroup(subInst1Opcode); - } else - Result += - Type != HexagonII::TypeALU32_2op && Type != HexagonII::TypeALU32_3op && - Type != HexagonII::TypeALU32_ADDI && Type != HexagonII::TypeS_2op && - Type != HexagonII::TypeS_3op && - (Type != HexagonII::TypeALU64 || HexagonMCInstrInfo::isFloat(MCII, ID)); - return Result != 0; + switch (Type) { + case HexagonII::TypeALU32_2op: + case HexagonII::TypeALU32_3op: + case HexagonII::TypeALU32_ADDI: + case HexagonII::TypeS_2op: + case HexagonII::TypeS_3op: + case HexagonII::TypeEXTENDER: + case HexagonII::TypeM: + case HexagonII::TypeALU64: + return false; + case HexagonII::TypeSUBINSN: { + return !isDuplexAGroup(ID.getOpcode()); + } + case HexagonII::TypeDUPLEX: + llvm_unreachable("unexpected duplex instruction"); + default: + return true; + } } bool HexagonMCChecker::checkAXOK() { @@ -315,8 +324,7 @@ bool HexagonMCChecker::checkAXOK() { void HexagonMCChecker::reportBranchErrors() { for (auto const &I : HexagonMCInstrInfo::bundleInstructions(MCII, MCB)) { - MCInstrDesc const &Desc = HexagonMCInstrInfo::getDesc(MCII, I); - if (Desc.isBranch() || Desc.isCall() || Desc.isReturn()) + if (HexagonMCInstrInfo::IsABranchingInst(MCII, STI, I)) reportNote(I.getLoc(), "Branching instruction"); } } @@ -326,8 +334,7 @@ bool HexagonMCChecker::checkHWLoop() { !HexagonMCInstrInfo::isOuterLoop(MCB)) return true; for (auto const &I : HexagonMCInstrInfo::bundleInstructions(MCII, MCB)) { - MCInstrDesc const &Desc = HexagonMCInstrInfo::getDesc(MCII, I); - if (Desc.isBranch() || Desc.isCall() || Desc.isReturn()) { + if (HexagonMCInstrInfo::IsABranchingInst(MCII, STI, I)) { reportError(MCB.getLoc(), "Branches cannot be in a packet with hardware loops"); reportBranchErrors(); @@ -340,8 +347,7 @@ bool HexagonMCChecker::checkHWLoop() { bool HexagonMCChecker::checkCOFMax1() { SmallVector<MCInst const *, 2> BranchLocations; for (auto const &I : HexagonMCInstrInfo::bundleInstructions(MCII, MCB)) { - MCInstrDesc const &Desc = HexagonMCInstrInfo::getDesc(MCII, I); - if (Desc.isBranch() || Desc.isCall() || Desc.isReturn()) + if (HexagonMCInstrInfo::IsABranchingInst(MCII, STI, I)) BranchLocations.push_back(&I); } for (unsigned J = 0, N = BranchLocations.size(); J < N; ++J) { @@ -373,18 +379,8 @@ bool HexagonMCChecker::checkCOFMax1() { } bool HexagonMCChecker::checkSlots() { - unsigned slotsUsed = 0; - for (auto HMI : HexagonMCInstrInfo::bundleInstructions(MCB)) { - MCInst const &MCI = *HMI.getInst(); - if (HexagonMCInstrInfo::isImmext(MCI)) - continue; - if (HexagonMCInstrInfo::isDuplex(MCII, MCI)) - slotsUsed += 2; - else - ++slotsUsed; - } - - if (slotsUsed > HEXAGON_PACKET_SIZE) { + if (HexagonMCInstrInfo::slotsConsumed(MCII, STI, MCB) > + HexagonMCInstrInfo::packetSizeSlots(STI)) { reportError("invalid instruction packet: out of slots"); return false; } @@ -424,81 +420,109 @@ bool HexagonMCChecker::checkPredicates() { // Check legal use of new values. bool HexagonMCChecker::checkNewValues() { - for (auto const &I : HexagonMCInstrInfo::bundleInstructions(MCII, MCB)) { - if (!HexagonMCInstrInfo::isNewValue(MCII, I)) + for (auto const &ConsumerInst : + HexagonMCInstrInfo::bundleInstructions(MCII, MCB)) { + if (!HexagonMCInstrInfo::isNewValue(MCII, ConsumerInst)) continue; - auto Consumer = HexagonMCInstrInfo::predicateInfo(MCII, I); - bool Branch = HexagonMCInstrInfo::getDesc(MCII, I).isBranch(); - MCOperand const &Op = HexagonMCInstrInfo::getNewValueOperand(MCII, I); + + const HexagonMCInstrInfo::PredicateInfo ConsumerPredInfo = + HexagonMCInstrInfo::predicateInfo(MCII, ConsumerInst); + + bool Branch = HexagonMCInstrInfo::getDesc(MCII, ConsumerInst).isBranch(); + MCOperand const &Op = + HexagonMCInstrInfo::getNewValueOperand(MCII, ConsumerInst); assert(Op.isReg()); - auto Producer = registerProducer(Op.getReg(), Consumer); - if (std::get<0>(Producer) == nullptr) { - reportError(I.getLoc(), "New value register consumer has no producer"); + + auto Producer = registerProducer(Op.getReg(), ConsumerPredInfo); + const MCInst *const ProducerInst = std::get<0>(Producer); + const HexagonMCInstrInfo::PredicateInfo ProducerPredInfo = + std::get<2>(Producer); + + if (ProducerInst == nullptr) { + reportError(ConsumerInst.getLoc(), + "New value register consumer has no producer"); return false; } if (!RelaxNVChecks) { // Checks that statically prove correct new value consumption - if (std::get<2>(Producer).isPredicated() && - (!Consumer.isPredicated() || - llvm::HexagonMCInstrInfo::getType(MCII, I) == HexagonII::TypeNCJ)) { + if (ProducerPredInfo.isPredicated() && + (!ConsumerPredInfo.isPredicated() || + llvm::HexagonMCInstrInfo::getType(MCII, ConsumerInst) == + HexagonII::TypeNCJ)) { reportNote( - std::get<0>(Producer)->getLoc(), + ProducerInst->getLoc(), "Register producer is predicated and consumer is unconditional"); - reportError(I.getLoc(), + reportError(ConsumerInst.getLoc(), "Instruction does not have a valid new register producer"); return false; } - if (std::get<2>(Producer).Register != Hexagon::NoRegister && - std::get<2>(Producer).Register != Consumer.Register) { - reportNote(std::get<0>(Producer)->getLoc(), + if (ProducerPredInfo.Register != Hexagon::NoRegister && + ProducerPredInfo.Register != ConsumerPredInfo.Register) { + reportNote(ProducerInst->getLoc(), "Register producer does not use the same predicate " "register as the consumer"); - reportError(I.getLoc(), + reportError(ConsumerInst.getLoc(), "Instruction does not have a valid new register producer"); return false; } } - if (std::get<2>(Producer).Register == Consumer.Register && - Consumer.PredicatedTrue != std::get<2>(Producer).PredicatedTrue) { + if (ProducerPredInfo.Register == ConsumerPredInfo.Register && + ConsumerPredInfo.PredicatedTrue != ProducerPredInfo.PredicatedTrue) { reportNote( - std::get<0>(Producer)->getLoc(), + ProducerInst->getLoc(), "Register producer has the opposite predicate sense as consumer"); - reportError(I.getLoc(), + reportError(ConsumerInst.getLoc(), "Instruction does not have a valid new register producer"); return false; } - MCInstrDesc const &Desc = - HexagonMCInstrInfo::getDesc(MCII, *std::get<0>(Producer)); - if (Desc.OpInfo[std::get<1>(Producer)].RegClass == + + MCInstrDesc const &Desc = HexagonMCInstrInfo::getDesc(MCII, *ProducerInst); + const unsigned ProducerOpIndex = std::get<1>(Producer); + + if (Desc.OpInfo[ProducerOpIndex].RegClass == Hexagon::DoubleRegsRegClassID) { - reportNote(std::get<0>(Producer)->getLoc(), + reportNote(ProducerInst->getLoc(), "Double registers cannot be new-value producers"); - reportError(I.getLoc(), + reportError(ConsumerInst.getLoc(), "Instruction does not have a valid new register producer"); return false; } - if ((Desc.mayLoad() && std::get<1>(Producer) == 1) || - (Desc.mayStore() && std::get<1>(Producer) == 0)) { - unsigned Mode = - HexagonMCInstrInfo::getAddrMode(MCII, *std::get<0>(Producer)); + + // The ProducerOpIsMemIndex logic checks for the index of the producer + // register operand. Z-reg load instructions have an implicit operand + // that's not encoded, so the producer won't appear as the 1-th def, it + // will be at the 0-th. + const unsigned ProducerOpSearchIndex = + (HexagonMCInstrInfo::getType(MCII, *ProducerInst) == + HexagonII::TypeCVI_ZW) + ? 0 + : 1; + + const bool ProducerOpIsMemIndex = + ((Desc.mayLoad() && ProducerOpIndex == ProducerOpSearchIndex) || + (Desc.mayStore() && ProducerOpIndex == 0)); + + if (ProducerOpIsMemIndex) { + unsigned Mode = HexagonMCInstrInfo::getAddrMode(MCII, *ProducerInst); + StringRef ModeError; if (Mode == HexagonII::AbsoluteSet) ModeError = "Absolute-set"; if (Mode == HexagonII::PostInc) ModeError = "Auto-increment"; if (!ModeError.empty()) { - reportNote(std::get<0>(Producer)->getLoc(), + reportNote(ProducerInst->getLoc(), ModeError + " registers cannot be a new-value " "producer"); - reportError(I.getLoc(), + reportError(ConsumerInst.getLoc(), "Instruction does not have a valid new register producer"); return false; } } - if (Branch && HexagonMCInstrInfo::isFloat(MCII, *std::get<0>(Producer))) { - reportNote(std::get<0>(Producer)->getLoc(), + if (Branch && HexagonMCInstrInfo::isFloat(MCII, *ProducerInst)) { + reportNote(ProducerInst->getLoc(), "FPU instructions cannot be new-value producers for jumps"); - reportError(I.getLoc(), + reportError(ConsumerInst.getLoc(), "Instruction does not have a valid new register producer"); return false; } @@ -541,9 +565,11 @@ HexagonMCChecker::registerProducer( unsigned Register, HexagonMCInstrInfo::PredicateInfo ConsumerPredicate) { std::tuple<MCInst const *, unsigned, HexagonMCInstrInfo::PredicateInfo> WrongSense; + for (auto const &I : HexagonMCInstrInfo::bundleInstructions(MCII, MCB)) { MCInstrDesc const &Desc = HexagonMCInstrInfo::getDesc(MCII, I); auto ProducerPredicate = HexagonMCInstrInfo::predicateInfo(MCII, I); + for (unsigned J = 0, N = Desc.getNumDefs(); J < N; ++J) for (auto K = MCRegAliasIterator(I.getOperand(J).getReg(), &RI, true); K.isValid(); ++K) @@ -568,9 +594,15 @@ void HexagonMCChecker::checkRegisterCurDefs() { for (auto const &I : HexagonMCInstrInfo::bundleInstructions(MCII, MCB)) { if (HexagonMCInstrInfo::isCVINew(MCII, I) && HexagonMCInstrInfo::getDesc(MCII, I).mayLoad()) { - unsigned Register = I.getOperand(0).getReg(); - if (!registerUsed(Register)) - reportWarning("Register `" + Twine(RI.getName(Register)) + + const unsigned RegDef = I.getOperand(0).getReg(); + + bool HasRegDefUse = false; + for (MCRegAliasIterator Alias(RegDef, &RI, true); Alias.isValid(); + ++Alias) + HasRegDefUse = HasRegDefUse || registerUsed(*Alias); + + if (!HasRegDefUse) + reportWarning("Register `" + Twine(RI.getName(RegDef)) + "' used with `.cur' " "but not used in the same packet"); } @@ -599,7 +631,7 @@ bool HexagonMCChecker::checkRegisters() { reportErrorRegisters(BadR); return false; } - if (!isPredicateRegister(R) && Defs[R].size() > 1) { + if (!HexagonMCInstrInfo::isPredReg(RI, R) && Defs[R].size() > 1) { // Check for multiple register definitions. PredSet &PM = Defs[R]; @@ -784,3 +816,22 @@ bool HexagonMCChecker::checkLegalVecRegPair() { } return true; } + +// Vd.tmp can't be accumulated +bool HexagonMCChecker::checkHVXAccum() +{ + for (const auto &I : HexagonMCInstrInfo::bundleInstructions(MCII, MCB)) { + bool IsTarget = + HexagonMCInstrInfo::isAccumulator(MCII, I) && I.getOperand(0).isReg(); + if (!IsTarget) + continue; + unsigned int R = I.getOperand(0).getReg(); + TmpDefsIterator It = TmpDefs.find(R); + if (It != TmpDefs.end()) { + reportError("register `" + Twine(RI.getName(R)) + ".tmp" + + "' is accumulated in this packet"); + return false; + } + } + return true; +} diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h index dbd3d8ae45e6..b83931eb88ac 100644 --- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h +++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h @@ -81,6 +81,10 @@ class HexagonMCChecker { void initReg(MCInst const &, unsigned, unsigned &PredReg, bool &isTrue); bool registerUsed(unsigned Register); + + /// \return a tuple of: pointer to the producer instruction or nullptr if + /// none was found, the operand index, and the PredicateInfo for the + /// producer. std::tuple<MCInst const *, unsigned, HexagonMCInstrInfo::PredicateInfo> registerProducer(unsigned Register, HexagonMCInstrInfo::PredicateInfo Predicated); @@ -100,14 +104,10 @@ class HexagonMCChecker { bool checkCOFMax1(); bool checkLegalVecRegPair(); bool checkValidTmpDst(); + bool checkHVXAccum(); static void compoundRegisterMap(unsigned &); - bool isPredicateRegister(unsigned R) const { - return (Hexagon::P0 == R || Hexagon::P1 == R || Hexagon::P2 == R || - Hexagon::P3 == R); - } - bool isLoopRegister(unsigned R) const { return (Hexagon::SA0 == R || Hexagon::LC0 == R || Hexagon::SA1 == R || Hexagon::LC1 == R); diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp index 33b2e9a9e302..f8ac35aed7c0 100644 --- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp +++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp @@ -712,7 +712,6 @@ unsigned HexagonMCCodeEmitter::getMachineOpValue(MCInst const &MI, MCOperand const &MO, SmallVectorImpl<MCFixup> &Fixups, MCSubtargetInfo const &STI) const { -#ifndef NDEBUG size_t OperandNumber = ~0U; for (unsigned i = 0, n = MI.getNumOperands(); i < n; ++i) if (&MI.getOperand(i) == &MO) { @@ -720,7 +719,6 @@ HexagonMCCodeEmitter::getMachineOpValue(MCInst const &MI, MCOperand const &MO, break; } assert((OperandNumber != ~0U) && "Operand not found"); -#endif if (HexagonMCInstrInfo::isNewValue(MCII, MI) && &MO == &HexagonMCInstrInfo::getNewValueOperand(MCII, MI)) { @@ -777,9 +775,13 @@ HexagonMCCodeEmitter::getMachineOpValue(MCInst const &MI, MCOperand const &MO, assert(!MO.isImm()); if (MO.isReg()) { unsigned Reg = MO.getReg(); - if (HexagonMCInstrInfo::isSubInstruction(MI) || - HexagonMCInstrInfo::getType(MCII, MI) == HexagonII::TypeCJ) + switch (HexagonMCInstrInfo::getDesc(MCII, MI).OpInfo[OperandNumber].RegClass) { + case GeneralSubRegsRegClassID: + case GeneralDoubleLow8RegsRegClassID: return HexagonMCInstrInfo::getDuplexRegisterNumbering(Reg); + default: + break; + } return MCT.getRegisterInfo()->getEncodingValue(Reg); } diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCompound.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCompound.cpp index e7ade7834a9f..3deef95df324 100644 --- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCompound.cpp +++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCompound.cpp @@ -365,8 +365,10 @@ static bool lookForCompound(MCInstrInfo const &MCII, MCContext &Context, MCI.begin() + HexagonMCInstrInfo::bundleInstructionsOffset; B != MCI.end(); ++B) { MCInst const *Inst = B->getInst(); - if (JumpInst == Inst) + if (JumpInst == Inst) { + BExtended = false; continue; + } if (HexagonMCInstrInfo::isImmext(*Inst)) { BExtended = true; continue; @@ -405,24 +407,27 @@ void HexagonMCInstrInfo::tryCompound(MCInstrInfo const &MCII, MCSubtargetInfo co if (MCI.size() < 2) return; - bool StartedValid = llvm::HexagonMCShuffle(Context, false, MCII, STI, MCI); - // Create a vector, needed to keep the order of jump instructions. MCInst CheckList(MCI); + // Keep the last known good bundle around in case the shuffle fails. + MCInst LastValidBundle(MCI); + + bool PreviouslyValid = llvm::HexagonMCShuffle(Context, false, MCII, STI, MCI); + // Look for compounds until none are found, only update the bundle when // a compound is found. while (lookForCompound(MCII, Context, CheckList)) { - // Keep the original bundle around in case the shuffle fails. - MCInst OriginalBundle(MCI); - // Need to update the bundle. MCI = CheckList; - if (StartedValid && - !llvm::HexagonMCShuffle(Context, false, MCII, STI, MCI)) { + const bool IsValid = llvm::HexagonMCShuffle(Context, false, MCII, STI, MCI); + if (PreviouslyValid && !IsValid) { LLVM_DEBUG(dbgs() << "Found ERROR\n"); - MCI = OriginalBundle; + MCI = LastValidBundle; + } else if (IsValid) { + LastValidBundle = MCI; + PreviouslyValid = true; } } } diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp index e1c95f1cc920..36d6c8c9f84b 100644 --- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp +++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp @@ -284,8 +284,6 @@ unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) { case Hexagon::J2_jumprf: case Hexagon::J2_jumprtnew: case Hexagon::J2_jumprfnew: - case Hexagon::J2_jumprtnewpt: - case Hexagon::J2_jumprfnewpt: case Hexagon::PS_jmprett: case Hexagon::PS_jmpretf: case Hexagon::PS_jmprettnew: @@ -303,8 +301,6 @@ unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) { case Hexagon::L4_return_f: case Hexagon::L4_return_tnew_pnt: case Hexagon::L4_return_fnew_pnt: - case Hexagon::L4_return_tnew_pt: - case Hexagon::L4_return_fnew_pt: // [if ([!]p0[.new])] dealloc_return SrcReg = MCI.getOperand(1).getReg(); if (Hexagon::P0 == SrcReg) { @@ -699,6 +695,7 @@ inline static void addOps(MCInst &subInstPtr, MCInst const &Inst, MCInst HexagonMCInstrInfo::deriveSubInst(MCInst const &Inst) { MCInst Result; + Result.setLoc(Inst.getLoc()); bool Absolute; int64_t Value; switch (Inst.getOpcode()) { @@ -830,7 +827,6 @@ MCInst HexagonMCInstrInfo::deriveSubInst(MCInst const &Inst) { Result.setOpcode(Hexagon::SL2_jumpr31_f); break; // none SUBInst if (!p0) jumpr r31 case Hexagon::J2_jumprfnew: - case Hexagon::J2_jumprfnewpt: case Hexagon::PS_jmpretfnewpt: case Hexagon::PS_jmpretfnew: Result.setOpcode(Hexagon::SL2_jumpr31_fnew); @@ -840,7 +836,6 @@ MCInst HexagonMCInstrInfo::deriveSubInst(MCInst const &Inst) { Result.setOpcode(Hexagon::SL2_jumpr31_t); break; // none SUBInst if (p0) jumpr r31 case Hexagon::J2_jumprtnew: - case Hexagon::J2_jumprtnewpt: case Hexagon::PS_jmprettnewpt: case Hexagon::PS_jmprettnew: Result.setOpcode(Hexagon::SL2_jumpr31_tnew); diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp index 68ccb20f4f15..494b0e6cbac6 100644 --- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp +++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp @@ -128,23 +128,28 @@ bool canonicalizePacketImpl(MCInstrInfo const &MCII, MCSubtargetInfo const &STI, bool CheckOk = Check ? Check->check(false) : true; if (!CheckOk) return false; + + MCInst OrigMCB = MCB; + // Examine the packet and convert pairs of instructions to compound // instructions when possible. if (!HexagonDisableCompound) HexagonMCInstrInfo::tryCompound(MCII, STI, Context, MCB); HexagonMCShuffle(Context, false, MCII, STI, MCB); + const SmallVector<DuplexCandidate, 8> possibleDuplexes = + (STI.getFeatureBits()[Hexagon::FeatureDuplex]) + ? HexagonMCInstrInfo::getDuplexPossibilties(MCII, STI, MCB) + : SmallVector<DuplexCandidate, 8>(); + // Examine the packet and convert pairs of instructions to duplex // instructions when possible. - if (STI.getFeatureBits() [Hexagon::FeatureDuplex]) { - SmallVector<DuplexCandidate, 8> possibleDuplexes; - possibleDuplexes = - HexagonMCInstrInfo::getDuplexPossibilties(MCII, STI, MCB); - HexagonMCShuffle(Context, MCII, STI, MCB, possibleDuplexes); - } + HexagonMCShuffle(Context, MCII, STI, MCB, possibleDuplexes); + // Examines packet and pad the packet, if needed, when an // end-loop is in the bundle. HexagonMCInstrInfo::padEndloop(MCB, Context); + // If compounding and duplexing didn't reduce the size below // 4 or less we have a packet that is too big. if (HexagonMCInstrInfo::bundleSize(MCB) > HEXAGON_PACKET_SIZE) { @@ -156,7 +161,9 @@ bool canonicalizePacketImpl(MCInstrInfo const &MCII, MCSubtargetInfo const &STI, CheckOk = Check ? Check->check(true) : true; if (!CheckOk) return false; + HexagonMCShuffle(Context, true, MCII, STI, MCB); + return true; } } // namespace @@ -857,16 +864,16 @@ bool HexagonMCInstrInfo::isVector(MCInstrInfo const &MCII, MCInst const &MCI) { } int64_t HexagonMCInstrInfo::minConstant(MCInst const &MCI, size_t Index) { - auto Sentinal = static_cast<int64_t>(std::numeric_limits<uint32_t>::max()) + auto Sentinel = static_cast<int64_t>(std::numeric_limits<uint32_t>::max()) << 8; if (MCI.size() <= Index) - return Sentinal; + return Sentinel; MCOperand const &MCO = MCI.getOperand(Index); if (!MCO.isExpr()) - return Sentinal; + return Sentinel; int64_t Value; if (!MCO.getExpr()->evaluateAsAbsolute(Value)) - return Sentinal; + return Sentinel; return Value; } @@ -915,10 +922,7 @@ void HexagonMCInstrInfo::padEndloop(MCInst &MCB, MCContext &Context) { MCInst Nop; Nop.setOpcode(Hexagon::A2_nop); assert(isBundle(MCB)); - while ((HexagonMCInstrInfo::isInnerLoop(MCB) && - (HexagonMCInstrInfo::bundleSize(MCB) < HEXAGON_PACKET_INNER_SIZE)) || - ((HexagonMCInstrInfo::isOuterLoop(MCB) && - (HexagonMCInstrInfo::bundleSize(MCB) < HEXAGON_PACKET_OUTER_SIZE)))) + while (LoopNeedsPadding(MCB)) MCB.addOperand(MCOperand::createInst(new (Context) MCInst(Nop))); } @@ -1030,3 +1034,19 @@ unsigned HexagonMCInstrInfo::SubregisterBit(unsigned Consumer, return Consumer == Producer; return 0; } + +bool HexagonMCInstrInfo::LoopNeedsPadding(MCInst const &MCB) { + return ( + (HexagonMCInstrInfo::isInnerLoop(MCB) && + (HexagonMCInstrInfo::bundleSize(MCB) < HEXAGON_PACKET_INNER_SIZE)) || + ((HexagonMCInstrInfo::isOuterLoop(MCB) && + (HexagonMCInstrInfo::bundleSize(MCB) < HEXAGON_PACKET_OUTER_SIZE)))); +} + +bool HexagonMCInstrInfo::IsABranchingInst(MCInstrInfo const &MCII, + MCSubtargetInfo const &STI, + MCInst const &I) { + assert(!HexagonMCInstrInfo::isBundle(I)); + MCInstrDesc const &Desc = HexagonMCInstrInfo::getDesc(MCII, I); + return (Desc.isBranch() || Desc.isCall() || Desc.isReturn()); +} diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h index 5c56db14798f..f0c4a86fde78 100644 --- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h +++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h @@ -65,18 +65,24 @@ public: namespace HexagonMCInstrInfo { -size_t const innerLoopOffset = 0; -int64_t const innerLoopMask = 1 << innerLoopOffset; +constexpr size_t innerLoopOffset = 0; +constexpr int64_t innerLoopMask = 1 << innerLoopOffset; -size_t const outerLoopOffset = 1; -int64_t const outerLoopMask = 1 << outerLoopOffset; +constexpr size_t outerLoopOffset = 1; +constexpr int64_t outerLoopMask = 1 << outerLoopOffset; // do not reorder memory load/stores by default load/stores are re-ordered // and by default loads can be re-ordered -size_t const memReorderDisabledOffset = 2; -int64_t const memReorderDisabledMask = 1 << memReorderDisabledOffset; +constexpr size_t memReorderDisabledOffset = 2; +constexpr int64_t memReorderDisabledMask = 1 << memReorderDisabledOffset; -size_t const bundleInstructionsOffset = 1; +constexpr size_t splitNoMemOrderOffset = 3; +constexpr int64_t splitNoMemorderMask = 1 << splitNoMemOrderOffset; + +constexpr size_t noShuffleOffset = 4; +constexpr int64_t noShuffleMask = 1 << noShuffleOffset; + +constexpr size_t bundleInstructionsOffset = 1; void addConstant(MCInst &MI, uint64_t Value, MCContext &Context); void addConstExtender(MCContext &Context, MCInstrInfo const &MCII, MCInst &MCB, @@ -95,6 +101,8 @@ bool canonicalizePacket(MCInstrInfo const &MCII, MCSubtargetInfo const &STI, MCContext &Context, MCInst &MCB, HexagonMCChecker *Checker, bool AttemptCompatibility = false); +bool IsABranchingInst(MCInstrInfo const &MCII, MCSubtargetInfo const &STI, + MCInst const &I); // Create a duplex instruction given the two subinsts MCInst *deriveDuplex(MCContext &Context, unsigned iClass, MCInst const &inst0, @@ -307,6 +315,10 @@ bool mustNotExtend(MCExpr const &Expr); // Returns true if this instruction requires a slot to execute. bool requiresSlot(MCSubtargetInfo const &STI, MCInst const &MCI); + +// Returns true if \a MCB would require endloop padding. +bool LoopNeedsPadding(MCInst const &MCB); + unsigned packetSize(StringRef CPU); // Returns the maximum number of slots available in the given @@ -318,8 +330,7 @@ unsigned packetSizeSlots(MCSubtargetInfo const &STI); unsigned slotsConsumed(MCInstrInfo const &MCII, MCSubtargetInfo const &STI, MCInst const &MCI); - -// Pad the bundle with nops to satisfy endloop requirements +// Pad the bundle with nops to satisfy endloop requirements. void padEndloop(MCInst &MCI, MCContext &Context); class PredicateInfo { public: diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCShuffler.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCShuffler.cpp index d38b77b42fbc..d96fade71a84 100644 --- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCShuffler.cpp +++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCShuffler.cpp @@ -81,10 +81,9 @@ void HexagonMCShuffler::copyTo(MCInst &MCB) { MCB.addOperand(MCOperand::createImm(BundleFlags)); MCB.setLoc(Loc); // Copy the results into the bundle. - for (HexagonShuffler::iterator I = begin(); I != end(); ++I) { - - MCInst const &MI = I->getDesc(); - MCInst const *Extender = I->getExtender(); + for (auto &I : *this) { + MCInst const &MI = I.getDesc(); + MCInst const *Extender = I.getExtender(); if (Extender) MCB.addOperand(MCOperand::createInst(Extender)); MCB.addOperand(MCOperand::createInst(&MI)); @@ -101,10 +100,10 @@ bool HexagonMCShuffler::reshuffleTo(MCInst &MCB) { return false; } -bool llvm::HexagonMCShuffle(MCContext &Context, bool Fatal, +bool llvm::HexagonMCShuffle(MCContext &Context, bool ReportErrors, MCInstrInfo const &MCII, MCSubtargetInfo const &STI, MCInst &MCB) { - HexagonMCShuffler MCS(Context, Fatal, MCII, STI, MCB); + HexagonMCShuffler MCS(Context, ReportErrors, MCII, STI, MCB); if (DisableShuffle) // Ignore if user chose so. @@ -128,11 +127,11 @@ bool llvm::HexagonMCShuffle(MCContext &Context, bool Fatal, return MCS.reshuffleTo(MCB); } -bool -llvm::HexagonMCShuffle(MCContext &Context, MCInstrInfo const &MCII, - MCSubtargetInfo const &STI, MCInst &MCB, - SmallVector<DuplexCandidate, 8> possibleDuplexes) { - if (DisableShuffle) +bool llvm::HexagonMCShuffle(MCContext &Context, MCInstrInfo const &MCII, + MCSubtargetInfo const &STI, MCInst &MCB, + SmallVector<DuplexCandidate, 8> possibleDuplexes) { + + if (DisableShuffle || possibleDuplexes.size() == 0) return false; if (!HexagonMCInstrInfo::bundleSize(MCB)) { @@ -173,10 +172,8 @@ llvm::HexagonMCShuffle(MCContext &Context, MCInstrInfo const &MCII, HexagonMCShuffler MCS(Context, false, MCII, STI, MCB); doneShuffling = MCS.reshuffleTo(MCB); // shuffle } - if (!doneShuffling) - return true; - return false; + return doneShuffling; } bool llvm::HexagonMCShuffle(MCContext &Context, MCInstrInfo const &MCII, diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCShuffler.h b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCShuffler.h index 3410c0ddbd84..4fc8addb27bc 100644 --- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCShuffler.h +++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCShuffler.h @@ -28,16 +28,17 @@ class MCSubtargetInfo; // Insn bundle shuffler. class HexagonMCShuffler : public HexagonShuffler { public: - HexagonMCShuffler(MCContext &Context, bool Fatal, MCInstrInfo const &MCII, - MCSubtargetInfo const &STI, MCInst &MCB) - : HexagonShuffler(Context, Fatal, MCII, STI) { + HexagonMCShuffler(MCContext &Context, bool ReportErrors, + MCInstrInfo const &MCII, MCSubtargetInfo const &STI, + MCInst &MCB) + : HexagonShuffler(Context, ReportErrors, MCII, STI) { init(MCB); } - HexagonMCShuffler(MCContext &Context, bool Fatal, MCInstrInfo const &MCII, - MCSubtargetInfo const &STI, MCInst &MCB, - MCInst const &AddMI, bool InsertAtFront) - : HexagonShuffler(Context, Fatal, MCII, STI) { + HexagonMCShuffler(MCContext &Context, bool ReportErrors, + MCInstrInfo const &MCII, MCSubtargetInfo const &STI, + MCInst &MCB, MCInst const &AddMI, bool InsertAtFront) + : HexagonShuffler(Context, ReportErrors, MCII, STI) { init(MCB, AddMI, InsertAtFront); } @@ -52,9 +53,11 @@ private: void init(MCInst &MCB, MCInst const &AddMI, bool InsertAtFront); }; -// Invocation of the shuffler. -bool HexagonMCShuffle(MCContext &Context, bool Fatal, MCInstrInfo const &MCII, - MCSubtargetInfo const &STI, MCInst &MCB); +// Invocation of the shuffler. Returns true if the shuffle succeeded. If +// true, MCB will contain the newly-shuffled packet. +bool HexagonMCShuffle(MCContext &Context, bool ReportErrors, + MCInstrInfo const &MCII, MCSubtargetInfo const &STI, + MCInst &MCB); bool HexagonMCShuffle(MCContext &Context, MCInstrInfo const &MCII, MCSubtargetInfo const &STI, MCInst &MCB, MCInst const &AddMI, int fixupCount); diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp index dfdddb50657c..6a08d7503bac 100644 --- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp +++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp @@ -517,6 +517,14 @@ MCSubtargetInfo *Hexagon_MC::createHexagonMCSubtargetInfo(const Triple &TT, return nullptr; } + // Add qfloat subtarget feature by default to v68 and above + // unless explicitely disabled + if (checkFeature(X, Hexagon::ExtensionHVXV68) && + ArchFS.find("-hvx-qfloat", 0) == std::string::npos) { + llvm::FeatureBitset Features = X->getFeatureBits(); + X->setFeatureBits(Features.set(Hexagon::ExtensionHVXQFloat)); + } + if (HexagonDisableDuplex) { llvm::FeatureBitset Features = X->getFeatureBits(); X->setFeatureBits(Features.reset(Hexagon::FeatureDuplex)); @@ -551,21 +559,11 @@ void Hexagon_MC::addArchSubtarget(MCSubtargetInfo const *STI, } unsigned Hexagon_MC::GetELFFlags(const MCSubtargetInfo &STI) { - static std::map<StringRef,unsigned> ElfFlags = { - {"hexagonv5", ELF::EF_HEXAGON_MACH_V5}, - {"hexagonv55", ELF::EF_HEXAGON_MACH_V55}, - {"hexagonv60", ELF::EF_HEXAGON_MACH_V60}, - {"hexagonv62", ELF::EF_HEXAGON_MACH_V62}, - {"hexagonv65", ELF::EF_HEXAGON_MACH_V65}, - {"hexagonv66", ELF::EF_HEXAGON_MACH_V66}, - {"hexagonv67", ELF::EF_HEXAGON_MACH_V67}, - {"hexagonv67t", ELF::EF_HEXAGON_MACH_V67T}, - {"hexagonv68", ELF::EF_HEXAGON_MACH_V68}, - {"hexagonv69", ELF::EF_HEXAGON_MACH_V69}, - }; + using llvm::Hexagon::ElfFlagsByCpuStr; - auto F = ElfFlags.find(STI.getCPU()); - assert(F != ElfFlags.end() && "Unrecognized Architecture"); + const std::string CPU(STI.getCPU().str()); + auto F = ElfFlagsByCpuStr.find(CPU); + assert(F != ElfFlagsByCpuStr.end() && "Unrecognized Architecture"); return F->second; } diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp index 1fce90b82864..d82731e153fe 100644 --- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp +++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp @@ -167,7 +167,8 @@ static bool checkHVXPipes(const HVXInstsT &hvxInsts, unsigned startIdx, HexagonShuffler::HexagonShuffler(MCContext &Context, bool ReportErrors, MCInstrInfo const &MCII, MCSubtargetInfo const &STI) - : Context(Context), MCII(MCII), STI(STI), ReportErrors(ReportErrors) { + : Context(Context), BundleFlags(), MCII(MCII), STI(STI), + ReportErrors(ReportErrors), CheckFailure() { reset(); } @@ -244,8 +245,8 @@ void HexagonShuffler::restrictNoSlot1Store( "Instruction does not allow a store in slot 1")); } -bool HexagonShuffler::applySlotRestrictions( - HexagonPacketSummary const &Summary) { +bool HexagonShuffler::applySlotRestrictions(HexagonPacketSummary const &Summary, + const bool DoShuffle) { // These restrictions can modify the slot masks in the instructions // in the Packet member. They should run unconditionally and their // order does not matter. @@ -262,7 +263,7 @@ bool HexagonShuffler::applySlotRestrictions( if (!CheckFailure) restrictBranchOrder(Summary); if (!CheckFailure) - restrictPreferSlot3(Summary); + restrictPreferSlot3(Summary, DoShuffle); return !CheckFailure; } @@ -303,10 +304,9 @@ void HexagonShuffler::restrictBranchOrder(HexagonPacketSummary const &Summary) { Packet = PacketSave; } - reportError("invalid instruction packet: out of slots"); + reportResourceError(Summary, "out of slots"); } - void HexagonShuffler::permitNonSlot() { for (HexagonInstr &ISJ : insts()) { const bool RequiresSlot = HexagonMCInstrInfo::requiresSlot(STI, *ISJ.ID); @@ -319,21 +319,19 @@ bool HexagonShuffler::ValidResourceUsage(HexagonPacketSummary const &Summary) { Optional<HexagonPacket> ShuffledPacket = tryAuction(Summary); if (!ShuffledPacket) { - reportError("invalid instruction packet: slot error"); + reportResourceError(Summary, "slot error"); return false; - } else { - Packet = *ShuffledPacket; } // Verify the CVI slot subscriptions. - llvm::stable_sort(*this, HexagonInstr::lessCVI); + llvm::stable_sort(*ShuffledPacket, HexagonInstr::lessCVI); // create vector of hvx instructions to check HVXInstsT hvxInsts; hvxInsts.clear(); - for (const_iterator I = cbegin(); I != cend(); ++I) { + for (const auto &I : *ShuffledPacket) { struct CVIUnits inst; - inst.Units = I->CVI.getUnits(); - inst.Lanes = I->CVI.getLanes(); + inst.Units = I.CVI.getUnits(); + inst.Lanes = I.CVI.getLanes(); if (inst.Units == 0) continue; // not an hvx inst or an hvx inst that doesn't uses any pipes hvxInsts.push_back(inst); @@ -349,6 +347,9 @@ bool HexagonShuffler::ValidResourceUsage(HexagonPacketSummary const &Summary) { return false; } } + + Packet = *ShuffledPacket; + return true; } @@ -438,6 +439,15 @@ bool HexagonShuffler::restrictStoreLoadOrder( return true; } +static std::string SlotMaskToText(unsigned SlotMask) { + SmallVector<std::string, HEXAGON_PRESHUFFLE_PACKET_SIZE> Slots; + for (unsigned SlotNum = 0; SlotNum < HEXAGON_PACKET_SIZE; SlotNum++) + if ((SlotMask & (1 << SlotNum)) != 0) + Slots.push_back(utostr(SlotNum)); + + return llvm::join(Slots, StringRef(", ")); +} + HexagonShuffler::HexagonPacketSummary HexagonShuffler::GetPacketSummary() { HexagonPacketSummary Summary = HexagonPacketSummary(); @@ -454,8 +464,13 @@ HexagonShuffler::HexagonPacketSummary HexagonShuffler::GetPacketSummary() { ++Summary.pSlot3Cnt; Summary.PrefSlot3Inst = ISJ; } - Summary.ReservedSlotMask |= + const unsigned ReservedSlots = HexagonMCInstrInfo::getOtherReservedSlots(MCII, STI, ID); + Summary.ReservedSlotMask |= ReservedSlots; + if (ReservedSlots != 0) + AppliedRestrictions.push_back(std::make_pair(ID.getLoc(), + (Twine("Instruction has reserved slots: ") + + SlotMaskToText(ReservedSlots)).str())); switch (HexagonMCInstrInfo::getType(MCII, ID)) { case HexagonII::TypeS_2op: @@ -463,7 +478,8 @@ HexagonShuffler::HexagonPacketSummary HexagonShuffler::GetPacketSummary() { case HexagonII::TypeALU64: break; case HexagonII::TypeJ: - Summary.branchInsts.push_back(ISJ); + if (HexagonMCInstrInfo::IsABranchingInst(MCII, STI, *ISJ->ID)) + Summary.branchInsts.push_back(ISJ); break; case HexagonII::TypeCVI_VM_VP_LDU: case HexagonII::TypeCVI_VM_LD: @@ -565,14 +581,15 @@ bool HexagonShuffler::ValidPacketMemoryOps( return !InvalidPacket; } -void HexagonShuffler::restrictPreferSlot3(HexagonPacketSummary const &Summary) { +void HexagonShuffler::restrictPreferSlot3(HexagonPacketSummary const &Summary, + const bool DoShuffle) { // flag if an instruction requires to be in slot 3 const bool HasOnlySlot3 = llvm::any_of(insts(), [&](HexagonInstr const &I) { return (I.Core.getUnits() == Slot3Mask); }); - const bool NeedsPrefSlot3Shuffle = - (Summary.branchInsts.size() <= 1 && !HasOnlySlot3 && - Summary.pSlot3Cnt == 1 && Summary.PrefSlot3Inst); + const bool NeedsPrefSlot3Shuffle = Summary.branchInsts.size() <= 1 && + !HasOnlySlot3 && Summary.pSlot3Cnt == 1 && + Summary.PrefSlot3Inst && DoShuffle; if (!NeedsPrefSlot3Shuffle) return; @@ -590,9 +607,9 @@ void HexagonShuffler::restrictPreferSlot3(HexagonPacketSummary const &Summary) { } /// Check that the packet is legal and enforce relative insn order. -bool HexagonShuffler::check() { +bool HexagonShuffler::check(const bool RequireShuffle) { const HexagonPacketSummary Summary = GetPacketSummary(); - if (!applySlotRestrictions(Summary)) + if (!applySlotRestrictions(Summary, RequireShuffle)) return false; if (!ValidPacketMemoryOps(Summary)) { @@ -600,13 +617,14 @@ bool HexagonShuffler::check() { return false; } - ValidResourceUsage(Summary); + if (RequireShuffle) + ValidResourceUsage(Summary); return !CheckFailure; } llvm::Optional<HexagonShuffler::HexagonPacket> -HexagonShuffler::tryAuction(HexagonPacketSummary const &Summary) const { +HexagonShuffler::tryAuction(HexagonPacketSummary const &Summary) { HexagonPacket PacketResult = Packet; HexagonUnitAuction AuctionCore(Summary.ReservedSlotMask); llvm::stable_sort(PacketResult, HexagonInstr::lessCore); @@ -635,13 +653,13 @@ bool HexagonShuffler::shuffle() { if (size() > HEXAGON_PACKET_SIZE) { // Ignore a packet with with more than what a packet can hold // or with compound or duplex insns for now. - reportError(Twine("invalid instruction packet")); + reportError("invalid instruction packet"); return false; } // Check and prepare packet. - bool Ok = true; - if (size() > 1 && (Ok = check())) + bool Ok = check(); + if (size() > 1 && Ok) // Reorder the handles for each slot. for (unsigned nSlot = 0, emptySlots = 0; nSlot < HEXAGON_PACKET_SIZE; ++nSlot) { @@ -684,6 +702,32 @@ bool HexagonShuffler::shuffle() { return Ok; } +void HexagonShuffler::reportResourceError(HexagonPacketSummary const &Summary, StringRef Err) { + if (ReportErrors) + reportResourceUsage(Summary); + reportError(Twine("invalid instruction packet: ") + Err); +} + + +void HexagonShuffler::reportResourceUsage(HexagonPacketSummary const &Summary) { + auto SM = Context.getSourceManager(); + if (SM) { + for (HexagonInstr const &I : insts()) { + const unsigned Units = I.Core.getUnits(); + + if (HexagonMCInstrInfo::requiresSlot(STI, *I.ID)) { + const std::string UnitsText = Units ? SlotMaskToText(Units) : "<None>"; + SM->PrintMessage(I.ID->getLoc(), SourceMgr::DK_Note, + Twine("Instruction can utilize slots: ") + + UnitsText); + } + else if (!HexagonMCInstrInfo::isImmext(*I.ID)) + SM->PrintMessage(I.ID->getLoc(), SourceMgr::DK_Note, + "Instruction does not require a slot"); + } + } +} + void HexagonShuffler::reportError(Twine const &Msg) { CheckFailure = true; if (ReportErrors) { diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.h b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.h index 1b4ebc5111db..70992e4c7e81 100644 --- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.h +++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.h @@ -72,16 +72,6 @@ public: using UnitsAndLanes = std::pair<unsigned, unsigned>; private: - // Available HVX slots. - enum { - CVI_NONE = 0, - CVI_XLANE = 1 << 0, - CVI_SHIFT = 1 << 1, - CVI_MPY0 = 1 << 2, - CVI_MPY1 = 1 << 3, - CVI_ZW = 1 << 4 - }; - // Count of adjacent slots that the insn requires to be executed. unsigned Lanes; // Flag whether the insn is a load or a store. @@ -177,21 +167,23 @@ protected: bool ReportErrors; bool CheckFailure; std::vector<std::pair<SMLoc, std::string>> AppliedRestrictions; - bool applySlotRestrictions(HexagonPacketSummary const &Summary); + + bool applySlotRestrictions(HexagonPacketSummary const &Summary, + const bool DoShuffle); void restrictSlot1AOK(HexagonPacketSummary const &Summary); void restrictNoSlot1Store(HexagonPacketSummary const &Summary); void restrictNoSlot1(); bool restrictStoreLoadOrder(HexagonPacketSummary const &Summary); void restrictBranchOrder(HexagonPacketSummary const &Summary); - void restrictPreferSlot3(HexagonPacketSummary const &Summary); + void restrictPreferSlot3(HexagonPacketSummary const &Summary, + const bool DoShuffle); void permitNonSlot(); - Optional<HexagonPacket> tryAuction(HexagonPacketSummary const &Summary) const; + Optional<HexagonPacket> tryAuction(HexagonPacketSummary const &Summary); HexagonPacketSummary GetPacketSummary(); bool ValidPacketMemoryOps(HexagonPacketSummary const &Summary) const; bool ValidResourceUsage(HexagonPacketSummary const &Summary); - bool validPacketInsts() const; public: using iterator = HexagonPacket::iterator; @@ -205,7 +197,7 @@ public: // Reset to initial state. void reset(); // Check if the bundle may be validly shuffled. - bool check(); + bool check(const bool RequireShuffle = true); // Reorder the insn handles in the bundle. bool shuffle(); @@ -242,6 +234,8 @@ public: // Return the error code for the last check or shuffling of the bundle. void reportError(Twine const &Msg); + void reportResourceError(HexagonPacketSummary const &Summary, StringRef Err); + void reportResourceUsage(HexagonPacketSummary const &Summary); }; } // end namespace llvm diff --git a/llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp b/llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp index a994bd7e57a4..660215ca7435 100644 --- a/llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp +++ b/llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp @@ -141,7 +141,7 @@ struct LanaiOperand : public MCParsedAsmOperand { struct MemOp Mem; }; - explicit LanaiOperand(KindTy Kind) : MCParsedAsmOperand(), Kind(Kind) {} + explicit LanaiOperand(KindTy Kind) : Kind(Kind) {} public: // The functions below are used by the autogenerated ASM matcher and hence to diff --git a/llvm/lib/Target/Lanai/LanaiISelLowering.cpp b/llvm/lib/Target/Lanai/LanaiISelLowering.cpp index 0d9e63c112fb..010ff80ad42a 100644 --- a/llvm/lib/Target/Lanai/LanaiISelLowering.cpp +++ b/llvm/lib/Target/Lanai/LanaiISelLowering.cpp @@ -284,7 +284,7 @@ LanaiTargetLowering::getSingleConstraintMatchWeight( void LanaiTargetLowering::LowerAsmOperandForConstraint( SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops, SelectionDAG &DAG) const { - SDValue Result(nullptr, 0); + SDValue Result; // Only support length 1 constraints for now. if (Constraint.length() > 1) @@ -511,7 +511,7 @@ SDValue LanaiTargetLowering::LowerCCCArguments( // the sret argument into rv for the return. Save the argument into // a virtual register so that we can access it from the return points. if (MF.getFunction().hasStructRetAttr()) { - unsigned Reg = LanaiMFI->getSRetReturnReg(); + Register Reg = LanaiMFI->getSRetReturnReg(); if (!Reg) { Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i32)); LanaiMFI->setSRetReturnReg(Reg); @@ -577,7 +577,7 @@ LanaiTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, if (DAG.getMachineFunction().getFunction().hasStructRetAttr()) { MachineFunction &MF = DAG.getMachineFunction(); LanaiMachineFunctionInfo *LanaiMFI = MF.getInfo<LanaiMachineFunctionInfo>(); - unsigned Reg = LanaiMFI->getSRetReturnReg(); + Register Reg = LanaiMFI->getSRetReturnReg(); assert(Reg && "SRetReturnReg should have been set in LowerFormalArguments()."); SDValue Val = @@ -1077,7 +1077,7 @@ SDValue LanaiTargetLowering::LowerRETURNADDR(SDValue Op, // Return the link register, which contains the return address. // Mark it an implicit live-in. - unsigned Reg = MF.addLiveIn(TRI->getRARegister(), getRegClassFor(MVT::i32)); + Register Reg = MF.addLiveIn(TRI->getRARegister(), getRegClassFor(MVT::i32)); return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT); } diff --git a/llvm/lib/Target/Lanai/LanaiMemAluCombiner.cpp b/llvm/lib/Target/Lanai/LanaiMemAluCombiner.cpp index 67443b771d3d..ce79bdafc425 100644 --- a/llvm/lib/Target/Lanai/LanaiMemAluCombiner.cpp +++ b/llvm/lib/Target/Lanai/LanaiMemAluCombiner.cpp @@ -412,9 +412,8 @@ bool LanaiMemAluCombiner::runOnMachineFunction(MachineFunction &MF) { TII = MF.getSubtarget<LanaiSubtarget>().getInstrInfo(); bool Modified = false; - for (MfIterator MFI = MF.begin(); MFI != MF.end(); ++MFI) { - Modified |= combineMemAluInBasicBlock(&*MFI); - } + for (MachineBasicBlock &MBB : MF) + Modified |= combineMemAluInBasicBlock(&MBB); return Modified; } } // namespace diff --git a/llvm/lib/Target/Lanai/LanaiRegisterInfo.cpp b/llvm/lib/Target/Lanai/LanaiRegisterInfo.cpp index abe20c8e18cf..03cf10205173 100644 --- a/llvm/lib/Target/Lanai/LanaiRegisterInfo.cpp +++ b/llvm/lib/Target/Lanai/LanaiRegisterInfo.cpp @@ -165,7 +165,7 @@ void LanaiRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, if ((isSPLSOpcode(MI.getOpcode()) && !isInt<10>(Offset)) || !isInt<16>(Offset)) { assert(RS && "Register scavenging must be on"); - unsigned Reg = RS->FindUnusedReg(&Lanai::GPRRegClass); + Register Reg = RS->FindUnusedReg(&Lanai::GPRRegClass); if (!Reg) Reg = RS->scavengeRegister(&Lanai::GPRRegClass, II, SPAdj); assert(Reg && "Register scavenger failed"); diff --git a/llvm/lib/Target/Lanai/LanaiSubtarget.cpp b/llvm/lib/Target/Lanai/LanaiSubtarget.cpp index d9d7847a0c5a..37a4843e1bc4 100644 --- a/llvm/lib/Target/Lanai/LanaiSubtarget.cpp +++ b/llvm/lib/Target/Lanai/LanaiSubtarget.cpp @@ -43,4 +43,4 @@ LanaiSubtarget::LanaiSubtarget(const Triple &TargetTriple, StringRef Cpu, CodeGenOpt::Level /*OptLevel*/) : LanaiGenSubtargetInfo(TargetTriple, Cpu, /*TuneCPU*/ Cpu, FeatureString), FrameLowering(initializeSubtargetDependencies(Cpu, FeatureString)), - InstrInfo(), TLInfo(TM, *this), TSInfo() {} + TLInfo(TM, *this) {} diff --git a/llvm/lib/Target/Lanai/MCTargetDesc/LanaiInstPrinter.cpp b/llvm/lib/Target/Lanai/MCTargetDesc/LanaiInstPrinter.cpp index 7027d18126bb..d8a66bc8a0da 100644 --- a/llvm/lib/Target/Lanai/MCTargetDesc/LanaiInstPrinter.cpp +++ b/llvm/lib/Target/Lanai/MCTargetDesc/LanaiInstPrinter.cpp @@ -148,7 +148,7 @@ void LanaiInstPrinter::printInst(const MCInst *MI, uint64_t Address, void LanaiInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &OS, const char *Modifier) { - assert((Modifier == 0 || Modifier[0] == 0) && "No modifiers supported"); + assert((Modifier == nullptr || Modifier[0] == 0) && "No modifiers supported"); const MCOperand &Op = MI->getOperand(OpNo); if (Op.isReg()) OS << "%" << getRegisterName(Op.getReg()); diff --git a/llvm/lib/Target/M68k/AsmParser/M68kAsmParser.cpp b/llvm/lib/Target/M68k/AsmParser/M68kAsmParser.cpp index 4db879c34ad9..dcd581875f60 100644 --- a/llvm/lib/Target/M68k/AsmParser/M68kAsmParser.cpp +++ b/llvm/lib/Target/M68k/AsmParser/M68kAsmParser.cpp @@ -1,4 +1,4 @@ -//===---- M68kAsmParser.cpp - Parse M68k assembly to MCInst instructions --===// +//===-- M68kAsmParser.cpp - Parse M68k assembly to MCInst instructions ----===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. diff --git a/llvm/lib/Target/M68k/Disassembler/M68kDisassembler.cpp b/llvm/lib/Target/M68k/Disassembler/M68kDisassembler.cpp index a08ffa787095..a565ff4e004d 100644 --- a/llvm/lib/Target/M68k/Disassembler/M68kDisassembler.cpp +++ b/llvm/lib/Target/M68k/Disassembler/M68kDisassembler.cpp @@ -1,4 +1,4 @@ -//===- M68kDisassembler.cpp - Disassembler for M68k -------------*- C++ -*-===// +//===-- M68kDisassembler.cpp - Disassembler for M68k ------------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. diff --git a/llvm/lib/Target/M68k/GISel/M68kCallLowering.cpp b/llvm/lib/Target/M68k/GISel/M68kCallLowering.cpp index 9cd959012e6f..b3d17184f1fe 100644 --- a/llvm/lib/Target/M68k/GISel/M68kCallLowering.cpp +++ b/llvm/lib/Target/M68k/GISel/M68kCallLowering.cpp @@ -1,4 +1,4 @@ -//===-- M68kCallLowering.cpp - Call lowering -------------------*- C++ -*-===// +//===-- M68kCallLowering.cpp - Call lowering --------------------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. diff --git a/llvm/lib/Target/M68k/GISel/M68kCallLowering.h b/llvm/lib/Target/M68k/GISel/M68kCallLowering.h index 47cdefdba100..24212e6dd9c6 100644 --- a/llvm/lib/Target/M68k/GISel/M68kCallLowering.h +++ b/llvm/lib/Target/M68k/GISel/M68kCallLowering.h @@ -1,4 +1,4 @@ -//===-- M68kCallLowering.h - Call lowering -------------------*- C++ -*-===// +//===-- M68kCallLowering.h - Call lowering ----------------------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. diff --git a/llvm/lib/Target/M68k/GISel/M68kInstructionSelector.cpp b/llvm/lib/Target/M68k/GISel/M68kInstructionSelector.cpp index 9ac4ab9a5ba1..a627eccd110d 100644 --- a/llvm/lib/Target/M68k/GISel/M68kInstructionSelector.cpp +++ b/llvm/lib/Target/M68k/GISel/M68kInstructionSelector.cpp @@ -1,4 +1,4 @@ -//===- M68kInstructionSelector.cpp ------------------------------*- C++ -*-===// +//===-- M68kInstructionSelector.cpp -----------------------------*- C++ -*-===// //===----------------------------------------------------------------------===// /// \file /// This file implements the targeting of the InstructionSelector class for diff --git a/llvm/lib/Target/M68k/GISel/M68kLegalizerInfo.cpp b/llvm/lib/Target/M68k/GISel/M68kLegalizerInfo.cpp index bcbe62816beb..860c0ce29326 100644 --- a/llvm/lib/Target/M68k/GISel/M68kLegalizerInfo.cpp +++ b/llvm/lib/Target/M68k/GISel/M68kLegalizerInfo.cpp @@ -1,4 +1,4 @@ -//===-- M68kLegalizerInfo.cpp ----------------------------------*- C++ -*-===// +//===-- M68kLegalizerInfo.cpp -----------------------------------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. diff --git a/llvm/lib/Target/M68k/GISel/M68kLegalizerInfo.h b/llvm/lib/Target/M68k/GISel/M68kLegalizerInfo.h index 205aa81aedcc..a10401ed1a9a 100644 --- a/llvm/lib/Target/M68k/GISel/M68kLegalizerInfo.h +++ b/llvm/lib/Target/M68k/GISel/M68kLegalizerInfo.h @@ -1,4 +1,4 @@ -//===- M68kLegalizerInfo --------------------------------------*- C++ -*-==// +//===-- M68kLegalizerInfo ---------------------------------------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. diff --git a/llvm/lib/Target/M68k/GISel/M68kRegisterBankInfo.cpp b/llvm/lib/Target/M68k/GISel/M68kRegisterBankInfo.cpp index 5c0f5dae8e37..b6ed6ab28a5d 100644 --- a/llvm/lib/Target/M68k/GISel/M68kRegisterBankInfo.cpp +++ b/llvm/lib/Target/M68k/GISel/M68kRegisterBankInfo.cpp @@ -1,4 +1,4 @@ -//===-- M68kRegisterBankInfo.cpp -------------------------------*- C++ -*-===// +//===-- M68kRegisterBankInfo.cpp --------------------------------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. diff --git a/llvm/lib/Target/M68k/GISel/M68kRegisterBankInfo.h b/llvm/lib/Target/M68k/GISel/M68kRegisterBankInfo.h index 853c75df2bb3..6c0b8ca7ba5a 100644 --- a/llvm/lib/Target/M68k/GISel/M68kRegisterBankInfo.h +++ b/llvm/lib/Target/M68k/GISel/M68kRegisterBankInfo.h @@ -1,4 +1,4 @@ -//===-- M68kRegisterBankInfo.h ---------------------------------*- C++ -*-===// +//===-- M68kRegisterBankInfo.h ----------------------------------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -42,4 +42,4 @@ public: getInstrMapping(const MachineInstr &MI) const override; }; } // end namespace llvm -#endif +#endif // LLVM_LIB_TARGET_M68K_GLSEL_M68KREGISTERBANKINFO_H diff --git a/llvm/lib/Target/M68k/GISel/M68kRegisterBanks.td b/llvm/lib/Target/M68k/GISel/M68kRegisterBanks.td index 942677a60e6c..2a00ec065cd4 100644 --- a/llvm/lib/Target/M68k/GISel/M68kRegisterBanks.td +++ b/llvm/lib/Target/M68k/GISel/M68kRegisterBanks.td @@ -1,4 +1,4 @@ -//===-- M68kRegisterBanks.td - Describe the M68k Banks -------*- tablegen -*-===// +//===-- M68kRegisterBanks.td - Describe the M68k Banks -----*- tablegen -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. diff --git a/llvm/lib/Target/M68k/M68k.h b/llvm/lib/Target/M68k/M68k.h index cef40bee7d93..b6069d736deb 100644 --- a/llvm/lib/Target/M68k/M68k.h +++ b/llvm/lib/Target/M68k/M68k.h @@ -1,4 +1,4 @@ -//===- M68k.h - Top-level interface for M68k representation -*- C++ -*-===// +//===-- M68k.h - Top-level interface for M68k representation ----*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -54,4 +54,4 @@ createM68kInstructionSelector(const M68kTargetMachine &, const M68kSubtarget &, } // namespace llvm -#endif +#endif // LLVM_LIB_TARGET_M68K_M68K_H diff --git a/llvm/lib/Target/M68k/M68k.td b/llvm/lib/Target/M68k/M68k.td index fde491e1b6d5..de7a6c82d110 100644 --- a/llvm/lib/Target/M68k/M68k.td +++ b/llvm/lib/Target/M68k/M68k.td @@ -1,4 +1,4 @@ -//===-- M68k.td - Motorola 680x0 target definitions ------*- tablegen -*-===// +//===-- M68k.td - Motorola 680x0 target definitions --------*- tablegen -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. diff --git a/llvm/lib/Target/M68k/M68kAsmPrinter.cpp b/llvm/lib/Target/M68k/M68kAsmPrinter.cpp index 08b7153632b4..3bcce9e3ba3b 100644 --- a/llvm/lib/Target/M68k/M68kAsmPrinter.cpp +++ b/llvm/lib/Target/M68k/M68kAsmPrinter.cpp @@ -1,4 +1,4 @@ -//===----- M68kAsmPrinter.cpp - M68k LLVM Assembly Printer -----*- C++ -*-===// +//===-- M68kAsmPrinter.cpp - M68k LLVM Assembly Printer ---------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. diff --git a/llvm/lib/Target/M68k/M68kAsmPrinter.h b/llvm/lib/Target/M68k/M68kAsmPrinter.h index dff3bb876336..1a76e3bf4e27 100644 --- a/llvm/lib/Target/M68k/M68kAsmPrinter.h +++ b/llvm/lib/Target/M68k/M68kAsmPrinter.h @@ -1,4 +1,4 @@ -//===----- M68kAsmPrinter.h - M68k LLVM Assembly Printer -------- C++ -*--===// +//===-- M68kAsmPrinter.h - M68k LLVM Assembly Printer -----------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -66,4 +66,4 @@ public: }; } // namespace llvm -#endif +#endif // LLVM_LIB_TARGET_M68K_M68KASMPRINTER_H diff --git a/llvm/lib/Target/M68k/M68kCallingConv.h b/llvm/lib/Target/M68k/M68kCallingConv.h index 20ffa993897f..6823df5472df 100644 --- a/llvm/lib/Target/M68k/M68kCallingConv.h +++ b/llvm/lib/Target/M68k/M68kCallingConv.h @@ -1,4 +1,4 @@ -//===-- M68kCallingConv.h - M68k Custom CC Routines ---------*- C++ -*-===// +//===-- M68kCallingConv.h - M68k Custom CC Routines -------------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -74,4 +74,4 @@ inline bool CC_M68k_Any_AssignToReg(unsigned &ValNo, MVT &ValVT, MVT &LocVT, } // namespace llvm -#endif +#endif // LLVM_LIB_TARGET_M68K_M68KCALLINGCONV_H diff --git a/llvm/lib/Target/M68k/M68kCollapseMOVEMPass.cpp b/llvm/lib/Target/M68k/M68kCollapseMOVEMPass.cpp index 4149ae92ffe9..7f0c0dd92dbb 100644 --- a/llvm/lib/Target/M68k/M68kCollapseMOVEMPass.cpp +++ b/llvm/lib/Target/M68k/M68kCollapseMOVEMPass.cpp @@ -1,4 +1,4 @@ -//===----- M68kCollapseMOVEMPass.cpp - Expand MOVEM pass --------*- C++ -*-===// +//===-- M68kCollapseMOVEMPass.cpp - Expand MOVEM pass -----------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. diff --git a/llvm/lib/Target/M68k/M68kExpandPseudo.cpp b/llvm/lib/Target/M68k/M68kExpandPseudo.cpp index 6a4aeaab518a..acfa30f28c2b 100644 --- a/llvm/lib/Target/M68k/M68kExpandPseudo.cpp +++ b/llvm/lib/Target/M68k/M68kExpandPseudo.cpp @@ -1,4 +1,4 @@ -//===--M68kExpandPseudo.cpp - Expand pseudo instructions ------*- C++ -*-===// +//===-- M68kExpandPseudo.cpp - Expand pseudo instructions -------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. diff --git a/llvm/lib/Target/M68k/M68kFrameLowering.cpp b/llvm/lib/Target/M68k/M68kFrameLowering.cpp index 66ea6ae38f43..643e156f9446 100644 --- a/llvm/lib/Target/M68k/M68kFrameLowering.cpp +++ b/llvm/lib/Target/M68k/M68kFrameLowering.cpp @@ -1,4 +1,4 @@ -//===-- M68kFrameLowering.cpp - M68k Frame Information ------*- C++ -*-===// +//===-- M68kFrameLowering.cpp - M68k Frame Information ----------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -157,7 +157,7 @@ static unsigned findDeadCallerSavedReg(MachineBasicBlock &MBB, MachineOperand &MO = MBBI->getOperand(i); if (!MO.isReg() || MO.isDef()) continue; - unsigned Reg = MO.getReg(); + Register Reg = MO.getReg(); if (!Reg) continue; for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) @@ -463,7 +463,7 @@ void M68kFrameLowering::emitPrologueCalleeSavedFrameMoves( // Calculate offsets. for (const auto &I : CSI) { int64_t Offset = MFI.getObjectOffset(I.getFrameIdx()); - unsigned Reg = I.getReg(); + Register Reg = I.getReg(); unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true); BuildCFI(MBB, MBBI, DL, @@ -485,7 +485,7 @@ void M68kFrameLowering::emitPrologue(MachineFunction &MF, uint64_t StackSize = MFI.getStackSize(); // Number of bytes to allocate. bool HasFP = hasFP(MF); bool NeedsDwarfCFI = MMI.hasDebugInfo() || Fn.needsUnwindTableEntry(); - unsigned FramePtr = TRI->getFrameRegister(MF); + Register FramePtr = TRI->getFrameRegister(MF); const unsigned MachineFramePtr = FramePtr; unsigned BasePtr = TRI->getBaseRegister(); @@ -683,7 +683,7 @@ void M68kFrameLowering::emitEpilogue(MachineFunction &MF, DebugLoc DL; if (MBBI != MBB.end()) DL = MBBI->getDebugLoc(); - unsigned FramePtr = TRI->getFrameRegister(MF); + Register FramePtr = TRI->getFrameRegister(MF); unsigned MachineFramePtr = FramePtr; // Get the number of bytes to allocate from the FrameInfo. @@ -819,7 +819,7 @@ bool M68kFrameLowering::assignCalleeSavedSpillSlots( // Since emitPrologue and emitEpilogue will handle spilling and restoring of // the frame register, we can delete it from CSI list and not have to worry // about avoiding it later. - unsigned FPReg = TRI->getFrameRegister(MF); + Register FPReg = TRI->getFrameRegister(MF); for (unsigned i = 0, e = CSI.size(); i < e; ++i) { if (TRI->regsOverlap(CSI[i].getReg(), FPReg)) { CSI.erase(CSI.begin() + i); @@ -842,7 +842,7 @@ bool M68kFrameLowering::spillCalleeSavedRegisters( unsigned Mask = 0; for (const auto &Info : CSI) { FI = std::max(FI, Info.getFrameIdx()); - unsigned Reg = Info.getReg(); + Register Reg = Info.getReg(); unsigned Shift = MRI.getSpillRegisterOrder(Reg); Mask |= 1 << Shift; } @@ -856,7 +856,7 @@ bool M68kFrameLowering::spillCalleeSavedRegisters( const MachineFunction &MF = *MBB.getParent(); const MachineRegisterInfo &RI = MF.getRegInfo(); for (const auto &Info : CSI) { - unsigned Reg = Info.getReg(); + Register Reg = Info.getReg(); bool IsLiveIn = RI.isLiveIn(Reg); if (!IsLiveIn) MBB.addLiveIn(Reg); @@ -877,7 +877,7 @@ bool M68kFrameLowering::restoreCalleeSavedRegisters( unsigned Mask = 0; for (const auto &Info : CSI) { FI = std::max(FI, Info.getFrameIdx()); - unsigned Reg = Info.getReg(); + Register Reg = Info.getReg(); unsigned Shift = MRI.getSpillRegisterOrder(Reg); Mask |= 1 << Shift; } diff --git a/llvm/lib/Target/M68k/M68kFrameLowering.h b/llvm/lib/Target/M68k/M68kFrameLowering.h index 0eba9e08d858..a5349377232e 100644 --- a/llvm/lib/Target/M68k/M68kFrameLowering.h +++ b/llvm/lib/Target/M68k/M68kFrameLowering.h @@ -1,4 +1,4 @@ -//===- M68kFrameLowering.h - Define frame lowering for M68k -*- C++ -*-===// +//===-- M68kFrameLowering.h - Define frame lowering for M68k ----*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -169,4 +169,4 @@ public: }; } // namespace llvm -#endif +#endif // LLVM_LIB_TARGET_M68K_M68KFRAMELOWERING_H diff --git a/llvm/lib/Target/M68k/M68kISelDAGToDAG.cpp b/llvm/lib/Target/M68k/M68kISelDAGToDAG.cpp index 0076c2647df3..9ef97b96ea9a 100644 --- a/llvm/lib/Target/M68k/M68kISelDAGToDAG.cpp +++ b/llvm/lib/Target/M68k/M68kISelDAGToDAG.cpp @@ -1,4 +1,4 @@ -//===- M68kISelDAGToDAG.cpp - M68k Dag to Dag Inst Selector -*- C++ -*-===// +//===-- M68kISelDAGToDAG.cpp - M68k Dag to Dag Inst Selector ----*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. diff --git a/llvm/lib/Target/M68k/M68kISelLowering.cpp b/llvm/lib/Target/M68k/M68kISelLowering.cpp index 79b395f8f984..dba190a2ebc0 100644 --- a/llvm/lib/Target/M68k/M68kISelLowering.cpp +++ b/llvm/lib/Target/M68k/M68kISelLowering.cpp @@ -1,4 +1,4 @@ -//===-- M68kISelLowering.cpp - M68k DAG Lowering Impl ------*- C++ -*--===// +//===-- M68kISelLowering.cpp - M68k DAG Lowering Impl -----------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -268,7 +268,7 @@ static bool MatchingStackOffset(SDValue Arg, unsigned Offset, int FI = INT_MAX; if (Arg.getOpcode() == ISD::CopyFromReg) { - unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg(); + Register VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg(); if (!Register::isVirtualRegister(VR)) return false; MachineInstr *Def = MRI->getVRegDef(VR); @@ -900,7 +900,7 @@ SDValue M68kTargetLowering::LowerFormalArguments( else llvm_unreachable("Unknown argument type!"); - unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); + Register Reg = MF.addLiveIn(VA.getLocReg(), RC); ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT); // If this is an 8 or 16-bit value, it is really passed promoted to 32 @@ -1276,7 +1276,7 @@ bool M68kTargetLowering::IsEligibleForTailCallOptimization( CCValAssign &VA = ArgLocs[i]; if (!VA.isRegLoc()) continue; - unsigned Reg = VA.getLocReg(); + Register Reg = VA.getLocReg(); switch (Reg) { default: break; @@ -1409,32 +1409,32 @@ SDValue M68kTargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Arith, SetCC); } -/// Create a BT (Bit Test) node - Test bit \p BitNo in \p Src and set condition -/// according to equal/not-equal condition code \p CC. +/// Create a BTST (Bit Test) node - Test bit \p BitNo in \p Src and set +/// condition according to equal/not-equal condition code \p CC. static SDValue getBitTestCondition(SDValue Src, SDValue BitNo, ISD::CondCode CC, const SDLoc &DL, SelectionDAG &DAG) { - // If Src is i8, promote it to i32 with any_extend. There is no i8 BT + // If Src is i8, promote it to i32 with any_extend. There is no i8 BTST // instruction. Since the shift amount is in-range-or-undefined, we know // that doing a bittest on the i32 value is ok. if (Src.getValueType() == MVT::i8 || Src.getValueType() == MVT::i16) Src = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Src); // If the operand types disagree, extend the shift amount to match. Since - // BT ignores high bits (like shifts) we can use anyextend. + // BTST ignores high bits (like shifts) we can use anyextend. if (Src.getValueType() != BitNo.getValueType()) BitNo = DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(), BitNo); - SDValue BT = DAG.getNode(M68kISD::BT, DL, MVT::i32, Src, BitNo); + SDValue BTST = DAG.getNode(M68kISD::BTST, DL, MVT::i32, Src, BitNo); // NOTE BTST sets CCR.Z flag M68k::CondCode Cond = CC == ISD::SETEQ ? M68k::COND_NE : M68k::COND_EQ; return DAG.getNode(M68kISD::SETCC, DL, MVT::i8, - DAG.getConstant(Cond, DL, MVT::i8), BT); + DAG.getConstant(Cond, DL, MVT::i8), BTST); } -/// Result of 'and' is compared against zero. Change to a BT node if possible. -static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC, const SDLoc &DL, - SelectionDAG &DAG) { +/// Result of 'and' is compared against zero. Change to a BTST node if possible. +static SDValue LowerAndToBTST(SDValue And, ISD::CondCode CC, const SDLoc &DL, + SelectionDAG &DAG) { SDValue Op0 = And.getOperand(0); SDValue Op1 = And.getOperand(1); if (Op0.getOpcode() == ISD::TRUNCATE) @@ -1468,7 +1468,7 @@ static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC, const SDLoc &DL, RHS = AndLHS.getOperand(1); } - // Use BT if the immediate can't be encoded in a TEST instruction. + // Use BTST if the immediate can't be encoded in a TEST instruction. if (!isUInt<32>(AndRHSVal) && isPowerOf2_64(AndRHSVal)) { LHS = AndLHS; RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), DL, LHS.getValueType()); @@ -1592,8 +1592,8 @@ static unsigned TranslateM68kCC(ISD::CondCode SetCCOpcode, const SDLoc &DL, } // Convert (truncate (srl X, N) to i1) to (bt X, N) -static SDValue LowerTruncateToBT(SDValue Op, ISD::CondCode CC, const SDLoc &DL, - SelectionDAG &DAG) { +static SDValue LowerTruncateToBTST(SDValue Op, ISD::CondCode CC, + const SDLoc &DL, SelectionDAG &DAG) { assert(Op.getOpcode() == ISD::TRUNCATE && Op.getValueType() == MVT::i1 && "Expected TRUNCATE to i1 node"); @@ -1889,14 +1889,14 @@ SDValue M68kTargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned M68kCC, } /// Result of 'and' or 'trunc to i1' is compared against zero. -/// Change to a BT node if possible. -SDValue M68kTargetLowering::LowerToBT(SDValue Op, ISD::CondCode CC, - const SDLoc &DL, - SelectionDAG &DAG) const { +/// Change to a BTST node if possible. +SDValue M68kTargetLowering::LowerToBTST(SDValue Op, ISD::CondCode CC, + const SDLoc &DL, + SelectionDAG &DAG) const { if (Op.getOpcode() == ISD::AND) - return LowerAndToBT(Op, CC, DL, DAG); + return LowerAndToBTST(Op, CC, DL, DAG); if (Op.getOpcode() == ISD::TRUNCATE && Op.getValueType() == MVT::i1) - return LowerTruncateToBT(Op, CC, DL, DAG); + return LowerTruncateToBTST(Op, CC, DL, DAG); return SDValue(); } @@ -1909,14 +1909,14 @@ SDValue M68kTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); - // Optimize to BT if possible. - // Lower (X & (1 << N)) == 0 to BT(X, N). - // Lower ((X >>u N) & 1) != 0 to BT(X, N). - // Lower ((X >>s N) & 1) != 0 to BT(X, N). - // Lower (trunc (X >> N) to i1) to BT(X, N). + // Optimize to BTST if possible. + // Lower (X & (1 << N)) == 0 to BTST(X, N). + // Lower ((X >>u N) & 1) != 0 to BTST(X, N). + // Lower ((X >>s N) & 1) != 0 to BTST(X, N). + // Lower (trunc (X >> N) to i1) to BTST(X, N). if (Op0.hasOneUse() && isNullConstant(Op1) && (CC == ISD::SETEQ || CC == ISD::SETNE)) { - if (SDValue NewSetCC = LowerToBT(Op0, CC, DL, DAG)) { + if (SDValue NewSetCC = LowerToBTST(Op0, CC, DL, DAG)) { if (VT == MVT::i1) return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, NewSetCC); return NewSetCC; @@ -2099,7 +2099,7 @@ SDValue M68kTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { bool IllegalFPCMov = false; - if ((isM68kLogicalCmp(Cmp) && !IllegalFPCMov) || Opc == M68kISD::BT) { + if ((isM68kLogicalCmp(Cmp) && !IllegalFPCMov) || Opc == M68kISD::BTST) { Cond = Cmp; addTest = false; } @@ -2163,7 +2163,7 @@ SDValue M68kTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { // We know the result of AND is compared against zero. Try to match // it to BT. if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { - if (SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG)) { + if (SDValue NewSetCC = LowerToBTST(Cond, ISD::SETNE, DL, DAG)) { CC = NewSetCC.getOperand(0); Cond = NewSetCC.getOperand(1); addTest = false; @@ -2282,7 +2282,7 @@ SDValue M68kTargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { SDValue Cmp = Cond.getOperand(1); unsigned Opc = Cmp.getOpcode(); - if (isM68kLogicalCmp(Cmp) || Opc == M68kISD::BT) { + if (isM68kLogicalCmp(Cmp) || Opc == M68kISD::BTST) { Cond = Cmp; AddTest = false; } else { @@ -2427,7 +2427,7 @@ SDValue M68kTargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { // We know the result is compared against zero. Try to match it to BT. if (Cond.hasOneUse()) { - if (SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG)) { + if (SDValue NewSetCC = LowerToBTST(Cond, ISD::SETNE, DL, DAG)) { CC = NewSetCC.getOperand(0); Cond = NewSetCC.getOperand(1); AddTest = false; @@ -3101,9 +3101,9 @@ M68kTargetLowering::EmitLoweredSelect(MachineInstr &MI, // destination registers, and the registers that went into the PHI. for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) { - unsigned DestReg = MIIt->getOperand(0).getReg(); - unsigned Op1Reg = MIIt->getOperand(1).getReg(); - unsigned Op2Reg = MIIt->getOperand(2).getReg(); + Register DestReg = MIIt->getOperand(0).getReg(); + Register Op1Reg = MIIt->getOperand(1).getReg(); + Register Op2Reg = MIIt->getOperand(2).getReg(); // If this CMOV we are generating is the opposite condition from // the jump we generated, then we have to swap the operands for the @@ -3211,13 +3211,13 @@ SDValue M68kTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, auto &MRI = MF.getRegInfo(); auto SPTy = getPointerTy(DAG.getDataLayout()); auto *ARClass = getRegClassFor(SPTy); - unsigned Vreg = MRI.createVirtualRegister(ARClass); + Register Vreg = MRI.createVirtualRegister(ARClass); Chain = DAG.getCopyToReg(Chain, DL, Vreg, Size); Result = DAG.getNode(M68kISD::SEG_ALLOCA, DL, SPTy, Chain, DAG.getRegister(Vreg, SPTy)); } else { auto &TLI = DAG.getTargetLoweringInfo(); - unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore(); + Register SPReg = TLI.getStackPointerRegisterToSaveRestore(); assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and" " not tell us which reg is the stack pointer!"); @@ -3391,8 +3391,8 @@ const char *M68kTargetLowering::getTargetNodeName(unsigned Opcode) const { return "M68kISD::AND"; case M68kISD::CMP: return "M68kISD::CMP"; - case M68kISD::BT: - return "M68kISD::BT"; + case M68kISD::BTST: + return "M68kISD::BTST"; case M68kISD::SELECT: return "M68kISD::SELECT"; case M68kISD::CMOV: diff --git a/llvm/lib/Target/M68k/M68kISelLowering.h b/llvm/lib/Target/M68k/M68kISelLowering.h index 6a5a40a8815b..9375a99962eb 100644 --- a/llvm/lib/Target/M68k/M68kISelLowering.h +++ b/llvm/lib/Target/M68k/M68kISelLowering.h @@ -1,4 +1,4 @@ -//===-- M68kISelLowering.h - M68k DAG Lowering Interface ----*- C++ -*-===// +//===-- M68kISelLowering.h - M68k DAG Lowering Interface --------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -43,7 +43,7 @@ enum NodeType { CMP, /// M68k bit-test instructions. - BT, + BTST, /// M68k Select SELECT, @@ -204,8 +204,8 @@ private: const CCValAssign &VA, ISD::ArgFlagsTy Flags) const; SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerToBT(SDValue And, ISD::CondCode CC, const SDLoc &DL, - SelectionDAG &DAG) const; + SDValue LowerToBTST(SDValue And, ISD::CondCode CC, const SDLoc &DL, + SelectionDAG &DAG) const; SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const; @@ -276,4 +276,4 @@ private: }; } // namespace llvm -#endif // M68kISELLOWERING_H +#endif // LLVM_LIB_TARGET_M68K_M68KISELLOWERING_H diff --git a/llvm/lib/Target/M68k/M68kInstrArithmetic.td b/llvm/lib/Target/M68k/M68kInstrArithmetic.td index b2c05365d30b..ef50de576641 100644 --- a/llvm/lib/Target/M68k/M68kInstrArithmetic.td +++ b/llvm/lib/Target/M68k/M68kInstrArithmetic.td @@ -150,8 +150,7 @@ let mayLoad = 1, mayStore = 1 in { // FIXME MxBiArOp_FMR/FMI cannot consume CCR from MxAdd/MxSub which leads for // MxAdd to survive the match and subsequent mismatch. -class MxBiArOp_FMR<string MN, SDNode NODE, MxType TYPE, - MxOperand MEMOpd, ComplexPattern MEMPat, +class MxBiArOp_FMR<string MN, MxType TYPE, MxOperand MEMOpd, bits<4> CMD, MxEncEA EA, MxEncExt EXT> : MxInst<(outs), (ins MEMOpd:$dst, TYPE.ROp:$opd), MN#"."#TYPE.Prefix#"\t$opd, $dst", @@ -160,8 +159,7 @@ class MxBiArOp_FMR<string MN, SDNode NODE, MxType TYPE, !cast<MxEncOpMode>("MxOpMode"#TYPE.Size#"EA"#TYPE.RLet), MxBeadDReg<1>, EA, EXT>>; -class MxBiArOp_FMI<string MN, SDNode NODE, MxType TYPE, - MxOperand MEMOpd, ComplexPattern MEMPat, +class MxBiArOp_FMI<string MN, MxType TYPE, MxOperand MEMOpd, bits<4> CMD, MxEncEA MEMEA, MxEncExt MEMExt> : MxInst<(outs), (ins MEMOpd:$dst, TYPE.IOp:$opd), MN#"."#TYPE.Prefix#"\t$opd, $dst", @@ -218,47 +216,47 @@ multiclass MxBiArOp_DF<string MN, SDNode NODE, bit isComm, def NAME#"32di" : MxBiArOp_RFRI_xEA<MN, NODE, MxType32d, CMD>; // op $reg, $mem - def NAME#"8pd" : MxBiArOp_FMR<MN, NODE, MxType8d, MxType8.POp, MxType8.PPat, + def NAME#"8pd" : MxBiArOp_FMR<MN, MxType8d, MxType8.POp, CMD, MxEncEAp_0, MxExtI16_0>; - def NAME#"16pd" : MxBiArOp_FMR<MN, NODE, MxType16d, MxType16.POp, MxType16.PPat, + def NAME#"16pd" : MxBiArOp_FMR<MN, MxType16d, MxType16.POp, CMD, MxEncEAp_0, MxExtI16_0>; - def NAME#"32pd" : MxBiArOp_FMR<MN, NODE, MxType32d, MxType32.POp, MxType32.PPat, + def NAME#"32pd" : MxBiArOp_FMR<MN, MxType32d, MxType32.POp, CMD, MxEncEAp_0, MxExtI16_0>; - def NAME#"8fd" : MxBiArOp_FMR<MN, NODE, MxType8d, MxType8.FOp, MxType8.FPat, + def NAME#"8fd" : MxBiArOp_FMR<MN, MxType8d, MxType8.FOp, CMD, MxEncEAf_0, MxExtBrief_0>; - def NAME#"16fd" : MxBiArOp_FMR<MN, NODE, MxType16d, MxType16.FOp, MxType16.FPat, + def NAME#"16fd" : MxBiArOp_FMR<MN, MxType16d, MxType16.FOp, CMD, MxEncEAf_0, MxExtBrief_0>; - def NAME#"32fd" : MxBiArOp_FMR<MN, NODE, MxType32d, MxType32.FOp, MxType32.FPat, + def NAME#"32fd" : MxBiArOp_FMR<MN, MxType32d, MxType32.FOp, CMD, MxEncEAf_0, MxExtBrief_0>; - def NAME#"8jd" : MxBiArOp_FMR<MN, NODE, MxType8d, MxType8.JOp, MxType8.JPat, + def NAME#"8jd" : MxBiArOp_FMR<MN, MxType8d, MxType8.JOp, CMD, MxEncEAj_0, MxExtEmpty>; - def NAME#"16jd" : MxBiArOp_FMR<MN, NODE, MxType16d, MxType16.JOp, MxType16.JPat, + def NAME#"16jd" : MxBiArOp_FMR<MN, MxType16d, MxType16.JOp, CMD, MxEncEAj_0, MxExtEmpty>; - def NAME#"32jd" : MxBiArOp_FMR<MN, NODE, MxType32d, MxType32.JOp, MxType32.JPat, + def NAME#"32jd" : MxBiArOp_FMR<MN, MxType32d, MxType32.JOp, CMD, MxEncEAj_0, MxExtEmpty>; // op $imm, $mem - def NAME#"8pi" : MxBiArOp_FMI<MN, NODE, MxType8, MxType8.POp, MxType8.PPat, + def NAME#"8pi" : MxBiArOp_FMI<MN, MxType8, MxType8.POp, CMDI, MxEncEAp_0, MxExtI16_0>; - def NAME#"16pi" : MxBiArOp_FMI<MN, NODE, MxType16, MxType16.POp, MxType16.PPat, + def NAME#"16pi" : MxBiArOp_FMI<MN, MxType16, MxType16.POp, CMDI, MxEncEAp_0, MxExtI16_0>; - def NAME#"32pi" : MxBiArOp_FMI<MN, NODE, MxType32, MxType32.POp, MxType32.PPat, + def NAME#"32pi" : MxBiArOp_FMI<MN, MxType32, MxType32.POp, CMDI, MxEncEAp_0, MxExtI16_0>; - def NAME#"8fi" : MxBiArOp_FMI<MN, NODE, MxType8, MxType8.FOp, MxType8.FPat, + def NAME#"8fi" : MxBiArOp_FMI<MN, MxType8, MxType8.FOp, CMDI, MxEncEAf_0, MxExtBrief_0>; - def NAME#"16fi" : MxBiArOp_FMI<MN, NODE, MxType16, MxType16.FOp, MxType16.FPat, + def NAME#"16fi" : MxBiArOp_FMI<MN, MxType16, MxType16.FOp, CMDI, MxEncEAf_0, MxExtBrief_0>; - def NAME#"32fi" : MxBiArOp_FMI<MN, NODE, MxType32, MxType32.FOp, MxType32.FPat, + def NAME#"32fi" : MxBiArOp_FMI<MN, MxType32, MxType32.FOp, CMDI, MxEncEAf_0, MxExtBrief_0>; - def NAME#"8ji" : MxBiArOp_FMI<MN, NODE, MxType8, MxType8.JOp, MxType8.JPat, + def NAME#"8ji" : MxBiArOp_FMI<MN, MxType8, MxType8.JOp, CMDI, MxEncEAj_0, MxExtEmpty>; - def NAME#"16ji" : MxBiArOp_FMI<MN, NODE, MxType16, MxType16.JOp, MxType16.JPat, + def NAME#"16ji" : MxBiArOp_FMI<MN, MxType16, MxType16.JOp, CMDI, MxEncEAj_0, MxExtEmpty>; - def NAME#"32ji" : MxBiArOp_FMI<MN, NODE, MxType32, MxType32.JOp, MxType32.JPat, + def NAME#"32ji" : MxBiArOp_FMI<MN, MxType32, MxType32.JOp, CMDI, MxEncEAj_0, MxExtEmpty>; def NAME#"16dr" : MxBiArOp_RFRR_xEA<MN, NODE, MxType16d, MxType16r, @@ -284,8 +282,7 @@ multiclass MxBiArOp_DF<string MN, SDNode NODE, bit isComm, // operations do not produce CCR we should not match them against Mx nodes that // produce it. let Pattern = [(null_frag)] in -multiclass MxBiArOp_AF<string MN, SDNode NODE, bit isComm, - bits<4> CMD, bits<4> CMDI> { +multiclass MxBiArOp_AF<string MN, SDNode NODE, bits<4> CMD> { def NAME#"32ak" : MxBiArOp_RFRM<MN, NODE, MxType32a, MxType32.KOp, MxType32.KPat, CMD, MxEncEAk, MxExtBrief_2>; @@ -307,9 +304,9 @@ multiclass MxBiArOp_AF<string MN, SDNode NODE, bit isComm, // NOTE These naturally produce CCR defm ADD : MxBiArOp_DF<"add", MxAdd, 1, 0xD, 0x6>; -defm ADD : MxBiArOp_AF<"adda", MxAdd, 1, 0xD, 0x6>; +defm ADD : MxBiArOp_AF<"adda", MxAdd, 0xD>; defm SUB : MxBiArOp_DF<"sub", MxSub, 0, 0x9, 0x4>; -defm SUB : MxBiArOp_AF<"suba", MxSub, 0, 0x9, 0x4>; +defm SUB : MxBiArOp_AF<"suba", MxSub, 0x9>; let Uses = [CCR], Defs = [CCR] in { diff --git a/llvm/lib/Target/M68k/M68kInstrBits.td b/llvm/lib/Target/M68k/M68kInstrBits.td index d97ca50f74a9..d610bce5c277 100644 --- a/llvm/lib/Target/M68k/M68kInstrBits.td +++ b/llvm/lib/Target/M68k/M68kInstrBits.td @@ -1,4 +1,4 @@ -//===------- M68kInstrBits.td - Bit Manipulation Instrs --*- tablegen -*-===// +//===-- M68kInstrBits.td - Bit Manipulation Instrs ---------*- tablegen -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -12,7 +12,7 @@ /// /// Machine: /// -/// BCNG [ ] BCLR [ ] BSET [ ] BTST [~] +/// BCHG [ ] BCLR [ ] BSET [ ] BTST [~] /// /// Map: /// @@ -51,24 +51,24 @@ class MxBTSTEnc_I<MxBead8Imm IMM, MxEncEA EA, MxEncExt EXT> let Defs = [CCR] in { class MxBTST_RR<MxType TYPE> : MxInst<(outs), (ins TYPE.ROp:$dst, TYPE.ROp:$bitno), "btst\t$bitno, $dst", - [(set CCR, (MxBt TYPE.VT:$dst, TYPE.VT:$bitno))], + [(set CCR, (MxBtst TYPE.VT:$dst, TYPE.VT:$bitno))], MxBTSTEnc_R<MxBeadDReg<1>, MxEncEAd_0, MxExtEmpty>>; class MxBTST_RI<MxType TYPE> : MxInst<(outs), (ins TYPE.ROp:$dst, TYPE.IOp:$bitno), "btst\t$bitno, $dst", - [(set CCR, (MxBt TYPE.VT:$dst, TYPE.IPat:$bitno))], + [(set CCR, (MxBtst TYPE.VT:$dst, TYPE.IPat:$bitno))], MxBTSTEnc_I<MxBead8Imm<1>, MxEncEAd_0, MxExtEmpty>>; class MxBTST_MR<MxType TYPE, MxOperand MEMOpd, ComplexPattern MEMPat, MxEncEA EA, MxEncExt EXT> : MxInst<(outs), (ins MEMOpd:$dst, TYPE.ROp:$bitno), "btst\t$bitno, $dst", - [(set CCR, (MxBt (TYPE.Load MEMPat:$dst), TYPE.VT:$bitno))], + [(set CCR, (MxBtst (TYPE.Load MEMPat:$dst), TYPE.VT:$bitno))], MxBTSTEnc_R<MxBeadDReg<1>, EA, EXT>>; class MxBTST_MI<MxType TYPE, MxOperand MEMOpd, ComplexPattern MEMPat, MxEncEA EA, MxEncExt EXT> : MxInst<(outs), (ins MEMOpd:$dst, TYPE.IOp:$bitno), "btst\t$bitno, $dst", - [(set CCR, (MxBt (TYPE.Load MEMPat:$dst), TYPE.IPat:$bitno))], + [(set CCR, (MxBtst (TYPE.Load MEMPat:$dst), TYPE.IPat:$bitno))], MxBTSTEnc_I<MxBead8Imm<1>, EA, EXT>>; } // Defs = [CCR] diff --git a/llvm/lib/Target/M68k/M68kInstrBuilder.h b/llvm/lib/Target/M68k/M68kInstrBuilder.h index e32b1b047a2b..e85bd270287c 100644 --- a/llvm/lib/Target/M68k/M68kInstrBuilder.h +++ b/llvm/lib/Target/M68k/M68kInstrBuilder.h @@ -1,4 +1,4 @@ -//===-- M68kInstrBuilder.h - Functions to build M68k insts --*- C++ -*-===// +//===-- M68kInstrBuilder.h - Functions to build M68k insts ------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -91,4 +91,4 @@ addMemOperand(const MachineInstrBuilder &MIB, int FI, int Offset = 0) { } // end namespace M68k } // end namespace llvm -#endif // LLVM_LIB_TARGET_M6800_M6800INSTRBUILDER_H +#endif // LLVM_LIB_TARGET_M68K_M68KINSTRBUILDER_H diff --git a/llvm/lib/Target/M68k/M68kInstrCompiler.td b/llvm/lib/Target/M68k/M68kInstrCompiler.td index 8fb331dec0e9..2ecf5ca0e6d0 100644 --- a/llvm/lib/Target/M68k/M68kInstrCompiler.td +++ b/llvm/lib/Target/M68k/M68kInstrCompiler.td @@ -1,4 +1,4 @@ -//===-- M68kInstrCompiler.td - Pseudos and Patterns ------*- tablegen -*-===// +//===-- M68kInstrCompiler.td - Pseudos and Patterns --------*- tablegen -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. diff --git a/llvm/lib/Target/M68k/M68kInstrControl.td b/llvm/lib/Target/M68k/M68kInstrControl.td index 9f87833ab0e2..be9045b6e0d2 100644 --- a/llvm/lib/Target/M68k/M68kInstrControl.td +++ b/llvm/lib/Target/M68k/M68kInstrControl.td @@ -1,4 +1,4 @@ -//===-- M68kInstrControl.td - Control Flow Instructions --*- tablegen -*-===// +//===-- M68kInstrControl.td - Control Flow Instructions ----*- tablegen -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. diff --git a/llvm/lib/Target/M68k/M68kInstrData.td b/llvm/lib/Target/M68k/M68kInstrData.td index 40b9e4a2a7fa..3dd5d9f8c7ac 100644 --- a/llvm/lib/Target/M68k/M68kInstrData.td +++ b/llvm/lib/Target/M68k/M68kInstrData.td @@ -1,4 +1,4 @@ -//== M68kInstrData.td - M68k Data Movement Instructions -*- tablegen --===// +//===-- M68kInstrData.td - M68k Data Movement Instructions -*- tablegen -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. diff --git a/llvm/lib/Target/M68k/M68kInstrFormats.td b/llvm/lib/Target/M68k/M68kInstrFormats.td index 99b7ffd17971..7e0c96a5b1f6 100644 --- a/llvm/lib/Target/M68k/M68kInstrFormats.td +++ b/llvm/lib/Target/M68k/M68kInstrFormats.td @@ -1,4 +1,4 @@ -//=== M68kInstrFormats.td - M68k Instruction Formats ---*- tablegen -*-===// +//===-- M68kInstrFormats.td - M68k Instruction Formats -----*- tablegen -*-===// // The LLVM Compiler Infrastructure // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. diff --git a/llvm/lib/Target/M68k/M68kInstrInfo.cpp b/llvm/lib/Target/M68k/M68kInstrInfo.cpp index 639bcd455687..105c816f9885 100644 --- a/llvm/lib/Target/M68k/M68kInstrInfo.cpp +++ b/llvm/lib/Target/M68k/M68kInstrInfo.cpp @@ -1,4 +1,4 @@ -//===-- M68kInstrInfo.cpp - M68k Instruction Information ----*- C++ -*-===// +//===-- M68kInstrInfo.cpp - M68k Instruction Information --------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -348,8 +348,8 @@ void M68kInstrInfo::AddZExt(MachineBasicBlock &MBB, bool M68kInstrInfo::ExpandMOVX_RR(MachineInstrBuilder &MIB, MVT MVTDst, MVT MVTSrc) const { unsigned Move = MVTDst == MVT::i16 ? M68k::MOV16rr : M68k::MOV32rr; - unsigned Dst = MIB->getOperand(0).getReg(); - unsigned Src = MIB->getOperand(1).getReg(); + Register Dst = MIB->getOperand(0).getReg(); + Register Src = MIB->getOperand(1).getReg(); assert(Dst != Src && "You cannot use the same Regs with MOVX_RR"); @@ -394,8 +394,8 @@ bool M68kInstrInfo::ExpandMOVSZX_RR(MachineInstrBuilder &MIB, bool IsSigned, else // i32 Move = M68k::MOV32rr; - unsigned Dst = MIB->getOperand(0).getReg(); - unsigned Src = MIB->getOperand(1).getReg(); + Register Dst = MIB->getOperand(0).getReg(); + Register Src = MIB->getOperand(1).getReg(); assert(Dst != Src && "You cannot use the same Regs with MOVSX_RR"); @@ -437,7 +437,7 @@ bool M68kInstrInfo::ExpandMOVSZX_RM(MachineInstrBuilder &MIB, bool IsSigned, MVT MVTSrc) const { LLVM_DEBUG(dbgs() << "Expand " << *MIB.getInstr() << " to LOAD and "); - unsigned Dst = MIB->getOperand(0).getReg(); + Register Dst = MIB->getOperand(0).getReg(); // We need the subreg of Dst to make instruction verifier happy because the // real machine instruction consumes and produces values of the same size and @@ -559,7 +559,7 @@ bool M68kInstrInfo::ExpandMOVEM(MachineInstrBuilder &MIB, static bool Expand2AddrUndef(MachineInstrBuilder &MIB, const MCInstrDesc &Desc) { assert(Desc.getNumOperands() == 3 && "Expected two-addr instruction."); - unsigned Reg = MIB->getOperand(0).getReg(); + Register Reg = MIB->getOperand(0).getReg(); MIB->setDesc(Desc); // MachineInstr::addOperand() will insert explicit operands before any diff --git a/llvm/lib/Target/M68k/M68kInstrInfo.h b/llvm/lib/Target/M68k/M68kInstrInfo.h index 6aced1487365..84d50c181ead 100644 --- a/llvm/lib/Target/M68k/M68kInstrInfo.h +++ b/llvm/lib/Target/M68k/M68kInstrInfo.h @@ -1,4 +1,4 @@ -//===-- M68kInstrInfo.h - M68k Instruction Information ------*- C++ -*-===// +//===-- M68kInstrInfo.h - M68k Instruction Information ----------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -336,4 +336,4 @@ public: } // namespace llvm -#endif +#endif // LLVM_LIB_TARGET_M68K_M68KINSTRINFO_H diff --git a/llvm/lib/Target/M68k/M68kInstrInfo.td b/llvm/lib/Target/M68k/M68kInstrInfo.td index ed6cd9ecf442..c581dd91eaaa 100644 --- a/llvm/lib/Target/M68k/M68kInstrInfo.td +++ b/llvm/lib/Target/M68k/M68kInstrInfo.td @@ -1,4 +1,4 @@ -//== M68kInstrInfo.td - Main M68k Instruction Definition -*- tablegen -*-=// +//===-- M68kInstrInfo.td - Main M68k Instruction Definition -*- tablegen -*-==// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -137,7 +137,7 @@ def MxSMul : SDNode<"M68kISD::SMUL", MxSDT_BiArithCCROut, [SDNPCommutative]>; def MxUMul : SDNode<"M68kISD::UMUL", MxSDT_2BiArithCCROut, [SDNPCommutative]>; def MxCmp : SDNode<"M68kISD::CMP", MxSDT_CmpTest>; -def MxBt : SDNode<"M68kISD::BT", MxSDT_CmpTest>; +def MxBtst : SDNode<"M68kISD::BTST", MxSDT_CmpTest>; def MxCmov : SDNode<"M68kISD::CMOV", MxSDT_Cmov>; def MxBrCond : SDNode<"M68kISD::BRCOND", MxSDT_BrCond, [SDNPHasChain]>; @@ -587,8 +587,8 @@ class MxType<ValueType vt, string prefix, string postfix, // qOp: Supported PCD operand // qPat: What PCD pattern is used MxOperand qOp, ComplexPattern qPat, - // kOp: Supported PCD operand - // kPat: What PCD pattern is used + // kOp: Supported PCI operand + // kPat: What PCI pattern is used MxOperand kOp, ComplexPattern kPat, // iOp: Supported immediate operand // iPat: What immediate pattern is used diff --git a/llvm/lib/Target/M68k/M68kInstrShiftRotate.td b/llvm/lib/Target/M68k/M68kInstrShiftRotate.td index cab687638076..f1967ec11928 100644 --- a/llvm/lib/Target/M68k/M68kInstrShiftRotate.td +++ b/llvm/lib/Target/M68k/M68kInstrShiftRotate.td @@ -1,4 +1,4 @@ -//===------ M68kInstrShiftRotate.td - Logical Instrs -----*- tablegen -*-===// +//===-- M68kInstrShiftRotate.td - Logical Instrs -----------*- tablegen -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. diff --git a/llvm/lib/Target/M68k/M68kMCInstLower.cpp b/llvm/lib/Target/M68k/M68kMCInstLower.cpp index f14361559b13..a0b1452ee663 100644 --- a/llvm/lib/Target/M68k/M68kMCInstLower.cpp +++ b/llvm/lib/Target/M68k/M68kMCInstLower.cpp @@ -1,4 +1,4 @@ -//===-- M68kMCInstLower.cpp - M68k MachineInstr to MCInst ---*- C++ -*-===// +//===-- M68kMCInstLower.cpp - M68k MachineInstr to MCInst -------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. diff --git a/llvm/lib/Target/M68k/M68kMCInstLower.h b/llvm/lib/Target/M68k/M68kMCInstLower.h index d6160629545e..76d9a36f70ef 100644 --- a/llvm/lib/Target/M68k/M68kMCInstLower.h +++ b/llvm/lib/Target/M68k/M68kMCInstLower.h @@ -1,4 +1,4 @@ -//===-- M68kMCInstLower.h - Lower MachineInstr to MCInst -----*- C++ -*--===// +//===-- M68kMCInstLower.h - Lower MachineInstr to MCInst --------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -51,4 +51,4 @@ public: }; } // namespace llvm -#endif +#endif // LLVM_LIB_TARGET_M68K_M68KMCINSTLOWER_H diff --git a/llvm/lib/Target/M68k/M68kMachineFunction.cpp b/llvm/lib/Target/M68k/M68kMachineFunction.cpp index 3d048df7ba49..b1e7369116d7 100644 --- a/llvm/lib/Target/M68k/M68kMachineFunction.cpp +++ b/llvm/lib/Target/M68k/M68kMachineFunction.cpp @@ -1,4 +1,4 @@ -//===-- M68kMachineFunctionInfo.cpp - M68k private data ----*- C++ -*--===// +//===-- M68kMachineFunctionInfo.cpp - M68k private data ---------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. diff --git a/llvm/lib/Target/M68k/M68kMachineFunction.h b/llvm/lib/Target/M68k/M68kMachineFunction.h index 5760bdd4b9e3..93c5255199d4 100644 --- a/llvm/lib/Target/M68k/M68kMachineFunction.h +++ b/llvm/lib/Target/M68k/M68kMachineFunction.h @@ -1,4 +1,4 @@ -//===-- M68kMachineFunctionInfo.h - M68k private data ---------*- C++ -*-=// +//===-- M68kMachineFunctionInfo.h - M68k private data -----------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -111,4 +111,4 @@ private: } // end of namespace llvm -#endif // M68K_MACHINE_FUNCTION_INFO_H +#endif // LLVM_LIB_TARGET_M68K_M68KMACHINEFUNCTION_H diff --git a/llvm/lib/Target/M68k/M68kRegisterInfo.cpp b/llvm/lib/Target/M68k/M68kRegisterInfo.cpp index 69d16035b1d9..0cae7ac4e312 100644 --- a/llvm/lib/Target/M68k/M68kRegisterInfo.cpp +++ b/llvm/lib/Target/M68k/M68kRegisterInfo.cpp @@ -1,4 +1,4 @@ -//===-- M68kRegisterInfo.cpp - CPU0 Register Information -----*- C++ -*--===// +//===-- M68kRegisterInfo.cpp - CPU0 Register Information --------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. diff --git a/llvm/lib/Target/M68k/M68kRegisterInfo.h b/llvm/lib/Target/M68k/M68kRegisterInfo.h index 51b94294772c..7f822e1cb34f 100644 --- a/llvm/lib/Target/M68k/M68kRegisterInfo.h +++ b/llvm/lib/Target/M68k/M68kRegisterInfo.h @@ -1,4 +1,4 @@ -//===-- M68kRegisterInfo.h - M68k Register Information Impl --*- C++ --===// +//===-- M68kRegisterInfo.h - M68k Register Information Impl -----*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -106,4 +106,4 @@ public: } // end namespace llvm -#endif +#endif // LLVM_LIB_TARGET_M68K_M68KREGISTERINFO_H diff --git a/llvm/lib/Target/M68k/M68kRegisterInfo.td b/llvm/lib/Target/M68k/M68kRegisterInfo.td index e2ea2967f75b..49874a2b1099 100644 --- a/llvm/lib/Target/M68k/M68kRegisterInfo.td +++ b/llvm/lib/Target/M68k/M68kRegisterInfo.td @@ -1,4 +1,4 @@ -//== M68kRegisterInfo.td - M68k register definitions ----*- tablegen -*-==// +//==-- M68kRegisterInfo.td - M68k register definitions ------*- tablegen -*-==// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. diff --git a/llvm/lib/Target/M68k/M68kSchedule.td b/llvm/lib/Target/M68k/M68kSchedule.td index a94cd8f31e2e..6a1bf0c6a020 100644 --- a/llvm/lib/Target/M68k/M68kSchedule.td +++ b/llvm/lib/Target/M68k/M68kSchedule.td @@ -1,4 +1,4 @@ -//===-- M68kSchedule.td - M68k Scheduling Definitions --*- tablegen -*-===// +//===-- M68kSchedule.td - M68k Scheduling Definitions ------*- tablegen -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. diff --git a/llvm/lib/Target/M68k/M68kSubtarget.cpp b/llvm/lib/Target/M68k/M68kSubtarget.cpp index 991889706e67..ec3830243daf 100644 --- a/llvm/lib/Target/M68k/M68kSubtarget.cpp +++ b/llvm/lib/Target/M68k/M68kSubtarget.cpp @@ -1,4 +1,4 @@ -//===-- M68kSubtarget.cpp - M68k Subtarget Information ------*- C++ -*-===// +//===-- M68kSubtarget.cpp - M68k Subtarget Information ----------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. diff --git a/llvm/lib/Target/M68k/M68kSubtarget.h b/llvm/lib/Target/M68k/M68kSubtarget.h index f45cb7edca1f..9bf2984983a1 100644 --- a/llvm/lib/Target/M68k/M68kSubtarget.h +++ b/llvm/lib/Target/M68k/M68kSubtarget.h @@ -1,4 +1,4 @@ -//===-- M68kSubtarget.h - Define Subtarget for the M68k -----*- C++ -*-===// +//===-- M68kSubtarget.h - Define Subtarget for the M68k ---------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -11,8 +11,8 @@ /// //===----------------------------------------------------------------------===// -#ifndef LLVM_LIB_TARGET_CPU0_M68KSUBTARGET_H -#define LLVM_LIB_TARGET_CPU0_M68KSUBTARGET_H +#ifndef LLVM_LIB_TARGET_M68K_M68KSUBTARGET_H +#define LLVM_LIB_TARGET_M68K_M68KSUBTARGET_H #include "M68kFrameLowering.h" #include "M68kISelLowering.h" @@ -179,4 +179,4 @@ public: }; } // namespace llvm -#endif +#endif // LLVM_LIB_TARGET_M68K_M68KSUBTARGET_H diff --git a/llvm/lib/Target/M68k/M68kTargetMachine.cpp b/llvm/lib/Target/M68k/M68kTargetMachine.cpp index e8126c6219e8..fd21fe6bcea8 100644 --- a/llvm/lib/Target/M68k/M68kTargetMachine.cpp +++ b/llvm/lib/Target/M68k/M68kTargetMachine.cpp @@ -1,4 +1,4 @@ -//===-- M68kTargetMachine.cpp - M68k target machine ---------*- C++ -*-===// +//===-- M68kTargetMachine.cpp - M68k Target Machine -------------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. diff --git a/llvm/lib/Target/M68k/M68kTargetMachine.h b/llvm/lib/Target/M68k/M68kTargetMachine.h index 34fae8e45504..8dda720774e7 100644 --- a/llvm/lib/Target/M68k/M68kTargetMachine.h +++ b/llvm/lib/Target/M68k/M68kTargetMachine.h @@ -1,4 +1,4 @@ -//===-- M68kTargetMachine.h - Define TargetMachine for M68k ----- C++ -===// +//===-- M68kTargetMachine.h - Define TargetMachine for M68k -----*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -53,4 +53,4 @@ public: }; } // namespace llvm -#endif +#endif // LLVM_LIB_TARGET_M68K_M68KTARGETMACHINE_H diff --git a/llvm/lib/Target/M68k/M68kTargetObjectFile.cpp b/llvm/lib/Target/M68k/M68kTargetObjectFile.cpp index 3e26b37e7760..4986d5dbebb9 100644 --- a/llvm/lib/Target/M68k/M68kTargetObjectFile.cpp +++ b/llvm/lib/Target/M68k/M68kTargetObjectFile.cpp @@ -1,4 +1,4 @@ -//===-- M68kELFTargetObjectFile.cpp - M68k Object Files -----*- C++ -*-===// +//===-- M68kELFTargetObjectFile.cpp - M68k Object Files ---------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. diff --git a/llvm/lib/Target/M68k/M68kTargetObjectFile.h b/llvm/lib/Target/M68k/M68kTargetObjectFile.h index dbc5375d5423..80a7d0d6e120 100644 --- a/llvm/lib/Target/M68k/M68kTargetObjectFile.h +++ b/llvm/lib/Target/M68k/M68kTargetObjectFile.h @@ -1,4 +1,4 @@ -//===-- M68kELFTargetObjectFile.h - M68k Object Info ---------*- C++ -====// +//===-- M68kELFTargetObjectFile.h - M68k Object Info ------------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -28,4 +28,4 @@ public: }; } // end namespace llvm -#endif +#endif // LLVM_LIB_TARGET_M68K_M68KTARGETOBJECTFILE_H diff --git a/llvm/lib/Target/M68k/MCTargetDesc/M68kAsmBackend.cpp b/llvm/lib/Target/M68k/MCTargetDesc/M68kAsmBackend.cpp index c1f88fb78ee1..b66557ec6c3a 100644 --- a/llvm/lib/Target/M68k/MCTargetDesc/M68kAsmBackend.cpp +++ b/llvm/lib/Target/M68k/MCTargetDesc/M68kAsmBackend.cpp @@ -1,4 +1,4 @@ -//===-- M68kAsmBackend.cpp - M68k Assembler Backend ---------*- C++ -*-===// +//===-- M68kAsmBackend.cpp - M68k Assembler Backend -------------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. diff --git a/llvm/lib/Target/M68k/MCTargetDesc/M68kBaseInfo.h b/llvm/lib/Target/M68k/MCTargetDesc/M68kBaseInfo.h index 7c56cfdf3123..4883f647e214 100644 --- a/llvm/lib/Target/M68k/MCTargetDesc/M68kBaseInfo.h +++ b/llvm/lib/Target/M68k/MCTargetDesc/M68kBaseInfo.h @@ -1,4 +1,4 @@ -//===-- M68kBaseInfo.h - Top level definitions for M68k MC --*- C++ -*-----===// +//===-- M68kBaseInfo.h - Top level definitions for M68k MC ------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -244,4 +244,4 @@ static inline unsigned getMaskedSpillRegister(unsigned order) { } // namespace llvm -#endif +#endif // LLVM_LIB_TARGET_M68K_MCTARGETDESC_M68KBASEINFO_H diff --git a/llvm/lib/Target/M68k/MCTargetDesc/M68kELFObjectWriter.cpp b/llvm/lib/Target/M68k/MCTargetDesc/M68kELFObjectWriter.cpp index 4c9a3297424d..27f1b3a3fac8 100644 --- a/llvm/lib/Target/M68k/MCTargetDesc/M68kELFObjectWriter.cpp +++ b/llvm/lib/Target/M68k/MCTargetDesc/M68kELFObjectWriter.cpp @@ -1,4 +1,4 @@ -//===---------- M68kELFObjectWriter.cpp - M68k ELF Writer ---*- C++ -*-===// +//===-- M68kELFObjectWriter.cpp - M68k ELF Writer ---------------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. diff --git a/llvm/lib/Target/M68k/MCTargetDesc/M68kFixupKinds.h b/llvm/lib/Target/M68k/MCTargetDesc/M68kFixupKinds.h index 2b760dec9e41..54a0e98fea6e 100644 --- a/llvm/lib/Target/M68k/MCTargetDesc/M68kFixupKinds.h +++ b/llvm/lib/Target/M68k/MCTargetDesc/M68kFixupKinds.h @@ -1,4 +1,4 @@ -//===-- M68kFixupKinds.h - M68k Specific Fixup Entries ------*- C++ -*-===// +//===-- M68kFixupKinds.h - M68k Specific Fixup Entries ----------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -51,4 +51,4 @@ static inline MCFixupKind getFixupForSize(unsigned Size, bool isPCRel) { } // namespace llvm -#endif +#endif // LLVM_LIB_TARGET_M68k_MCTARGETDESC_M68kFIXUPKINDS_H diff --git a/llvm/lib/Target/M68k/MCTargetDesc/M68kInstPrinter.cpp b/llvm/lib/Target/M68k/MCTargetDesc/M68kInstPrinter.cpp index a2e41437ee21..9ba28622b5b5 100644 --- a/llvm/lib/Target/M68k/MCTargetDesc/M68kInstPrinter.cpp +++ b/llvm/lib/Target/M68k/MCTargetDesc/M68kInstPrinter.cpp @@ -1,4 +1,4 @@ -//===-- M68kInstPrinter.cpp - Convert M68k MCInst to asm ----*- C++ -*-===// +//===-- M68kInstPrinter.cpp - Convert M68k MCInst to asm --------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. diff --git a/llvm/lib/Target/M68k/MCTargetDesc/M68kInstPrinter.h b/llvm/lib/Target/M68k/MCTargetDesc/M68kInstPrinter.h index ec26bc4ddbfd..239268dd7159 100644 --- a/llvm/lib/Target/M68k/MCTargetDesc/M68kInstPrinter.h +++ b/llvm/lib/Target/M68k/MCTargetDesc/M68kInstPrinter.h @@ -1,4 +1,4 @@ -//===-- M68kInstPrinter.h - Convert M68k MCInst to asm ------*- C++ -*-===// +//===-- M68kInstPrinter.h - Convert M68k MCInst to asm ----------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -166,4 +166,4 @@ private: }; } // end namespace llvm -#endif +#endif // LLVM_LIB_TARGET_M68K_INSTPRINTER_M68KINSTPRINTER_H diff --git a/llvm/lib/Target/M68k/MCTargetDesc/M68kMCAsmInfo.cpp b/llvm/lib/Target/M68k/MCTargetDesc/M68kMCAsmInfo.cpp index ee2041012bb9..005d2d38f53d 100644 --- a/llvm/lib/Target/M68k/MCTargetDesc/M68kMCAsmInfo.cpp +++ b/llvm/lib/Target/M68k/MCTargetDesc/M68kMCAsmInfo.cpp @@ -1,4 +1,4 @@ -//===-- M68kMCAsmInfo.cpp - M68k Asm Properties -------------*- C++ -*-===// +//===-- M68kMCAsmInfo.cpp - M68k Asm Properties -----------------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. diff --git a/llvm/lib/Target/M68k/MCTargetDesc/M68kMCAsmInfo.h b/llvm/lib/Target/M68k/MCTargetDesc/M68kMCAsmInfo.h index b3a58cc61223..873264d88674 100644 --- a/llvm/lib/Target/M68k/MCTargetDesc/M68kMCAsmInfo.h +++ b/llvm/lib/Target/M68k/MCTargetDesc/M68kMCAsmInfo.h @@ -1,4 +1,4 @@ -//===-- M68kMCAsmInfo.h - M68k Asm Info --------------------*- C++ -*--===// +//===-- M68kMCAsmInfo.h - M68k Asm Info -------------------------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -28,4 +28,4 @@ public: } // namespace llvm -#endif +#endif // LLVM_LIB_TARGET_M68K_MCTARGETDESC_M68KMCASMINFO_H diff --git a/llvm/lib/Target/M68k/MCTargetDesc/M68kMCCodeEmitter.cpp b/llvm/lib/Target/M68k/MCTargetDesc/M68kMCCodeEmitter.cpp index 9708abaadf98..9227bd6c3a78 100644 --- a/llvm/lib/Target/M68k/MCTargetDesc/M68kMCCodeEmitter.cpp +++ b/llvm/lib/Target/M68k/MCTargetDesc/M68kMCCodeEmitter.cpp @@ -1,4 +1,4 @@ -//===-- M68kMCCodeEmitter.cpp - Convert M68k code emitter ---*- C++ -*-===// +//===-- M68kMCCodeEmitter.cpp - Convert M68k code emitter -------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. diff --git a/llvm/lib/Target/M68k/MCTargetDesc/M68kMCCodeEmitter.h b/llvm/lib/Target/M68k/MCTargetDesc/M68kMCCodeEmitter.h index 242a1297206a..aba705aa54b6 100644 --- a/llvm/lib/Target/M68k/MCTargetDesc/M68kMCCodeEmitter.h +++ b/llvm/lib/Target/M68k/MCTargetDesc/M68kMCCodeEmitter.h @@ -1,4 +1,4 @@ -//===-- M68kMCCodeEmitter.h - M68k Code Emitter ----------------*- C++ -*--===// +//===-- M68kMCCodeEmitter.h - M68k Code Emitter -----------------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -25,4 +25,4 @@ const uint8_t *getMCInstrBeads(unsigned); } // namespace M68k } // namespace llvm -#endif +#endif // LLVM_LIB_TARGET_M68K_MCTARGETDESC_M68KMCCODEEMITTER_H diff --git a/llvm/lib/Target/M68k/MCTargetDesc/M68kMCTargetDesc.cpp b/llvm/lib/Target/M68k/MCTargetDesc/M68kMCTargetDesc.cpp index 9f4db895a821..2606e22410fc 100644 --- a/llvm/lib/Target/M68k/MCTargetDesc/M68kMCTargetDesc.cpp +++ b/llvm/lib/Target/M68k/MCTargetDesc/M68kMCTargetDesc.cpp @@ -1,4 +1,4 @@ -//===-- M68kMCTargetDesc.cpp - M68k Target Descriptions -----*- C++ -*-===// +//===-- M68kMCTargetDesc.cpp - M68k Target Descriptions ---------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. diff --git a/llvm/lib/Target/M68k/MCTargetDesc/M68kMCTargetDesc.h b/llvm/lib/Target/M68k/MCTargetDesc/M68kMCTargetDesc.h index a0ebca0ce36c..aa53e13af4fc 100644 --- a/llvm/lib/Target/M68k/MCTargetDesc/M68kMCTargetDesc.h +++ b/llvm/lib/Target/M68k/MCTargetDesc/M68kMCTargetDesc.h @@ -1,4 +1,4 @@ -//===-- M68kMCTargetDesc.h - M68k Target Descriptions -------*- C++ -*-===// +//===-- M68kMCTargetDesc.h - M68k Target Descriptions -----------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -58,4 +58,4 @@ std::unique_ptr<MCObjectTargetWriter> createM68kELFObjectWriter(uint8_t OSABI); #define GET_SUBTARGETINFO_ENUM #include "M68kGenSubtargetInfo.inc" -#endif +#endif // LLVM_LIB_TARGET_M68K_MCTARGETDESC_M68KMCTARGETDESC_H diff --git a/llvm/lib/Target/M68k/TargetInfo/M68kTargetInfo.cpp b/llvm/lib/Target/M68k/TargetInfo/M68kTargetInfo.cpp index 2a225b8a43cd..4701f46b0298 100644 --- a/llvm/lib/Target/M68k/TargetInfo/M68kTargetInfo.cpp +++ b/llvm/lib/Target/M68k/TargetInfo/M68kTargetInfo.cpp @@ -1,4 +1,4 @@ -//===-- M68kTargetInfo.cpp - M68k Target Implementation -----*- C++ -*-===// +//===-- M68kTargetInfo.cpp - M68k Target Implementation ---------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. diff --git a/llvm/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp b/llvm/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp index c1677baf52a7..13cba8b079a9 100644 --- a/llvm/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp +++ b/llvm/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp @@ -114,13 +114,14 @@ class MSP430Operand : public MCParsedAsmOperand { public: MSP430Operand(StringRef Tok, SMLoc const &S) - : Base(), Kind(k_Tok), Tok(Tok), Start(S), End(S) {} + : Kind(k_Tok), Tok(Tok), Start(S), End(S) {} MSP430Operand(KindTy Kind, unsigned Reg, SMLoc const &S, SMLoc const &E) - : Base(), Kind(Kind), Reg(Reg), Start(S), End(E) {} + : Kind(Kind), Reg(Reg), Start(S), End(E) {} MSP430Operand(MCExpr const *Imm, SMLoc const &S, SMLoc const &E) - : Base(), Kind(k_Imm), Imm(Imm), Start(S), End(E) {} - MSP430Operand(unsigned Reg, MCExpr const *Expr, SMLoc const &S, SMLoc const &E) - : Base(), Kind(k_Mem), Mem({Reg, Expr}), Start(S), End(E) {} + : Kind(k_Imm), Imm(Imm), Start(S), End(E) {} + MSP430Operand(unsigned Reg, MCExpr const *Expr, SMLoc const &S, + SMLoc const &E) + : Kind(k_Mem), Mem({Reg, Expr}), Start(S), End(E) {} void addRegOperands(MCInst &Inst, unsigned N) const { assert((Kind == k_Reg || Kind == k_IndReg || Kind == k_PostIndReg) && diff --git a/llvm/lib/Target/MSP430/MCTargetDesc/MSP430ELFObjectWriter.cpp b/llvm/lib/Target/MSP430/MCTargetDesc/MSP430ELFObjectWriter.cpp index 0cdd1f4f701f..bb5351af6523 100644 --- a/llvm/lib/Target/MSP430/MCTargetDesc/MSP430ELFObjectWriter.cpp +++ b/llvm/lib/Target/MSP430/MCTargetDesc/MSP430ELFObjectWriter.cpp @@ -9,7 +9,6 @@ #include "MCTargetDesc/MSP430FixupKinds.h" #include "MCTargetDesc/MSP430MCTargetDesc.h" -#include "MCTargetDesc/MSP430MCTargetDesc.h" #include "llvm/MC/MCELFObjectWriter.h" #include "llvm/MC/MCFixup.h" #include "llvm/MC/MCObjectWriter.h" diff --git a/llvm/lib/Target/MSP430/MSP430FrameLowering.cpp b/llvm/lib/Target/MSP430/MSP430FrameLowering.cpp index 4ef9a567d453..6a8dc3502496 100644 --- a/llvm/lib/Target/MSP430/MSP430FrameLowering.cpp +++ b/llvm/lib/Target/MSP430/MSP430FrameLowering.cpp @@ -190,7 +190,7 @@ bool MSP430FrameLowering::spillCalleeSavedRegisters( MFI->setCalleeSavedFrameSize(CSI.size() * 2); for (const CalleeSavedInfo &I : llvm::reverse(CSI)) { - unsigned Reg = I.getReg(); + Register Reg = I.getReg(); // Add the callee-saved register as live-in. It's killed at the spill. MBB.addLiveIn(Reg); BuildMI(MBB, MI, DL, TII.get(MSP430::PUSH16r)) diff --git a/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp b/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp index c64a44a0ef95..aebfc6b0ae2e 100644 --- a/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp +++ b/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp @@ -705,7 +705,7 @@ SDValue MSP430TargetLowering::LowerCCCArguments( for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { if (Ins[i].Flags.isSRet()) { - unsigned Reg = FuncInfo->getSRetReturnReg(); + Register Reg = FuncInfo->getSRetReturnReg(); if (!Reg) { Reg = MF.getRegInfo().createVirtualRegister( getRegClassFor(MVT::i16)); @@ -772,7 +772,7 @@ MSP430TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, if (MF.getFunction().hasStructRetAttr()) { MSP430MachineFunctionInfo *FuncInfo = MF.getInfo<MSP430MachineFunctionInfo>(); - unsigned Reg = FuncInfo->getSRetReturnReg(); + Register Reg = FuncInfo->getSRetReturnReg(); if (!Reg) llvm_unreachable("sret virtual register not created in entry block"); @@ -1402,12 +1402,12 @@ bool MSP430TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const { bool MSP430TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const { // MSP430 implicitly zero-extends 8-bit results in 16-bit registers. - return 0 && Ty1->isIntegerTy(8) && Ty2->isIntegerTy(16); + return false && Ty1->isIntegerTy(8) && Ty2->isIntegerTy(16); } bool MSP430TargetLowering::isZExtFree(EVT VT1, EVT VT2) const { // MSP430 implicitly zero-extends 8-bit results in 16-bit registers. - return 0 && VT1 == MVT::i8 && VT2 == MVT::i16; + return false && VT1 == MVT::i8 && VT2 == MVT::i16; } bool MSP430TargetLowering::isZExtFree(SDValue Val, EVT VT2) const { diff --git a/llvm/lib/Target/MSP430/MSP430Subtarget.cpp b/llvm/lib/Target/MSP430/MSP430Subtarget.cpp index 2fd58717c4db..0604d47597e2 100644 --- a/llvm/lib/Target/MSP430/MSP430Subtarget.cpp +++ b/llvm/lib/Target/MSP430/MSP430Subtarget.cpp @@ -57,5 +57,5 @@ MSP430Subtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS) { MSP430Subtarget::MSP430Subtarget(const Triple &TT, const std::string &CPU, const std::string &FS, const TargetMachine &TM) - : MSP430GenSubtargetInfo(TT, CPU, /*TuneCPU*/ CPU, FS), FrameLowering(), + : MSP430GenSubtargetInfo(TT, CPU, /*TuneCPU*/ CPU, FS), InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM, *this) {} diff --git a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp index 01b5dff2e448..736c41f8ac03 100644 --- a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp +++ b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp @@ -827,8 +827,7 @@ private: } Kind; public: - MipsOperand(KindTy K, MipsAsmParser &Parser) - : MCParsedAsmOperand(), Kind(K), AsmParser(Parser) {} + MipsOperand(KindTy K, MipsAsmParser &Parser) : Kind(K), AsmParser(Parser) {} ~MipsOperand() override { switch (Kind) { diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp index bfe413a152b6..a3dbe6f84a1e 100644 --- a/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp +++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp @@ -197,7 +197,7 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value, Value = (int64_t)Value / 2; // We now check if Value can be encoded as a 26-bit signed immediate. if (!isInt<26>(Value)) { - Ctx.reportFatalError(Fixup.getLoc(), "out of range PC26 fixup"); + Ctx.reportError(Fixup.getLoc(), "out of range PC26 fixup"); return 0; } break; diff --git a/llvm/lib/Target/Mips/Mips.h b/llvm/lib/Target/Mips/Mips.h index b3faaab436f0..faf58545db62 100644 --- a/llvm/lib/Target/Mips/Mips.h +++ b/llvm/lib/Target/Mips/Mips.h @@ -38,6 +38,7 @@ namespace llvm { FunctionPass *createMicroMipsSizeReducePass(); FunctionPass *createMipsExpandPseudoPass(); FunctionPass *createMipsPreLegalizeCombiner(); + FunctionPass *createMipsMulMulBugPass(); InstructionSelector *createMipsInstructionSelector(const MipsTargetMachine &, MipsSubtarget &, @@ -47,6 +48,7 @@ namespace llvm { void initializeMipsBranchExpansionPass(PassRegistry &); void initializeMicroMipsSizeReducePass(PassRegistry &); void initializeMipsPreLegalizerCombinerPass(PassRegistry&); + void initializeMipsMulMulBugFixPass(PassRegistry&); } // end namespace llvm; #endif diff --git a/llvm/lib/Target/Mips/Mips16FrameLowering.cpp b/llvm/lib/Target/Mips/Mips16FrameLowering.cpp index 622f2039f9e4..4f4e3f3f2ed7 100644 --- a/llvm/lib/Target/Mips/Mips16FrameLowering.cpp +++ b/llvm/lib/Target/Mips/Mips16FrameLowering.cpp @@ -74,7 +74,7 @@ void Mips16FrameLowering::emitPrologue(MachineFunction &MF, for (const CalleeSavedInfo &I : CSI) { int64_t Offset = MFI.getObjectOffset(I.getFrameIdx()); - unsigned Reg = I.getReg(); + Register Reg = I.getReg(); unsigned DReg = MRI->getDwarfRegNum(Reg, true); unsigned CFIIndex = MF.addFrameInst( MCCFIInstruction::createOffset(nullptr, DReg, Offset)); @@ -124,7 +124,7 @@ bool Mips16FrameLowering::spillCalleeSavedRegisters( // method MipsTargetLowering::lowerRETURNADDR. // It's killed at the spill, unless the register is RA and return address // is taken. - unsigned Reg = I.getReg(); + Register Reg = I.getReg(); bool IsRAAndRetAddrIsTaken = (Reg == Mips::RA) && MF->getFrameInfo().isReturnAddressTaken(); if (!IsRAAndRetAddrIsTaken) diff --git a/llvm/lib/Target/Mips/Mips16ISelDAGToDAG.cpp b/llvm/lib/Target/Mips/Mips16ISelDAGToDAG.cpp index ddd28d095e51..50147c019bfd 100644 --- a/llvm/lib/Target/Mips/Mips16ISelDAGToDAG.cpp +++ b/llvm/lib/Target/Mips/Mips16ISelDAGToDAG.cpp @@ -121,7 +121,7 @@ bool Mips16DAGToDAGISel::selectAddr(bool SPAllowed, SDValue Addr, SDValue &Base, } // Addresses of the form FI+const or FI|const if (CurDAG->isBaseWithConstantOffset(Addr)) { - ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1)); + auto *CN = cast<ConstantSDNode>(Addr.getOperand(1)); if (isInt<16>(CN->getSExtValue())) { // If the first operand is a FI, get the TargetFI Node if (SPAllowed) { diff --git a/llvm/lib/Target/Mips/Mips16ISelLowering.cpp b/llvm/lib/Target/Mips/Mips16ISelLowering.cpp index 136612c59d96..78ffe00c020c 100644 --- a/llvm/lib/Target/Mips/Mips16ISelLowering.cpp +++ b/llvm/lib/Target/Mips/Mips16ISelLowering.cpp @@ -451,7 +451,7 @@ getOpndList(SmallVectorImpl<SDValue> &Ops, // So for now we always save S2. The optimization will be done // in a follow-on patch. // - if (1 || (Signature->RetSig != Mips16HardFloatInfo::NoFPRet)) + if (true || (Signature->RetSig != Mips16HardFloatInfo::NoFPRet)) FuncInfo->setSaveS2(); } // one more look at list of intrinsics diff --git a/llvm/lib/Target/Mips/Mips16InstrInfo.cpp b/llvm/lib/Target/Mips/Mips16InstrInfo.cpp index 3403ec01aef2..02d0e770ba66 100644 --- a/llvm/lib/Target/Mips/Mips16InstrInfo.cpp +++ b/llvm/lib/Target/Mips/Mips16InstrInfo.cpp @@ -190,7 +190,7 @@ static void addSaveRestoreRegs(MachineInstrBuilder &MIB, // method MipsTargetLowering::lowerRETURNADDR. // It's killed at the spill, unless the register is RA and return address // is taken. - unsigned Reg = CSI[e-i-1].getReg(); + Register Reg = CSI[e-i-1].getReg(); switch (Reg) { case Mips::RA: case Mips::S0: diff --git a/llvm/lib/Target/Mips/Mips16RegisterInfo.cpp b/llvm/lib/Target/Mips/Mips16RegisterInfo.cpp index f6f43da9abf8..563118dfe627 100644 --- a/llvm/lib/Target/Mips/Mips16RegisterInfo.cpp +++ b/llvm/lib/Target/Mips/Mips16RegisterInfo.cpp @@ -37,7 +37,7 @@ using namespace llvm; #define DEBUG_TYPE "mips16-registerinfo" -Mips16RegisterInfo::Mips16RegisterInfo() : MipsRegisterInfo() {} +Mips16RegisterInfo::Mips16RegisterInfo() {} bool Mips16RegisterInfo::requiresRegisterScavenging (const MachineFunction &MF) const { diff --git a/llvm/lib/Target/Mips/MipsAnalyzeImmediate.cpp b/llvm/lib/Target/Mips/MipsAnalyzeImmediate.cpp index ae2b83c414db..33da0ff31be8 100644 --- a/llvm/lib/Target/Mips/MipsAnalyzeImmediate.cpp +++ b/llvm/lib/Target/Mips/MipsAnalyzeImmediate.cpp @@ -25,8 +25,8 @@ void MipsAnalyzeImmediate::AddInstr(InstSeqLs &SeqLs, const Inst &I) { return; } - for (InstSeqLs::iterator Iter = SeqLs.begin(); Iter != SeqLs.end(); ++Iter) - Iter->push_back(I); + for (auto &S : SeqLs) + S.push_back(I); } void MipsAnalyzeImmediate::GetInstSeqLsADDiu(uint64_t Imm, unsigned RemSize, diff --git a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp index 5d026785b921..4bd8845e9cb9 100644 --- a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp +++ b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp @@ -80,13 +80,9 @@ bool MipsAsmPrinter::runOnMachineFunction(MachineFunction &MF) { MipsFI = MF.getInfo<MipsFunctionInfo>(); if (Subtarget->inMips16Mode()) - for (std::map< - const char *, - const Mips16HardFloatInfo::FuncSignature *>::const_iterator - it = MipsFI->StubsNeeded.begin(); - it != MipsFI->StubsNeeded.end(); ++it) { - const char *Symbol = it->first; - const Mips16HardFloatInfo::FuncSignature *Signature = it->second; + for (const auto &I : MipsFI->StubsNeeded) { + const char *Symbol = I.first; + const Mips16HardFloatInfo::FuncSignature *Signature = I.second; if (StubsNeeded.find(Symbol) == StubsNeeded.end()) StubsNeeded[Symbol] = Signature; } @@ -341,7 +337,7 @@ void MipsAsmPrinter::printSavedRegsBitmask() { unsigned CSFPRegsSize = 0; for (const auto &I : CSI) { - unsigned Reg = I.getReg(); + Register Reg = I.getReg(); unsigned RegNum = TRI->getEncodingValue(Reg); // If it's a floating point register, set the FPU Bitmask. @@ -1279,11 +1275,11 @@ void MipsAsmPrinter::NaClAlignIndirectJumpTargets(MachineFunction &MF) { // Align all blocks that are jumped to through jump table. if (MachineJumpTableInfo *JtInfo = MF.getJumpTableInfo()) { const std::vector<MachineJumpTableEntry> &JT = JtInfo->getJumpTables(); - for (unsigned I = 0; I < JT.size(); ++I) { - const std::vector<MachineBasicBlock*> &MBBs = JT[I].MBBs; + for (const auto &I : JT) { + const std::vector<MachineBasicBlock *> &MBBs = I.MBBs; - for (unsigned J = 0; J < MBBs.size(); ++J) - MBBs[J]->setAlignment(MIPS_NACL_BUNDLE_ALIGN); + for (MachineBasicBlock *MBB : MBBs) + MBB->setAlignment(MIPS_NACL_BUNDLE_ALIGN); } } diff --git a/llvm/lib/Target/Mips/MipsCallLowering.h b/llvm/lib/Target/Mips/MipsCallLowering.h index 1d1406da3201..9f114d55db4c 100644 --- a/llvm/lib/Target/Mips/MipsCallLowering.h +++ b/llvm/lib/Target/Mips/MipsCallLowering.h @@ -18,7 +18,6 @@ namespace llvm { -class MachineMemOperand; class MipsTargetLowering; class MipsCallLowering : public CallLowering { diff --git a/llvm/lib/Target/Mips/MipsConstantIslandPass.cpp b/llvm/lib/Target/Mips/MipsConstantIslandPass.cpp index 491d379bfe0b..1efbf5570287 100644 --- a/llvm/lib/Target/Mips/MipsConstantIslandPass.cpp +++ b/llvm/lib/Target/Mips/MipsConstantIslandPass.cpp @@ -604,9 +604,9 @@ MipsConstantIslands::CPEntry std::vector<CPEntry> &CPEs = CPEntries[CPI]; // Number of entries per constpool index should be small, just do a // linear search. - for (unsigned i = 0, e = CPEs.size(); i != e; ++i) { - if (CPEs[i].CPEMI == CPEMI) - return &CPEs[i]; + for (CPEntry &CPE : CPEs) { + if (CPE.CPEMI == CPEMI) + return &CPE; } return nullptr; } @@ -1052,27 +1052,27 @@ int MipsConstantIslands::findInRangeCPEntry(CPUser& U, unsigned UserOffset) // No. Look for previously created clones of the CPE that are in range. unsigned CPI = CPEMI->getOperand(1).getIndex(); std::vector<CPEntry> &CPEs = CPEntries[CPI]; - for (unsigned i = 0, e = CPEs.size(); i != e; ++i) { + for (CPEntry &CPE : CPEs) { // We already tried this one - if (CPEs[i].CPEMI == CPEMI) + if (CPE.CPEMI == CPEMI) continue; // Removing CPEs can leave empty entries, skip - if (CPEs[i].CPEMI == nullptr) + if (CPE.CPEMI == nullptr) continue; - if (isCPEntryInRange(UserMI, UserOffset, CPEs[i].CPEMI, U.getMaxDisp(), - U.NegOk)) { - LLVM_DEBUG(dbgs() << "Replacing CPE#" << CPI << " with CPE#" - << CPEs[i].CPI << "\n"); + if (isCPEntryInRange(UserMI, UserOffset, CPE.CPEMI, U.getMaxDisp(), + U.NegOk)) { + LLVM_DEBUG(dbgs() << "Replacing CPE#" << CPI << " with CPE#" << CPE.CPI + << "\n"); // Point the CPUser node to the replacement - U.CPEMI = CPEs[i].CPEMI; + U.CPEMI = CPE.CPEMI; // Change the CPI in the instruction operand to refer to the clone. for (MachineOperand &MO : UserMI->operands()) if (MO.isCPI()) { - MO.setIndex(CPEs[i].CPI); + MO.setIndex(CPE.CPI); break; } // Adjust the refcount of the clone... - CPEs[i].RefCount++; + CPE.RefCount++; // ...and the original. If we didn't remove the old entry, none of the // addresses changed, so we don't need another pass. return decrementCPEReferenceCount(CPI, CPEMI) ? 2 : 1; @@ -1108,27 +1108,27 @@ int MipsConstantIslands::findLongFormInRangeCPEntry // No. Look for previously created clones of the CPE that are in range. unsigned CPI = CPEMI->getOperand(1).getIndex(); std::vector<CPEntry> &CPEs = CPEntries[CPI]; - for (unsigned i = 0, e = CPEs.size(); i != e; ++i) { + for (CPEntry &CPE : CPEs) { // We already tried this one - if (CPEs[i].CPEMI == CPEMI) + if (CPE.CPEMI == CPEMI) continue; // Removing CPEs can leave empty entries, skip - if (CPEs[i].CPEMI == nullptr) + if (CPE.CPEMI == nullptr) continue; - if (isCPEntryInRange(UserMI, UserOffset, CPEs[i].CPEMI, - U.getLongFormMaxDisp(), U.NegOk)) { - LLVM_DEBUG(dbgs() << "Replacing CPE#" << CPI << " with CPE#" - << CPEs[i].CPI << "\n"); + if (isCPEntryInRange(UserMI, UserOffset, CPE.CPEMI, U.getLongFormMaxDisp(), + U.NegOk)) { + LLVM_DEBUG(dbgs() << "Replacing CPE#" << CPI << " with CPE#" << CPE.CPI + << "\n"); // Point the CPUser node to the replacement - U.CPEMI = CPEs[i].CPEMI; + U.CPEMI = CPE.CPEMI; // Change the CPI in the instruction operand to refer to the clone. for (MachineOperand &MO : UserMI->operands()) if (MO.isCPI()) { - MO.setIndex(CPEs[i].CPI); + MO.setIndex(CPE.CPI); break; } // Adjust the refcount of the clone... - CPEs[i].RefCount++; + CPE.RefCount++; // ...and the original. If we didn't remove the old entry, none of the // addresses changed, so we don't need another pass. return decrementCPEReferenceCount(CPI, CPEMI) ? 2 : 1; @@ -1435,15 +1435,14 @@ void MipsConstantIslands::removeDeadCPEMI(MachineInstr *CPEMI) { /// are zero. bool MipsConstantIslands::removeUnusedCPEntries() { unsigned MadeChange = false; - for (unsigned i = 0, e = CPEntries.size(); i != e; ++i) { - std::vector<CPEntry> &CPEs = CPEntries[i]; - for (unsigned j = 0, ee = CPEs.size(); j != ee; ++j) { - if (CPEs[j].RefCount == 0 && CPEs[j].CPEMI) { - removeDeadCPEMI(CPEs[j].CPEMI); - CPEs[j].CPEMI = nullptr; - MadeChange = true; - } + for (std::vector<CPEntry> &CPEs : CPEntries) { + for (CPEntry &CPE : CPEs) { + if (CPE.RefCount == 0 && CPE.CPEMI) { + removeDeadCPEMI(CPE.CPEMI); + CPE.CPEMI = nullptr; + MadeChange = true; } + } } return MadeChange; } diff --git a/llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp b/llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp index 2d27d7553de6..cf6cec22308c 100644 --- a/llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp +++ b/llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp @@ -309,12 +309,12 @@ INITIALIZE_PASS(MipsDelaySlotFiller, DEBUG_TYPE, static void insertDelayFiller(Iter Filler, const BB2BrMap &BrMap) { MachineFunction *MF = Filler->getParent()->getParent(); - for (BB2BrMap::const_iterator I = BrMap.begin(); I != BrMap.end(); ++I) { - if (I->second) { - MIBundleBuilder(I->second).append(MF->CloneMachineInstr(&*Filler)); + for (const auto &I : BrMap) { + if (I.second) { + MIBundleBuilder(I.second).append(MF->CloneMachineInstr(&*Filler)); ++UsefulSlots; } else { - I->first->insert(I->first->end(), MF->CloneMachineInstr(&*Filler)); + I.first->push_back(MF->CloneMachineInstr(&*Filler)); } } } diff --git a/llvm/lib/Target/Mips/MipsFastISel.cpp b/llvm/lib/Target/Mips/MipsFastISel.cpp index 05c1c06ffefe..6ddfec5d0f79 100644 --- a/llvm/lib/Target/Mips/MipsFastISel.cpp +++ b/llvm/lib/Target/Mips/MipsFastISel.cpp @@ -313,7 +313,7 @@ unsigned MipsFastISel::emitLogicalOp(unsigned ISDOpc, MVT RetVT, llvm_unreachable("unexpected opcode"); } - unsigned LHSReg = getRegForValue(LHS); + Register LHSReg = getRegForValue(LHS); if (!LHSReg) return 0; @@ -325,7 +325,7 @@ unsigned MipsFastISel::emitLogicalOp(unsigned ISDOpc, MVT RetVT, if (!RHSReg) return 0; - unsigned ResultReg = createResultReg(&Mips::GPR32RegClass); + Register ResultReg = createResultReg(&Mips::GPR32RegClass); if (!ResultReg) return 0; @@ -341,7 +341,7 @@ unsigned MipsFastISel::fastMaterializeAlloca(const AllocaInst *AI) { FuncInfo.StaticAllocaMap.find(AI); if (SI != FuncInfo.StaticAllocaMap.end()) { - unsigned ResultReg = createResultReg(&Mips::GPR32RegClass); + Register ResultReg = createResultReg(&Mips::GPR32RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Mips::LEA_ADDiu), ResultReg) .addFrameIndex(SI->second) @@ -362,7 +362,7 @@ unsigned MipsFastISel::materializeInt(const Constant *C, MVT VT) { unsigned MipsFastISel::materialize32BitInt(int64_t Imm, const TargetRegisterClass *RC) { - unsigned ResultReg = createResultReg(RC); + Register ResultReg = createResultReg(RC); if (isInt<16>(Imm)) { unsigned Opc = Mips::ADDiu; @@ -376,7 +376,7 @@ unsigned MipsFastISel::materialize32BitInt(int64_t Imm, unsigned Hi = (Imm >> 16) & 0xFFFF; if (Lo) { // Both Lo and Hi have nonzero bits. - unsigned TmpReg = createResultReg(RC); + Register TmpReg = createResultReg(RC); emitInst(Mips::LUi, TmpReg).addImm(Hi); emitInst(Mips::ORi, ResultReg).addReg(TmpReg).addImm(Lo); } else { @@ -391,13 +391,13 @@ unsigned MipsFastISel::materializeFP(const ConstantFP *CFP, MVT VT) { int64_t Imm = CFP->getValueAPF().bitcastToAPInt().getZExtValue(); if (VT == MVT::f32) { const TargetRegisterClass *RC = &Mips::FGR32RegClass; - unsigned DestReg = createResultReg(RC); + Register DestReg = createResultReg(RC); unsigned TempReg = materialize32BitInt(Imm, &Mips::GPR32RegClass); emitInst(Mips::MTC1, DestReg).addReg(TempReg); return DestReg; } else if (VT == MVT::f64) { const TargetRegisterClass *RC = &Mips::AFGR64RegClass; - unsigned DestReg = createResultReg(RC); + Register DestReg = createResultReg(RC); unsigned TempReg1 = materialize32BitInt(Imm >> 32, &Mips::GPR32RegClass); unsigned TempReg2 = materialize32BitInt(Imm & 0xFFFFFFFF, &Mips::GPR32RegClass); @@ -412,7 +412,7 @@ unsigned MipsFastISel::materializeGV(const GlobalValue *GV, MVT VT) { if (VT != MVT::i32) return 0; const TargetRegisterClass *RC = &Mips::GPR32RegClass; - unsigned DestReg = createResultReg(RC); + Register DestReg = createResultReg(RC); const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV); bool IsThreadLocal = GVar && GVar->isThreadLocal(); // TLS not supported at this time. @@ -423,7 +423,7 @@ unsigned MipsFastISel::materializeGV(const GlobalValue *GV, MVT VT) { .addGlobalAddress(GV, 0, MipsII::MO_GOT); if ((GV->hasInternalLinkage() || (GV->hasLocalLinkage() && !isa<Function>(GV)))) { - unsigned TempReg = createResultReg(RC); + Register TempReg = createResultReg(RC); emitInst(Mips::ADDiu, TempReg) .addReg(DestReg) .addGlobalAddress(GV, 0, MipsII::MO_ABS_LO); @@ -434,7 +434,7 @@ unsigned MipsFastISel::materializeGV(const GlobalValue *GV, MVT VT) { unsigned MipsFastISel::materializeExternalCallSym(MCSymbol *Sym) { const TargetRegisterClass *RC = &Mips::GPR32RegClass; - unsigned DestReg = createResultReg(RC); + Register DestReg = createResultReg(RC); emitInst(Mips::LW, DestReg) .addReg(MFI->getGlobalBaseReg(*MF)) .addSym(Sym, MipsII::MO_GOT); @@ -649,13 +649,13 @@ bool MipsFastISel::emitCmp(unsigned ResultReg, const CmpInst *CI) { default: return false; case CmpInst::ICMP_EQ: { - unsigned TempReg = createResultReg(&Mips::GPR32RegClass); + Register TempReg = createResultReg(&Mips::GPR32RegClass); emitInst(Mips::XOR, TempReg).addReg(LeftReg).addReg(RightReg); emitInst(Mips::SLTiu, ResultReg).addReg(TempReg).addImm(1); break; } case CmpInst::ICMP_NE: { - unsigned TempReg = createResultReg(&Mips::GPR32RegClass); + Register TempReg = createResultReg(&Mips::GPR32RegClass); emitInst(Mips::XOR, TempReg).addReg(LeftReg).addReg(RightReg); emitInst(Mips::SLTu, ResultReg).addReg(Mips::ZERO).addReg(TempReg); break; @@ -667,13 +667,13 @@ bool MipsFastISel::emitCmp(unsigned ResultReg, const CmpInst *CI) { emitInst(Mips::SLTu, ResultReg).addReg(LeftReg).addReg(RightReg); break; case CmpInst::ICMP_UGE: { - unsigned TempReg = createResultReg(&Mips::GPR32RegClass); + Register TempReg = createResultReg(&Mips::GPR32RegClass); emitInst(Mips::SLTu, TempReg).addReg(LeftReg).addReg(RightReg); emitInst(Mips::XORi, ResultReg).addReg(TempReg).addImm(1); break; } case CmpInst::ICMP_ULE: { - unsigned TempReg = createResultReg(&Mips::GPR32RegClass); + Register TempReg = createResultReg(&Mips::GPR32RegClass); emitInst(Mips::SLTu, TempReg).addReg(RightReg).addReg(LeftReg); emitInst(Mips::XORi, ResultReg).addReg(TempReg).addImm(1); break; @@ -685,13 +685,13 @@ bool MipsFastISel::emitCmp(unsigned ResultReg, const CmpInst *CI) { emitInst(Mips::SLT, ResultReg).addReg(LeftReg).addReg(RightReg); break; case CmpInst::ICMP_SGE: { - unsigned TempReg = createResultReg(&Mips::GPR32RegClass); + Register TempReg = createResultReg(&Mips::GPR32RegClass); emitInst(Mips::SLT, TempReg).addReg(LeftReg).addReg(RightReg); emitInst(Mips::XORi, ResultReg).addReg(TempReg).addImm(1); break; } case CmpInst::ICMP_SLE: { - unsigned TempReg = createResultReg(&Mips::GPR32RegClass); + Register TempReg = createResultReg(&Mips::GPR32RegClass); emitInst(Mips::SLT, TempReg).addReg(RightReg).addReg(LeftReg); emitInst(Mips::XORi, ResultReg).addReg(TempReg).addImm(1); break; @@ -737,8 +737,8 @@ bool MipsFastISel::emitCmp(unsigned ResultReg, const CmpInst *CI) { default: llvm_unreachable("Only switching of a subset of CCs."); } - unsigned RegWithZero = createResultReg(&Mips::GPR32RegClass); - unsigned RegWithOne = createResultReg(&Mips::GPR32RegClass); + Register RegWithZero = createResultReg(&Mips::GPR32RegClass); + Register RegWithOne = createResultReg(&Mips::GPR32RegClass); emitInst(Mips::ADDiu, RegWithZero).addReg(Mips::ZERO).addImm(0); emitInst(Mips::ADDiu, RegWithOne).addReg(Mips::ZERO).addImm(1); emitInst(Opc).addReg(Mips::FCC0, RegState::Define).addReg(LeftReg) @@ -964,7 +964,7 @@ bool MipsFastISel::selectBranch(const Instruction *I) { // For the general case, we need to mask with 1. if (ZExtCondReg == 0) { - unsigned CondReg = getRegForValue(BI->getCondition()); + Register CondReg = getRegForValue(BI->getCondition()); if (CondReg == 0) return false; @@ -982,7 +982,7 @@ bool MipsFastISel::selectBranch(const Instruction *I) { bool MipsFastISel::selectCmp(const Instruction *I) { const CmpInst *CI = cast<CmpInst>(I); - unsigned ResultReg = createResultReg(&Mips::GPR32RegClass); + Register ResultReg = createResultReg(&Mips::GPR32RegClass); if (!emitCmp(ResultReg, CI)) return false; updateValueMap(I, ResultReg); @@ -1000,13 +1000,13 @@ bool MipsFastISel::selectFPExt(const Instruction *I) { if (SrcVT != MVT::f32 || DestVT != MVT::f64) return false; - unsigned SrcReg = + Register SrcReg = getRegForValue(Src); // this must be a 32bit floating point register class // maybe we should handle this differently if (!SrcReg) return false; - unsigned DestReg = createResultReg(&Mips::AFGR64RegClass); + Register DestReg = createResultReg(&Mips::AFGR64RegClass); emitInst(Mips::CVT_D32_S, DestReg).addReg(SrcReg); updateValueMap(I, DestReg); return true; @@ -1041,22 +1041,22 @@ bool MipsFastISel::selectSelect(const Instruction *I) { const SelectInst *SI = cast<SelectInst>(I); const Value *Cond = SI->getCondition(); - unsigned Src1Reg = getRegForValue(SI->getTrueValue()); - unsigned Src2Reg = getRegForValue(SI->getFalseValue()); - unsigned CondReg = getRegForValue(Cond); + Register Src1Reg = getRegForValue(SI->getTrueValue()); + Register Src2Reg = getRegForValue(SI->getFalseValue()); + Register CondReg = getRegForValue(Cond); if (!Src1Reg || !Src2Reg || !CondReg) return false; - unsigned ZExtCondReg = createResultReg(&Mips::GPR32RegClass); + Register ZExtCondReg = createResultReg(&Mips::GPR32RegClass); if (!ZExtCondReg) return false; if (!emitIntExt(MVT::i1, CondReg, MVT::i32, ZExtCondReg, true)) return false; - unsigned ResultReg = createResultReg(RC); - unsigned TempReg = createResultReg(RC); + Register ResultReg = createResultReg(RC); + Register TempReg = createResultReg(RC); if (!ResultReg || !TempReg) return false; @@ -1079,11 +1079,11 @@ bool MipsFastISel::selectFPTrunc(const Instruction *I) { if (SrcVT != MVT::f64 || DestVT != MVT::f32) return false; - unsigned SrcReg = getRegForValue(Src); + Register SrcReg = getRegForValue(Src); if (!SrcReg) return false; - unsigned DestReg = createResultReg(&Mips::FGR32RegClass); + Register DestReg = createResultReg(&Mips::FGR32RegClass); if (!DestReg) return false; @@ -1115,14 +1115,14 @@ bool MipsFastISel::selectFPToInt(const Instruction *I, bool IsSigned) { if (SrcVT != MVT::f32 && SrcVT != MVT::f64) return false; - unsigned SrcReg = getRegForValue(Src); + Register SrcReg = getRegForValue(Src); if (SrcReg == 0) return false; // Determine the opcode for the conversion, which takes place // entirely within FPRs. - unsigned DestReg = createResultReg(&Mips::GPR32RegClass); - unsigned TempReg = createResultReg(&Mips::FGR32RegClass); + Register DestReg = createResultReg(&Mips::GPR32RegClass); + Register TempReg = createResultReg(&Mips::FGR32RegClass); unsigned Opc = (SrcVT == MVT::f32) ? Mips::TRUNC_W_S : Mips::TRUNC_W_D32; // Generate the convert. @@ -1196,7 +1196,7 @@ bool MipsFastISel::processCallArgs(CallLoweringInfo &CLI, break; } } - unsigned ArgReg = getRegForValue(ArgVal); + Register ArgReg = getRegForValue(ArgVal); if (!ArgReg) return false; @@ -1294,7 +1294,7 @@ bool MipsFastISel::finishCall(CallLoweringInfo &CLI, MVT RetVT, if (RetVT == MVT::i1 || RetVT == MVT::i8 || RetVT == MVT::i16) CopyVT = MVT::i32; - unsigned ResultReg = createResultReg(TLI.getRegClassFor(CopyVT)); + Register ResultReg = createResultReg(TLI.getRegClassFor(CopyVT)); if (!ResultReg) return false; BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, @@ -1462,11 +1462,11 @@ bool MipsFastISel::fastLowerArguments() { for (const auto &FormalArg : F->args()) { unsigned ArgNo = FormalArg.getArgNo(); unsigned SrcReg = Allocation[ArgNo].Reg; - unsigned DstReg = FuncInfo.MF->addLiveIn(SrcReg, Allocation[ArgNo].RC); + Register DstReg = FuncInfo.MF->addLiveIn(SrcReg, Allocation[ArgNo].RC); // FIXME: Unfortunately it's necessary to emit a copy from the livein copy. // Without this, EmitLiveInCopies may eliminate the livein if its only // use is a bitcast (which isn't turned into an instruction). - unsigned ResultReg = createResultReg(Allocation[ArgNo].RC); + Register ResultReg = createResultReg(Allocation[ArgNo].RC); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), ResultReg) .addReg(DstReg, getKillRegState(true)); @@ -1594,10 +1594,10 @@ bool MipsFastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) { if (!isTypeSupported(RetTy, VT)) return false; - unsigned SrcReg = getRegForValue(II->getOperand(0)); + Register SrcReg = getRegForValue(II->getOperand(0)); if (SrcReg == 0) return false; - unsigned DestReg = createResultReg(&Mips::GPR32RegClass); + Register DestReg = createResultReg(&Mips::GPR32RegClass); if (DestReg == 0) return false; if (VT == MVT::i16) { @@ -1607,9 +1607,9 @@ bool MipsFastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) { return true; } else { unsigned TempReg[3]; - for (int i = 0; i < 3; i++) { - TempReg[i] = createResultReg(&Mips::GPR32RegClass); - if (TempReg[i] == 0) + for (unsigned &R : TempReg) { + R = createResultReg(&Mips::GPR32RegClass); + if (R == 0) return false; } emitInst(Mips::SLL, TempReg[0]).addReg(SrcReg).addImm(8); @@ -1621,16 +1621,16 @@ bool MipsFastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) { } } else if (VT == MVT::i32) { if (Subtarget->hasMips32r2()) { - unsigned TempReg = createResultReg(&Mips::GPR32RegClass); + Register TempReg = createResultReg(&Mips::GPR32RegClass); emitInst(Mips::WSBH, TempReg).addReg(SrcReg); emitInst(Mips::ROTR, DestReg).addReg(TempReg).addImm(16); updateValueMap(II, DestReg); return true; } else { unsigned TempReg[8]; - for (int i = 0; i < 8; i++) { - TempReg[i] = createResultReg(&Mips::GPR32RegClass); - if (TempReg[i] == 0) + for (unsigned &R : TempReg) { + R = createResultReg(&Mips::GPR32RegClass); + if (R == 0) return false; } @@ -1720,7 +1720,7 @@ bool MipsFastISel::selectRet(const Instruction *I) { if (!VA.isRegLoc()) return false; - unsigned Reg = getRegForValue(RV); + Register Reg = getRegForValue(RV); if (Reg == 0) return false; @@ -1788,7 +1788,7 @@ bool MipsFastISel::selectTrunc(const Instruction *I) { if (DestVT != MVT::i16 && DestVT != MVT::i8 && DestVT != MVT::i1) return false; - unsigned SrcReg = getRegForValue(Op); + Register SrcReg = getRegForValue(Op); if (!SrcReg) return false; @@ -1804,7 +1804,7 @@ bool MipsFastISel::selectIntExt(const Instruction *I) { Type *SrcTy = Src->getType(); bool isZExt = isa<ZExtInst>(I); - unsigned SrcReg = getRegForValue(Src); + Register SrcReg = getRegForValue(Src); if (!SrcReg) return false; @@ -1818,7 +1818,7 @@ bool MipsFastISel::selectIntExt(const Instruction *I) { MVT SrcVT = SrcEVT.getSimpleVT(); MVT DestVT = DestEVT.getSimpleVT(); - unsigned ResultReg = createResultReg(&Mips::GPR32RegClass); + Register ResultReg = createResultReg(&Mips::GPR32RegClass); if (!emitIntExt(SrcVT, SrcReg, DestVT, ResultReg, isZExt)) return false; @@ -1839,7 +1839,7 @@ bool MipsFastISel::emitIntSExt32r1(MVT SrcVT, unsigned SrcReg, MVT DestVT, ShiftAmt = 16; break; } - unsigned TempReg = createResultReg(&Mips::GPR32RegClass); + Register TempReg = createResultReg(&Mips::GPR32RegClass); emitInst(Mips::SLL, TempReg).addReg(SrcReg).addImm(ShiftAmt); emitInst(Mips::SRA, DestReg).addReg(TempReg).addImm(ShiftAmt); return true; @@ -1935,15 +1935,15 @@ bool MipsFastISel::selectDivRem(const Instruction *I, unsigned ISDOpcode) { break; } - unsigned Src0Reg = getRegForValue(I->getOperand(0)); - unsigned Src1Reg = getRegForValue(I->getOperand(1)); + Register Src0Reg = getRegForValue(I->getOperand(0)); + Register Src1Reg = getRegForValue(I->getOperand(1)); if (!Src0Reg || !Src1Reg) return false; emitInst(DivOpc).addReg(Src0Reg).addReg(Src1Reg); emitInst(Mips::TEQ).addReg(Src1Reg).addReg(Mips::ZERO).addImm(7); - unsigned ResultReg = createResultReg(&Mips::GPR32RegClass); + Register ResultReg = createResultReg(&Mips::GPR32RegClass); if (!ResultReg) return false; @@ -1962,19 +1962,19 @@ bool MipsFastISel::selectShift(const Instruction *I) { if (!isTypeSupported(I->getType(), RetVT)) return false; - unsigned ResultReg = createResultReg(&Mips::GPR32RegClass); + Register ResultReg = createResultReg(&Mips::GPR32RegClass); if (!ResultReg) return false; unsigned Opcode = I->getOpcode(); const Value *Op0 = I->getOperand(0); - unsigned Op0Reg = getRegForValue(Op0); + Register Op0Reg = getRegForValue(Op0); if (!Op0Reg) return false; // If AShr or LShr, then we need to make sure the operand0 is sign extended. if (Opcode == Instruction::AShr || Opcode == Instruction::LShr) { - unsigned TempReg = createResultReg(&Mips::GPR32RegClass); + Register TempReg = createResultReg(&Mips::GPR32RegClass); if (!TempReg) return false; @@ -2008,7 +2008,7 @@ bool MipsFastISel::selectShift(const Instruction *I) { return true; } - unsigned Op1Reg = getRegForValue(I->getOperand(1)); + Register Op1Reg = getRegForValue(I->getOperand(1)); if (!Op1Reg) return false; @@ -2091,7 +2091,7 @@ bool MipsFastISel::fastSelectInstruction(const Instruction *I) { unsigned MipsFastISel::getRegEnsuringSimpleIntegerWidening(const Value *V, bool IsUnsigned) { - unsigned VReg = getRegForValue(V); + Register VReg = getRegForValue(V); if (VReg == 0) return 0; MVT VMVT = TLI.getValueType(DL, V->getType(), true).getSimpleVT(); @@ -2100,7 +2100,7 @@ unsigned MipsFastISel::getRegEnsuringSimpleIntegerWidening(const Value *V, return 0; if ((VMVT == MVT::i8) || (VMVT == MVT::i16)) { - unsigned TempReg = createResultReg(&Mips::GPR32RegClass); + Register TempReg = createResultReg(&Mips::GPR32RegClass); if (!emitIntExt(VMVT, VReg, MVT::i32, TempReg, IsUnsigned)) return 0; VReg = TempReg; @@ -2112,7 +2112,7 @@ void MipsFastISel::simplifyAddress(Address &Addr) { if (!isInt<16>(Addr.getOffset())) { unsigned TempReg = materialize32BitInt(Addr.getOffset(), &Mips::GPR32RegClass); - unsigned DestReg = createResultReg(&Mips::GPR32RegClass); + Register DestReg = createResultReg(&Mips::GPR32RegClass); emitInst(Mips::ADDu, DestReg).addReg(TempReg).addReg(Addr.getReg()); Addr.setReg(DestReg); Addr.setOffset(0); @@ -2129,7 +2129,7 @@ unsigned MipsFastISel::fastEmitInst_rr(unsigned MachineInstOpcode, // followed by another instruction that defines the same registers too. // We can fix this by explicitly marking those registers as dead. if (MachineInstOpcode == Mips::MUL) { - unsigned ResultReg = createResultReg(RC); + Register ResultReg = createResultReg(RC); const MCInstrDesc &II = TII.get(MachineInstOpcode); Op0 = constrainOperandRegClass(II, Op0, II.getNumDefs()); Op1 = constrainOperandRegClass(II, Op1, II.getNumDefs() + 1); diff --git a/llvm/lib/Target/Mips/MipsISelLowering.cpp b/llvm/lib/Target/Mips/MipsISelLowering.cpp index 9377e83524e1..0c2e129b8f1f 100644 --- a/llvm/lib/Target/Mips/MipsISelLowering.cpp +++ b/llvm/lib/Target/Mips/MipsISelLowering.cpp @@ -2523,7 +2523,7 @@ SDValue MipsTargetLowering::lowerRETURNADDR(SDValue Op, MFI.setReturnAddressIsTaken(true); // Return RA, which contains the return address. Mark it an implicit live-in. - unsigned Reg = MF.addLiveIn(RA, getRegClassFor(VT)); + Register Reg = MF.addLiveIn(RA, getRegClassFor(VT)); return DAG.getCopyFromReg(DAG.getEntryNode(), SDLoc(Op), Reg, VT); } @@ -3051,17 +3051,15 @@ getOpndList(SmallVectorImpl<SDValue> &Ops, // stuck together. SDValue InFlag; - for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { - Chain = CLI.DAG.getCopyToReg(Chain, CLI.DL, RegsToPass[i].first, - RegsToPass[i].second, InFlag); + for (auto &R : RegsToPass) { + Chain = CLI.DAG.getCopyToReg(Chain, CLI.DL, R.first, R.second, InFlag); InFlag = Chain.getValue(1); } // Add argument registers to the end of the list so that they are // known live into the call. - for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) - Ops.push_back(CLI.DAG.getRegister(RegsToPass[i].first, - RegsToPass[i].second.getValueType())); + for (auto &R : RegsToPass) + Ops.push_back(CLI.DAG.getRegister(R.first, R.second.getValueType())); // Add a register mask operand representing the call-preserved registers. const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo(); diff --git a/llvm/lib/Target/Mips/MipsInstructionSelector.cpp b/llvm/lib/Target/Mips/MipsInstructionSelector.cpp index 6d44ce2ab563..59f158688b16 100644 --- a/llvm/lib/Target/Mips/MipsInstructionSelector.cpp +++ b/llvm/lib/Target/Mips/MipsInstructionSelector.cpp @@ -80,8 +80,8 @@ private: MipsInstructionSelector::MipsInstructionSelector( const MipsTargetMachine &TM, const MipsSubtarget &STI, const MipsRegisterBankInfo &RBI) - : InstructionSelector(), TM(TM), STI(STI), TII(*STI.getInstrInfo()), - TRI(*STI.getRegisterInfo()), RBI(RBI), + : TM(TM), STI(STI), TII(*STI.getInstrInfo()), TRI(*STI.getRegisterInfo()), + RBI(RBI), #define GET_GLOBALISEL_PREDICATES_INIT #include "MipsGenGlobalISel.inc" diff --git a/llvm/lib/Target/Mips/MipsMachineFunction.cpp b/llvm/lib/Target/Mips/MipsMachineFunction.cpp index a7a2be30f58a..411a26e42713 100644 --- a/llvm/lib/Target/Mips/MipsMachineFunction.cpp +++ b/llvm/lib/Target/Mips/MipsMachineFunction.cpp @@ -148,14 +148,14 @@ void MipsFunctionInfo::initGlobalBaseReg(MachineFunction &MF) { void MipsFunctionInfo::createEhDataRegsFI(MachineFunction &MF) { const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); - for (int I = 0; I < 4; ++I) { + for (int &I : EhDataRegFI) { const TargetRegisterClass &RC = static_cast<const MipsTargetMachine &>(MF.getTarget()).getABI().IsN64() ? Mips::GPR64RegClass : Mips::GPR32RegClass; - EhDataRegFI[I] = MF.getFrameInfo().CreateStackObject( - TRI.getSpillSize(RC), TRI.getSpillAlign(RC), false); + I = MF.getFrameInfo().CreateStackObject(TRI.getSpillSize(RC), + TRI.getSpillAlign(RC), false); } } @@ -167,9 +167,9 @@ void MipsFunctionInfo::createISRRegFI(MachineFunction &MF) { const TargetRegisterClass &RC = Mips::GPR32RegClass; const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); - for (int I = 0; I < 2; ++I) - ISRDataRegFI[I] = MF.getFrameInfo().CreateStackObject( - TRI.getSpillSize(RC), TRI.getSpillAlign(RC), false); + for (int &I : ISRDataRegFI) + I = MF.getFrameInfo().CreateStackObject(TRI.getSpillSize(RC), + TRI.getSpillAlign(RC), false); } bool MipsFunctionInfo::isEhDataRegFI(int FI) const { diff --git a/llvm/lib/Target/Mips/MipsMulMulBugPass.cpp b/llvm/lib/Target/Mips/MipsMulMulBugPass.cpp new file mode 100644 index 000000000000..daaf1135c2b1 --- /dev/null +++ b/llvm/lib/Target/Mips/MipsMulMulBugPass.cpp @@ -0,0 +1,136 @@ +//===- MipsMulMulBugPass.cpp - Mips VR4300 mulmul bugfix pass -------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Early revisions of the VR4300 have a hardware bug where two consecutive +// multiplications can produce an incorrect result in the second multiply. +// +// This pass scans for mul instructions in each basic block and inserts +// a nop whenever the following conditions are met: +// +// - The current instruction is a single or double-precision floating-point +// mul instruction. +// - The next instruction is either a mul instruction (any kind) +// or a branch instruction. +//===----------------------------------------------------------------------===// + +#include "Mips.h" +#include "MipsInstrInfo.h" +#include "MipsSubtarget.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Target/TargetMachine.h" + +#define DEBUG_TYPE "mips-vr4300-mulmul-fix" + +using namespace llvm; + +namespace { + +class MipsMulMulBugFix : public MachineFunctionPass { +public: + MipsMulMulBugFix() : MachineFunctionPass(ID) { + initializeMipsMulMulBugFixPass(*PassRegistry::getPassRegistry()); + } + + StringRef getPassName() const override { return "Mips VR4300 mulmul bugfix"; } + + MachineFunctionProperties getRequiredProperties() const override { + return MachineFunctionProperties().set( + MachineFunctionProperties::Property::NoVRegs); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + static char ID; + +private: + bool fixMulMulBB(MachineBasicBlock &MBB, const MipsInstrInfo &MipsII); +}; + +} // namespace + +INITIALIZE_PASS(MipsMulMulBugFix, "mips-vr4300-mulmul-fix", + "Mips VR4300 mulmul bugfix", false, false) + +char MipsMulMulBugFix::ID = 0; + +bool MipsMulMulBugFix::runOnMachineFunction(MachineFunction &MF) { + const MipsInstrInfo &MipsII = + *static_cast<const MipsInstrInfo *>(MF.getSubtarget().getInstrInfo()); + + bool Modified = false; + + for (auto &MBB : MF) + Modified |= fixMulMulBB(MBB, MipsII); + + return Modified; +} + +static bool isFirstMul(const MachineInstr &MI) { + switch (MI.getOpcode()) { + case Mips::FMUL_S: + case Mips::FMUL_D: + case Mips::FMUL_D32: + case Mips::FMUL_D64: + return true; + default: + return false; + } +} + +static bool isSecondMulOrBranch(const MachineInstr &MI) { + if (MI.isBranch() || MI.isIndirectBranch() || MI.isCall()) + return true; + + switch (MI.getOpcode()) { + case Mips::MUL: + case Mips::FMUL_S: + case Mips::FMUL_D: + case Mips::FMUL_D32: + case Mips::FMUL_D64: + case Mips::MULT: + case Mips::MULTu: + case Mips::DMULT: + case Mips::DMULTu: + return true; + default: + return false; + } +} + +bool MipsMulMulBugFix::fixMulMulBB(MachineBasicBlock &MBB, + const MipsInstrInfo &MipsII) { + bool Modified = false; + + MachineBasicBlock::instr_iterator NextMII; + + // Iterate through the instructions in the basic block + for (MachineBasicBlock::instr_iterator MII = MBB.instr_begin(), + E = MBB.instr_end(); + MII != E; MII = NextMII) { + + NextMII = next_nodbg(MII, E); + + // Trigger when the current instruction is a mul and the next instruction + // is either a mul or a branch in case the branch target start with a mul + if (NextMII != E && isFirstMul(*MII) && isSecondMulOrBranch(*NextMII)) { + LLVM_DEBUG(dbgs() << "Found mulmul!\n"); + + const MCInstrDesc &NewMCID = MipsII.get(Mips::NOP); + BuildMI(MBB, NextMII, DebugLoc(), NewMCID); + Modified = true; + } + } + + return Modified; +} + +FunctionPass *llvm::createMipsMulMulBugPass() { return new MipsMulMulBugFix(); } diff --git a/llvm/lib/Target/Mips/MipsRegisterInfo.cpp b/llvm/lib/Target/Mips/MipsRegisterInfo.cpp index 7cba3118cd62..390ab9d22024 100644 --- a/llvm/lib/Target/Mips/MipsRegisterInfo.cpp +++ b/llvm/lib/Target/Mips/MipsRegisterInfo.cpp @@ -159,8 +159,8 @@ getReservedRegs(const MachineFunction &MF) const { BitVector Reserved(getNumRegs()); const MipsSubtarget &Subtarget = MF.getSubtarget<MipsSubtarget>(); - for (unsigned I = 0; I < array_lengthof(ReservedGPR32); ++I) - Reserved.set(ReservedGPR32[I]); + for (MCPhysReg R : ReservedGPR32) + Reserved.set(R); // Reserve registers for the NaCl sandbox. if (Subtarget.isTargetNaCl()) { @@ -169,8 +169,8 @@ getReservedRegs(const MachineFunction &MF) const { Reserved.set(Mips::T8); // Reserved for thread pointer. } - for (unsigned I = 0; I < array_lengthof(ReservedGPR64); ++I) - Reserved.set(ReservedGPR64[I]); + for (MCPhysReg R : ReservedGPR64) + Reserved.set(R); // For mno-abicalls, GP is a program invariant! if (!Subtarget.isABICalls()) { diff --git a/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp b/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp index 193d071447ff..7ee2ddf3605f 100644 --- a/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp +++ b/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp @@ -454,7 +454,7 @@ void MipsSEFrameLowering::emitPrologue(MachineFunction &MF, // directives. for (const CalleeSavedInfo &I : CSI) { int64_t Offset = MFI.getObjectOffset(I.getFrameIdx()); - unsigned Reg = I.getReg(); + Register Reg = I.getReg(); // If Reg is a double precision register, emit two cfa_offsets, // one for each of the paired single precision registers. @@ -801,7 +801,7 @@ bool MipsSEFrameLowering::spillCalleeSavedRegisters( // method MipsTargetLowering::lowerRETURNADDR. // It's killed at the spill, unless the register is RA and return address // is taken. - unsigned Reg = I.getReg(); + Register Reg = I.getReg(); bool IsRAAndRetAddrIsTaken = (Reg == Mips::RA || Reg == Mips::RA_64) && MF->getFrameInfo().isReturnAddressTaken(); if (!IsRAAndRetAddrIsTaken) diff --git a/llvm/lib/Target/Mips/MipsSEISelLowering.cpp b/llvm/lib/Target/Mips/MipsSEISelLowering.cpp index 40b215a8204c..346ebe9664fc 100644 --- a/llvm/lib/Target/Mips/MipsSEISelLowering.cpp +++ b/llvm/lib/Target/Mips/MipsSEISelLowering.cpp @@ -85,18 +85,18 @@ MipsSETargetLowering::MipsSETargetLowering(const MipsTargetMachine &TM, if (Subtarget.hasDSP()) { MVT::SimpleValueType VecTys[2] = {MVT::v2i16, MVT::v4i8}; - for (unsigned i = 0; i < array_lengthof(VecTys); ++i) { - addRegisterClass(VecTys[i], &Mips::DSPRRegClass); + for (const auto &VecTy : VecTys) { + addRegisterClass(VecTy, &Mips::DSPRRegClass); // Expand all builtin opcodes. for (unsigned Opc = 0; Opc < ISD::BUILTIN_OP_END; ++Opc) - setOperationAction(Opc, VecTys[i], Expand); + setOperationAction(Opc, VecTy, Expand); - setOperationAction(ISD::ADD, VecTys[i], Legal); - setOperationAction(ISD::SUB, VecTys[i], Legal); - setOperationAction(ISD::LOAD, VecTys[i], Legal); - setOperationAction(ISD::STORE, VecTys[i], Legal); - setOperationAction(ISD::BITCAST, VecTys[i], Legal); + setOperationAction(ISD::ADD, VecTy, Legal); + setOperationAction(ISD::SUB, VecTy, Legal); + setOperationAction(ISD::LOAD, VecTy, Legal); + setOperationAction(ISD::STORE, VecTy, Legal); + setOperationAction(ISD::BITCAST, VecTy, Legal); } setTargetDAGCombine(ISD::SHL); @@ -2931,7 +2931,7 @@ static SDValue lowerVECTOR_SHUFFLE_PCKOD(SDValue Op, EVT ResTy, // operand is unused and can be replaced with anything. We choose to replace it // with the used operand since this reduces the number of instructions overall. static SDValue lowerVECTOR_SHUFFLE_VSHF(SDValue Op, EVT ResTy, - SmallVector<int, 16> Indices, + const SmallVector<int, 16> &Indices, SelectionDAG &DAG) { SmallVector<SDValue, 16> Ops; SDValue Op0; @@ -2953,9 +2953,8 @@ static SDValue lowerVECTOR_SHUFFLE_VSHF(SDValue Op, EVT ResTy, Using2ndVec = true; } - for (SmallVector<int, 16>::iterator I = Indices.begin(); I != Indices.end(); - ++I) - Ops.push_back(DAG.getTargetConstant(*I, DL, MaskEltTy)); + for (int Idx : Indices) + Ops.push_back(DAG.getTargetConstant(Idx, DL, MaskEltTy)); SDValue MaskVec = DAG.getBuildVector(MaskVecTy, DL, Ops); diff --git a/llvm/lib/Target/Mips/MipsSERegisterInfo.cpp b/llvm/lib/Target/Mips/MipsSERegisterInfo.cpp index b05e9ad827c4..d6481793ef49 100644 --- a/llvm/lib/Target/Mips/MipsSERegisterInfo.cpp +++ b/llvm/lib/Target/Mips/MipsSERegisterInfo.cpp @@ -38,7 +38,7 @@ using namespace llvm; #define DEBUG_TYPE "mips-reg-info" -MipsSERegisterInfo::MipsSERegisterInfo() : MipsRegisterInfo() {} +MipsSERegisterInfo::MipsSERegisterInfo() {} bool MipsSERegisterInfo:: requiresRegisterScavenging(const MachineFunction &MF) const { diff --git a/llvm/lib/Target/Mips/MipsTargetMachine.cpp b/llvm/lib/Target/Mips/MipsTargetMachine.cpp index 8de3c9fd25bd..f9f662a00117 100644 --- a/llvm/lib/Target/Mips/MipsTargetMachine.cpp +++ b/llvm/lib/Target/Mips/MipsTargetMachine.cpp @@ -45,6 +45,10 @@ using namespace llvm; #define DEBUG_TYPE "mips" +static cl::opt<bool> + EnableMulMulFix("mfix4300", cl::init(false), + cl::desc("Enable the VR4300 mulmul bug fix."), cl::Hidden); + extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeMipsTarget() { // Register the target. RegisterTargetMachine<MipsebTargetMachine> X(getTheMipsTarget()); @@ -58,6 +62,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeMipsTarget() { initializeMipsBranchExpansionPass(*PR); initializeMicroMipsSizeReducePass(*PR); initializeMipsPreLegalizerCombinerPass(*PR); + initializeMipsMulMulBugFixPass(*PR); } static std::string computeDataLayout(const Triple &TT, StringRef CPU, @@ -292,6 +297,11 @@ void MipsPassConfig::addPreEmitPass() { // instructions which can be remapped to a 16 bit instruction. addPass(createMicroMipsSizeReducePass()); + // This pass inserts a nop instruction between two back-to-back multiplication + // instructions when the "mfix4300" flag is passed. + if (EnableMulMulFix) + addPass(createMipsMulMulBugPass()); + // The delay slot filler pass can potientially create forbidden slot hazards // for MIPSR6 and therefore it should go before MipsBranchExpansion pass. addPass(createMipsDelaySlotFillerPass()); diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp index 82d332ab3f08..da0cbb32659c 100644 --- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp +++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp @@ -108,6 +108,10 @@ void NVPTXInstPrinter::printCvtMode(const MCInst *MI, int OpNum, raw_ostream &O, // SAT flag if (Imm & NVPTX::PTXCvtMode::SAT_FLAG) O << ".sat"; + } else if (strcmp(Modifier, "relu") == 0) { + // RELU flag + if (Imm & NVPTX::PTXCvtMode::RELU_FLAG) + O << ".relu"; } else if (strcmp(Modifier, "base") == 0) { // Default operand switch (Imm & NVPTX::PTXCvtMode::BASE_MASK) { @@ -139,6 +143,9 @@ void NVPTXInstPrinter::printCvtMode(const MCInst *MI, int OpNum, raw_ostream &O, case NVPTX::PTXCvtMode::RP: O << ".rp"; break; + case NVPTX::PTXCvtMode::RNA: + O << ".rna"; + break; } } else { llvm_unreachable("Invalid conversion modifier"); diff --git a/llvm/lib/Target/NVPTX/NVPTX.h b/llvm/lib/Target/NVPTX/NVPTX.h index c2fd090da084..41e9f375e536 100644 --- a/llvm/lib/Target/NVPTX/NVPTX.h +++ b/llvm/lib/Target/NVPTX/NVPTX.h @@ -137,10 +137,12 @@ enum CvtMode { RZ, RM, RP, + RNA, BASE_MASK = 0x0F, FTZ_FLAG = 0x10, - SAT_FLAG = 0x20 + SAT_FLAG = 0x20, + RELU_FLAG = 0x40 }; } diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp index 16add48d4602..3a59306c4998 100644 --- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp @@ -1214,9 +1214,9 @@ void NVPTXAsmPrinter::emitDemotedVars(const Function *f, raw_ostream &O) { std::vector<const GlobalVariable *> &gvars = localDecls[f]; - for (unsigned i = 0, e = gvars.size(); i != e; ++i) { + for (const GlobalVariable *GV : gvars) { O << "\t// demoted variable\n\t"; - printModuleLevelGV(gvars[i], O, true); + printModuleLevelGV(GV, O, true); } } @@ -1454,7 +1454,7 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) { if (static_cast<NVPTXTargetMachine &>(TM).getDrvInterface() != NVPTX::CUDA) { - Type *ETy = PTy->getElementType(); + Type *ETy = PTy->getPointerElementType(); int addrSpace = PTy->getAddressSpace(); switch (addrSpace) { default: @@ -1514,7 +1514,7 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) { // param has byVal attribute. So should be a pointer auto *PTy = dyn_cast<PointerType>(Ty); assert(PTy && "Param with byval attribute should be a pointer type"); - Type *ETy = PTy->getElementType(); + Type *ETy = PTy->getPointerElementType(); if (isABI || isKernelFunc) { // Just print .param .align <a> .b8 .param[size]; @@ -1613,7 +1613,7 @@ void NVPTXAsmPrinter::setAndEmitFunctionVirtualRegisters( // We use the per class virtual register number in the ptx output. unsigned int numVRs = MRI->getNumVirtRegs(); for (unsigned i = 0; i < numVRs; i++) { - unsigned int vr = Register::index2VirtReg(i); + Register vr = Register::index2VirtReg(i); const TargetRegisterClass *RC = MRI->getRegClass(vr); DenseMap<unsigned, unsigned> ®map = VRegMapping[RC]; int n = regmap.size(); diff --git a/llvm/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp b/llvm/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp index a9a5eae42c1d..888fc8ffac2c 100644 --- a/llvm/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp @@ -96,20 +96,18 @@ bool GenericToNVVM::runOnModule(Module &M) { // Walk through the instructions in function defitinions, and replace any use // of original global variables in GVMap with a use of the corresponding // copies in GVMap. If necessary, promote constants to instructions. - for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) { - if (I->isDeclaration()) { + for (Function &F : M) { + if (F.isDeclaration()) { continue; } - IRBuilder<> Builder(I->getEntryBlock().getFirstNonPHIOrDbg()); - for (Function::iterator BBI = I->begin(), BBE = I->end(); BBI != BBE; - ++BBI) { - for (BasicBlock::iterator II = BBI->begin(), IE = BBI->end(); II != IE; - ++II) { - for (unsigned i = 0, e = II->getNumOperands(); i < e; ++i) { - Value *Operand = II->getOperand(i); + IRBuilder<> Builder(F.getEntryBlock().getFirstNonPHIOrDbg()); + for (BasicBlock &BB : F) { + for (Instruction &II : BB) { + for (unsigned i = 0, e = II.getNumOperands(); i < e; ++i) { + Value *Operand = II.getOperand(i); if (isa<Constant>(Operand)) { - II->setOperand( - i, remapConstant(&M, &*I, cast<Constant>(Operand), Builder)); + II.setOperand( + i, remapConstant(&M, &F, cast<Constant>(Operand), Builder)); } } } diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index e2f6b69fc530..eac237bb27bb 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -553,17 +553,30 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, // These map to corresponding instructions for f32/f64. f16 must be // promoted to f32. v2f16 is expanded to f16, which is then promoted // to f32. - for (const auto &Op : {ISD::FDIV, ISD::FREM, ISD::FSQRT, ISD::FSIN, ISD::FCOS, - ISD::FABS, ISD::FMINNUM, ISD::FMAXNUM}) { + for (const auto &Op : + {ISD::FDIV, ISD::FREM, ISD::FSQRT, ISD::FSIN, ISD::FCOS, ISD::FABS}) { setOperationAction(Op, MVT::f16, Promote); setOperationAction(Op, MVT::f32, Legal); setOperationAction(Op, MVT::f64, Legal); setOperationAction(Op, MVT::v2f16, Expand); } - setOperationAction(ISD::FMINNUM, MVT::f16, Promote); - setOperationAction(ISD::FMAXNUM, MVT::f16, Promote); - setOperationAction(ISD::FMINIMUM, MVT::f16, Promote); - setOperationAction(ISD::FMAXIMUM, MVT::f16, Promote); + // max.f16, max.f16x2 and max.NaN are supported on sm_80+. + auto GetMinMaxAction = [&](LegalizeAction NotSm80Action) { + bool IsAtLeastSm80 = STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70; + return IsAtLeastSm80 ? Legal : NotSm80Action; + }; + for (const auto &Op : {ISD::FMINNUM, ISD::FMAXNUM}) { + setFP16OperationAction(Op, MVT::f16, GetMinMaxAction(Promote), Promote); + setOperationAction(Op, MVT::f32, Legal); + setOperationAction(Op, MVT::f64, Legal); + setFP16OperationAction(Op, MVT::v2f16, GetMinMaxAction(Expand), Expand); + } + for (const auto &Op : {ISD::FMINIMUM, ISD::FMAXIMUM}) { + setFP16OperationAction(Op, MVT::f16, GetMinMaxAction(Expand), Expand); + setOperationAction(Op, MVT::f32, GetMinMaxAction(Expand)); + setOperationAction(Op, MVT::f64, GetMinMaxAction(Expand)); + setFP16OperationAction(Op, MVT::v2f16, GetMinMaxAction(Expand), Expand); + } // No FEXP2, FLOG2. The PTX ex2 and log2 functions are always approximate. // No FPOW or FREM in PTX. @@ -1341,7 +1354,7 @@ std::string NVPTXTargetLowering::getPrototype( } auto *PTy = dyn_cast<PointerType>(Ty); assert(PTy && "Param with byval attribute should be a pointer type"); - Type *ETy = PTy->getElementType(); + Type *ETy = PTy->getPointerElementType(); Align align = Outs[OIdx].Flags.getNonZeroByValAlign(); unsigned sz = DL.getTypeAllocSize(ETy); @@ -1564,7 +1577,8 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SmallVector<uint64_t, 16> Offsets; auto *PTy = dyn_cast<PointerType>(Args[i].Ty); assert(PTy && "Type of a byval parameter should be pointer"); - ComputePTXValueVTs(*this, DL, PTy->getElementType(), VTs, &Offsets, 0); + ComputePTXValueVTs(*this, DL, PTy->getPointerElementType(), VTs, &Offsets, + 0); // declare .param .align <align> .b8 .param<n>[<size>]; unsigned sz = Outs[OIdx].Flags.getByValSize(); @@ -2434,7 +2448,7 @@ static bool isImageOrSamplerVal(const Value *arg, const Module *context) { if (!context) return false; - auto *STy = dyn_cast<StructType>(PTy->getElementType()); + auto *STy = dyn_cast<StructType>(PTy->getPointerElementType()); if (!STy || STy->isLiteral()) return false; diff --git a/llvm/lib/Target/NVPTX/NVPTXImageOptimizer.cpp b/llvm/lib/Target/NVPTX/NVPTXImageOptimizer.cpp index fc0d5cc6fbfa..eeedce2d99cb 100644 --- a/llvm/lib/Target/NVPTX/NVPTXImageOptimizer.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXImageOptimizer.cpp @@ -57,12 +57,9 @@ bool NVPTXImageOptimizer::runOnFunction(Function &F) { InstrToDelete.clear(); // Look for call instructions in the function - for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; - ++BI) { - for (BasicBlock::iterator I = (*BI).begin(), E = (*BI).end(); - I != E; ++I) { - Instruction &Instr = *I; - if (CallInst *CI = dyn_cast<CallInst>(I)) { + for (BasicBlock &BB : F) { + for (Instruction &Instr : BB) { + if (CallInst *CI = dyn_cast<CallInst>(&Instr)) { Function *CalledF = CI->getCalledFunction(); if (CalledF && CalledF->isIntrinsic()) { // This is an intrinsic function call, check if its an istypep @@ -84,8 +81,8 @@ bool NVPTXImageOptimizer::runOnFunction(Function &F) { } // Delete any istypep instances we replaced in the IR - for (unsigned i = 0, e = InstrToDelete.size(); i != e; ++i) - InstrToDelete[i]->eraseFromParent(); + for (Instruction *I : InstrToDelete) + I->eraseFromParent(); return Changed; } diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index 96386af569de..22e200e77831 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -48,6 +48,7 @@ def CvtRN : PatLeaf<(i32 0x5)>; def CvtRZ : PatLeaf<(i32 0x6)>; def CvtRM : PatLeaf<(i32 0x7)>; def CvtRP : PatLeaf<(i32 0x8)>; +def CvtRNA : PatLeaf<(i32 0x9)>; def CvtNONE_FTZ : PatLeaf<(i32 0x10)>; def CvtRNI_FTZ : PatLeaf<(i32 0x11)>; @@ -62,6 +63,10 @@ def CvtRP_FTZ : PatLeaf<(i32 0x18)>; def CvtSAT : PatLeaf<(i32 0x20)>; def CvtSAT_FTZ : PatLeaf<(i32 0x30)>; +def CvtNONE_RELU : PatLeaf<(i32 0x40)>; +def CvtRN_RELU : PatLeaf<(i32 0x45)>; +def CvtRZ_RELU : PatLeaf<(i32 0x46)>; + def CvtMode : Operand<i32> { let PrintMethod = "printCvtMode"; } @@ -249,6 +254,32 @@ multiclass F3<string OpcStr, SDNode OpNode> { (ins Float32Regs:$a, f32imm:$b), !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"), [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>; + + def f16rr_ftz : + NVPTXInst<(outs Float16Regs:$dst), + (ins Float16Regs:$a, Float16Regs:$b), + !strconcat(OpcStr, ".ftz.f16 \t$dst, $a, $b;"), + [(set Float16Regs:$dst, (OpNode Float16Regs:$a, Float16Regs:$b))]>, + Requires<[useFP16Math, doF32FTZ]>; + def f16rr : + NVPTXInst<(outs Float16Regs:$dst), + (ins Float16Regs:$a, Float16Regs:$b), + !strconcat(OpcStr, ".f16 \t$dst, $a, $b;"), + [(set Float16Regs:$dst, (OpNode Float16Regs:$a, Float16Regs:$b))]>, + Requires<[useFP16Math]>; + + def f16x2rr_ftz : + NVPTXInst<(outs Float16x2Regs:$dst), + (ins Float16x2Regs:$a, Float16x2Regs:$b), + !strconcat(OpcStr, ".ftz.f16x2 \t$dst, $a, $b;"), + [(set Float16x2Regs:$dst, (OpNode Float16x2Regs:$a, Float16x2Regs:$b))]>, + Requires<[useFP16Math, doF32FTZ]>; + def f16x2rr : + NVPTXInst<(outs Float16x2Regs:$dst), + (ins Float16x2Regs:$a, Float16x2Regs:$b), + !strconcat(OpcStr, ".f16x2 \t$dst, $a, $b;"), + [(set Float16x2Regs:$dst, (OpNode Float16x2Regs:$a, Float16x2Regs:$b))]>, + Requires<[useFP16Math]>; } // Template for instructions which take three FP args. The @@ -500,6 +531,29 @@ let hasSideEffects = false in { "cvt.s64.s16 \t$dst, $src;", []>; def CVT_INREG_s64_s32 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src), "cvt.s64.s32 \t$dst, $src;", []>; + +multiclass CVT_FROM_FLOAT_SM80<string FromName, RegisterClass RC> { + def _f32 : + NVPTXInst<(outs RC:$dst), + (ins Float32Regs:$src, CvtMode:$mode), + !strconcat("cvt${mode:base}${mode:relu}.", + FromName, ".f32 \t$dst, $src;"), []>, + Requires<[hasPTX70, hasSM80]>; + } + + defm CVT_bf16 : CVT_FROM_FLOAT_SM80<"bf16", Int16Regs>; + + multiclass CVT_FROM_FLOAT_V2_SM80<string FromName, RegisterClass RC> { + def _f32 : + NVPTXInst<(outs RC:$dst), + (ins Float32Regs:$src1, Float32Regs:$src2, CvtMode:$mode), + !strconcat("cvt${mode:base}${mode:relu}.", + FromName, ".f32 \t$dst, $src1, $src2;"), []>, + Requires<[hasPTX70, hasSM80]>; + } + + defm CVT_f16x2 : CVT_FROM_FLOAT_V2_SM80<"f16x2", Float16x2Regs>; + defm CVT_bf16x2 : CVT_FROM_FLOAT_V2_SM80<"bf16x2", Int32Regs>; } //----------------------------------- @@ -842,6 +896,8 @@ defm FMUL : F3_fma_component<"mul", fmul>; defm FMIN : F3<"min", fminnum>; defm FMAX : F3<"max", fmaxnum>; +defm FMINNAN : F3<"min.NaN", fminimum>; +defm FMAXNAN : F3<"max.NaN", fmaximum>; defm FABS : F2<"abs", fabs>; defm FNEG : F2<"neg", fneg>; diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index 511cd875ac55..ec069a0a02ae 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -1046,6 +1046,38 @@ def : Pat<(int_nvvm_ui2f_rm Int32Regs:$a), def : Pat<(int_nvvm_ui2f_rp Int32Regs:$a), (CVT_f32_u32 Int32Regs:$a, CvtRP)>; +def : Pat<(int_nvvm_ff2bf16x2_rn Float32Regs:$a, Float32Regs:$b), + (CVT_bf16x2_f32 Float32Regs:$a, Float32Regs:$b, CvtRN)>; +def : Pat<(int_nvvm_ff2bf16x2_rn_relu Float32Regs:$a, Float32Regs:$b), + (CVT_bf16x2_f32 Float32Regs:$a, Float32Regs:$b, CvtRN_RELU)>; +def : Pat<(int_nvvm_ff2bf16x2_rz Float32Regs:$a, Float32Regs:$b), + (CVT_bf16x2_f32 Float32Regs:$a, Float32Regs:$b, CvtRZ)>; +def : Pat<(int_nvvm_ff2bf16x2_rz_relu Float32Regs:$a, Float32Regs:$b), + (CVT_bf16x2_f32 Float32Regs:$a, Float32Regs:$b, CvtRZ_RELU)>; + +def : Pat<(int_nvvm_ff2f16x2_rn Float32Regs:$a, Float32Regs:$b), + (CVT_f16x2_f32 Float32Regs:$a, Float32Regs:$b, CvtRN)>; +def : Pat<(int_nvvm_ff2f16x2_rn_relu Float32Regs:$a, Float32Regs:$b), + (CVT_f16x2_f32 Float32Regs:$a, Float32Regs:$b, CvtRN_RELU)>; +def : Pat<(int_nvvm_ff2f16x2_rz Float32Regs:$a, Float32Regs:$b), + (CVT_f16x2_f32 Float32Regs:$a, Float32Regs:$b, CvtRZ)>; +def : Pat<(int_nvvm_ff2f16x2_rz_relu Float32Regs:$a, Float32Regs:$b), + (CVT_f16x2_f32 Float32Regs:$a, Float32Regs:$b, CvtRZ_RELU)>; + +def : Pat<(int_nvvm_f2bf16_rn Float32Regs:$a), + (CVT_bf16_f32 Float32Regs:$a, CvtRN)>; +def : Pat<(int_nvvm_f2bf16_rn_relu Float32Regs:$a), + (CVT_bf16_f32 Float32Regs:$a, CvtRN_RELU)>; +def : Pat<(int_nvvm_f2bf16_rz Float32Regs:$a), + (CVT_bf16_f32 Float32Regs:$a, CvtRZ)>; +def : Pat<(int_nvvm_f2bf16_rz_relu Float32Regs:$a), + (CVT_bf16_f32 Float32Regs:$a, CvtRZ_RELU)>; + +def CVT_tf32_f32 : + NVPTXInst<(outs Int32Regs:$dest), (ins Float32Regs:$a), + "cvt.rna.tf32.f32 \t$dest, $a;", + [(set Int32Regs:$dest, (int_nvvm_f2tf32_rna Float32Regs:$a))]>; + def INT_NVVM_LOHI_I2D : F_MATH_2<"mov.b64 \t$dst, {{$src0, $src1}};", Float64Regs, Int32Regs, Int32Regs, int_nvvm_lohi_i2d>; diff --git a/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp b/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp index 6cf59d285e8d..f655f25602bc 100644 --- a/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp @@ -66,10 +66,9 @@ bool NVPTXLowerAggrCopies::runOnFunction(Function &F) { getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); // Collect all aggregate loads and mem* calls. - for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) { - for (BasicBlock::iterator II = BI->begin(), IE = BI->end(); II != IE; - ++II) { - if (LoadInst *LI = dyn_cast<LoadInst>(II)) { + for (BasicBlock &BB : F) { + for (Instruction &I : BB) { + if (LoadInst *LI = dyn_cast<LoadInst>(&I)) { if (!LI->hasOneUse()) continue; @@ -81,7 +80,7 @@ bool NVPTXLowerAggrCopies::runOnFunction(Function &F) { continue; AggrLoads.push_back(LI); } - } else if (MemIntrinsic *IntrCall = dyn_cast<MemIntrinsic>(II)) { + } else if (MemIntrinsic *IntrCall = dyn_cast<MemIntrinsic>(&I)) { // Convert intrinsic calls with variable size or with constant size // larger than the MaxAggrCopySize threshold. if (ConstantInt *LenCI = dyn_cast<ConstantInt>(IntrCall->getLength())) { diff --git a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp index ddb7f097fe68..67aa49132016 100644 --- a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp @@ -233,7 +233,7 @@ void NVPTXLowerArgs::handleByValParam(Argument *Arg) { assert(PType && "Expecting pointer type in handleByValParam"); - Type *StructType = PType->getElementType(); + Type *StructType = PType->getPointerElementType(); auto IsALoadChain = [&](Value *Start) { SmallVector<Value *, 16> ValuesToCheck = {Start}; diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp b/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp index 05c20369abf4..5a6440c91fca 100644 --- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp @@ -49,8 +49,8 @@ NVPTXSubtarget::NVPTXSubtarget(const Triple &TT, const std::string &CPU, const std::string &FS, const NVPTXTargetMachine &TM) : NVPTXGenSubtargetInfo(TT, CPU, /*TuneCPU*/ CPU, FS), PTXVersion(0), - SmVersion(20), TM(TM), InstrInfo(), - TLInfo(TM, initializeSubtargetDependencies(CPU, FS)), FrameLowering() {} + SmVersion(20), TM(TM), + TLInfo(TM, initializeSubtargetDependencies(CPU, FS)) {} bool NVPTXSubtarget::hasImageHandles() const { // Enable handles for Kepler+, where CUDA supports indirect surfaces and diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetObjectFile.h b/llvm/lib/Target/NVPTX/NVPTXTargetObjectFile.h index 366d92a5a805..4645671a0cd8 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetObjectFile.h +++ b/llvm/lib/Target/NVPTX/NVPTXTargetObjectFile.h @@ -17,7 +17,7 @@ namespace llvm { class NVPTXTargetObjectFile : public TargetLoweringObjectFile { public: - NVPTXTargetObjectFile() : TargetLoweringObjectFile() {} + NVPTXTargetObjectFile() {} ~NVPTXTargetObjectFile() override; diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp b/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp index 74d129d330f3..2d6d72777db2 100644 --- a/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp @@ -286,8 +286,7 @@ bool getAlign(const Function &F, unsigned index, unsigned &align) { bool retval = findAllNVVMAnnotation(&F, "align", Vs); if (!retval) return false; - for (int i = 0, e = Vs.size(); i < e; i++) { - unsigned v = Vs[i]; + for (unsigned v : Vs) { if ((v >> 16) == index) { align = v & 0xFFFF; return true; diff --git a/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp b/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp index ded922329ebf..715cff72dcab 100644 --- a/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp +++ b/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp @@ -121,6 +121,7 @@ class PPCAsmParser : public MCTargetAsmParser { bool ParseDirectiveMachine(SMLoc L); bool ParseDirectiveAbiVersion(SMLoc L); bool ParseDirectiveLocalEntry(SMLoc L); + bool ParseGNUAttribute(SMLoc L); bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, OperandVector &Operands, MCStreamer &Out, @@ -201,7 +202,8 @@ struct PPCOperand : public MCParsedAsmOperand { struct TLSRegOp TLSReg; }; - PPCOperand(KindTy K) : MCParsedAsmOperand(), Kind(K) {} + PPCOperand(KindTy K) : Kind(K) {} + public: PPCOperand(const PPCOperand &o) : MCParsedAsmOperand() { Kind = o.Kind; @@ -1604,6 +1606,8 @@ bool PPCAsmParser::ParseDirective(AsmToken DirectiveID) { ParseDirectiveAbiVersion(DirectiveID.getLoc()); else if (IDVal == ".localentry") ParseDirectiveLocalEntry(DirectiveID.getLoc()); + else if (IDVal.startswith(".gnu_attribute")) + ParseGNUAttribute(DirectiveID.getLoc()); else return true; return false; @@ -1719,7 +1723,16 @@ bool PPCAsmParser::ParseDirectiveLocalEntry(SMLoc L) { return false; } +bool PPCAsmParser::ParseGNUAttribute(SMLoc L) { + int64_t Tag; + int64_t IntegerValue; + if (!getParser().parseGNUAttribute(L, Tag, IntegerValue)) + return false; + + getParser().getStreamer().emitGNUAttribute(Tag, IntegerValue); + return true; +} /// Force static initialization. extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializePowerPCAsmParser() { diff --git a/llvm/lib/Target/PowerPC/GISel/PPCInstructionSelector.cpp b/llvm/lib/Target/PowerPC/GISel/PPCInstructionSelector.cpp index 7d64816ed6c7..0cd8350e3fdd 100644 --- a/llvm/lib/Target/PowerPC/GISel/PPCInstructionSelector.cpp +++ b/llvm/lib/Target/PowerPC/GISel/PPCInstructionSelector.cpp @@ -65,8 +65,7 @@ private: PPCInstructionSelector::PPCInstructionSelector(const PPCTargetMachine &TM, const PPCSubtarget &STI, const PPCRegisterBankInfo &RBI) - : InstructionSelector(), TII(*STI.getInstrInfo()), - TRI(*STI.getRegisterInfo()), RBI(RBI), + : TII(*STI.getInstrInfo()), TRI(*STI.getRegisterInfo()), RBI(RBI), #define GET_GLOBALISEL_PREDICATES_INIT #include "PPCGenGlobalISel.inc" #undef GET_GLOBALISEL_PREDICATES_INIT diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.cpp index 0ca8587ba483..b92b0fc342ec 100644 --- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.cpp +++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.cpp @@ -40,9 +40,8 @@ PPCELFStreamer::PPCELFStreamer(MCContext &Context, std::unique_ptr<MCAsmBackend> MAB, std::unique_ptr<MCObjectWriter> OW, std::unique_ptr<MCCodeEmitter> Emitter) - : MCELFStreamer(Context, std::move(MAB), std::move(OW), - std::move(Emitter)), LastLabel(NULL) { -} + : MCELFStreamer(Context, std::move(MAB), std::move(OW), std::move(Emitter)), + LastLabel(nullptr) {} void PPCELFStreamer::emitPrefixedInstruction(const MCInst &Inst, const MCSubtargetInfo &STI) { diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp index d6e02d0d0862..a651362f703b 100644 --- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp +++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp @@ -271,14 +271,14 @@ private: MCAssembler &MCA = getStreamer().getAssembler(); int64_t Offset; if (!LocalOffset->evaluateAsAbsolute(Offset, MCA)) - MCA.getContext().reportFatalError( - LocalOffset->getLoc(), ".localentry expression must be absolute."); + MCA.getContext().reportError(LocalOffset->getLoc(), + ".localentry expression must be absolute"); switch (Offset) { default: - MCA.getContext().reportFatalError( - LocalOffset->getLoc(), - ".localentry expression is not a valid power of 2."); + MCA.getContext().reportError( + LocalOffset->getLoc(), ".localentry expression must be a power of 2"); + return 0; case 0: return 0; case 1: diff --git a/llvm/lib/Target/PowerPC/P10InstrResources.td b/llvm/lib/Target/PowerPC/P10InstrResources.td index f3ae0010ad8e..edd3b42d47e1 100644 --- a/llvm/lib/Target/PowerPC/P10InstrResources.td +++ b/llvm/lib/Target/PowerPC/P10InstrResources.td @@ -409,8 +409,8 @@ def : InstRW<[P10W_DF_13C, P10W_DISP_ANY, P10DF_Read, P10DF_Read, P10DF_Read], // 13 Cycles Decimal Floating Point operations, and 3 Cycles Store operations, 2 input operands def : InstRW<[P10W_DF_13C, P10W_DISP_EVEN, P10W_ST_3C, P10W_DISP_ANY], (instrs - HASHST, - HASHSTP + HASHST, HASHST8, + HASHSTP, HASHSTP8 )>; // 24 Cycles Decimal Floating Point operations, 1 input operands @@ -619,6 +619,8 @@ def : InstRW<[P10W_DX_5C, P10W_DISP_ANY, P10DX_Read, P10DX_Read], XSCMPEXPQP, XSCMPOQP, XSCMPUQP, + XSMAXCQP, + XSMINCQP, XSTSTDCQP, XXGENPCVBM )>; @@ -1336,8 +1338,8 @@ def : InstRW<[P10W_LD_6C, P10W_DISP_ANY, P10LD_Read, P10LD_Read], // 6 Cycles Load operations, and 13 Cycles Decimal Floating Point operations, 2 input operands def : InstRW<[P10W_LD_6C, P10W_DISP_EVEN, P10W_DF_13C, P10W_DISP_ANY], (instrs - HASHCHK, - HASHCHKP + HASHCHK, HASHCHK8, + HASHCHKP, HASHCHKP8 )>; // Single crack instructions diff --git a/llvm/lib/Target/PowerPC/P9InstrResources.td b/llvm/lib/Target/PowerPC/P9InstrResources.td index f7c049951c54..c088d7847ce4 100644 --- a/llvm/lib/Target/PowerPC/P9InstrResources.td +++ b/llvm/lib/Target/PowerPC/P9InstrResources.td @@ -1415,7 +1415,7 @@ def : InstRW<[], (instregex "NOP_GT_PWR(6|7)$"), (instregex "TLB(IA|IVAX|SX|SX2|SX2D|LD|LI|RE|RE2|WE|WE2)$"), (instregex "WRTEE(I)?$"), - (instregex "HASH(ST|STP|CHK|CHKP)$"), + (instregex "HASH(ST|STP|CHK|CHKP)(8)?$"), ATTN, CLRBHRB, MFBHRBE, diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp index f26c15667a0b..780981806996 100644 --- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp +++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp @@ -109,6 +109,23 @@ struct DenseMapInfo<std::pair<const MCSymbol *, MCSymbolRefExpr::VariantKind>> { namespace { +enum { + // GNU attribute tags for PowerPC ABI + Tag_GNU_Power_ABI_FP = 4, + Tag_GNU_Power_ABI_Vector = 8, + Tag_GNU_Power_ABI_Struct_Return = 12, + + // GNU attribute values for PowerPC float ABI, as combination of two parts + Val_GNU_Power_ABI_NoFloat = 0b00, + Val_GNU_Power_ABI_HardFloat_DP = 0b01, + Val_GNU_Power_ABI_SoftFloat_DP = 0b10, + Val_GNU_Power_ABI_HardFloat_SP = 0b11, + + Val_GNU_Power_ABI_LDBL_IBM128 = 0b0100, + Val_GNU_Power_ABI_LDBL_64 = 0b1000, + Val_GNU_Power_ABI_LDBL_IEEE128 = 0b1100, +}; + class PPCAsmPrinter : public AsmPrinter { protected: // For TLS on AIX, we need to be able to identify TOC entries of specific @@ -178,6 +195,8 @@ public: return "Linux PPC Assembly Printer"; } + void emitGNUAttributes(Module &M); + void emitStartOfAsmFile(Module &M) override; void emitEndOfAsmFile(Module &) override; @@ -1388,6 +1407,28 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) { EmitToStreamer(*OutStreamer, TmpInst); } +void PPCLinuxAsmPrinter::emitGNUAttributes(Module &M) { + // Emit float ABI into GNU attribute + Metadata *MD = M.getModuleFlag("float-abi"); + MDString *FloatABI = dyn_cast_or_null<MDString>(MD); + if (!FloatABI) + return; + StringRef flt = FloatABI->getString(); + // TODO: Support emitting soft-fp and hard double/single attributes. + if (flt == "doubledouble") + OutStreamer->emitGNUAttribute(Tag_GNU_Power_ABI_FP, + Val_GNU_Power_ABI_HardFloat_DP | + Val_GNU_Power_ABI_LDBL_IBM128); + else if (flt == "ieeequad") + OutStreamer->emitGNUAttribute(Tag_GNU_Power_ABI_FP, + Val_GNU_Power_ABI_HardFloat_DP | + Val_GNU_Power_ABI_LDBL_IEEE128); + else if (flt == "ieeedouble") + OutStreamer->emitGNUAttribute(Tag_GNU_Power_ABI_FP, + Val_GNU_Power_ABI_HardFloat_DP | + Val_GNU_Power_ABI_LDBL_64); +} + void PPCLinuxAsmPrinter::emitInstruction(const MachineInstr *MI) { if (!Subtarget->isPPC64()) return PPCAsmPrinter::emitInstruction(MI); @@ -1642,6 +1683,8 @@ void PPCLinuxAsmPrinter::emitEndOfAsmFile(Module &M) { PPCTargetStreamer *TS = static_cast<PPCTargetStreamer *>(OutStreamer->getTargetStreamer()); + emitGNUAttributes(M); + if (!TOC.empty()) { const char *Name = isPPC64 ? ".toc" : ".got2"; MCSectionELF *Section = OutContext.getELFSection( diff --git a/llvm/lib/Target/PowerPC/PPCFastISel.cpp b/llvm/lib/Target/PowerPC/PPCFastISel.cpp index 856569bc8a73..e7cd107c5046 100644 --- a/llvm/lib/Target/PowerPC/PPCFastISel.cpp +++ b/llvm/lib/Target/PowerPC/PPCFastISel.cpp @@ -150,7 +150,7 @@ class PPCFastISel final : public FastISel { unsigned copyRegToRegClass(const TargetRegisterClass *ToRC, unsigned SrcReg, unsigned Flag = 0, unsigned SubReg = 0) { - unsigned TmpReg = createResultReg(ToRC); + Register TmpReg = createResultReg(ToRC); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), TmpReg).addReg(SrcReg, Flag, SubReg); return TmpReg; @@ -428,7 +428,7 @@ void PPCFastISel::PPCSimplifyAddress(Address &Addr, bool &UseOffset, // put the alloca address into a register, set the base type back to // register and continue. This should almost never happen. if (!UseOffset && Addr.BaseType == Address::FrameIndexBase) { - unsigned ResultReg = createResultReg(&PPC::G8RC_and_G8RC_NOX0RegClass); + Register ResultReg = createResultReg(&PPC::G8RC_and_G8RC_NOX0RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::ADDI8), ResultReg).addFrameIndex(Addr.Base.FI).addImm(0); Addr.Base.Reg = ResultReg; @@ -604,7 +604,7 @@ bool PPCFastISel::SelectLoad(const Instruction *I) { // Look at the currently assigned register for this instruction // to determine the required register class. This is necessary // to constrain RA from using R0/X0 when this is not legal. - unsigned AssignedReg = FuncInfo.ValueMap[I]; + Register AssignedReg = FuncInfo.ValueMap[I]; const TargetRegisterClass *RC = AssignedReg ? MRI.getRegClass(AssignedReg) : nullptr; @@ -783,7 +783,7 @@ bool PPCFastISel::SelectBranch(const Instruction *I) { PPCPred = PPC::InvertPredicate(PPCPred); } - unsigned CondReg = createResultReg(&PPC::CRRCRegClass); + Register CondReg = createResultReg(&PPC::CRRCRegClass); if (!PPCEmitCmp(CI->getOperand(0), CI->getOperand(1), CI->isUnsigned(), CondReg, PPCPred)) @@ -847,7 +847,7 @@ bool PPCFastISel::PPCEmitCmp(const Value *SrcValue1, const Value *SrcValue2, } } - unsigned SrcReg1 = getRegForValue(SrcValue1); + Register SrcReg1 = getRegForValue(SrcValue1); if (SrcReg1 == 0) return false; @@ -928,13 +928,13 @@ bool PPCFastISel::PPCEmitCmp(const Value *SrcValue1, const Value *SrcValue2, } if (NeedsExt) { - unsigned ExtReg = createResultReg(&PPC::GPRCRegClass); + Register ExtReg = createResultReg(&PPC::GPRCRegClass); if (!PPCEmitIntExt(SrcVT, SrcReg1, MVT::i32, ExtReg, IsZExt)) return false; SrcReg1 = ExtReg; if (!UseImm) { - unsigned ExtReg = createResultReg(&PPC::GPRCRegClass); + Register ExtReg = createResultReg(&PPC::GPRCRegClass); if (!PPCEmitIntExt(SrcVT, SrcReg2, MVT::i32, ExtReg, IsZExt)) return false; SrcReg2 = ExtReg; @@ -960,7 +960,7 @@ bool PPCFastISel::SelectFPExt(const Instruction *I) { if (SrcVT != MVT::f32 || DestVT != MVT::f64) return false; - unsigned SrcReg = getRegForValue(Src); + Register SrcReg = getRegForValue(Src); if (!SrcReg) return false; @@ -978,7 +978,7 @@ bool PPCFastISel::SelectFPTrunc(const Instruction *I) { if (SrcVT != MVT::f64 || DestVT != MVT::f32) return false; - unsigned SrcReg = getRegForValue(Src); + Register SrcReg = getRegForValue(Src); if (!SrcReg) return false; @@ -1019,7 +1019,7 @@ unsigned PPCFastISel::PPCMoveToFPReg(MVT SrcVT, unsigned SrcReg, // If necessary, extend 32-bit int to 64-bit. if (SrcVT == MVT::i32) { - unsigned TmpReg = createResultReg(&PPC::G8RCRegClass); + Register TmpReg = createResultReg(&PPC::G8RCRegClass); if (!PPCEmitIntExt(MVT::i32, SrcReg, MVT::i64, TmpReg, !IsSigned)) return 0; SrcReg = TmpReg; @@ -1079,7 +1079,7 @@ bool PPCFastISel::SelectIToFP(const Instruction *I, bool IsSigned) { SrcVT != MVT::i32 && SrcVT != MVT::i64) return false; - unsigned SrcReg = getRegForValue(Src); + Register SrcReg = getRegForValue(Src); if (SrcReg == 0) return false; @@ -1091,7 +1091,7 @@ bool PPCFastISel::SelectIToFP(const Instruction *I, bool IsSigned) { else Opc = IsSigned ? PPC::EFDCFSI : PPC::EFDCFUI; - unsigned DestReg = createResultReg(&PPC::SPERCRegClass); + Register DestReg = createResultReg(&PPC::SPERCRegClass); // Generate the convert. BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), DestReg) .addReg(SrcReg); @@ -1114,7 +1114,7 @@ bool PPCFastISel::SelectIToFP(const Instruction *I, bool IsSigned) { // Extend the input if necessary. if (SrcVT == MVT::i8 || SrcVT == MVT::i16) { - unsigned TmpReg = createResultReg(&PPC::G8RCRegClass); + Register TmpReg = createResultReg(&PPC::G8RCRegClass); if (!PPCEmitIntExt(SrcVT, SrcReg, MVT::i64, TmpReg, !IsSigned)) return false; SrcVT = MVT::i64; @@ -1128,7 +1128,7 @@ bool PPCFastISel::SelectIToFP(const Instruction *I, bool IsSigned) { // Determine the opcode for the conversion. const TargetRegisterClass *RC = &PPC::F8RCRegClass; - unsigned DestReg = createResultReg(RC); + Register DestReg = createResultReg(RC); unsigned Opc; if (DstVT == MVT::f32) @@ -1170,7 +1170,7 @@ unsigned PPCFastISel::PPCMoveToIntReg(const Instruction *I, MVT VT, // Look at the currently assigned register for this instruction // to determine the required register class. - unsigned AssignedReg = FuncInfo.ValueMap[I]; + Register AssignedReg = FuncInfo.ValueMap[I]; const TargetRegisterClass *RC = AssignedReg ? MRI.getRegClass(AssignedReg) : nullptr; @@ -1206,7 +1206,7 @@ bool PPCFastISel::SelectFPToI(const Instruction *I, bool IsSigned) { if (SrcVT != MVT::f32 && SrcVT != MVT::f64) return false; - unsigned SrcReg = getRegForValue(Src); + Register SrcReg = getRegForValue(Src); if (SrcReg == 0) return false; @@ -1276,7 +1276,7 @@ bool PPCFastISel::SelectBinaryIntOp(const Instruction *I, unsigned ISDOpcode) { // Look at the currently assigned register for this instruction // to determine the required register class. If there is no register, // make a conservative choice (don't assign R0). - unsigned AssignedReg = FuncInfo.ValueMap[I]; + Register AssignedReg = FuncInfo.ValueMap[I]; const TargetRegisterClass *RC = (AssignedReg ? MRI.getRegClass(AssignedReg) : &PPC::GPRC_and_GPRC_NOR0RegClass); @@ -1296,8 +1296,8 @@ bool PPCFastISel::SelectBinaryIntOp(const Instruction *I, unsigned ISDOpcode) { break; } - unsigned ResultReg = createResultReg(RC ? RC : &PPC::G8RCRegClass); - unsigned SrcReg1 = getRegForValue(I->getOperand(0)); + Register ResultReg = createResultReg(RC ? RC : &PPC::G8RCRegClass); + Register SrcReg1 = getRegForValue(I->getOperand(0)); if (SrcReg1 == 0) return false; // Handle case of small immediate operand. @@ -1355,7 +1355,7 @@ bool PPCFastISel::SelectBinaryIntOp(const Instruction *I, unsigned ISDOpcode) { } // Reg-reg case. - unsigned SrcReg2 = getRegForValue(I->getOperand(1)); + Register SrcReg2 = getRegForValue(I->getOperand(1)); if (SrcReg2 == 0) return false; // Reverse operands for subtract-from. @@ -1441,7 +1441,7 @@ bool PPCFastISel::processCallArgs(SmallVectorImpl<Value*> &Args, MVT DestVT = VA.getLocVT(); const TargetRegisterClass *RC = (DestVT == MVT::i64) ? &PPC::G8RCRegClass : &PPC::GPRCRegClass; - unsigned TmpReg = createResultReg(RC); + Register TmpReg = createResultReg(RC); if (!PPCEmitIntExt(ArgVT, Arg, DestVT, TmpReg, /*IsZExt*/false)) llvm_unreachable("Failed to emit a sext!"); ArgVT = DestVT; @@ -1453,7 +1453,7 @@ bool PPCFastISel::processCallArgs(SmallVectorImpl<Value*> &Args, MVT DestVT = VA.getLocVT(); const TargetRegisterClass *RC = (DestVT == MVT::i64) ? &PPC::G8RCRegClass : &PPC::GPRCRegClass; - unsigned TmpReg = createResultReg(RC); + Register TmpReg = createResultReg(RC); if (!PPCEmitIntExt(ArgVT, Arg, DestVT, TmpReg, /*IsZExt*/true)) llvm_unreachable("Failed to emit a zext!"); ArgVT = DestVT; @@ -1628,7 +1628,7 @@ bool PPCFastISel::fastLowerCall(CallLoweringInfo &CLI) { if (ArgVT.isVector() || ArgVT == MVT::f128) return false; - unsigned Arg = getRegForValue(ArgValue); + Register Arg = getRegForValue(ArgValue); if (Arg == 0) return false; @@ -1734,7 +1734,7 @@ bool PPCFastISel::SelectRet(const Instruction *I) { RetRegs.push_back(RetReg); } else { - unsigned Reg = getRegForValue(RV); + Register Reg = getRegForValue(RV); if (Reg == 0) return false; @@ -1767,7 +1767,7 @@ bool PPCFastISel::SelectRet(const Instruction *I) { case CCValAssign::ZExt: { const TargetRegisterClass *RC = (DestVT == MVT::i64) ? &PPC::G8RCRegClass : &PPC::GPRCRegClass; - unsigned TmpReg = createResultReg(RC); + Register TmpReg = createResultReg(RC); if (!PPCEmitIntExt(RVVT, SrcReg, DestVT, TmpReg, true)) return false; SrcReg = TmpReg; @@ -1776,7 +1776,7 @@ bool PPCFastISel::SelectRet(const Instruction *I) { case CCValAssign::SExt: { const TargetRegisterClass *RC = (DestVT == MVT::i64) ? &PPC::G8RCRegClass : &PPC::GPRCRegClass; - unsigned TmpReg = createResultReg(RC); + Register TmpReg = createResultReg(RC); if (!PPCEmitIntExt(RVVT, SrcReg, DestVT, TmpReg, false)) return false; SrcReg = TmpReg; @@ -1857,7 +1857,7 @@ bool PPCFastISel::PPCEmitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT, // Attempt to fast-select an indirect branch instruction. bool PPCFastISel::SelectIndirectBr(const Instruction *I) { - unsigned AddrReg = getRegForValue(I->getOperand(0)); + Register AddrReg = getRegForValue(I->getOperand(0)); if (AddrReg == 0) return false; @@ -1884,7 +1884,7 @@ bool PPCFastISel::SelectTrunc(const Instruction *I) { if (DestVT != MVT::i32 && DestVT != MVT::i16 && DestVT != MVT::i8) return false; - unsigned SrcReg = getRegForValue(Src); + Register SrcReg = getRegForValue(Src); if (!SrcReg) return false; @@ -1903,7 +1903,7 @@ bool PPCFastISel::SelectIntExt(const Instruction *I) { Type *SrcTy = Src->getType(); bool IsZExt = isa<ZExtInst>(I); - unsigned SrcReg = getRegForValue(Src); + Register SrcReg = getRegForValue(Src); if (!SrcReg) return false; EVT SrcEVT, DestEVT; @@ -1921,12 +1921,12 @@ bool PPCFastISel::SelectIntExt(const Instruction *I) { // instruction, use it. Otherwise pick the register class of the // correct size that does not contain X0/R0, since we don't know // whether downstream uses permit that assignment. - unsigned AssignedReg = FuncInfo.ValueMap[I]; + Register AssignedReg = FuncInfo.ValueMap[I]; const TargetRegisterClass *RC = (AssignedReg ? MRI.getRegClass(AssignedReg) : (DestVT == MVT::i64 ? &PPC::G8RC_and_G8RC_NOX0RegClass : &PPC::GPRC_and_GPRC_NOR0RegClass)); - unsigned ResultReg = createResultReg(RC); + Register ResultReg = createResultReg(RC); if (!PPCEmitIntExt(SrcVT, SrcReg, DestVT, ResultReg, IsZExt)) return false; @@ -1966,15 +1966,6 @@ bool PPCFastISel::fastSelectInstruction(const Instruction *I) { return SelectBinaryIntOp(I, ISD::OR); case Instruction::Sub: return SelectBinaryIntOp(I, ISD::SUB); - case Instruction::Call: - // On AIX, call lowering uses the DAG-ISEL path currently so that the - // callee of the direct function call instruction will be mapped to the - // symbol for the function's entry point, which is distinct from the - // function descriptor symbol. The latter is the symbol whose XCOFF symbol - // name is the C-linkage name of the source level function. - if (TM.getTargetTriple().isOSAIX()) - break; - return selectCall(I); case Instruction::Ret: return SelectRet(I); case Instruction::Trunc: @@ -2012,7 +2003,7 @@ unsigned PPCFastISel::PPCMaterializeFP(const ConstantFP *CFP, MVT VT) { else RC = ((VT == MVT::f32) ? &PPC::F4RCRegClass : &PPC::F8RCRegClass); - unsigned DestReg = createResultReg(RC); + Register DestReg = createResultReg(RC); CodeModel::Model CModel = TM.getCodeModel(); MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand( @@ -2026,7 +2017,7 @@ unsigned PPCFastISel::PPCMaterializeFP(const ConstantFP *CFP, MVT VT) { else Opc = ((VT == MVT::f32) ? PPC::LFS : PPC::LFD); - unsigned TmpReg = createResultReg(&PPC::G8RC_and_G8RC_NOX0RegClass); + Register TmpReg = createResultReg(&PPC::G8RC_and_G8RC_NOX0RegClass); PPCFuncInfo->setUsesTOCBasePtr(); // For small code model, generate a LF[SD](0, LDtocCPT(Idx, X2)). @@ -2043,7 +2034,7 @@ unsigned PPCFastISel::PPCMaterializeFP(const ConstantFP *CFP, MVT VT) { // But for large code model, we must generate a LDtocL followed // by the LF[SD]. if (CModel == CodeModel::Large) { - unsigned TmpReg2 = createResultReg(&PPC::G8RC_and_G8RC_NOX0RegClass); + Register TmpReg2 = createResultReg(&PPC::G8RC_and_G8RC_NOX0RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::LDtocL), TmpReg2).addConstantPoolIndex(Idx).addReg(TmpReg); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), DestReg) @@ -2068,7 +2059,7 @@ unsigned PPCFastISel::PPCMaterializeGV(const GlobalValue *GV, MVT VT) { assert(VT == MVT::i64 && "Non-address!"); const TargetRegisterClass *RC = &PPC::G8RC_and_G8RC_NOX0RegClass; - unsigned DestReg = createResultReg(RC); + Register DestReg = createResultReg(RC); // Global values may be plain old object addresses, TLS object // addresses, constant pool entries, or jump tables. How we generate @@ -2083,6 +2074,12 @@ unsigned PPCFastISel::PPCMaterializeGV(const GlobalValue *GV, MVT VT) { if (GV->isThreadLocal()) return 0; + // If the global has the toc-data attribute then fallback to DAG-ISEL. + if (TM.getTargetTriple().isOSAIX()) + if (const GlobalVariable *Var = dyn_cast_or_null<GlobalVariable>(GV)) + if (Var->hasAttribute("toc-data")) + return false; + PPCFuncInfo->setUsesTOCBasePtr(); // For small code model, generate a simple TOC load. if (CModel == CodeModel::Small) @@ -2099,7 +2096,7 @@ unsigned PPCFastISel::PPCMaterializeGV(const GlobalValue *GV, MVT VT) { // Otherwise we generate: // ADDItocL(ADDIStocHA8(%x2, GV), GV) // Either way, start with the ADDIStocHA8: - unsigned HighPartReg = createResultReg(RC); + Register HighPartReg = createResultReg(RC); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::ADDIStocHA8), HighPartReg).addReg(PPC::X2).addGlobalAddress(GV); @@ -2123,7 +2120,7 @@ unsigned PPCFastISel::PPCMaterialize32BitInt(int64_t Imm, unsigned Lo = Imm & 0xFFFF; unsigned Hi = (Imm >> 16) & 0xFFFF; - unsigned ResultReg = createResultReg(RC); + Register ResultReg = createResultReg(RC); bool IsGPRC = RC->hasSuperClassEq(&PPC::GPRCRegClass); if (isInt<16>(Imm)) @@ -2132,7 +2129,7 @@ unsigned PPCFastISel::PPCMaterialize32BitInt(int64_t Imm, .addImm(Imm); else if (Lo) { // Both Lo and Hi have nonzero bits. - unsigned TmpReg = createResultReg(RC); + Register TmpReg = createResultReg(RC); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(IsGPRC ? PPC::LIS : PPC::LIS8), TmpReg) .addImm(Hi); @@ -2195,7 +2192,7 @@ unsigned PPCFastISel::PPCMaterialize64BitInt(int64_t Imm, TmpReg3 = TmpReg2; if ((Lo = Remainder & 0xFFFF)) { - unsigned ResultReg = createResultReg(RC); + Register ResultReg = createResultReg(RC); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::ORI8), ResultReg).addReg(TmpReg3).addImm(Lo); return ResultReg; @@ -2211,7 +2208,7 @@ unsigned PPCFastISel::PPCMaterializeInt(const ConstantInt *CI, MVT VT, // If we're using CR bit registers for i1 values, handle that as a special // case first. if (VT == MVT::i1 && Subtarget->useCRBits()) { - unsigned ImmReg = createResultReg(&PPC::CRBITRCRegClass); + Register ImmReg = createResultReg(&PPC::CRBITRCRegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CI->isZero() ? PPC::CRUNSET : PPC::CRSET), ImmReg); return ImmReg; @@ -2231,7 +2228,7 @@ unsigned PPCFastISel::PPCMaterializeInt(const ConstantInt *CI, MVT VT, // a range of 0..0x7fff. if (isInt<16>(Imm)) { unsigned Opc = (VT == MVT::i64) ? PPC::LI8 : PPC::LI; - unsigned ImmReg = createResultReg(RC); + Register ImmReg = createResultReg(RC); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ImmReg) .addImm(Imm); return ImmReg; @@ -2283,7 +2280,7 @@ unsigned PPCFastISel::fastMaterializeAlloca(const AllocaInst *AI) { FuncInfo.StaticAllocaMap.find(AI); if (SI != FuncInfo.StaticAllocaMap.end()) { - unsigned ResultReg = createResultReg(&PPC::G8RC_and_G8RC_NOX0RegClass); + Register ResultReg = createResultReg(&PPC::G8RC_and_G8RC_NOX0RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::ADDI8), ResultReg).addFrameIndex(SI->second).addImm(0); return ResultReg; @@ -2393,7 +2390,7 @@ unsigned PPCFastISel::fastEmit_i(MVT Ty, MVT VT, unsigned Opc, uint64_t Imm) { // If we're using CR bit registers for i1 values, handle that as a special // case first. if (VT == MVT::i1 && Subtarget->useCRBits()) { - unsigned ImmReg = createResultReg(&PPC::CRBITRCRegClass); + Register ImmReg = createResultReg(&PPC::CRBITRCRegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Imm == 0 ? PPC::CRUNSET : PPC::CRSET), ImmReg); return ImmReg; diff --git a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp index 3ca563fee970..65c969c196e1 100644 --- a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp @@ -674,7 +674,8 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF, : PPC::MFCR); const MCInstrDesc &StoreWordInst = TII.get(isPPC64 ? PPC::STW8 : PPC::STW); const MCInstrDesc &HashST = - TII.get(HasPrivileged ? PPC::HASHSTP : PPC::HASHST); + TII.get(isPPC64 ? (HasPrivileged ? PPC::HASHSTP8 : PPC::HASHST8) + : (HasPrivileged ? PPC::HASHSTP : PPC::HASHST)); // Regarding this assert: Even though LR is saved in the caller's frame (i.e., // LROffset is positive), that slot is callee-owned. Because PPC32 SVR4 has no @@ -1172,7 +1173,7 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF, // CFA. const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo(); for (const CalleeSavedInfo &I : CSI) { - unsigned Reg = I.getReg(); + Register Reg = I.getReg(); if (Reg == PPC::LR || Reg == PPC::LR8 || Reg == PPC::RM) continue; // This is a bit of a hack: CR2LT, CR2GT, CR2EQ and CR2UN are just @@ -1195,7 +1196,7 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF, // In the ELFv1 ABI, only CR2 is noted in CFI and stands in for // the whole CR word. In the ELFv2 ABI, every CR that was // actually saved gets its own CFI record. - unsigned CRReg = isELFv2ABI? Reg : (unsigned) PPC::CR2; + Register CRReg = isELFv2ABI? Reg : PPC::CR2; unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createOffset( nullptr, MRI->getDwarfRegNum(CRReg, true), CRSaveOffset)); BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION)) @@ -1590,7 +1591,8 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF, const MCInstrDesc& MoveToCRInst = TII.get( isPPC64 ? PPC::MTOCRF8 : PPC::MTOCRF); const MCInstrDesc &HashChk = - TII.get(HasPrivileged ? PPC::HASHCHKP : PPC::HASHCHK); + TII.get(isPPC64 ? (HasPrivileged ? PPC::HASHCHKP8 : PPC::HASHCHK8) + : (HasPrivileged ? PPC::HASHCHKP : PPC::HASHCHK)); int64_t LROffset = getReturnSaveOffset(); int64_t FPOffset = 0; @@ -2085,7 +2087,7 @@ void PPCFrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF, SmallVector<CalleeSavedInfo, 18> VRegs; for (const CalleeSavedInfo &I : CSI) { - unsigned Reg = I.getReg(); + Register Reg = I.getReg(); assert((!MF.getInfo<PPCFunctionInfo>()->mustSaveTOC() || (Reg != PPC::X2 && Reg != PPC::R2)) && "Not expecting to try to spill R2 in a function that must save TOC"); @@ -2337,7 +2339,7 @@ bool PPCFrameLowering::assignCalleeSavedSpillSlots( if (BVAllocatable.none()) return false; - unsigned Reg = CS.getReg(); + Register Reg = CS.getReg(); if (!PPC::G8RCRegClass.contains(Reg)) { AllSpilledToReg = false; @@ -2395,7 +2397,7 @@ bool PPCFrameLowering::spillCalleeSavedRegisters( }); for (const CalleeSavedInfo &I : CSI) { - unsigned Reg = I.getReg(); + Register Reg = I.getReg(); // CR2 through CR4 are the nonvolatile CR fields. bool IsCRField = PPC::CR2 <= Reg && Reg <= PPC::CR4; @@ -2581,7 +2583,7 @@ bool PPCFrameLowering::restoreCalleeSavedRegisters( --BeforeI; for (unsigned i = 0, e = CSI.size(); i != e; ++i) { - unsigned Reg = CSI[i].getReg(); + Register Reg = CSI[i].getReg(); if ((Reg == PPC::X2 || Reg == PPC::R2) && MustSaveTOC) continue; diff --git a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp index ba74af5ef5f7..fdcf6e7e80f2 100644 --- a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp @@ -1365,8 +1365,7 @@ class BitPermutationSelector { ValueBit(SDValue V, unsigned I, Kind K = Variable) : V(V), Idx(I), K(K) {} - ValueBit(Kind K = Variable) - : V(SDValue(nullptr, 0)), Idx(UINT32_MAX), K(K) {} + ValueBit(Kind K = Variable) : Idx(UINT32_MAX), K(K) {} bool isZero() const { return K == ConstZero || K == VariableKnownToBeZero; @@ -4438,7 +4437,7 @@ bool PPCDAGToDAGISel::trySETCC(SDNode *N) { // Force the ccreg into CR7. SDValue CR7Reg = CurDAG->getRegister(PPC::CR7, MVT::i32); - SDValue InFlag(nullptr, 0); // Null incoming flag value. + SDValue InFlag; // Null incoming flag value. CCReg = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, CR7Reg, CCReg, InFlag).getValue(1); diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index 8d6edf07bc53..25cc34badda0 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -2433,7 +2433,7 @@ unsigned PPC::getSplatIdxForPPCMnemonics(SDNode *N, unsigned EltSize, /// the constant being splatted. The ByteSize field indicates the number of /// bytes of each element [124] -> [bhw]. SDValue PPC::get_VSPLTI_elt(SDNode *N, unsigned ByteSize, SelectionDAG &DAG) { - SDValue OpVal(nullptr, 0); + SDValue OpVal; // If ByteSize of the splat is bigger than the element size of the // build_vector, then we have a case where we are checking for a splat where @@ -3508,8 +3508,9 @@ SDValue PPCTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { int ShuffV[] = {1, 0, 3, 2}; SDValue Shuff = DAG.getVectorShuffle(MVT::v4i32, dl, SetCC32, SetCC32, ShuffV); - return DAG.getBitcast( - MVT::v2i64, DAG.getNode(ISD::AND, dl, MVT::v4i32, Shuff, SetCC32)); + return DAG.getBitcast(MVT::v2i64, + DAG.getNode(CC == ISD::SETEQ ? ISD::AND : ISD::OR, + dl, MVT::v4i32, Shuff, SetCC32)); } // We handle most of these in the usual way. @@ -4078,8 +4079,8 @@ SDValue PPCTargetLowering::LowerFormalArguments_32SVR4( // virtual ones. if (VA.getLocVT() == MVT::f64 && Subtarget.hasSPE()) { assert(i + 1 < e && "No second half of double precision argument"); - unsigned RegLo = MF.addLiveIn(VA.getLocReg(), RC); - unsigned RegHi = MF.addLiveIn(ArgLocs[++i].getLocReg(), RC); + Register RegLo = MF.addLiveIn(VA.getLocReg(), RC); + Register RegHi = MF.addLiveIn(ArgLocs[++i].getLocReg(), RC); SDValue ArgValueLo = DAG.getCopyFromReg(Chain, dl, RegLo, MVT::i32); SDValue ArgValueHi = DAG.getCopyFromReg(Chain, dl, RegHi, MVT::i32); if (!Subtarget.isLittleEndian()) @@ -4087,7 +4088,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_32SVR4( ArgValue = DAG.getNode(PPCISD::BUILD_SPE64, dl, MVT::f64, ArgValueLo, ArgValueHi); } else { - unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); + Register Reg = MF.addLiveIn(VA.getLocReg(), RC); ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, ValVT == MVT::i1 ? MVT::i32 : ValVT); if (ValVT == MVT::i1) @@ -4179,7 +4180,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_32SVR4( // dereferencing the result of va_next. for (unsigned GPRIndex = 0; GPRIndex != NumGPArgRegs; ++GPRIndex) { // Get an existing live-in vreg, or add a new one. - unsigned VReg = MF.getRegInfo().getLiveInVirtReg(GPArgRegs[GPRIndex]); + Register VReg = MF.getRegInfo().getLiveInVirtReg(GPArgRegs[GPRIndex]); if (!VReg) VReg = MF.addLiveIn(GPArgRegs[GPRIndex], &PPC::GPRCRegClass); @@ -4198,7 +4199,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_32SVR4( // on the stack. for (unsigned FPRIndex = 0; FPRIndex != NumFPArgRegs; ++FPRIndex) { // Get an existing live-in vreg, or add a new one. - unsigned VReg = MF.getRegInfo().getLiveInVirtReg(FPArgRegs[FPRIndex]); + Register VReg = MF.getRegInfo().getLiveInVirtReg(FPArgRegs[FPRIndex]); if (!VReg) VReg = MF.addLiveIn(FPArgRegs[FPRIndex], &PPC::F8RCRegClass); @@ -4384,7 +4385,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_64SVR4( InVals.push_back(Arg); if (GPR_idx != Num_GPR_Regs) { - unsigned VReg = MF.addLiveIn(GPR[GPR_idx++], &PPC::G8RCRegClass); + Register VReg = MF.addLiveIn(GPR[GPR_idx++], &PPC::G8RCRegClass); FuncInfo->addLiveInAttr(VReg, Flags); SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT); EVT ObjType = EVT::getIntegerVT(*DAG.getContext(), ObjSize * 8); @@ -4408,7 +4409,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_64SVR4( if (GPR_idx == Num_GPR_Regs) break; - unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass); + Register VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass); FuncInfo->addLiveInAttr(VReg, Flags); SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT); SDValue Addr = FIN; @@ -4432,7 +4433,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_64SVR4( case MVT::i64: if (Flags.isNest()) { // The 'nest' parameter, if any, is passed in R11. - unsigned VReg = MF.addLiveIn(PPC::X11, &PPC::G8RCRegClass); + Register VReg = MF.addLiveIn(PPC::X11, &PPC::G8RCRegClass); ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); if (ObjectVT == MVT::i32 || ObjectVT == MVT::i1) @@ -4445,7 +4446,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_64SVR4( // passed directly. Clang may use those instead of "byval" aggregate // types to avoid forcing arguments to memory unnecessarily. if (GPR_idx != Num_GPR_Regs) { - unsigned VReg = MF.addLiveIn(GPR[GPR_idx++], &PPC::G8RCRegClass); + Register VReg = MF.addLiveIn(GPR[GPR_idx++], &PPC::G8RCRegClass); FuncInfo->addLiveInAttr(VReg, Flags); ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); @@ -4491,7 +4492,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_64SVR4( // This can only ever happen in the presence of f32 array types, // since otherwise we never run out of FPRs before running out // of GPRs. - unsigned VReg = MF.addLiveIn(GPR[GPR_idx++], &PPC::G8RCRegClass); + Register VReg = MF.addLiveIn(GPR[GPR_idx++], &PPC::G8RCRegClass); FuncInfo->addLiveInAttr(VReg, Flags); ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); @@ -4532,7 +4533,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_64SVR4( // passed directly. The latter are used to implement ELFv2 homogenous // vector aggregates. if (VR_idx != Num_VR_Regs) { - unsigned VReg = MF.addLiveIn(VR[VR_idx], &PPC::VRRCRegClass); + Register VReg = MF.addLiveIn(VR[VR_idx], &PPC::VRRCRegClass); ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT); ++VR_idx; } else { @@ -4591,7 +4592,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_64SVR4( // the result of va_next. for (GPR_idx = (ArgOffset - LinkageSize) / PtrByteSize; GPR_idx < Num_GPR_Regs; ++GPR_idx) { - unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass); + Register VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass); SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT); SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo()); @@ -7059,7 +7060,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_AIX( auto HandleRegLoc = [&, RegClass, LocVT](const MCPhysReg PhysReg, unsigned Offset) { - const unsigned VReg = MF.addLiveIn(PhysReg, RegClass); + const Register VReg = MF.addLiveIn(PhysReg, RegClass); // Since the callers side has left justified the aggregate in the // register, we can simply store the entire register into the stack // slot. @@ -7156,7 +7157,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_AIX( (CCInfo.getNextStackOffset() - LinkageSize) / PtrByteSize; GPRIndex < NumGPArgRegs; ++GPRIndex) { - const unsigned VReg = + const Register VReg = IsPPC64 ? MF.addLiveIn(GPR_64[GPRIndex], &PPC::G8RCRegClass) : MF.addLiveIn(GPR_32[GPRIndex], &PPC::GPRCRegClass); @@ -11178,13 +11179,17 @@ void PPCTargetLowering::ReplaceNodeResults(SDNode *N, case ISD::STRICT_FP_TO_SINT: case ISD::STRICT_FP_TO_UINT: case ISD::FP_TO_SINT: - case ISD::FP_TO_UINT: + case ISD::FP_TO_UINT: { // LowerFP_TO_INT() can only handle f32 and f64. if (N->getOperand(N->isStrictFPOpcode() ? 1 : 0).getValueType() == MVT::ppcf128) return; - Results.push_back(LowerFP_TO_INT(SDValue(N, 0), DAG, dl)); + SDValue LoweredValue = LowerFP_TO_INT(SDValue(N, 0), DAG, dl); + Results.push_back(LoweredValue); + if (N->isStrictFPOpcode()) + Results.push_back(LoweredValue.getValue(1)); return; + } case ISD::TRUNCATE: { if (!N->getValueType(0).isVector()) return; @@ -17890,7 +17895,7 @@ Value *PPCTargetLowering::emitMaskedAtomicRMWIntrinsic( assert(EnableQuadwordAtomics && Subtarget.hasQuadwordAtomics() && "Only support quadword now"); Module *M = Builder.GetInsertBlock()->getParent()->getParent(); - Type *ValTy = cast<PointerType>(AlignedAddr->getType())->getElementType(); + Type *ValTy = AlignedAddr->getType()->getPointerElementType(); assert(ValTy->getPrimitiveSizeInBits() == 128); Function *RMW = Intrinsic::getDeclaration( M, getIntrinsicForAtomicRMWBinOp128(AI->getOperation())); @@ -17915,7 +17920,7 @@ Value *PPCTargetLowering::emitMaskedAtomicCmpXchgIntrinsic( assert(EnableQuadwordAtomics && Subtarget.hasQuadwordAtomics() && "Only support quadword now"); Module *M = Builder.GetInsertBlock()->getParent()->getParent(); - Type *ValTy = cast<PointerType>(AlignedAddr->getType())->getElementType(); + Type *ValTy = AlignedAddr->getType()->getPointerElementType(); assert(ValTy->getPrimitiveSizeInBits() == 128); Function *IntCmpXchg = Intrinsic::getDeclaration(M, Intrinsic::ppc_cmpxchg_i128); diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h index 87b7f96112ec..eb52e4aa6273 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.h +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -1456,4 +1456,4 @@ namespace llvm { } // end namespace llvm -#endif // LLVM_TARGET_POWERPC_PPC32ISELLOWERING_H +#endif // LLVM_LIB_TARGET_POWERPC_PPCISELLOWERING_H diff --git a/llvm/lib/Target/PowerPC/PPCInstr64Bit.td b/llvm/lib/Target/PowerPC/PPCInstr64Bit.td index 58af8037f59c..eae8e36e475e 100644 --- a/llvm/lib/Target/PowerPC/PPCInstr64Bit.td +++ b/llvm/lib/Target/PowerPC/PPCInstr64Bit.td @@ -1760,26 +1760,27 @@ defm FCTIWUZ : XForm_26r<63, 143, (outs f8rc:$frD), (ins f8rc:$frB), // These instructions store a hash computed from the value of the link register // and the value of the stack pointer. -let mayStore = 1 in { -def HASHST : XForm_XD6_RA5_RB5<31, 722, (outs), - (ins g8rc:$RB, memrihash:$D_RA_XD), - "hashst $RB, $D_RA_XD", IIC_IntGeneral, []>; -def HASHSTP : XForm_XD6_RA5_RB5<31, 658, (outs), +let mayStore = 1, Interpretation64Bit = 1, isCodeGenOnly = 1 in { +def HASHST8 : XForm_XD6_RA5_RB5<31, 722, (outs), (ins g8rc:$RB, memrihash:$D_RA_XD), - "hashstp $RB, $D_RA_XD", IIC_IntGeneral, []>; + "hashst $RB, $D_RA_XD", IIC_IntGeneral, []>; +def HASHSTP8 : XForm_XD6_RA5_RB5<31, 658, (outs), + (ins g8rc:$RB, memrihash:$D_RA_XD), + "hashstp $RB, $D_RA_XD", IIC_IntGeneral, []>; } // These instructions check a hash computed from the value of the link register // and the value of the stack pointer. The hasSideEffects flag is needed as the // instruction may TRAP if the hash does not match the hash stored at the // specified address. -let mayLoad = 1, hasSideEffects = 1 in { -def HASHCHK : XForm_XD6_RA5_RB5<31, 754, (outs), - (ins g8rc:$RB, memrihash:$D_RA_XD), - "hashchk $RB, $D_RA_XD", IIC_IntGeneral, []>; -def HASHCHKP : XForm_XD6_RA5_RB5<31, 690, (outs), +let mayLoad = 1, hasSideEffects = 1, + Interpretation64Bit = 1, isCodeGenOnly = 1 in { +def HASHCHK8 : XForm_XD6_RA5_RB5<31, 754, (outs), (ins g8rc:$RB, memrihash:$D_RA_XD), - "hashchkp $RB, $D_RA_XD", IIC_IntGeneral, []>; + "hashchk $RB, $D_RA_XD", IIC_IntGeneral, []>; +def HASHCHKP8 : XForm_XD6_RA5_RB5<31, 690, (outs), + (ins g8rc:$RB, memrihash:$D_RA_XD), + "hashchkp $RB, $D_RA_XD", IIC_IntGeneral, []>; } let Interpretation64Bit = 1, isCodeGenOnly = 1, hasSideEffects = 1 in diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp index a0fd2111de11..eada872c2a7d 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp @@ -2339,9 +2339,8 @@ bool PPCInstrInfo::ClobbersPredicate(MachineInstr &MI, Found = true; } } else if (MO.isRegMask()) { - for (TargetRegisterClass::iterator I = RC->begin(), - IE = RC->end(); I != IE; ++I) - if (MO.clobbersPhysReg(*I)) { + for (MCPhysReg R : *RC) + if (MO.clobbersPhysReg(R)) { Pred.push_back(MO); Found = true; } @@ -3253,7 +3252,7 @@ MachineInstr *PPCInstrInfo::getForwardingDefMI( Register Reg = MI.getOperand(i).getReg(); if (!Register::isVirtualRegister(Reg)) continue; - unsigned TrueReg = TRI->lookThruCopyLike(Reg, MRI); + Register TrueReg = TRI->lookThruCopyLike(Reg, MRI); if (Register::isVirtualRegister(TrueReg)) { DefMI = MRI->getVRegDef(TrueReg); if (DefMI->getOpcode() == PPC::LI || DefMI->getOpcode() == PPC::LI8 || @@ -3502,8 +3501,8 @@ bool PPCInstrInfo::foldFrameOffset(MachineInstr &MI) const { return false; assert(ADDIMI && "There should be ADDIMI for valid ToBeChangedReg."); - unsigned ToBeChangedReg = ADDIMI->getOperand(0).getReg(); - unsigned ScaleReg = ADDMI->getOperand(ScaleRegIdx).getReg(); + Register ToBeChangedReg = ADDIMI->getOperand(0).getReg(); + Register ScaleReg = ADDMI->getOperand(ScaleRegIdx).getReg(); auto NewDefFor = [&](unsigned Reg, MachineBasicBlock::iterator Start, MachineBasicBlock::iterator End) { for (auto It = ++Start; It != End; It++) @@ -3720,7 +3719,7 @@ bool PPCInstrInfo::convertToImmediateForm(MachineInstr &MI, bool PPCInstrInfo::combineRLWINM(MachineInstr &MI, MachineInstr **ToErase) const { MachineRegisterInfo *MRI = &MI.getParent()->getParent()->getRegInfo(); - unsigned FoldingReg = MI.getOperand(1).getReg(); + Register FoldingReg = MI.getOperand(1).getReg(); if (!Register::isVirtualRegister(FoldingReg)) return false; MachineInstr *SrcMI = MRI->getVRegDef(FoldingReg); @@ -5266,7 +5265,7 @@ PPCInstrInfo::isSignOrZeroExtended(const MachineInstr &MI, bool SignExt, if (!Register::isVirtualRegister(SrcReg)) return false; const MachineInstr *SrcMI = MRI->getVRegDef(SrcReg); - if (SrcMI != NULL) + if (SrcMI != nullptr) return isSignOrZeroExtended(*SrcMI, SignExt, Depth); return false; @@ -5290,7 +5289,7 @@ PPCInstrInfo::isSignOrZeroExtended(const MachineInstr &MI, bool SignExt, if (!Register::isVirtualRegister(SrcReg)) return false; const MachineInstr *SrcMI = MRI->getVRegDef(SrcReg); - if (SrcMI != NULL) + if (SrcMI != nullptr) return isSignOrZeroExtended(*SrcMI, SignExt, Depth); return false; @@ -5319,7 +5318,8 @@ PPCInstrInfo::isSignOrZeroExtended(const MachineInstr &MI, bool SignExt, if (!Register::isVirtualRegister(SrcReg)) return false; const MachineInstr *SrcMI = MRI->getVRegDef(SrcReg); - if (SrcMI == NULL || !isSignOrZeroExtended(*SrcMI, SignExt, Depth+1)) + if (SrcMI == nullptr || + !isSignOrZeroExtended(*SrcMI, SignExt, Depth + 1)) return false; } else diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td index 2340be5b5915..c26b4f6ceb7d 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td @@ -5530,6 +5530,30 @@ def DWBytes3210 { (i64 (INSERT_SUBREG (i64 (IMPLICIT_DEF)), Word, sub_32)); } +// These instructions store a hash computed from the value of the link register +// and the value of the stack pointer. +let mayStore = 1 in { +def HASHST : XForm_XD6_RA5_RB5<31, 722, (outs), + (ins gprc:$RB, memrihash:$D_RA_XD), + "hashst $RB, $D_RA_XD", IIC_IntGeneral, []>; +def HASHSTP : XForm_XD6_RA5_RB5<31, 658, (outs), + (ins gprc:$RB, memrihash:$D_RA_XD), + "hashstp $RB, $D_RA_XD", IIC_IntGeneral, []>; +} + +// These instructions check a hash computed from the value of the link register +// and the value of the stack pointer. The hasSideEffects flag is needed as the +// instruction may TRAP if the hash does not match the hash stored at the +// specified address. +let mayLoad = 1, hasSideEffects = 1 in { +def HASHCHK : XForm_XD6_RA5_RB5<31, 754, (outs), + (ins gprc:$RB, memrihash:$D_RA_XD), + "hashchk $RB, $D_RA_XD", IIC_IntGeneral, []>; +def HASHCHKP : XForm_XD6_RA5_RB5<31, 690, (outs), + (ins gprc:$RB, memrihash:$D_RA_XD), + "hashchkp $RB, $D_RA_XD", IIC_IntGeneral, []>; +} + // Now both high word and low word are reversed, next // swap the high word and low word. def : Pat<(i64 (bitreverse i64:$A)), diff --git a/llvm/lib/Target/PowerPC/PPCInstrPrefix.td b/llvm/lib/Target/PowerPC/PPCInstrPrefix.td index a19289e96b3e..fe354208533b 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrPrefix.td +++ b/llvm/lib/Target/PowerPC/PPCInstrPrefix.td @@ -2398,6 +2398,8 @@ let Predicates = [IsISA3_1] in { let Predicates = [IsISA3_1, HasVSX] in { def XVCVSPBF16 : XX2_XT6_XO5_XB6<60, 17, 475, "xvcvspbf16", vsrc, []>; def XVCVBF16SPN : XX2_XT6_XO5_XB6<60, 16, 475, "xvcvbf16spn", vsrc, []>; + def XSMAXCQP : X_VT5_VA5_VB5<63, 676, "xsmaxcqp", []>; + def XSMINCQP : X_VT5_VA5_VB5<63, 740, "xsmincqp", []>; } // Multiclass defining patterns for Set Boolean Extension Reverse Instructions. diff --git a/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp b/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp index d12a9b806fd0..e5fa02bc8ccf 100644 --- a/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp +++ b/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp @@ -107,10 +107,10 @@ private: void initialize(MachineFunction &MFParm); // Perform peepholes. - bool simplifyCode(void); + bool simplifyCode(); // Perform peepholes. - bool eliminateRedundantCompare(void); + bool eliminateRedundantCompare(); bool eliminateRedundantTOCSaves(std::map<MachineInstr *, bool> &TOCSaves); bool combineSEXTAndSHL(MachineInstr &MI, MachineInstr *&ToErase); bool emitRLDICWhenLoweringJumpTables(MachineInstr &MI); @@ -258,12 +258,12 @@ void PPCMIPeephole::UpdateTOCSaves( } bool Keep = true; - for (auto It = TOCSaves.begin(); It != TOCSaves.end(); It++ ) { - MachineInstr *CurrInst = It->first; + for (auto &I : TOCSaves) { + MachineInstr *CurrInst = I.first; // If new instruction dominates an existing one, mark existing one as // redundant. - if (It->second && MDT->dominates(MI, CurrInst)) - It->second = false; + if (I.second && MDT->dominates(MI, CurrInst)) + I.second = false; // Check if the new instruction is redundant. if (MDT->dominates(CurrInst, MI)) { Keep = false; @@ -381,7 +381,7 @@ static void convertUnprimedAccPHIs(const PPCInstrInfo *TII, } // Perform peephole optimizations. -bool PPCMIPeephole::simplifyCode(void) { +bool PPCMIPeephole::simplifyCode() { bool Simplified = false; bool TrapOpt = false; MachineInstr* ToErase = nullptr; @@ -481,7 +481,7 @@ bool PPCMIPeephole::simplifyCode(void) { // PPC::ZERO. if (!MI.getOperand(1).isImm() || MI.getOperand(1).getImm() != 0) break; - unsigned MIDestReg = MI.getOperand(0).getReg(); + Register MIDestReg = MI.getOperand(0).getReg(); for (MachineInstr& UseMI : MRI->use_instructions(MIDestReg)) Simplified |= TII->onlyFoldImmediate(UseMI, MI, MIDestReg); if (MRI->use_nodbg_empty(MIDestReg)) { @@ -519,9 +519,9 @@ bool PPCMIPeephole::simplifyCode(void) { // XXPERMDI t, SUBREG_TO_REG(s), SUBREG_TO_REG(s), immed. // We have to look through chains of COPY and SUBREG_TO_REG // to find the real source values for comparison. - unsigned TrueReg1 = + Register TrueReg1 = TRI->lookThruCopyLike(MI.getOperand(1).getReg(), MRI); - unsigned TrueReg2 = + Register TrueReg2 = TRI->lookThruCopyLike(MI.getOperand(2).getReg(), MRI); if (!(TrueReg1 == TrueReg2 && Register::isVirtualRegister(TrueReg1))) @@ -541,7 +541,7 @@ bool PPCMIPeephole::simplifyCode(void) { auto isConversionOfLoadAndSplat = [=]() -> bool { if (DefOpc != PPC::XVCVDPSXDS && DefOpc != PPC::XVCVDPUXDS) return false; - unsigned FeedReg1 = + Register FeedReg1 = TRI->lookThruCopyLike(DefMI->getOperand(1).getReg(), MRI); if (Register::isVirtualRegister(FeedReg1)) { MachineInstr *LoadMI = MRI->getVRegDef(FeedReg1); @@ -565,16 +565,16 @@ bool PPCMIPeephole::simplifyCode(void) { // If this is a splat or a swap fed by another splat, we // can replace it with a copy. if (DefOpc == PPC::XXPERMDI) { - unsigned DefReg1 = DefMI->getOperand(1).getReg(); - unsigned DefReg2 = DefMI->getOperand(2).getReg(); + Register DefReg1 = DefMI->getOperand(1).getReg(); + Register DefReg2 = DefMI->getOperand(2).getReg(); unsigned DefImmed = DefMI->getOperand(3).getImm(); // If the two inputs are not the same register, check to see if // they originate from the same virtual register after only // copy-like instructions. if (DefReg1 != DefReg2) { - unsigned FeedReg1 = TRI->lookThruCopyLike(DefReg1, MRI); - unsigned FeedReg2 = TRI->lookThruCopyLike(DefReg2, MRI); + Register FeedReg1 = TRI->lookThruCopyLike(DefReg1, MRI); + Register FeedReg2 = TRI->lookThruCopyLike(DefReg2, MRI); if (!(FeedReg1 == FeedReg2 && Register::isVirtualRegister(FeedReg1))) @@ -643,7 +643,7 @@ bool PPCMIPeephole::simplifyCode(void) { case PPC::XXSPLTW: { unsigned MyOpcode = MI.getOpcode(); unsigned OpNo = MyOpcode == PPC::XXSPLTW ? 1 : 2; - unsigned TrueReg = + Register TrueReg = TRI->lookThruCopyLike(MI.getOperand(OpNo).getReg(), MRI); if (!Register::isVirtualRegister(TrueReg)) break; @@ -707,7 +707,7 @@ bool PPCMIPeephole::simplifyCode(void) { } case PPC::XVCVDPSP: { // If this is a DP->SP conversion fed by an FRSP, the FRSP is redundant. - unsigned TrueReg = + Register TrueReg = TRI->lookThruCopyLike(MI.getOperand(1).getReg(), MRI); if (!Register::isVirtualRegister(TrueReg)) break; @@ -716,9 +716,9 @@ bool PPCMIPeephole::simplifyCode(void) { // This can occur when building a vector of single precision or integer // values. if (DefMI && DefMI->getOpcode() == PPC::XXPERMDI) { - unsigned DefsReg1 = + Register DefsReg1 = TRI->lookThruCopyLike(DefMI->getOperand(1).getReg(), MRI); - unsigned DefsReg2 = + Register DefsReg2 = TRI->lookThruCopyLike(DefMI->getOperand(2).getReg(), MRI); if (!Register::isVirtualRegister(DefsReg1) || !Register::isVirtualRegister(DefsReg2)) @@ -1178,7 +1178,7 @@ static unsigned getIncomingRegForBlock(MachineInstr *Phi, static unsigned getSrcVReg(unsigned Reg, MachineBasicBlock *BB1, MachineBasicBlock *BB2, MachineRegisterInfo *MRI) { unsigned SrcReg = Reg; - while (1) { + while (true) { unsigned NextReg = SrcReg; MachineInstr *Inst = MRI->getVRegDef(SrcReg); if (BB1 && Inst->getOpcode() == PPC::PHI && Inst->getParent() == BB2) { @@ -1334,7 +1334,7 @@ bool PPCMIPeephole::eliminateRedundantTOCSaves( // cmpwi r3, 0 ; greather than -1 means greater or equal to 0 // bge 0, .LBB0_4 -bool PPCMIPeephole::eliminateRedundantCompare(void) { +bool PPCMIPeephole::eliminateRedundantCompare() { bool Simplified = false; for (MachineBasicBlock &MBB2 : *MF) { @@ -1737,4 +1737,3 @@ INITIALIZE_PASS_END(PPCMIPeephole, DEBUG_TYPE, char PPCMIPeephole::ID = 0; FunctionPass* llvm::createPPCMIPeepholePass() { return new PPCMIPeephole(); } - diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp index 4bccc5596d2b..76b016c0ee79 100644 --- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp @@ -390,6 +390,18 @@ BitVector PPCRegisterInfo::getReservedRegs(const MachineFunction &MF) const { return Reserved; } +bool PPCRegisterInfo::isAsmClobberable(const MachineFunction &MF, + MCRegister PhysReg) const { + // We cannot use getReservedRegs() to find the registers that are not asm + // clobberable because there are some reserved registers which can be + // clobbered by inline asm. For example, when LR is clobbered, the register is + // saved and restored. We will hardcode the registers that are not asm + // cloberable in this function. + + // The stack pointer (R1/X1) is not clobberable by inline asm + return PhysReg != PPC::R1 && PhysReg != PPC::X1; +} + bool PPCRegisterInfo::requiresFrameIndexScavenging(const MachineFunction &MF) const { const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>(); const PPCInstrInfo *InstrInfo = Subtarget.getInstrInfo(); @@ -423,7 +435,7 @@ bool PPCRegisterInfo::requiresFrameIndexScavenging(const MachineFunction &MF) co continue; int FrIdx = Info[i].getFrameIdx(); - unsigned Reg = Info[i].getReg(); + Register Reg = Info[i].getReg(); const TargetRegisterClass *RC = getMinimalPhysRegClass(Reg); unsigned Opcode = InstrInfo->getStoreOpcodeForSpill(RC); diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.h b/llvm/lib/Target/PowerPC/PPCRegisterInfo.h index 2e534dd1bcd5..114f6d0f4c66 100644 --- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.h +++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.h @@ -91,6 +91,8 @@ public: void adjustStackMapLiveOutMask(uint32_t *Mask) const override; BitVector getReservedRegs(const MachineFunction &MF) const override; + bool isAsmClobberable(const MachineFunction &MF, + MCRegister PhysReg) const override; bool isCallerPreservedPhysReg(MCRegister PhysReg, const MachineFunction &MF) const override; @@ -185,6 +187,10 @@ public: return RegName; } + + bool isNonallocatableRegisterCalleeSave(MCRegister Reg) const override { + return Reg == PPC::LR || Reg == PPC::LR8; + } }; } // end namespace llvm diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp index ed28731b8ef2..cc5738a5d7b6 100644 --- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -374,11 +374,10 @@ bool PPCTTIImpl::mightUseCTR(BasicBlock *BB, TargetLibraryInfo *LibInfo, // clobbers ctr. auto asmClobbersCTR = [](InlineAsm *IA) { InlineAsm::ConstraintInfoVector CIV = IA->ParseConstraints(); - for (unsigned i = 0, ie = CIV.size(); i < ie; ++i) { - InlineAsm::ConstraintInfo &C = CIV[i]; + for (const InlineAsm::ConstraintInfo &C : CIV) { if (C.Type != InlineAsm::isInput) - for (unsigned j = 0, je = C.Codes.size(); j < je; ++j) - if (StringRef(C.Codes[j]).equals_insensitive("{ctr}")) + for (const auto &Code : C.Codes) + if (StringRef(Code).equals_insensitive("{ctr}")) return true; } return false; @@ -653,11 +652,17 @@ bool PPCTTIImpl::mightUseCTR(BasicBlock *BB, TargetLibraryInfo *LibInfo, } return true; - } else if (isa<BinaryOperator>(J) && - (J->getType()->getScalarType()->isFP128Ty() || + } else if ((J->getType()->getScalarType()->isFP128Ty() || J->getType()->getScalarType()->isPPC_FP128Ty())) { // Most operations on f128 or ppc_f128 values become calls. return true; + } else if (isa<FCmpInst>(J) && + J->getOperand(0)->getType()->getScalarType()->isFP128Ty()) { + return true; + } else if ((isa<FPTruncInst>(J) || isa<FPExtInst>(J)) && + (cast<CastInst>(J)->getSrcTy()->getScalarType()->isFP128Ty() || + cast<CastInst>(J)->getDestTy()->getScalarType()->isFP128Ty())) { + return true; } else if (isa<UIToFPInst>(J) || isa<SIToFPInst>(J) || isa<FPToUIInst>(J) || isa<FPToSIInst>(J)) { CastInst *CI = cast<CastInst>(J); @@ -1295,8 +1300,8 @@ bool PPCTTIImpl::canSaveCmp(Loop *L, BranchInst **BI, ScalarEvolution *SE, LoopInfo *LI, DominatorTree *DT, AssumptionCache *AC, TargetLibraryInfo *LibInfo) { // Process nested loops first. - for (Loop::iterator I = L->begin(), E = L->end(); I != E; ++I) - if (canSaveCmp(*I, BI, SE, LI, DT, AC, LibInfo)) + for (Loop *I : *L) + if (canSaveCmp(I, BI, SE, LI, DT, AC, LibInfo)) return false; // Stop search. HardwareLoopInfo HWLoopInfo(L); diff --git a/llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp b/llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp index 0be35adc35c7..8a7d324ddfe1 100644 --- a/llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp +++ b/llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp @@ -297,18 +297,16 @@ protected: // fma result. LiveInterval &NewFMAInt = LIS->getInterval(KilledProdReg); - for (LiveInterval::iterator AI = FMAInt.begin(), AE = FMAInt.end(); - AI != AE; ++AI) { + for (auto &AI : FMAInt) { // Don't add the segment that corresponds to the original copy. - if (AI->valno == AddendValNo) + if (AI.valno == AddendValNo) continue; VNInfo *NewFMAValNo = - NewFMAInt.getNextValue(AI->start, - LIS->getVNInfoAllocator()); + NewFMAInt.getNextValue(AI.start, LIS->getVNInfoAllocator()); - NewFMAInt.addSegment(LiveInterval::Segment(AI->start, AI->end, - NewFMAValNo)); + NewFMAInt.addSegment( + LiveInterval::Segment(AI.start, AI.end, NewFMAValNo)); } LLVM_DEBUG(dbgs() << " extended: " << NewFMAInt << '\n'); diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp index 75592dd4c6f5..a2ea34fe11c7 100644 --- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp +++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp @@ -302,7 +302,7 @@ struct RISCVOperand : public MCParsedAsmOperand { struct VTypeOp VType; }; - RISCVOperand(KindTy K) : MCParsedAsmOperand(), Kind(K) {} + RISCVOperand(KindTy K) : Kind(K) {} public: RISCVOperand(const RISCVOperand &o) : MCParsedAsmOperand() { @@ -337,7 +337,6 @@ public: bool isImm() const override { return Kind == KindTy::Immediate; } bool isMem() const override { return false; } bool isSystemRegister() const { return Kind == KindTy::SystemRegister; } - bool isVType() const { return Kind == KindTy::VType; } bool isGPR() const { return Kind == KindTy::Register && @@ -421,7 +420,27 @@ public: bool isCSRSystemRegister() const { return isSystemRegister(); } - bool isVTypeI() const { return isVType(); } + bool isVTypeImm(unsigned N) const { + int64_t Imm; + RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None; + if (!isImm()) + return false; + bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK); + return IsConstantImm && isUIntN(N, Imm) && VK == RISCVMCExpr::VK_RISCV_None; + } + + // If the last operand of the vsetvli/vsetvli instruction is a constant + // expression, KindTy is Immediate. + bool isVTypeI10() const { + if (Kind == KindTy::Immediate) + return isVTypeImm(10); + return Kind == KindTy::VType; + } + bool isVTypeI11() const { + if (Kind == KindTy::Immediate) + return isVTypeImm(11); + return Kind == KindTy::VType; + } /// Return true if the operand is a valid for the fence instruction e.g. /// ('iorw'). @@ -547,6 +566,16 @@ public: return IsConstantImm && isUInt<7>(Imm) && VK == RISCVMCExpr::VK_RISCV_None; } + bool isRnumArg() const { + int64_t Imm; + RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None; + if (!isImm()) + return false; + bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK); + return IsConstantImm && Imm >= INT64_C(0) && Imm <= INT64_C(10) && + VK == RISCVMCExpr::VK_RISCV_None; + } + bool isSImm5() const { if (!isImm()) return false; @@ -898,9 +927,21 @@ public: Inst.addOperand(MCOperand::createImm(SysReg.Encoding)); } + // Support non-canonical syntax: + // "vsetivli rd, uimm, 0xabc" or "vsetvli rd, rs1, 0xabc" + // "vsetivli rd, uimm, (0xc << N)" or "vsetvli rd, rs1, (0xc << N)" void addVTypeIOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); - Inst.addOperand(MCOperand::createImm(getVType())); + int64_t Imm = 0; + if (Kind == KindTy::Immediate) { + RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None; + bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK); + (void)IsConstantImm; + assert(IsConstantImm && "Invalid VTypeI Operand!"); + } else { + Imm = getVType(); + } + Inst.addOperand(MCOperand::createImm(Imm)); } // Returns the rounding mode represented by this RISCVOperand. Should only @@ -1209,6 +1250,9 @@ bool RISCVAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, (1 << 4), "immediate must be in the range"); } + case Match_InvalidRnumArg: { + return generateImmOutOfRangeError(Operands, ErrorInfo, 0, 10); + } } llvm_unreachable("Unknown match type detected!"); @@ -1881,8 +1925,10 @@ bool RISCVAsmParser::ParseInstruction(ParseInstructionInfo &Info, Operands.push_back(RISCVOperand::createToken(Name, NameLoc, isRV64())); // If there are no more operands, then finish - if (getLexer().is(AsmToken::EndOfStatement)) + if (getLexer().is(AsmToken::EndOfStatement)) { + getParser().Lex(); // Consume the EndOfStatement. return false; + } // Parse first operand if (parseOperand(Operands, Name)) diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h index 9cfd36745f46..01c6bd90ea58 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h @@ -191,7 +191,8 @@ enum OperandType : unsigned { OPERAND_SIMM12, OPERAND_UIMM20, OPERAND_UIMMLOG2XLEN, - OPERAND_LAST_RISCV_IMM = OPERAND_UIMMLOG2XLEN, + OPERAND_RVKRNUM, + OPERAND_LAST_RISCV_IMM = OPERAND_RVKRNUM, // Operand is either a register or uimm5, this is used by V extension pseudo // instructions to represent a value that be passed as AVL to either vsetvli // or vsetivli. diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.h index 59d8bb009d1c..7ce7dafb8ca1 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.h +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_LIB_TARGET_RISCV_RISCVELFSTREAMER_H -#define LLVM_LIB_TARGET_RISCV_RISCVELFSTREAMER_H +#ifndef LLVM_LIB_TARGET_RISCV_MCTARGETDESC_RISCVELFSTREAMER_H +#define LLVM_LIB_TARGET_RISCV_MCTARGETDESC_RISCVELFSTREAMER_H #include "RISCVTargetStreamer.h" #include "llvm/MC/MCELFStreamer.h" diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp index 89a7d54f60f8..3268740849f0 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp @@ -85,7 +85,7 @@ void RISCVInstPrinter::printRegName(raw_ostream &O, unsigned RegNo) const { void RISCVInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O, const char *Modifier) { - assert((Modifier == 0 || Modifier[0] == 0) && "No modifiers supported"); + assert((Modifier == nullptr || Modifier[0] == 0) && "No modifiers supported"); const MCOperand &MO = MI->getOperand(OpNo); if (MO.isReg()) { diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.cpp index 0ee6d8de78c9..18858209aa9b 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.cpp +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.cpp @@ -141,6 +141,24 @@ static void generateInstSeqImpl(int64_t Val, Res.push_back(RISCVMatInt::Inst(RISCV::ADDI, Lo12)); } +static unsigned extractRotateInfo(int64_t Val) { + // for case: 0b111..1..xxxxxx1..1.. + unsigned LeadingOnes = countLeadingOnes((uint64_t)Val); + unsigned TrailingOnes = countTrailingOnes((uint64_t)Val); + if (TrailingOnes > 0 && TrailingOnes < 64 && + (LeadingOnes + TrailingOnes) > (64 - 12)) + return 64 - TrailingOnes; + + // for case: 0bxxx1..1..1...xxx + unsigned UpperTrailingOnes = countTrailingOnes(Hi_32(Val)); + unsigned LowerLeadingOnes = countLeadingOnes(Lo_32(Val)); + if (UpperTrailingOnes < 32 && + (UpperTrailingOnes + LowerLeadingOnes) > (64 - 12)) + return 32 - UpperTrailingOnes; + + return 0; +} + namespace llvm { namespace RISCVMatInt { InstSeq generateInstSeq(int64_t Val, const FeatureBitset &ActiveFeatures) { @@ -312,6 +330,18 @@ InstSeq generateInstSeq(int64_t Val, const FeatureBitset &ActiveFeatures) { } } + // Perform optimization with rori in the Zbb extension. + if (Res.size() > 2 && ActiveFeatures[RISCV::FeatureStdExtZbb]) { + if (unsigned Rotate = extractRotateInfo(Val)) { + RISCVMatInt::InstSeq TmpSeq; + uint64_t NegImm12 = + ((uint64_t)Val >> (64 - Rotate)) | ((uint64_t)Val << Rotate); + assert(isInt<12>(NegImm12)); + TmpSeq.push_back(RISCVMatInt::Inst(RISCV::ADDI, NegImm12)); + TmpSeq.push_back(RISCVMatInt::Inst(RISCV::RORI, Rotate)); + Res = TmpSeq; + } + } return Res; } diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.h index 02b4b18f54bd..6a8e0c640001 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.h +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_LIB_TARGET_RISCV_MATINT_H -#define LLVM_LIB_TARGET_RISCV_MATINT_H +#ifndef LLVM_LIB_TARGET_RISCV_MCTARGETDESC_MATINT_H +#define LLVM_LIB_TARGET_RISCV_MCTARGETDESC_MATINT_H #include "llvm/ADT/SmallVector.h" #include "llvm/MC/SubtargetFeature.h" @@ -15,7 +15,6 @@ namespace llvm { class APInt; -class MCSubtargetInfo; namespace RISCVMatInt { struct Inst { diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.h index 0bda3de0ce5d..171780d94ce7 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.h +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_LIB_TARGET_RISCV_RISCVTARGETSTREAMER_H -#define LLVM_LIB_TARGET_RISCV_RISCVTARGETSTREAMER_H +#ifndef LLVM_LIB_TARGET_RISCV_MCTARGETDESC_RISCVTARGETSTREAMER_H +#define LLVM_LIB_TARGET_RISCV_MCTARGETDESC_RISCVTARGETSTREAMER_H #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSubtargetInfo.h" diff --git a/llvm/lib/Target/RISCV/RISCV.h b/llvm/lib/Target/RISCV/RISCV.h index b415c9f35e7f..03462240fd93 100644 --- a/llvm/lib/Target/RISCV/RISCV.h +++ b/llvm/lib/Target/RISCV/RISCV.h @@ -40,6 +40,9 @@ FunctionPass *createRISCVISelDag(RISCVTargetMachine &TM); FunctionPass *createRISCVGatherScatterLoweringPass(); void initializeRISCVGatherScatterLoweringPass(PassRegistry &); +FunctionPass *createRISCVSExtWRemovalPass(); +void initializeRISCVSExtWRemovalPass(PassRegistry &); + FunctionPass *createRISCVMergeBaseOffsetOptPass(); void initializeRISCVMergeBaseOffsetOptPass(PassRegistry &); diff --git a/llvm/lib/Target/RISCV/RISCV.td b/llvm/lib/Target/RISCV/RISCV.td index 6aa915c01929..5b0f27c5e937 100644 --- a/llvm/lib/Target/RISCV/RISCV.td +++ b/llvm/lib/Target/RISCV/RISCV.td @@ -42,7 +42,7 @@ def HasStdExtD : Predicate<"Subtarget->hasStdExtD()">, "'D' (Double-Precision Floating-Point)">; def FeatureStdExtZfhmin - : SubtargetFeature<"experimental-zfhmin", "HasStdExtZfhmin", "true", + : SubtargetFeature<"zfhmin", "HasStdExtZfhmin", "true", "'Zfhmin' (Half-Precision Floating-Point Minimal)", [FeatureStdExtF]>; def HasStdExtZfhmin : Predicate<"Subtarget->hasStdExtZfhmin()">, @@ -50,7 +50,7 @@ def HasStdExtZfhmin : Predicate<"Subtarget->hasStdExtZfhmin()">, "'Zfhmin' (Half-Precision Floating-Point Minimal)">; def FeatureStdExtZfh - : SubtargetFeature<"experimental-zfh", "HasStdExtZfh", "true", + : SubtargetFeature<"zfh", "HasStdExtZfh", "true", "'Zfh' (Half-Precision Floating-Point)", [FeatureStdExtZfhmin, FeatureStdExtF]>; def HasStdExtZfh : Predicate<"Subtarget->hasStdExtZfh()">, @@ -65,83 +65,217 @@ def HasStdExtC : Predicate<"Subtarget->hasStdExtC()">, "'C' (Compressed Instructions)">; def FeatureStdExtZba - : SubtargetFeature<"experimental-zba", "HasStdExtZba", "true", - "'Zba' (Address calculation 'B' Instructions)">; + : SubtargetFeature<"zba", "HasStdExtZba", "true", + "'Zba' (Address Generation Instructions)">; def HasStdExtZba : Predicate<"Subtarget->hasStdExtZba()">, AssemblerPredicate<(all_of FeatureStdExtZba), - "'Zba' (Address calculation 'B' Instructions)">; + "'Zba' (Address Generation Instructions)">; def NotHasStdExtZba : Predicate<"!Subtarget->hasStdExtZba()">; def FeatureStdExtZbb - : SubtargetFeature<"experimental-zbb", "HasStdExtZbb", "true", - "'Zbb' (Base 'B' Instructions)">; + : SubtargetFeature<"zbb", "HasStdExtZbb", "true", + "'Zbb' (Basic Bit-Manipulation)">; def HasStdExtZbb : Predicate<"Subtarget->hasStdExtZbb()">, AssemblerPredicate<(all_of FeatureStdExtZbb), - "'Zbb' (Base 'B' Instructions)">; + "'Zbb' (Basic Bit-Manipulation)">; def FeatureStdExtZbc - : SubtargetFeature<"experimental-zbc", "HasStdExtZbc", "true", - "'Zbc' (Carry-Less 'B' Instructions)">; + : SubtargetFeature<"zbc", "HasStdExtZbc", "true", + "'Zbc' (Carry-Less Multiplication)">; def HasStdExtZbc : Predicate<"Subtarget->hasStdExtZbc()">, AssemblerPredicate<(all_of FeatureStdExtZbc), - "'Zbc' (Carry-Less 'B' Instructions)">; + "'Zbc' (Carry-Less Multiplication)">; def FeatureStdExtZbe : SubtargetFeature<"experimental-zbe", "HasStdExtZbe", "true", - "'Zbe' (Extract-Deposit 'B' Instructions)">; + "'Zbe' (Extract-Deposit 'Zb' Instructions)">; def HasStdExtZbe : Predicate<"Subtarget->hasStdExtZbe()">, AssemblerPredicate<(all_of FeatureStdExtZbe), - "'Zbe' (Extract-Deposit 'B' Instructions)">; + "'Zbe' (Extract-Deposit 'Zb' Instructions)">; def FeatureStdExtZbf : SubtargetFeature<"experimental-zbf", "HasStdExtZbf", "true", - "'Zbf' (Bit-Field 'B' Instructions)">; + "'Zbf' (Bit-Field 'Zb' Instructions)">; def HasStdExtZbf : Predicate<"Subtarget->hasStdExtZbf()">, AssemblerPredicate<(all_of FeatureStdExtZbf), - "'Zbf' (Bit-Field 'B' Instructions)">; + "'Zbf' (Bit-Field 'Zb' Instructions)">; def FeatureStdExtZbm : SubtargetFeature<"experimental-zbm", "HasStdExtZbm", "true", - "'Zbm' (Matrix 'B' Instructions)">; + "'Zbm' (Matrix 'Zb' Instructions)">; def HasStdExtZbm : Predicate<"Subtarget->hasStdExtZbm()">, AssemblerPredicate<(all_of FeatureStdExtZbm), - "'Zbm' (Matrix 'B' Instructions)">; + "'Zbm' (Matrix 'Zb' Instructions)">; def FeatureStdExtZbp : SubtargetFeature<"experimental-zbp", "HasStdExtZbp", "true", - "'Zbp' (Permutation 'B' Instructions)">; + "'Zbp' (Permutation 'Zb' Instructions)">; def HasStdExtZbp : Predicate<"Subtarget->hasStdExtZbp()">, AssemblerPredicate<(all_of FeatureStdExtZbp), - "'Zbp' (Permutation 'B' Instructions)">; + "'Zbp' (Permutation 'Zb' Instructions)">; def FeatureStdExtZbr : SubtargetFeature<"experimental-zbr", "HasStdExtZbr", "true", - "'Zbr' (Polynomial Reduction 'B' Instructions)">; + "'Zbr' (Polynomial Reduction 'Zb' Instructions)">; def HasStdExtZbr : Predicate<"Subtarget->hasStdExtZbr()">, AssemblerPredicate<(all_of FeatureStdExtZbr), - "'Zbr' (Polynomial Reduction 'B' Instructions)">; + "'Zbr' (Polynomial Reduction 'Zb' Instructions)">; def FeatureStdExtZbs - : SubtargetFeature<"experimental-zbs", "HasStdExtZbs", "true", - "'Zbs' (Single-Bit 'B' Instructions)">; + : SubtargetFeature<"zbs", "HasStdExtZbs", "true", + "'Zbs' (Single-Bit Instructions)">; def HasStdExtZbs : Predicate<"Subtarget->hasStdExtZbs()">, AssemblerPredicate<(all_of FeatureStdExtZbs), - "'Zbs' (Single-Bit 'B' Instructions)">; + "'Zbs' (Single-Bit Instructions)">; def FeatureStdExtZbt : SubtargetFeature<"experimental-zbt", "HasStdExtZbt", "true", - "'Zbt' (Ternary 'B' Instructions)">; + "'Zbt' (Ternary 'Zb' Instructions)">; def HasStdExtZbt : Predicate<"Subtarget->hasStdExtZbt()">, AssemblerPredicate<(all_of FeatureStdExtZbt), - "'Zbt' (Ternary 'B' Instructions)">; + "'Zbt' (Ternary 'Zb' Instructions)">; // Some instructions belong to both the basic and the permutation // subextensions. They should be enabled if either has been specified. def HasStdExtZbbOrZbp : Predicate<"Subtarget->hasStdExtZbb() || Subtarget->hasStdExtZbp()">, AssemblerPredicate<(any_of FeatureStdExtZbb, FeatureStdExtZbp), - "'Zbb' (Base 'B' Instructions) or " - "'Zbp' (Permutation 'B' Instructions)">; + "'Zbb' (Basic Bit-Manipulation) or " + "'Zbp' (Permutation 'Zb' Instructions)">; + +def FeatureStdExtZbkb + : SubtargetFeature<"zbkb", "HasStdExtZbkb", "true", + "'Zbkb' (Bitmanip instructions for Cryptography)">; +def HasStdExtZbkb : Predicate<"Subtarget->hasStdExtZbkb()">, + AssemblerPredicate<(all_of FeatureStdExtZbkb), + "'Zbkb' (Bitmanip instructions for Cryptography)">; + +def FeatureStdExtZbkx + : SubtargetFeature<"zbkx", "HasStdExtZbkx", "true", + "'Zbkx' (Crossbar permutation instructions)">; +def HasStdExtZbkx : Predicate<"Subtarget->hasStdExtZbkx()">, + AssemblerPredicate<(all_of FeatureStdExtZbkx), + "'Zbkx' (Crossbar permutation instructions)">; + +def HasStdExtZbpOrZbkx + : Predicate<"Subtarget->hasStdExtZbp() || Subtarget->hasStdExtZbkx()">, + AssemblerPredicate<(any_of FeatureStdExtZbp, FeatureStdExtZbkx), + "'Zbp' (Permutation 'Zb' Instructions) or " + "'Zbkx' (Crossbar permutation instructions)">; + +def HasStdExtZbpOrZbkb + : Predicate<"Subtarget->hasStdExtZbp() || Subtarget->hasStdExtZbkb()">, + AssemblerPredicate<(any_of FeatureStdExtZbp, FeatureStdExtZbkb), + "'Zbp' (Permutation 'Zb' Instructions) or " + "'Zbkb' (Bitmanip instructions for Cryptography)">; + +def HasStdExtZbbOrZbkb + : Predicate<"Subtarget->hasStdExtZbb() || Subtarget->hasStdExtZbkb()">, + AssemblerPredicate<(any_of FeatureStdExtZbb, FeatureStdExtZbkb), + "'Zbb' (Basic Bit-Manipulation) or " + "'Zbkb' (Bitmanip instructions for Cryptography)">; + +def HasStdExtZbbOrZbpOrZbkb + : Predicate<"Subtarget->hasStdExtZbb() || Subtarget->hasStdExtZbp() || Subtarget->hasStdExtZbkb()">, + AssemblerPredicate<(any_of FeatureStdExtZbb, FeatureStdExtZbp, FeatureStdExtZbkb), + "'Zbb' (Basic Bit-Manipulation) or " + "'Zbp' (Permutation 'Zb' Instructions) or " + "'Zbkb' (Bitmanip instructions for Cryptography)">; + +// The Carry-less multiply subextension for cryptography is a subset of basic carry-less multiply subextension. The former should be enabled if the latter is enabled. +def FeatureStdExtZbkc + : SubtargetFeature<"zbkc", "HasStdExtZbkc", "true", + "'Zbkc' (Carry-less multiply instructions for Cryptography)">; +def HasStdExtZbkc + : Predicate<"Subtarget->hasStdExtZbkc()">, + AssemblerPredicate<(all_of FeatureStdExtZbkc), + "'Zbkc' (Carry-less multiply instructions for Cryptography)">; + +def HasStdExtZbcOrZbkc + : Predicate<"Subtarget->hasStdExtZbc() || Subtarget->hasStdExtZbkc()">, + AssemblerPredicate<(any_of FeatureStdExtZbc, FeatureStdExtZbkc), + "'Zbc' (Carry-Less Multiplication) or " + "'Zbkc' (Carry-less multiply instructions for Cryptography)">; + +def FeatureStdExtZknd + : SubtargetFeature<"zknd", "HasStdExtZknd", "true", + "'Zknd' (NIST Suite: AES Decryption)">; +def HasStdExtZknd : Predicate<"Subtarget->hasStdExtZknd()">, + AssemblerPredicate<(all_of FeatureStdExtZknd), + "'Zknd' (NIST Suite: AES Decryption)">; + +def FeatureStdExtZkne + : SubtargetFeature<"zkne", "HasStdExtZkne", "true", + "'Zkne' (NIST Suite: AES Encryption)">; +def HasStdExtZkne : Predicate<"Subtarget->hasStdExtZkne()">, + AssemblerPredicate<(all_of FeatureStdExtZkne), + "'Zkne' (NIST Suite: AES Encryption)">; + +// Some instructions belong to both Zknd and Zkne subextensions. +// They should be enabled if either has been specified. +def HasStdExtZkndOrZkne + : Predicate<"Subtarget->hasStdExtZknd() || Subtarget->hasStdExtZkne()">, + AssemblerPredicate<(any_of FeatureStdExtZknd, FeatureStdExtZkne), + "'Zknd' (NIST Suite: AES Decryption) or " + "'Zkne' (NIST Suite: AES Encryption)">; + +def FeatureStdExtZknh + : SubtargetFeature<"zknh", "HasStdExtZknh", "true", + "'Zknh' (NIST Suite: Hash Function Instructions)">; +def HasStdExtZknh : Predicate<"Subtarget->hasStdExtZknh()">, + AssemblerPredicate<(all_of FeatureStdExtZknh), + "'Zknh' (NIST Suite: Hash Function Instructions)">; + +def FeatureStdExtZksed + : SubtargetFeature<"zksed", "HasStdExtZksed", "true", + "'Zksed' (ShangMi Suite: SM4 Block Cipher Instructions)">; +def HasStdExtZksed : Predicate<"Subtarget->hasStdExtZksed()">, + AssemblerPredicate<(all_of FeatureStdExtZksed), + "'Zksed' (ShangMi Suite: SM4 Block Cipher Instructions)">; + +def FeatureStdExtZksh + : SubtargetFeature<"zksh", "HasStdExtZksh", "true", + "'Zksh' (ShangMi Suite: SM3 Hash Function Instructions)">; +def HasStdExtZksh : Predicate<"Subtarget->hasStdExtZksh()">, + AssemblerPredicate<(all_of FeatureStdExtZksh), + "'Zksh' (ShangMi Suite: SM3 Hash Function Instructions)">; + +def FeatureStdExtZkr + : SubtargetFeature<"zkr", "HasStdExtZkr", "true", + "'Zkr' (Entropy Source Extension)">; +def HasStdExtZkr : Predicate<"Subtarget->hasStdExtZkr()">, + AssemblerPredicate<(all_of FeatureStdExtZkr), + "'Zkr' (Entropy Source Extension)">; + +def FeatureStdExtZkn + : SubtargetFeature<"zkn", "HasStdExtZkn", "true", + "'Zkn' (NIST Algorithm Suite)", + [FeatureStdExtZbkb, + FeatureStdExtZbkc, + FeatureStdExtZbkx, + FeatureStdExtZkne, + FeatureStdExtZknd, + FeatureStdExtZknh]>; + +def FeatureStdExtZks + : SubtargetFeature<"zks", "HasStdExtZks", "true", + "'Zks' (ShangMi Algorithm Suite)", + [FeatureStdExtZbkb, + FeatureStdExtZbkc, + FeatureStdExtZbkx, + FeatureStdExtZksed, + FeatureStdExtZksh]>; + +def FeatureStdExtZkt + : SubtargetFeature<"zkt", "HasStdExtZkt", "true", + "'Zkt' (Data Independent Execution Latency)">; + +def FeatureStdExtZk + : SubtargetFeature<"zk", "HasStdExtZk", "true", + "'Zk' (Standard scalar cryptography extension)", + [FeatureStdExtZkn, + FeatureStdExtZkr, + FeatureStdExtZkt]>; def FeatureNoRVCHints : SubtargetFeature<"no-rvc-hints", "EnableRVCHintInstrs", "false", @@ -150,23 +284,66 @@ def HasRVCHints : Predicate<"Subtarget->enableRVCHintInstrs()">, AssemblerPredicate<(all_of(not FeatureNoRVCHints)), "RVC Hint Instructions">; +def FeatureStdExtZvl32b : SubtargetFeature<"zvl32b", "ZvlLen", "ExtZvl::Zvl32b", + "'Zvl' (Minimum Vector Length) 32">; + +foreach i = { 6-15 } in { + defvar I = !shl(1, i); + def FeatureStdExtZvl#I#b : + SubtargetFeature<"zvl"#I#"b", "ZvlLen", "ExtZvl::Zvl"#I#"b", + "'Zvl' (Minimum Vector Length) "#I, + [!cast<SubtargetFeature>("FeatureStdExtZvl"#!srl(I, 1)#"b")]>; +} + +def FeatureStdExtZve32x + : SubtargetFeature<"zve32x", "HasStdExtZve32x", "true", + "'Zve32x' (Vector Extensions for Embedded Processors " + "with maximal 32 EEW)", + [FeatureStdExtZvl32b]>; + +def FeatureStdExtZve32f + : SubtargetFeature<"zve32f", "HasStdExtZve32f", "true", + "'Zve32f' (Vector Extensions for Embedded Processors " + "with maximal 32 EEW and F extension)", + [FeatureStdExtZve32x]>; + +def FeatureStdExtZve64x + : SubtargetFeature<"zve64x", "HasStdExtZve64x", "true", + "'Zve64x' (Vector Extensions for Embedded Processors " + "with maximal 64 EEW)", [FeatureStdExtZve32x, FeatureStdExtZvl64b]>; + +def FeatureStdExtZve64f + : SubtargetFeature<"zve64f", "HasStdExtZve64f", "true", + "'Zve64f' (Vector Extensions for Embedded Processors " + "with maximal 64 EEW and F extension)", + [FeatureStdExtZve32f, FeatureStdExtZve64x]>; + +def FeatureStdExtZve64d + : SubtargetFeature<"zve64d", "HasStdExtZve64d", "true", + "'Zve64d' (Vector Extensions for Embedded Processors " + "with maximal 64 EEW, F and D extension)", + [FeatureStdExtZve64f]>; + def FeatureStdExtV - : SubtargetFeature<"experimental-v", "HasStdExtV", "true", - "'V' (Vector Instructions)">; -def HasStdExtV : Predicate<"Subtarget->hasStdExtV()">, - AssemblerPredicate<(all_of FeatureStdExtV), - "'V' (Vector Instructions)">; - -def HasVInstructions : Predicate<"Subtarget->hasVInstructions()">; -def HasVInstructionsAnyF : Predicate<"Subtarget->hasVInstructionsAnyF()">; - -def FeatureStdExtZvlsseg - : SubtargetFeature<"experimental-zvlsseg", "HasStdExtZvlsseg", "true", - "'Zvlsseg' (Vector segment load/store instructions)", - [FeatureStdExtV]>; -def HasStdExtZvlsseg : Predicate<"Subtarget->hasStdExtZvlsseg()">, - AssemblerPredicate<(all_of FeatureStdExtZvlsseg), - "'Zvlsseg' (Vector segment load/store instructions)">; + : SubtargetFeature<"v", "HasStdExtV", "true", + "'V' (Vector Extension for Application Processors)", + [FeatureStdExtZvl128b, FeatureStdExtF, FeatureStdExtD]>; + +def HasVInstructions : Predicate<"Subtarget->hasVInstructions()">, + AssemblerPredicate< + (any_of FeatureStdExtZve32x, FeatureStdExtV), + "'V' (Vector Extension for Application Processors), 'Zve32x' or " + "'Zve64x' (Vector Extensions for Embedded Processors)">; +def HasVInstructionsI64 : Predicate<"Subtarget->hasVInstructionsI64()">, + AssemblerPredicate< + (any_of FeatureStdExtZve64x, FeatureStdExtV), + "'V' (Vector Extension for Application Processors) or 'Zve64x' " + "(Vector Extensions for Embedded Processors)">; +def HasVInstructionsAnyF : Predicate<"Subtarget->hasVInstructionsAnyF()">, + AssemblerPredicate< + (any_of FeatureStdExtZve32f, FeatureStdExtV), + "'V' (Vector Extension for Application Processors), 'Zve32f', " + "'Zve64f' or 'Zve64d' (Vector Extensions for Embedded Processors)">; def Feature64Bit : SubtargetFeature<"64bit", "HasRV64", "true", "Implements RV64">; @@ -198,6 +375,9 @@ foreach i = {1-31} in def FeatureSaveRestore : SubtargetFeature<"save-restore", "EnableSaveRestore", "true", "Enable save/restore.">; +def TuneSiFive7 : SubtargetFeature<"sifive7", "RISCVProcFamily", "SiFive7", + "SiFive 7-Series processors">; + //===----------------------------------------------------------------------===// // Named operands for CSR instructions. //===----------------------------------------------------------------------===// @@ -226,8 +406,10 @@ def : ProcessorModel<"generic-rv64", NoSchedModel, [Feature64Bit]>; def : ProcessorModel<"rocket-rv32", RocketModel, []>; def : ProcessorModel<"rocket-rv64", RocketModel, [Feature64Bit]>; -def : ProcessorModel<"sifive-7-rv32", SiFive7Model, []>; -def : ProcessorModel<"sifive-7-rv64", SiFive7Model, [Feature64Bit]>; +def : ProcessorModel<"sifive-7-rv32", SiFive7Model, [], + [TuneSiFive7]>; +def : ProcessorModel<"sifive-7-rv64", SiFive7Model, [Feature64Bit], + [TuneSiFive7]>; def : ProcessorModel<"sifive-e20", RocketModel, [FeatureStdExtM, FeatureStdExtC]>; @@ -253,7 +435,8 @@ def : ProcessorModel<"sifive-e34", RocketModel, [FeatureStdExtM, def : ProcessorModel<"sifive-e76", SiFive7Model, [FeatureStdExtM, FeatureStdExtA, FeatureStdExtF, - FeatureStdExtC]>; + FeatureStdExtC], + [TuneSiFive7]>; def : ProcessorModel<"sifive-s21", RocketModel, [Feature64Bit, FeatureStdExtM, @@ -277,7 +460,8 @@ def : ProcessorModel<"sifive-s76", SiFive7Model, [Feature64Bit, FeatureStdExtA, FeatureStdExtF, FeatureStdExtD, - FeatureStdExtC]>; + FeatureStdExtC], + [TuneSiFive7]>; def : ProcessorModel<"sifive-u54", RocketModel, [Feature64Bit, FeatureStdExtM, @@ -291,7 +475,8 @@ def : ProcessorModel<"sifive-u74", SiFive7Model, [Feature64Bit, FeatureStdExtA, FeatureStdExtF, FeatureStdExtD, - FeatureStdExtC]>; + FeatureStdExtC], + [TuneSiFive7]>; //===----------------------------------------------------------------------===// // Define the RISC-V target. diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp index f5d491938050..ad003404d793 100644 --- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp @@ -242,7 +242,8 @@ bool RISCVFrameLowering::hasBP(const MachineFunction &MF) const { // adjustment, we can not use SP to access the stack objects for the // arguments. Instead, use BP to access these stack objects. return (MFI.hasVarSizedObjects() || - (!hasReservedCallFrame(MF) && MFI.getMaxCallFrameSize() != 0)) && + (!hasReservedCallFrame(MF) && (!MFI.isMaxCallFrameSizeComputed() || + MFI.getMaxCallFrameSize() != 0))) && TRI->hasStackRealignment(MF); } @@ -940,11 +941,22 @@ void RISCVFrameLowering::processFunctionBeforeFrameFinalized( } static bool hasRVVFrameObject(const MachineFunction &MF) { - const MachineFrameInfo &MFI = MF.getFrameInfo(); - for (int I = 0, E = MFI.getObjectIndexEnd(); I != E; ++I) - if (MFI.getStackID(I) == TargetStackID::ScalableVector) - return true; - return false; + // Originally, the function will scan all the stack objects to check whether + // if there is any scalable vector object on the stack or not. However, it + // causes errors in the register allocator. In issue 53016, it returns false + // before RA because there is no RVV stack objects. After RA, it returns true + // because there are spilling slots for RVV values during RA. It will not + // reserve BP during register allocation and generate BP access in the PEI + // pass due to the inconsistent behavior of the function. + // + // The function is changed to use hasVInstructions() as the return value. It + // is not precise, but it can make the register allocation correct. + // + // FIXME: Find a better way to make the decision or revisit the solution in + // D103622. + // + // Refer to https://github.com/llvm/llvm-project/issues/53016. + return MF.getSubtarget<RISCVSubtarget>().hasVInstructions(); } // Not preserve stack space within prologue for outgoing variables when the diff --git a/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp b/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp index d47bd739235f..ba91b16661a4 100644 --- a/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp @@ -127,6 +127,41 @@ static std::pair<Value *, Value *> matchStridedConstant(Constant *StartC) { return std::make_pair(StartVal, Stride); } +static std::pair<Value *, Value *> matchStridedStart(Value *Start, + IRBuilder<> &Builder) { + // Base case, start is a strided constant. + auto *StartC = dyn_cast<Constant>(Start); + if (StartC) + return matchStridedConstant(StartC); + + // Not a constant, maybe it's a strided constant with a splat added to it. + auto *BO = dyn_cast<BinaryOperator>(Start); + if (!BO || BO->getOpcode() != Instruction::Add) + return std::make_pair(nullptr, nullptr); + + // Look for an operand that is splatted. + unsigned OtherIndex = 1; + Value *Splat = getSplatValue(BO->getOperand(0)); + if (!Splat) { + Splat = getSplatValue(BO->getOperand(1)); + OtherIndex = 0; + } + if (!Splat) + return std::make_pair(nullptr, nullptr); + + Value *Stride; + std::tie(Start, Stride) = matchStridedStart(BO->getOperand(OtherIndex), + Builder); + if (!Start) + return std::make_pair(nullptr, nullptr); + + // Add the splat value to the start. + Builder.SetInsertPoint(BO); + Builder.SetCurrentDebugLocation(DebugLoc()); + Start = Builder.CreateAdd(Start, Splat); + return std::make_pair(Start, Stride); +} + // Recursively, walk about the use-def chain until we find a Phi with a strided // start value. Build and update a scalar recurrence as we unwind the recursion. // We also update the Stride as we unwind. Our goal is to move all of the @@ -161,12 +196,7 @@ bool RISCVGatherScatterLowering::matchStridedRecurrence(Value *Index, Loop *L, if (!Step) return false; - // Start should be a strided constant. - auto *StartC = dyn_cast<Constant>(Start); - if (!StartC) - return false; - - std::tie(Start, Stride) = matchStridedConstant(StartC); + std::tie(Start, Stride) = matchStridedStart(Start, Builder); if (!Start) return false; assert(Stride != nullptr); diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp index b24eb5f7bbf4..5870502d74d5 100644 --- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp @@ -86,8 +86,12 @@ void RISCVDAGToDAGISel::PreprocessISelDAG() { SDVTList VTs = CurDAG->getVTList({VT, MVT::Other}); SDValue IntID = CurDAG->getTargetConstant(Intrinsic::riscv_vlse, DL, MVT::i64); - SDValue Ops[] = {Chain, IntID, StackSlot, - CurDAG->getRegister(RISCV::X0, MVT::i64), VL}; + SDValue Ops[] = {Chain, + IntID, + CurDAG->getUNDEF(VT), + StackSlot, + CurDAG->getRegister(RISCV::X0, MVT::i64), + VL}; SDValue Result = CurDAG->getMemIntrinsicNode( ISD::INTRINSIC_W_CHAIN, DL, VTs, Ops, MVT::i64, MPI, Align(8), @@ -125,12 +129,37 @@ void RISCVDAGToDAGISel::PostprocessISelDAG() { CurDAG->RemoveDeadNodes(); } -static SDNode *selectImm(SelectionDAG *CurDAG, const SDLoc &DL, int64_t Imm, - const RISCVSubtarget &Subtarget) { +static SDNode *selectImmWithConstantPool(SelectionDAG *CurDAG, const SDLoc &DL, + const MVT VT, int64_t Imm, + const RISCVSubtarget &Subtarget) { + assert(VT == MVT::i64 && "Expecting MVT::i64"); + const RISCVTargetLowering *TLI = Subtarget.getTargetLowering(); + ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(CurDAG->getConstantPool( + ConstantInt::get(EVT(VT).getTypeForEVT(*CurDAG->getContext()), Imm), VT)); + SDValue Addr = TLI->getAddr(CP, *CurDAG); + SDValue Offset = CurDAG->getTargetConstant(0, DL, VT); + // Since there is no data race, the chain can be the entry node. + SDNode *Load = CurDAG->getMachineNode(RISCV::LD, DL, VT, Addr, Offset, + CurDAG->getEntryNode()); + MachineFunction &MF = CurDAG->getMachineFunction(); + MachineMemOperand *MemOp = MF.getMachineMemOperand( + MachinePointerInfo::getConstantPool(MF), MachineMemOperand::MOLoad, + LLT(VT), CP->getAlign()); + CurDAG->setNodeMemRefs(cast<MachineSDNode>(Load), {MemOp}); + return Load; +} + +static SDNode *selectImm(SelectionDAG *CurDAG, const SDLoc &DL, const MVT VT, + int64_t Imm, const RISCVSubtarget &Subtarget) { MVT XLenVT = Subtarget.getXLenVT(); RISCVMatInt::InstSeq Seq = RISCVMatInt::generateInstSeq(Imm, Subtarget.getFeatureBits()); + // If Imm is expensive to build, then we put it into constant pool. + if (Subtarget.useConstantPoolForLargeInts() && + Seq.size() > Subtarget.getMaxBuildIntsCost()) + return selectImmWithConstantPool(CurDAG, DL, VT, Imm, Subtarget); + SDNode *Result = nullptr; SDValue SrcReg = CurDAG->getRegister(RISCV::X0, XLenVT); for (RISCVMatInt::Inst &Inst : Seq) { @@ -372,6 +401,10 @@ void RISCVDAGToDAGISel::selectVLXSEG(SDNode *Node, bool IsMasked, RISCVII::VLMUL IndexLMUL = RISCVTargetLowering::getLMUL(IndexVT); unsigned IndexLog2EEW = Log2_32(IndexVT.getScalarSizeInBits()); + if (IndexLog2EEW == 6 && !Subtarget->is64Bit()) { + report_fatal_error("The V extension does not support EEW=64 for index " + "values when XLEN=32"); + } const RISCV::VLXSEGPseudo *P = RISCV::getVLXSEGPseudo( NF, IsMasked, IsOrdered, IndexLog2EEW, static_cast<unsigned>(LMUL), static_cast<unsigned>(IndexLMUL)); @@ -450,6 +483,10 @@ void RISCVDAGToDAGISel::selectVSXSEG(SDNode *Node, bool IsMasked, RISCVII::VLMUL IndexLMUL = RISCVTargetLowering::getLMUL(IndexVT); unsigned IndexLog2EEW = Log2_32(IndexVT.getScalarSizeInBits()); + if (IndexLog2EEW == 6 && !Subtarget->is64Bit()) { + report_fatal_error("The V extension does not support EEW=64 for index " + "values when XLEN=32"); + } const RISCV::VSXSEGPseudo *P = RISCV::getVSXSEGPseudo( NF, IsMasked, IsOrdered, IndexLog2EEW, static_cast<unsigned>(LMUL), static_cast<unsigned>(IndexLMUL)); @@ -462,6 +499,75 @@ void RISCVDAGToDAGISel::selectVSXSEG(SDNode *Node, bool IsMasked, ReplaceNode(Node, Store); } +void RISCVDAGToDAGISel::selectVSETVLI(SDNode *Node) { + if (!Subtarget->hasVInstructions()) + return; + + assert((Node->getOpcode() == ISD::INTRINSIC_W_CHAIN || + Node->getOpcode() == ISD::INTRINSIC_WO_CHAIN) && + "Unexpected opcode"); + + SDLoc DL(Node); + MVT XLenVT = Subtarget->getXLenVT(); + + bool HasChain = Node->getOpcode() == ISD::INTRINSIC_W_CHAIN; + unsigned IntNoOffset = HasChain ? 1 : 0; + unsigned IntNo = Node->getConstantOperandVal(IntNoOffset); + + assert((IntNo == Intrinsic::riscv_vsetvli || + IntNo == Intrinsic::riscv_vsetvlimax || + IntNo == Intrinsic::riscv_vsetvli_opt || + IntNo == Intrinsic::riscv_vsetvlimax_opt) && + "Unexpected vsetvli intrinsic"); + + bool VLMax = IntNo == Intrinsic::riscv_vsetvlimax || + IntNo == Intrinsic::riscv_vsetvlimax_opt; + unsigned Offset = IntNoOffset + (VLMax ? 1 : 2); + + assert(Node->getNumOperands() == Offset + 2 && + "Unexpected number of operands"); + + unsigned SEW = + RISCVVType::decodeVSEW(Node->getConstantOperandVal(Offset) & 0x7); + RISCVII::VLMUL VLMul = static_cast<RISCVII::VLMUL>( + Node->getConstantOperandVal(Offset + 1) & 0x7); + + unsigned VTypeI = RISCVVType::encodeVTYPE(VLMul, SEW, /*TailAgnostic*/ true, + /*MaskAgnostic*/ false); + SDValue VTypeIOp = CurDAG->getTargetConstant(VTypeI, DL, XLenVT); + + SmallVector<EVT, 2> VTs = {XLenVT}; + if (HasChain) + VTs.push_back(MVT::Other); + + SDValue VLOperand; + unsigned Opcode = RISCV::PseudoVSETVLI; + if (VLMax) { + VLOperand = CurDAG->getRegister(RISCV::X0, XLenVT); + Opcode = RISCV::PseudoVSETVLIX0; + } else { + VLOperand = Node->getOperand(IntNoOffset + 1); + + if (auto *C = dyn_cast<ConstantSDNode>(VLOperand)) { + uint64_t AVL = C->getZExtValue(); + if (isUInt<5>(AVL)) { + SDValue VLImm = CurDAG->getTargetConstant(AVL, DL, XLenVT); + SmallVector<SDValue, 3> Ops = {VLImm, VTypeIOp}; + if (HasChain) + Ops.push_back(Node->getOperand(0)); + ReplaceNode( + Node, CurDAG->getMachineNode(RISCV::PseudoVSETIVLI, DL, VTs, Ops)); + return; + } + } + } + + SmallVector<SDValue, 3> Ops = {VLOperand, VTypeIOp}; + if (HasChain) + Ops.push_back(Node->getOperand(0)); + + ReplaceNode(Node, CurDAG->getMachineNode(Opcode, DL, VTs, Ops)); +} void RISCVDAGToDAGISel::Select(SDNode *Node) { // If we have a custom node, we have already selected. @@ -498,7 +604,7 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { if (!isInt<32>(Imm) && isUInt<32>(Imm) && hasAllWUsers(Node)) Imm = SignExtend64(Imm, 32); - ReplaceNode(Node, selectImm(CurDAG, DL, Imm, *Subtarget)); + ReplaceNode(Node, selectImm(CurDAG, DL, VT, Imm, *Subtarget)); return; } case ISD::FrameIndex: { @@ -509,38 +615,69 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { return; } case ISD::SRL: { - // We don't need this transform if zext.h is supported. - if (Subtarget->hasStdExtZbb() || Subtarget->hasStdExtZbp()) + // Optimize (srl (and X, C2), C) -> + // (srli (slli X, (XLen-C3), (XLen-C3) + C) + // Where C2 is a mask with C3 trailing ones. + // Taking into account that the C2 may have had lower bits unset by + // SimplifyDemandedBits. This avoids materializing the C2 immediate. + // This pattern occurs when type legalizing right shifts for types with + // less than XLen bits. + auto *N1C = dyn_cast<ConstantSDNode>(Node->getOperand(1)); + if (!N1C) + break; + SDValue N0 = Node->getOperand(0); + if (N0.getOpcode() != ISD::AND || !N0.hasOneUse() || + !isa<ConstantSDNode>(N0.getOperand(1))) + break; + unsigned ShAmt = N1C->getZExtValue(); + uint64_t Mask = N0.getConstantOperandVal(1); + Mask |= maskTrailingOnes<uint64_t>(ShAmt); + if (!isMask_64(Mask)) + break; + unsigned TrailingOnes = countTrailingOnes(Mask); + // 32 trailing ones should use srliw via tablegen pattern. + if (TrailingOnes == 32 || ShAmt >= TrailingOnes) break; - // Optimize (srl (and X, 0xffff), C) -> - // (srli (slli X, (XLen-16), (XLen-16) + C) - // Taking into account that the 0xffff may have had lower bits unset by - // SimplifyDemandedBits. This avoids materializing the 0xffff immediate. - // This pattern occurs when type legalizing i16 right shifts. - // FIXME: This could be extended to other AND masks. + unsigned LShAmt = Subtarget->getXLen() - TrailingOnes; + SDNode *SLLI = + CurDAG->getMachineNode(RISCV::SLLI, DL, VT, N0->getOperand(0), + CurDAG->getTargetConstant(LShAmt, DL, VT)); + SDNode *SRLI = CurDAG->getMachineNode( + RISCV::SRLI, DL, VT, SDValue(SLLI, 0), + CurDAG->getTargetConstant(LShAmt + ShAmt, DL, VT)); + ReplaceNode(Node, SRLI); + return; + } + case ISD::SRA: { + // Optimize (sra (sext_inreg X, i16), C) -> + // (srai (slli X, (XLen-16), (XLen-16) + C) + // And (sra (sext_inreg X, i8), C) -> + // (srai (slli X, (XLen-8), (XLen-8) + C) + // This can occur when Zbb is enabled, which makes sext_inreg i16/i8 legal. + // This transform matches the code we get without Zbb. The shifts are more + // compressible, and this can help expose CSE opportunities in the sdiv by + // constant optimization. auto *N1C = dyn_cast<ConstantSDNode>(Node->getOperand(1)); - if (N1C) { - uint64_t ShAmt = N1C->getZExtValue(); - SDValue N0 = Node->getOperand(0); - if (ShAmt < 16 && N0.getOpcode() == ISD::AND && N0.hasOneUse() && - isa<ConstantSDNode>(N0.getOperand(1))) { - uint64_t Mask = N0.getConstantOperandVal(1); - Mask |= maskTrailingOnes<uint64_t>(ShAmt); - if (Mask == 0xffff) { - unsigned LShAmt = Subtarget->getXLen() - 16; - SDNode *SLLI = - CurDAG->getMachineNode(RISCV::SLLI, DL, VT, N0->getOperand(0), - CurDAG->getTargetConstant(LShAmt, DL, VT)); - SDNode *SRLI = CurDAG->getMachineNode( - RISCV::SRLI, DL, VT, SDValue(SLLI, 0), - CurDAG->getTargetConstant(LShAmt + ShAmt, DL, VT)); - ReplaceNode(Node, SRLI); - return; - } - } - } - - break; + if (!N1C) + break; + SDValue N0 = Node->getOperand(0); + if (N0.getOpcode() != ISD::SIGN_EXTEND_INREG || !N0.hasOneUse()) + break; + unsigned ShAmt = N1C->getZExtValue(); + unsigned ExtSize = + cast<VTSDNode>(N0.getOperand(1))->getVT().getSizeInBits(); + // ExtSize of 32 should use sraiw via tablegen pattern. + if (ExtSize >= 32 || ShAmt >= ExtSize) + break; + unsigned LShAmt = Subtarget->getXLen() - ExtSize; + SDNode *SLLI = + CurDAG->getMachineNode(RISCV::SLLI, DL, VT, N0->getOperand(0), + CurDAG->getTargetConstant(LShAmt, DL, VT)); + SDNode *SRAI = CurDAG->getMachineNode( + RISCV::SRAI, DL, VT, SDValue(SLLI, 0), + CurDAG->getTargetConstant(LShAmt + ShAmt, DL, VT)); + ReplaceNode(Node, SRAI); + return; } case ISD::AND: { auto *N1C = dyn_cast<ConstantSDNode>(Node->getOperand(1)); @@ -774,7 +911,7 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { ShiftedC1 = SignExtend64(ShiftedC1, 32); // Create (mulhu (slli X, lzcnt(C2)), C1 << (XLen - lzcnt(C2))). - SDNode *Imm = selectImm(CurDAG, DL, ShiftedC1, *Subtarget); + SDNode *Imm = selectImm(CurDAG, DL, VT, ShiftedC1, *Subtarget); SDNode *SLLI = CurDAG->getMachineNode(RISCV::SLLI, DL, VT, N0.getOperand(0), CurDAG->getTargetConstant(LeadingZeros, DL, VT)); @@ -793,62 +930,52 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { case Intrinsic::riscv_vmsge: { SDValue Src1 = Node->getOperand(1); SDValue Src2 = Node->getOperand(2); + bool IsUnsigned = IntNo == Intrinsic::riscv_vmsgeu; + bool IsCmpUnsignedZero = false; // Only custom select scalar second operand. if (Src2.getValueType() != XLenVT) break; // Small constants are handled with patterns. if (auto *C = dyn_cast<ConstantSDNode>(Src2)) { int64_t CVal = C->getSExtValue(); - if (CVal >= -15 && CVal <= 16) - break; + if (CVal >= -15 && CVal <= 16) { + if (!IsUnsigned || CVal != 0) + break; + IsCmpUnsignedZero = true; + } } - bool IsUnsigned = IntNo == Intrinsic::riscv_vmsgeu; MVT Src1VT = Src1.getSimpleValueType(); - unsigned VMSLTOpcode, VMNANDOpcode; + unsigned VMSLTOpcode, VMNANDOpcode, VMSetOpcode; switch (RISCVTargetLowering::getLMUL(Src1VT)) { default: llvm_unreachable("Unexpected LMUL!"); - case RISCVII::VLMUL::LMUL_F8: - VMSLTOpcode = - IsUnsigned ? RISCV::PseudoVMSLTU_VX_MF8 : RISCV::PseudoVMSLT_VX_MF8; - VMNANDOpcode = RISCV::PseudoVMNAND_MM_MF8; - break; - case RISCVII::VLMUL::LMUL_F4: - VMSLTOpcode = - IsUnsigned ? RISCV::PseudoVMSLTU_VX_MF4 : RISCV::PseudoVMSLT_VX_MF4; - VMNANDOpcode = RISCV::PseudoVMNAND_MM_MF4; - break; - case RISCVII::VLMUL::LMUL_F2: - VMSLTOpcode = - IsUnsigned ? RISCV::PseudoVMSLTU_VX_MF2 : RISCV::PseudoVMSLT_VX_MF2; - VMNANDOpcode = RISCV::PseudoVMNAND_MM_MF2; - break; - case RISCVII::VLMUL::LMUL_1: - VMSLTOpcode = - IsUnsigned ? RISCV::PseudoVMSLTU_VX_M1 : RISCV::PseudoVMSLT_VX_M1; - VMNANDOpcode = RISCV::PseudoVMNAND_MM_M1; - break; - case RISCVII::VLMUL::LMUL_2: - VMSLTOpcode = - IsUnsigned ? RISCV::PseudoVMSLTU_VX_M2 : RISCV::PseudoVMSLT_VX_M2; - VMNANDOpcode = RISCV::PseudoVMNAND_MM_M2; - break; - case RISCVII::VLMUL::LMUL_4: - VMSLTOpcode = - IsUnsigned ? RISCV::PseudoVMSLTU_VX_M4 : RISCV::PseudoVMSLT_VX_M4; - VMNANDOpcode = RISCV::PseudoVMNAND_MM_M4; - break; - case RISCVII::VLMUL::LMUL_8: - VMSLTOpcode = - IsUnsigned ? RISCV::PseudoVMSLTU_VX_M8 : RISCV::PseudoVMSLT_VX_M8; - VMNANDOpcode = RISCV::PseudoVMNAND_MM_M8; - break; +#define CASE_VMSLT_VMNAND_VMSET_OPCODES(lmulenum, suffix, suffix_b) \ + case RISCVII::VLMUL::lmulenum: \ + VMSLTOpcode = IsUnsigned ? RISCV::PseudoVMSLTU_VX_##suffix \ + : RISCV::PseudoVMSLT_VX_##suffix; \ + VMNANDOpcode = RISCV::PseudoVMNAND_MM_##suffix; \ + VMSetOpcode = RISCV::PseudoVMSET_M_##suffix_b; \ + break; + CASE_VMSLT_VMNAND_VMSET_OPCODES(LMUL_F8, MF8, B1) + CASE_VMSLT_VMNAND_VMSET_OPCODES(LMUL_F4, MF4, B2) + CASE_VMSLT_VMNAND_VMSET_OPCODES(LMUL_F2, MF2, B4) + CASE_VMSLT_VMNAND_VMSET_OPCODES(LMUL_1, M1, B8) + CASE_VMSLT_VMNAND_VMSET_OPCODES(LMUL_2, M2, B16) + CASE_VMSLT_VMNAND_VMSET_OPCODES(LMUL_4, M4, B32) + CASE_VMSLT_VMNAND_VMSET_OPCODES(LMUL_8, M8, B64) +#undef CASE_VMSLT_VMNAND_VMSET_OPCODES } SDValue SEW = CurDAG->getTargetConstant( Log2_32(Src1VT.getScalarSizeInBits()), DL, XLenVT); SDValue VL; selectVLOp(Node->getOperand(3), VL); + // If vmsgeu with 0 immediate, expand it to vmset. + if (IsCmpUnsignedZero) { + ReplaceNode(Node, CurDAG->getMachineNode(VMSetOpcode, DL, VT, VL, SEW)); + return; + } + // Expand to // vmslt{u}.vx vd, va, x; vmnand.mm vd, vd, vd SDValue Cmp = SDValue( @@ -862,96 +989,61 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { case Intrinsic::riscv_vmsge_mask: { SDValue Src1 = Node->getOperand(2); SDValue Src2 = Node->getOperand(3); + bool IsUnsigned = IntNo == Intrinsic::riscv_vmsgeu_mask; + bool IsCmpUnsignedZero = false; // Only custom select scalar second operand. if (Src2.getValueType() != XLenVT) break; // Small constants are handled with patterns. if (auto *C = dyn_cast<ConstantSDNode>(Src2)) { int64_t CVal = C->getSExtValue(); - if (CVal >= -15 && CVal <= 16) - break; + if (CVal >= -15 && CVal <= 16) { + if (!IsUnsigned || CVal != 0) + break; + IsCmpUnsignedZero = true; + } } - bool IsUnsigned = IntNo == Intrinsic::riscv_vmsgeu_mask; MVT Src1VT = Src1.getSimpleValueType(); - unsigned VMSLTOpcode, VMSLTMaskOpcode, VMXOROpcode, VMANDNOpcode; + unsigned VMSLTOpcode, VMSLTMaskOpcode, VMXOROpcode, VMANDNOpcode, + VMSetOpcode, VMANDOpcode; switch (RISCVTargetLowering::getLMUL(Src1VT)) { default: llvm_unreachable("Unexpected LMUL!"); - case RISCVII::VLMUL::LMUL_F8: - VMSLTOpcode = - IsUnsigned ? RISCV::PseudoVMSLTU_VX_MF8 : RISCV::PseudoVMSLT_VX_MF8; - VMSLTMaskOpcode = IsUnsigned ? RISCV::PseudoVMSLTU_VX_MF8_MASK - : RISCV::PseudoVMSLT_VX_MF8_MASK; - break; - case RISCVII::VLMUL::LMUL_F4: - VMSLTOpcode = - IsUnsigned ? RISCV::PseudoVMSLTU_VX_MF4 : RISCV::PseudoVMSLT_VX_MF4; - VMSLTMaskOpcode = IsUnsigned ? RISCV::PseudoVMSLTU_VX_MF4_MASK - : RISCV::PseudoVMSLT_VX_MF4_MASK; - break; - case RISCVII::VLMUL::LMUL_F2: - VMSLTOpcode = - IsUnsigned ? RISCV::PseudoVMSLTU_VX_MF2 : RISCV::PseudoVMSLT_VX_MF2; - VMSLTMaskOpcode = IsUnsigned ? RISCV::PseudoVMSLTU_VX_MF2_MASK - : RISCV::PseudoVMSLT_VX_MF2_MASK; - break; - case RISCVII::VLMUL::LMUL_1: - VMSLTOpcode = - IsUnsigned ? RISCV::PseudoVMSLTU_VX_M1 : RISCV::PseudoVMSLT_VX_M1; - VMSLTMaskOpcode = IsUnsigned ? RISCV::PseudoVMSLTU_VX_M1_MASK - : RISCV::PseudoVMSLT_VX_M1_MASK; - break; - case RISCVII::VLMUL::LMUL_2: - VMSLTOpcode = - IsUnsigned ? RISCV::PseudoVMSLTU_VX_M2 : RISCV::PseudoVMSLT_VX_M2; - VMSLTMaskOpcode = IsUnsigned ? RISCV::PseudoVMSLTU_VX_M2_MASK - : RISCV::PseudoVMSLT_VX_M2_MASK; - break; - case RISCVII::VLMUL::LMUL_4: - VMSLTOpcode = - IsUnsigned ? RISCV::PseudoVMSLTU_VX_M4 : RISCV::PseudoVMSLT_VX_M4; - VMSLTMaskOpcode = IsUnsigned ? RISCV::PseudoVMSLTU_VX_M4_MASK - : RISCV::PseudoVMSLT_VX_M4_MASK; - break; - case RISCVII::VLMUL::LMUL_8: - VMSLTOpcode = - IsUnsigned ? RISCV::PseudoVMSLTU_VX_M8 : RISCV::PseudoVMSLT_VX_M8; - VMSLTMaskOpcode = IsUnsigned ? RISCV::PseudoVMSLTU_VX_M8_MASK - : RISCV::PseudoVMSLT_VX_M8_MASK; - break; +#define CASE_VMSLT_VMSET_OPCODES(lmulenum, suffix, suffix_b) \ + case RISCVII::VLMUL::lmulenum: \ + VMSLTOpcode = IsUnsigned ? RISCV::PseudoVMSLTU_VX_##suffix \ + : RISCV::PseudoVMSLT_VX_##suffix; \ + VMSLTMaskOpcode = IsUnsigned ? RISCV::PseudoVMSLTU_VX_##suffix##_MASK \ + : RISCV::PseudoVMSLT_VX_##suffix##_MASK; \ + VMSetOpcode = RISCV::PseudoVMSET_M_##suffix_b; \ + break; + CASE_VMSLT_VMSET_OPCODES(LMUL_F8, MF8, B1) + CASE_VMSLT_VMSET_OPCODES(LMUL_F4, MF4, B2) + CASE_VMSLT_VMSET_OPCODES(LMUL_F2, MF2, B4) + CASE_VMSLT_VMSET_OPCODES(LMUL_1, M1, B8) + CASE_VMSLT_VMSET_OPCODES(LMUL_2, M2, B16) + CASE_VMSLT_VMSET_OPCODES(LMUL_4, M4, B32) + CASE_VMSLT_VMSET_OPCODES(LMUL_8, M8, B64) +#undef CASE_VMSLT_VMSET_OPCODES } // Mask operations use the LMUL from the mask type. switch (RISCVTargetLowering::getLMUL(VT)) { default: llvm_unreachable("Unexpected LMUL!"); - case RISCVII::VLMUL::LMUL_F8: - VMXOROpcode = RISCV::PseudoVMXOR_MM_MF8; - VMANDNOpcode = RISCV::PseudoVMANDN_MM_MF8; - break; - case RISCVII::VLMUL::LMUL_F4: - VMXOROpcode = RISCV::PseudoVMXOR_MM_MF4; - VMANDNOpcode = RISCV::PseudoVMANDN_MM_MF4; - break; - case RISCVII::VLMUL::LMUL_F2: - VMXOROpcode = RISCV::PseudoVMXOR_MM_MF2; - VMANDNOpcode = RISCV::PseudoVMANDN_MM_MF2; - break; - case RISCVII::VLMUL::LMUL_1: - VMXOROpcode = RISCV::PseudoVMXOR_MM_M1; - VMANDNOpcode = RISCV::PseudoVMANDN_MM_M1; - break; - case RISCVII::VLMUL::LMUL_2: - VMXOROpcode = RISCV::PseudoVMXOR_MM_M2; - VMANDNOpcode = RISCV::PseudoVMANDN_MM_M2; - break; - case RISCVII::VLMUL::LMUL_4: - VMXOROpcode = RISCV::PseudoVMXOR_MM_M4; - VMANDNOpcode = RISCV::PseudoVMANDN_MM_M4; - break; - case RISCVII::VLMUL::LMUL_8: - VMXOROpcode = RISCV::PseudoVMXOR_MM_M8; - VMANDNOpcode = RISCV::PseudoVMANDN_MM_M8; - break; +#define CASE_VMXOR_VMANDN_VMAND_OPCODES(lmulenum, suffix) \ + case RISCVII::VLMUL::lmulenum: \ + VMXOROpcode = RISCV::PseudoVMXOR_MM_##suffix; \ + VMANDNOpcode = RISCV::PseudoVMANDN_MM_##suffix; \ + VMANDOpcode = RISCV::PseudoVMAND_MM_##suffix; \ + break; + CASE_VMXOR_VMANDN_VMAND_OPCODES(LMUL_F8, MF8) + CASE_VMXOR_VMANDN_VMAND_OPCODES(LMUL_F4, MF4) + CASE_VMXOR_VMANDN_VMAND_OPCODES(LMUL_F2, MF2) + CASE_VMXOR_VMANDN_VMAND_OPCODES(LMUL_1, M1) + CASE_VMXOR_VMANDN_VMAND_OPCODES(LMUL_2, M2) + CASE_VMXOR_VMANDN_VMAND_OPCODES(LMUL_4, M4) + CASE_VMXOR_VMANDN_VMAND_OPCODES(LMUL_8, M8) +#undef CASE_VMXOR_VMANDN_VMAND_OPCODES } SDValue SEW = CurDAG->getTargetConstant( Log2_32(Src1VT.getScalarSizeInBits()), DL, XLenVT); @@ -960,6 +1052,16 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { selectVLOp(Node->getOperand(5), VL); SDValue MaskedOff = Node->getOperand(1); SDValue Mask = Node->getOperand(4); + + // If vmsgeu_mask with 0 immediate, expand it to {vmset, vmand}. + if (IsCmpUnsignedZero) { + SDValue VMSet = + SDValue(CurDAG->getMachineNode(VMSetOpcode, DL, VT, VL, SEW), 0); + ReplaceNode(Node, CurDAG->getMachineNode(VMANDOpcode, DL, VT, + {Mask, VMSet, VL, MaskSEW})); + return; + } + // If the MaskedOff value and the Mask are the same value use // vmslt{u}.vx vt, va, x; vmandn.mm vd, vd, vt // This avoids needing to copy v0 to vd before starting the next sequence. @@ -988,6 +1090,9 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { {Cmp, Mask, VL, MaskSEW})); return; } + case Intrinsic::riscv_vsetvli_opt: + case Intrinsic::riscv_vsetvlimax_opt: + return selectVSETVLI(Node); } break; } @@ -997,54 +1102,9 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { // By default we do not custom select any intrinsic. default: break; - case Intrinsic::riscv_vsetvli: - case Intrinsic::riscv_vsetvlimax: { - if (!Subtarget->hasVInstructions()) - break; - - bool VLMax = IntNo == Intrinsic::riscv_vsetvlimax; - unsigned Offset = VLMax ? 2 : 3; - - assert(Node->getNumOperands() == Offset + 2 && - "Unexpected number of operands"); - - unsigned SEW = - RISCVVType::decodeVSEW(Node->getConstantOperandVal(Offset) & 0x7); - RISCVII::VLMUL VLMul = static_cast<RISCVII::VLMUL>( - Node->getConstantOperandVal(Offset + 1) & 0x7); - - unsigned VTypeI = RISCVVType::encodeVTYPE( - VLMul, SEW, /*TailAgnostic*/ true, /*MaskAgnostic*/ false); - SDValue VTypeIOp = CurDAG->getTargetConstant(VTypeI, DL, XLenVT); - - SDValue VLOperand; - unsigned Opcode = RISCV::PseudoVSETVLI; - if (VLMax) { - VLOperand = CurDAG->getRegister(RISCV::X0, XLenVT); - Opcode = RISCV::PseudoVSETVLIX0; - } else { - VLOperand = Node->getOperand(2); - - if (auto *C = dyn_cast<ConstantSDNode>(VLOperand)) { - uint64_t AVL = C->getZExtValue(); - if (isUInt<5>(AVL)) { - SDValue VLImm = CurDAG->getTargetConstant(AVL, DL, XLenVT); - ReplaceNode( - Node, CurDAG->getMachineNode(RISCV::PseudoVSETIVLI, DL, XLenVT, - MVT::Other, VLImm, VTypeIOp, - /* Chain */ Node->getOperand(0))); - return; - } - } - } - - ReplaceNode(Node, - CurDAG->getMachineNode(Opcode, DL, XLenVT, - MVT::Other, VLOperand, VTypeIOp, - /* Chain */ Node->getOperand(0))); - return; - } + case Intrinsic::riscv_vsetvlimax: + return selectVSETVLI(Node); case Intrinsic::riscv_vlseg2: case Intrinsic::riscv_vlseg3: case Intrinsic::riscv_vlseg4: @@ -1154,9 +1214,14 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { unsigned Log2SEW = Log2_32(VT.getScalarSizeInBits()); unsigned CurOp = 2; + // Masked intrinsic only have TU version pseduo instructions. + bool IsTU = IsMasked || (!IsMasked && !Node->getOperand(CurOp).isUndef()); SmallVector<SDValue, 8> Operands; - if (IsMasked) + if (IsTU) Operands.push_back(Node->getOperand(CurOp++)); + else + // Skip the undef passthru operand for nomask TA version pseudo + CurOp++; MVT IndexVT; addVectorLoadStoreOperands(Node, Log2SEW, DL, CurOp, IsMasked, @@ -1169,8 +1234,12 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { RISCVII::VLMUL LMUL = RISCVTargetLowering::getLMUL(VT); RISCVII::VLMUL IndexLMUL = RISCVTargetLowering::getLMUL(IndexVT); unsigned IndexLog2EEW = Log2_32(IndexVT.getScalarSizeInBits()); + if (IndexLog2EEW == 6 && !Subtarget->is64Bit()) { + report_fatal_error("The V extension does not support EEW=64 for index " + "values when XLEN=32"); + } const RISCV::VLX_VSXPseudo *P = RISCV::getVLXPseudo( - IsMasked, IsOrdered, IndexLog2EEW, static_cast<unsigned>(LMUL), + IsMasked, IsTU, IsOrdered, IndexLog2EEW, static_cast<unsigned>(LMUL), static_cast<unsigned>(IndexLMUL)); MachineSDNode *Load = CurDAG->getMachineNode(P->Pseudo, DL, Node->getVTList(), Operands); @@ -1195,16 +1264,25 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { unsigned Log2SEW = Log2_32(VT.getScalarSizeInBits()); unsigned CurOp = 2; + // The riscv_vlm intrinsic are always tail agnostic and no passthru operand. + bool HasPassthruOperand = IntNo != Intrinsic::riscv_vlm; + // Masked intrinsic only have TU version pseduo instructions. + bool IsTU = + HasPassthruOperand && + ((!IsMasked && !Node->getOperand(CurOp).isUndef()) || IsMasked); SmallVector<SDValue, 8> Operands; - if (IsMasked) + if (IsTU) Operands.push_back(Node->getOperand(CurOp++)); + else if (HasPassthruOperand) + // Skip the undef passthru operand for nomask TA version pseudo + CurOp++; addVectorLoadStoreOperands(Node, Log2SEW, DL, CurOp, IsMasked, IsStrided, Operands, /*IsLoad=*/true); RISCVII::VLMUL LMUL = RISCVTargetLowering::getLMUL(VT); const RISCV::VLEPseudo *P = - RISCV::getVLEPseudo(IsMasked, IsStrided, /*FF*/ false, Log2SEW, + RISCV::getVLEPseudo(IsMasked, IsTU, IsStrided, /*FF*/ false, Log2SEW, static_cast<unsigned>(LMUL)); MachineSDNode *Load = CurDAG->getMachineNode(P->Pseudo, DL, Node->getVTList(), Operands); @@ -1223,9 +1301,14 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { unsigned Log2SEW = Log2_32(VT.getScalarSizeInBits()); unsigned CurOp = 2; + // Masked intrinsic only have TU version pseduo instructions. + bool IsTU = IsMasked || (!IsMasked && !Node->getOperand(CurOp).isUndef()); SmallVector<SDValue, 7> Operands; - if (IsMasked) + if (IsTU) Operands.push_back(Node->getOperand(CurOp++)); + else + // Skip the undef passthru operand for nomask TA version pseudo + CurOp++; addVectorLoadStoreOperands(Node, Log2SEW, DL, CurOp, IsMasked, /*IsStridedOrIndexed*/ false, Operands, @@ -1233,8 +1316,8 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { RISCVII::VLMUL LMUL = RISCVTargetLowering::getLMUL(VT); const RISCV::VLEPseudo *P = - RISCV::getVLEPseudo(IsMasked, /*Strided*/ false, /*FF*/ true, Log2SEW, - static_cast<unsigned>(LMUL)); + RISCV::getVLEPseudo(IsMasked, IsTU, /*Strided*/ false, /*FF*/ true, + Log2SEW, static_cast<unsigned>(LMUL)); MachineSDNode *Load = CurDAG->getMachineNode(P->Pseudo, DL, Node->getValueType(0), MVT::Other, MVT::Glue, Operands); @@ -1359,9 +1442,13 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { RISCVII::VLMUL LMUL = RISCVTargetLowering::getLMUL(VT); RISCVII::VLMUL IndexLMUL = RISCVTargetLowering::getLMUL(IndexVT); unsigned IndexLog2EEW = Log2_32(IndexVT.getScalarSizeInBits()); + if (IndexLog2EEW == 6 && !Subtarget->is64Bit()) { + report_fatal_error("The V extension does not support EEW=64 for index " + "values when XLEN=32"); + } const RISCV::VLX_VSXPseudo *P = RISCV::getVSXPseudo( - IsMasked, IsOrdered, IndexLog2EEW, static_cast<unsigned>(LMUL), - static_cast<unsigned>(IndexLMUL)); + IsMasked, /*TU*/ false, IsOrdered, IndexLog2EEW, + static_cast<unsigned>(LMUL), static_cast<unsigned>(IndexLMUL)); MachineSDNode *Store = CurDAG->getMachineNode(P->Pseudo, DL, Node->getVTList(), Operands); @@ -1516,10 +1603,16 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { return; } case ISD::SPLAT_VECTOR: + case RISCVISD::VMV_S_X_VL: + case RISCVISD::VFMV_S_F_VL: case RISCVISD::VMV_V_X_VL: case RISCVISD::VFMV_V_F_VL: { // Try to match splat of a scalar load to a strided load with stride of x0. - SDValue Src = Node->getOperand(0); + bool IsScalarMove = Node->getOpcode() == RISCVISD::VMV_S_X_VL || + Node->getOpcode() == RISCVISD::VFMV_S_F_VL; + if (IsScalarMove && !Node->getOperand(0).isUndef()) + break; + SDValue Src = IsScalarMove ? Node->getOperand(1) : Node->getOperand(0); auto *Ld = dyn_cast<LoadSDNode>(Src); if (!Ld) break; @@ -1534,7 +1627,13 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { SDValue VL; if (Node->getOpcode() == ISD::SPLAT_VECTOR) VL = CurDAG->getTargetConstant(RISCV::VLMaxSentinel, DL, XLenVT); - else + else if (IsScalarMove) { + // We could deal with more VL if we update the VSETVLI insert pass to + // avoid introducing more VSETVLI. + if (!isOneConstant(Node->getOperand(2))) + break; + selectVLOp(Node->getOperand(2), VL); + } else selectVLOp(Node->getOperand(1), VL); unsigned Log2SEW = Log2_32(VT.getScalarSizeInBits()); @@ -1546,8 +1645,8 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { RISCVII::VLMUL LMUL = RISCVTargetLowering::getLMUL(VT); const RISCV::VLEPseudo *P = RISCV::getVLEPseudo( - /*IsMasked*/ false, /*IsStrided*/ true, /*FF*/ false, Log2SEW, - static_cast<unsigned>(LMUL)); + /*IsMasked*/ false, /*IsTU*/ false, /*IsStrided*/ true, /*FF*/ false, + Log2SEW, static_cast<unsigned>(LMUL)); MachineSDNode *Load = CurDAG->getMachineNode(P->Pseudo, DL, Node->getVTList(), Operands); @@ -1727,6 +1826,20 @@ bool RISCVDAGToDAGISel::hasAllNBitUsers(SDNode *Node, unsigned Bits) const { if (Bits < Subtarget->getXLen() - User->getConstantOperandVal(1)) return false; break; + case RISCV::ANDI: + if (Bits < (64 - countLeadingZeros(User->getConstantOperandVal(1)))) + return false; + break; + case RISCV::SEXTB: + if (Bits < 8) + return false; + break; + case RISCV::SEXTH: + case RISCV::ZEXTH_RV32: + case RISCV::ZEXTH_RV64: + if (Bits < 16) + return false; + break; case RISCV::ADDUW: case RISCV::SH1ADDUW: case RISCV::SH2ADDUW: @@ -1758,7 +1871,8 @@ bool RISCVDAGToDAGISel::hasAllNBitUsers(SDNode *Node, unsigned Bits) const { // allows us to choose betwen VSETIVLI or VSETVLI later. bool RISCVDAGToDAGISel::selectVLOp(SDValue N, SDValue &VL) { auto *C = dyn_cast<ConstantSDNode>(N); - if (C && isUInt<5>(C->getZExtValue())) + if (C && (isUInt<5>(C->getZExtValue()) || + C->getSExtValue() == RISCV::VLMaxSentinel)) VL = CurDAG->getTargetConstant(C->getZExtValue(), SDLoc(N), N->getValueType(0)); else diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h index a2770089995d..c429a9298739 100644 --- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h +++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h @@ -87,6 +87,8 @@ public: void selectVSSEG(SDNode *Node, bool IsMasked, bool IsStrided); void selectVSXSEG(SDNode *Node, bool IsMasked, bool IsOrdered); + void selectVSETVLI(SDNode *Node); + // Return the RISC-V condition code that matches the given DAG integer // condition code. The CondCode must be one of those supported by the RISC-V // ISA (see translateSetCCForBranch). @@ -159,6 +161,7 @@ struct VSXSEGPseudo { struct VLEPseudo { uint16_t Masked : 1; + uint16_t IsTU : 1; uint16_t Strided : 1; uint16_t FF : 1; uint16_t Log2SEW : 3; @@ -176,6 +179,7 @@ struct VSEPseudo { struct VLX_VSXPseudo { uint16_t Masked : 1; + uint16_t IsTU : 1; uint16_t Ordered : 1; uint16_t Log2SEW : 3; uint16_t LMUL : 3; diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 4f5512e6fb37..5cc3aa35d4d2 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -24,6 +24,7 @@ #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineJumpTableInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" #include "llvm/CodeGen/ValueTypes.h" @@ -249,7 +250,8 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, setOperationAction(ISD::SRL_PARTS, XLenVT, Custom); setOperationAction(ISD::SRA_PARTS, XLenVT, Custom); - if (Subtarget.hasStdExtZbb() || Subtarget.hasStdExtZbp()) { + if (Subtarget.hasStdExtZbb() || Subtarget.hasStdExtZbp() || + Subtarget.hasStdExtZbkb()) { if (Subtarget.is64Bit()) { setOperationAction(ISD::ROTL, MVT::i32, Custom); setOperationAction(ISD::ROTR, MVT::i32, Custom); @@ -277,7 +279,9 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, // With Zbb we have an XLen rev8 instruction, but not GREVI. So we'll // pattern match it directly in isel. setOperationAction(ISD::BSWAP, XLenVT, - Subtarget.hasStdExtZbb() ? Legal : Expand); + (Subtarget.hasStdExtZbb() || Subtarget.hasStdExtZbkb()) + ? Legal + : Expand); } if (Subtarget.hasStdExtZbb()) { @@ -330,6 +334,10 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, setOperationAction(ISD::LLRINT, MVT::f16, Legal); setOperationAction(ISD::LROUND, MVT::f16, Legal); setOperationAction(ISD::LLROUND, MVT::f16, Legal); + setOperationAction(ISD::STRICT_LRINT, MVT::f16, Legal); + setOperationAction(ISD::STRICT_LLRINT, MVT::f16, Legal); + setOperationAction(ISD::STRICT_LROUND, MVT::f16, Legal); + setOperationAction(ISD::STRICT_LLROUND, MVT::f16, Legal); setOperationAction(ISD::STRICT_FADD, MVT::f16, Legal); setOperationAction(ISD::STRICT_FMA, MVT::f16, Legal); setOperationAction(ISD::STRICT_FSUB, MVT::f16, Legal); @@ -338,6 +346,8 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Legal); setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Legal); setOperationAction(ISD::STRICT_FSQRT, MVT::f16, Legal); + setOperationAction(ISD::STRICT_FSETCC, MVT::f16, Legal); + setOperationAction(ISD::STRICT_FSETCCS, MVT::f16, Legal); for (auto CC : FPCCToExpand) setCondCodeAction(CC, MVT::f16, Expand); setOperationAction(ISD::SELECT_CC, MVT::f16, Expand); @@ -363,6 +373,9 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, setOperationAction(ISD::FLOG2, MVT::f16, Promote); setOperationAction(ISD::FLOG10, MVT::f16, Promote); + // FIXME: Need to promote f16 STRICT_* to f32 libcalls, but we don't have + // complete support for all operations in LegalizeDAG. + // We need to custom promote this. if (Subtarget.is64Bit()) setOperationAction(ISD::FPOWI, MVT::i32, Custom); @@ -375,12 +388,18 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, setOperationAction(ISD::LLRINT, MVT::f32, Legal); setOperationAction(ISD::LROUND, MVT::f32, Legal); setOperationAction(ISD::LLROUND, MVT::f32, Legal); + setOperationAction(ISD::STRICT_LRINT, MVT::f32, Legal); + setOperationAction(ISD::STRICT_LLRINT, MVT::f32, Legal); + setOperationAction(ISD::STRICT_LROUND, MVT::f32, Legal); + setOperationAction(ISD::STRICT_LLROUND, MVT::f32, Legal); setOperationAction(ISD::STRICT_FADD, MVT::f32, Legal); setOperationAction(ISD::STRICT_FMA, MVT::f32, Legal); setOperationAction(ISD::STRICT_FSUB, MVT::f32, Legal); setOperationAction(ISD::STRICT_FMUL, MVT::f32, Legal); setOperationAction(ISD::STRICT_FDIV, MVT::f32, Legal); setOperationAction(ISD::STRICT_FSQRT, MVT::f32, Legal); + setOperationAction(ISD::STRICT_FSETCC, MVT::f32, Legal); + setOperationAction(ISD::STRICT_FSETCCS, MVT::f32, Legal); for (auto CC : FPCCToExpand) setCondCodeAction(CC, MVT::f32, Expand); setOperationAction(ISD::SELECT_CC, MVT::f32, Expand); @@ -402,6 +421,10 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, setOperationAction(ISD::LLRINT, MVT::f64, Legal); setOperationAction(ISD::LROUND, MVT::f64, Legal); setOperationAction(ISD::LLROUND, MVT::f64, Legal); + setOperationAction(ISD::STRICT_LRINT, MVT::f64, Legal); + setOperationAction(ISD::STRICT_LLRINT, MVT::f64, Legal); + setOperationAction(ISD::STRICT_LROUND, MVT::f64, Legal); + setOperationAction(ISD::STRICT_LLROUND, MVT::f64, Legal); setOperationAction(ISD::STRICT_FMA, MVT::f64, Legal); setOperationAction(ISD::STRICT_FADD, MVT::f64, Legal); setOperationAction(ISD::STRICT_FSUB, MVT::f64, Legal); @@ -410,6 +433,8 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Legal); setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Legal); setOperationAction(ISD::STRICT_FSQRT, MVT::f64, Legal); + setOperationAction(ISD::STRICT_FSETCC, MVT::f64, Legal); + setOperationAction(ISD::STRICT_FSETCCS, MVT::f64, Legal); for (auto CC : FPCCToExpand) setCondCodeAction(CC, MVT::f64, Expand); setOperationAction(ISD::SELECT_CC, MVT::f64, Expand); @@ -499,12 +524,13 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, ISD::VP_SHL, ISD::VP_REDUCE_ADD, ISD::VP_REDUCE_AND, ISD::VP_REDUCE_OR, ISD::VP_REDUCE_XOR, ISD::VP_REDUCE_SMAX, ISD::VP_REDUCE_SMIN, ISD::VP_REDUCE_UMAX, ISD::VP_REDUCE_UMIN, - ISD::VP_SELECT}; + ISD::VP_MERGE, ISD::VP_SELECT}; static const unsigned FloatingPointVPOps[] = { ISD::VP_FADD, ISD::VP_FSUB, ISD::VP_FMUL, ISD::VP_FDIV, ISD::VP_REDUCE_FADD, ISD::VP_REDUCE_SEQ_FADD, - ISD::VP_REDUCE_FMIN, ISD::VP_REDUCE_FMAX, ISD::VP_SELECT}; + ISD::VP_REDUCE_FMIN, ISD::VP_REDUCE_FMAX, ISD::VP_MERGE, + ISD::VP_SELECT}; if (!Subtarget.is64Bit()) { // We must custom-lower certain vXi64 operations on RV32 due to the vector @@ -546,6 +572,8 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, setOperationAction(ISD::SELECT, VT, Custom); setOperationAction(ISD::SELECT_CC, VT, Expand); setOperationAction(ISD::VSELECT, VT, Expand); + setOperationAction(ISD::VP_MERGE, VT, Expand); + setOperationAction(ISD::VP_SELECT, VT, Expand); setOperationAction(ISD::VP_AND, VT, Custom); setOperationAction(ISD::VP_OR, VT, Custom); @@ -590,6 +618,12 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, setOperationAction(ISD::SMUL_LOHI, VT, Expand); setOperationAction(ISD::UMUL_LOHI, VT, Expand); + // nxvXi64 MULHS/MULHU requires the V extension instead of Zve64*. + if (VT.getVectorElementType() == MVT::i64 && !Subtarget.hasStdExtV()) { + setOperationAction(ISD::MULHU, VT, Expand); + setOperationAction(ISD::MULHS, VT, Expand); + } + setOperationAction(ISD::SMIN, VT, Legal); setOperationAction(ISD::SMAX, VT, Legal); setOperationAction(ISD::UMIN, VT, Legal); @@ -886,8 +920,11 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, setOperationAction(ISD::UMAX, VT, Custom); setOperationAction(ISD::ABS, VT, Custom); - setOperationAction(ISD::MULHS, VT, Custom); - setOperationAction(ISD::MULHU, VT, Custom); + // vXi64 MULHS/MULHU requires the V extension instead of Zve64*. + if (VT.getVectorElementType() != MVT::i64 || Subtarget.hasStdExtV()) { + setOperationAction(ISD::MULHS, VT, Custom); + setOperationAction(ISD::MULHU, VT, Custom); + } setOperationAction(ISD::SADDSAT, VT, Custom); setOperationAction(ISD::UADDSAT, VT, Custom); @@ -1002,9 +1039,12 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, setOperationAction(ISD::BITCAST, MVT::i16, Custom); setOperationAction(ISD::BITCAST, MVT::i32, Custom); setOperationAction(ISD::BITCAST, MVT::i64, Custom); - setOperationAction(ISD::BITCAST, MVT::f16, Custom); - setOperationAction(ISD::BITCAST, MVT::f32, Custom); - setOperationAction(ISD::BITCAST, MVT::f64, Custom); + if (Subtarget.hasStdExtZfh()) + setOperationAction(ISD::BITCAST, MVT::f16, Custom); + if (Subtarget.hasStdExtF()) + setOperationAction(ISD::BITCAST, MVT::f32, Custom); + if (Subtarget.hasStdExtD()) + setOperationAction(ISD::BITCAST, MVT::f64, Custom); } } @@ -1024,7 +1064,13 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::OR); setTargetDAGCombine(ISD::XOR); setTargetDAGCombine(ISD::ANY_EXTEND); - setTargetDAGCombine(ISD::ZERO_EXTEND); + if (Subtarget.hasStdExtF()) { + setTargetDAGCombine(ISD::ZERO_EXTEND); + setTargetDAGCombine(ISD::FP_TO_SINT); + setTargetDAGCombine(ISD::FP_TO_UINT); + setTargetDAGCombine(ISD::FP_TO_SINT_SAT); + setTargetDAGCombine(ISD::FP_TO_UINT_SAT); + } if (Subtarget.hasVInstructions()) { setTargetDAGCombine(ISD::FCOPYSIGN); setTargetDAGCombine(ISD::MGATHER); @@ -1072,7 +1118,7 @@ bool RISCVTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, case Intrinsic::riscv_masked_cmpxchg_i32: { PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType()); Info.opc = ISD::INTRINSIC_W_CHAIN; - Info.memVT = MVT::getVT(PtrTy->getElementType()); + Info.memVT = MVT::getVT(PtrTy->getPointerElementType()); Info.ptrVal = I.getArgOperand(0); Info.offset = 0; Info.align = Align(4); @@ -1158,10 +1204,11 @@ bool RISCVTargetLowering::isTruncateFree(EVT SrcVT, EVT DstVT) const { bool RISCVTargetLowering::isZExtFree(SDValue Val, EVT VT2) const { // Zexts are free if they can be combined with a load. + // Don't advertise i32->i64 zextload as being free for RV64. It interacts + // poorly with type legalization of compares preferring sext. if (auto *LD = dyn_cast<LoadSDNode>(Val)) { EVT MemVT = LD->getMemoryVT(); - if ((MemVT == MVT::i8 || MemVT == MVT::i16 || - (Subtarget.is64Bit() && MemVT == MVT::i32)) && + if ((MemVT == MVT::i8 || MemVT == MVT::i16) && (LD->getExtensionType() == ISD::NON_EXTLOAD || LD->getExtensionType() == ISD::ZEXTLOAD)) return true; @@ -1189,7 +1236,9 @@ bool RISCVTargetLowering::hasAndNotCompare(SDValue Y) const { if (VT.isVector()) return false; - return Subtarget.hasStdExtZbb() && !isa<ConstantSDNode>(Y); + return (Subtarget.hasStdExtZbb() || Subtarget.hasStdExtZbp() || + Subtarget.hasStdExtZbkb()) && + !isa<ConstantSDNode>(Y); } /// Check if sinking \p I's operands to I's basic block is profitable, because @@ -1230,6 +1279,30 @@ bool RISCVTargetLowering::shouldSinkOperands( switch (II->getIntrinsicID()) { case Intrinsic::fma: return Operand == 0 || Operand == 1; + // FIXME: Our patterns can only match vx/vf instructions when the splat + // it on the RHS, because TableGen doesn't recognize our VP operations + // as commutative. + case Intrinsic::vp_add: + case Intrinsic::vp_mul: + case Intrinsic::vp_and: + case Intrinsic::vp_or: + case Intrinsic::vp_xor: + case Intrinsic::vp_fadd: + case Intrinsic::vp_fmul: + case Intrinsic::vp_shl: + case Intrinsic::vp_lshr: + case Intrinsic::vp_ashr: + case Intrinsic::vp_udiv: + case Intrinsic::vp_sdiv: + case Intrinsic::vp_urem: + case Intrinsic::vp_srem: + return Operand == 1; + // ... with the exception of vp.sub/vp.fsub/vp.fdiv, which have + // explicit patterns for both LHS and RHS (as 'vr' versions). + case Intrinsic::vp_sub: + case Intrinsic::vp_fsub: + case Intrinsic::vp_fdiv: + return Operand == 0 || Operand == 1; default: return false; } @@ -1277,8 +1350,6 @@ bool RISCVTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT, return false; if (VT == MVT::f64 && !Subtarget.hasStdExtD()) return false; - if (Imm.isNegZero()) - return false; return Imm.isZero(); } @@ -1482,6 +1553,19 @@ bool RISCVTargetLowering::isLegalElementTypeForRVV(Type *ScalarTy) const { return false; } +static SDValue getVLOperand(SDValue Op) { + assert((Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN || + Op.getOpcode() == ISD::INTRINSIC_W_CHAIN) && + "Unexpected opcode"); + bool HasChain = Op.getOpcode() == ISD::INTRINSIC_W_CHAIN; + unsigned IntNo = Op.getConstantOperandVal(HasChain ? 1 : 0); + const RISCVVIntrinsicsTable::RISCVVIntrinsicInfo *II = + RISCVVIntrinsicsTable::getRISCVVIntrinsicInfo(IntNo); + if (!II) + return SDValue(); + return Op.getOperand(II->VLOperand + 1 + HasChain); +} + static bool useRVVForFixedLengthVectorVT(MVT VT, const RISCVSubtarget &Subtarget) { assert(VT.isFixedLengthVector() && "Expected a fixed length vector type!"); @@ -1667,7 +1751,8 @@ bool RISCVTargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const { return false; } -static SDValue lowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) { +static SDValue lowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG, + const RISCVSubtarget &Subtarget) { // RISCV FP-to-int conversions saturate to the destination register size, but // don't produce 0 for nan. We can use a conversion instruction and fix the // nan case with a compare and a select. @@ -1679,15 +1764,17 @@ static SDValue lowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) { bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT_SAT; unsigned Opc; if (SatVT == DstVT) - Opc = IsSigned ? RISCVISD::FCVT_X_RTZ : RISCVISD::FCVT_XU_RTZ; + Opc = IsSigned ? RISCVISD::FCVT_X : RISCVISD::FCVT_XU; else if (DstVT == MVT::i64 && SatVT == MVT::i32) - Opc = IsSigned ? RISCVISD::FCVT_W_RTZ_RV64 : RISCVISD::FCVT_WU_RTZ_RV64; + Opc = IsSigned ? RISCVISD::FCVT_W_RV64 : RISCVISD::FCVT_WU_RV64; else return SDValue(); // FIXME: Support other SatVTs by clamping before or after the conversion. SDLoc DL(Op); - SDValue FpToInt = DAG.getNode(Opc, DL, DstVT, Src); + SDValue FpToInt = DAG.getNode( + Opc, DL, DstVT, Src, + DAG.getTargetConstant(RISCVFPRndMode::RTZ, DL, Subtarget.getXLenVT())); SDValue ZeroInt = DAG.getConstant(0, DL, DstVT); return DAG.getSelectCC(DL, Src, Src, ZeroInt, FpToInt, ISD::CondCode::SETUO); @@ -1898,6 +1985,8 @@ static SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, // codegen across RV32 and RV64. unsigned NumViaIntegerBits = std::min(std::max(NumElts, 8u), Subtarget.getXLen()); + NumViaIntegerBits = std::min(NumViaIntegerBits, + Subtarget.getMaxELENForFixedLengthVectors()); if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) { // If we have to use more than one INSERT_VECTOR_ELT then this // optimization is likely to increase code size; avoid peforming it in @@ -2190,6 +2279,17 @@ static SDValue splatPartsI64WithVL(const SDLoc &DL, MVT VT, SDValue Lo, // node in order to try and match RVV vector/scalar instructions. if ((LoC >> 31) == HiC) return DAG.getNode(RISCVISD::VMV_V_X_VL, DL, VT, Lo, VL); + + // If vl is equal to VLMax and Hi constant is equal to Lo, we could use + // vmv.v.x whose EEW = 32 to lower it. + auto *Const = dyn_cast<ConstantSDNode>(VL); + if (LoC == HiC && Const && Const->getSExtValue() == RISCV::VLMaxSentinel) { + MVT InterVT = MVT::getVectorVT(MVT::i32, VT.getVectorElementCount() * 2); + // TODO: if vl <= min(VLMAX), we can also do this. But we could not + // access the subtarget here now. + auto InterVec = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, InterVT, Lo, VL); + return DAG.getNode(ISD::BITCAST, DL, VT, InterVec); + } } // Fall back to a stack store and stride x0 vector load. @@ -2215,8 +2315,13 @@ static SDValue splatSplitI64WithVL(const SDLoc &DL, MVT VT, SDValue Scalar, static SDValue lowerScalarSplat(SDValue Scalar, SDValue VL, MVT VT, SDLoc DL, SelectionDAG &DAG, const RISCVSubtarget &Subtarget) { - if (VT.isFloatingPoint()) + if (VT.isFloatingPoint()) { + // If VL is 1, we could use vfmv.s.f. + if (isOneConstant(VL)) + return DAG.getNode(RISCVISD::VFMV_S_F_VL, DL, VT, DAG.getUNDEF(VT), + Scalar, VL); return DAG.getNode(RISCVISD::VFMV_V_F_VL, DL, VT, Scalar, VL); + } MVT XLenVT = Subtarget.getXLenVT(); @@ -2229,16 +2334,98 @@ static SDValue lowerScalarSplat(SDValue Scalar, SDValue VL, MVT VT, SDLoc DL, unsigned ExtOpc = isa<ConstantSDNode>(Scalar) ? ISD::SIGN_EXTEND : ISD::ANY_EXTEND; Scalar = DAG.getNode(ExtOpc, DL, XLenVT, Scalar); + ConstantSDNode *Const = dyn_cast<ConstantSDNode>(Scalar); + // If VL is 1 and the scalar value won't benefit from immediate, we could + // use vmv.s.x. + if (isOneConstant(VL) && + (!Const || isNullConstant(Scalar) || !isInt<5>(Const->getSExtValue()))) + return DAG.getNode(RISCVISD::VMV_S_X_VL, DL, VT, DAG.getUNDEF(VT), Scalar, + VL); return DAG.getNode(RISCVISD::VMV_V_X_VL, DL, VT, Scalar, VL); } assert(XLenVT == MVT::i32 && Scalar.getValueType() == MVT::i64 && "Unexpected scalar for splat lowering!"); + if (isOneConstant(VL) && isNullConstant(Scalar)) + return DAG.getNode(RISCVISD::VMV_S_X_VL, DL, VT, DAG.getUNDEF(VT), + DAG.getConstant(0, DL, XLenVT), VL); + // Otherwise use the more complicated splatting algorithm. return splatSplitI64WithVL(DL, VT, Scalar, VL, DAG); } +// Is the mask a slidedown that shifts in undefs. +static int matchShuffleAsSlideDown(ArrayRef<int> Mask) { + int Size = Mask.size(); + + // Elements shifted in should be undef. + auto CheckUndefs = [&](int Shift) { + for (int i = Size - Shift; i != Size; ++i) + if (Mask[i] >= 0) + return false; + return true; + }; + + // Elements should be shifted or undef. + auto MatchShift = [&](int Shift) { + for (int i = 0; i != Size - Shift; ++i) + if (Mask[i] >= 0 && Mask[i] != Shift + i) + return false; + return true; + }; + + // Try all possible shifts. + for (int Shift = 1; Shift != Size; ++Shift) + if (CheckUndefs(Shift) && MatchShift(Shift)) + return Shift; + + // No match. + return -1; +} + +static bool isInterleaveShuffle(ArrayRef<int> Mask, MVT VT, bool &SwapSources, + const RISCVSubtarget &Subtarget) { + // We need to be able to widen elements to the next larger integer type. + if (VT.getScalarSizeInBits() >= Subtarget.getMaxELENForFixedLengthVectors()) + return false; + + int Size = Mask.size(); + assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size"); + + int Srcs[] = {-1, -1}; + for (int i = 0; i != Size; ++i) { + // Ignore undef elements. + if (Mask[i] < 0) + continue; + + // Is this an even or odd element. + int Pol = i % 2; + + // Ensure we consistently use the same source for this element polarity. + int Src = Mask[i] / Size; + if (Srcs[Pol] < 0) + Srcs[Pol] = Src; + if (Srcs[Pol] != Src) + return false; + + // Make sure the element within the source is appropriate for this element + // in the destination. + int Elt = Mask[i] % Size; + if (Elt != i / 2) + return false; + } + + // We need to find a source for each polarity and they can't be the same. + if (Srcs[0] < 0 || Srcs[1] < 0 || Srcs[0] == Srcs[1]) + return false; + + // Swap the sources if the second source was in the even polarity. + SwapSources = Srcs[0] > Srcs[1]; + + return true; +} + static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, const RISCVSubtarget &Subtarget) { SDValue V1 = Op.getOperand(0); @@ -2284,8 +2471,12 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, SDVTList VTs = DAG.getVTList({ContainerVT, MVT::Other}); SDValue IntID = DAG.getTargetConstant(Intrinsic::riscv_vlse, DL, XLenVT); - SDValue Ops[] = {Ld->getChain(), IntID, NewAddr, - DAG.getRegister(RISCV::X0, XLenVT), VL}; + SDValue Ops[] = {Ld->getChain(), + IntID, + DAG.getUNDEF(ContainerVT), + NewAddr, + DAG.getRegister(RISCV::X0, XLenVT), + VL}; SDValue NewLoad = DAG.getMemIntrinsicNode( ISD::INTRINSIC_W_CHAIN, DL, VTs, Ops, SVT, DAG.getMachineFunction().getMachineMemOperand( @@ -2324,10 +2515,97 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, } } + ArrayRef<int> Mask = SVN->getMask(); + + // Try to match as a slidedown. + int SlideAmt = matchShuffleAsSlideDown(Mask); + if (SlideAmt >= 0) { + // TODO: Should we reduce the VL to account for the upper undef elements? + // Requires additional vsetvlis, but might be faster to execute. + V1 = convertToScalableVector(ContainerVT, V1, DAG, Subtarget); + SDValue SlideDown = + DAG.getNode(RISCVISD::VSLIDEDOWN_VL, DL, ContainerVT, + DAG.getUNDEF(ContainerVT), V1, + DAG.getConstant(SlideAmt, DL, XLenVT), + TrueMask, VL); + return convertFromScalableVector(VT, SlideDown, DAG, Subtarget); + } + + // Detect an interleave shuffle and lower to + // (vmaccu.vx (vwaddu.vx lohalf(V1), lohalf(V2)), lohalf(V2), (2^eltbits - 1)) + bool SwapSources; + if (isInterleaveShuffle(Mask, VT, SwapSources, Subtarget)) { + // Swap sources if needed. + if (SwapSources) + std::swap(V1, V2); + + // Extract the lower half of the vectors. + MVT HalfVT = VT.getHalfNumVectorElementsVT(); + V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1, + DAG.getConstant(0, DL, XLenVT)); + V2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V2, + DAG.getConstant(0, DL, XLenVT)); + + // Double the element width and halve the number of elements in an int type. + unsigned EltBits = VT.getScalarSizeInBits(); + MVT WideIntEltVT = MVT::getIntegerVT(EltBits * 2); + MVT WideIntVT = + MVT::getVectorVT(WideIntEltVT, VT.getVectorNumElements() / 2); + // Convert this to a scalable vector. We need to base this on the + // destination size to ensure there's always a type with a smaller LMUL. + MVT WideIntContainerVT = + getContainerForFixedLengthVector(DAG, WideIntVT, Subtarget); + + // Convert sources to scalable vectors with the same element count as the + // larger type. + MVT HalfContainerVT = MVT::getVectorVT( + VT.getVectorElementType(), WideIntContainerVT.getVectorElementCount()); + V1 = convertToScalableVector(HalfContainerVT, V1, DAG, Subtarget); + V2 = convertToScalableVector(HalfContainerVT, V2, DAG, Subtarget); + + // Cast sources to integer. + MVT IntEltVT = MVT::getIntegerVT(EltBits); + MVT IntHalfVT = + MVT::getVectorVT(IntEltVT, HalfContainerVT.getVectorElementCount()); + V1 = DAG.getBitcast(IntHalfVT, V1); + V2 = DAG.getBitcast(IntHalfVT, V2); + + // Freeze V2 since we use it twice and we need to be sure that the add and + // multiply see the same value. + V2 = DAG.getNode(ISD::FREEZE, DL, IntHalfVT, V2); + + // Recreate TrueMask using the widened type's element count. + MVT MaskVT = + MVT::getVectorVT(MVT::i1, HalfContainerVT.getVectorElementCount()); + TrueMask = DAG.getNode(RISCVISD::VMSET_VL, DL, MaskVT, VL); + + // Widen V1 and V2 with 0s and add one copy of V2 to V1. + SDValue Add = DAG.getNode(RISCVISD::VWADDU_VL, DL, WideIntContainerVT, V1, + V2, TrueMask, VL); + // Create 2^eltbits - 1 copies of V2 by multiplying by the largest integer. + SDValue Multiplier = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, IntHalfVT, + DAG.getAllOnesConstant(DL, XLenVT)); + SDValue WidenMul = DAG.getNode(RISCVISD::VWMULU_VL, DL, WideIntContainerVT, + V2, Multiplier, TrueMask, VL); + // Add the new copies to our previous addition giving us 2^eltbits copies of + // V2. This is equivalent to shifting V2 left by eltbits. This should + // combine with the vwmulu.vv above to form vwmaccu.vv. + Add = DAG.getNode(RISCVISD::ADD_VL, DL, WideIntContainerVT, Add, WidenMul, + TrueMask, VL); + // Cast back to ContainerVT. We need to re-create a new ContainerVT in case + // WideIntContainerVT is a larger fractional LMUL than implied by the fixed + // vector VT. + ContainerVT = + MVT::getVectorVT(VT.getVectorElementType(), + WideIntContainerVT.getVectorElementCount() * 2); + Add = DAG.getBitcast(ContainerVT, Add); + return convertFromScalableVector(VT, Add, DAG, Subtarget); + } + // Detect shuffles which can be re-expressed as vector selects; these are // shuffles in which each element in the destination is taken from an element // at the corresponding index in either source vectors. - bool IsSelect = all_of(enumerate(SVN->getMask()), [&](const auto &MaskIdx) { + bool IsSelect = all_of(enumerate(Mask), [&](const auto &MaskIdx) { int MaskIndex = MaskIdx.value(); return MaskIndex < 0 || MaskIdx.index() == (unsigned)MaskIndex % NumElts; }); @@ -2353,7 +2631,7 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, // Now construct the mask that will be used by the vselect or blended // vrgather operation. For vrgathers, construct the appropriate indices into // each vector. - for (int MaskIndex : SVN->getMask()) { + for (int MaskIndex : Mask) { bool SelectMaskVal = (MaskIndex < (int)NumElts) ^ InvertMask; MaskVals.push_back(DAG.getConstant(SelectMaskVal, DL, XLenVT)); if (!IsSelect) { @@ -2691,15 +2969,25 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op, MVT VT = Op.getSimpleValueType(); assert(VT == Subtarget.getXLenVT() && "Unexpected custom legalization"); SDLoc DL(Op); - if (Op.getOperand(2).getOpcode() == ISD::Constant) - return Op; // FSL/FSR take a log2(XLen)+1 bit shift amount but XLenVT FSHL/FSHR only - // use log(XLen) bits. Mask the shift amount accordingly. + // use log(XLen) bits. Mask the shift amount accordingly to prevent + // accidentally setting the extra bit. unsigned ShAmtWidth = Subtarget.getXLen() - 1; SDValue ShAmt = DAG.getNode(ISD::AND, DL, VT, Op.getOperand(2), DAG.getConstant(ShAmtWidth, DL, VT)); - unsigned Opc = Op.getOpcode() == ISD::FSHL ? RISCVISD::FSL : RISCVISD::FSR; - return DAG.getNode(Opc, DL, VT, Op.getOperand(0), Op.getOperand(1), ShAmt); + // fshl and fshr concatenate their operands in the same order. fsr and fsl + // instruction use different orders. fshl will return its first operand for + // shift of zero, fshr will return its second operand. fsl and fsr both + // return rs1 so the ISD nodes need to have different operand orders. + // Shift amount is in rs2. + SDValue Op0 = Op.getOperand(0); + SDValue Op1 = Op.getOperand(1); + unsigned Opc = RISCVISD::FSL; + if (Op.getOpcode() == ISD::FSHR) { + std::swap(Op0, Op1); + Opc = RISCVISD::FSR; + } + return DAG.getNode(Opc, DL, VT, Op0, Op1, ShAmt); } case ISD::TRUNCATE: { SDLoc DL(Op); @@ -2774,7 +3062,7 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op, // We define our scalable vector types for lmul=1 to use a 64 bit known // minimum size. e.g. <vscale x 2 x i32>. VLENB is in bytes so we calculate // vscale as VLENB / 8. - assert(RISCV::RVVBitsPerBlock == 64 && "Unexpected bits per block!"); + static_assert(RISCV::RVVBitsPerBlock == 64, "Unexpected bits per block!"); if (isa<ConstantSDNode>(Op.getOperand(0))) { // We assume VLENB is a multiple of 8. We manually choose the best shift // here because SimplifyDemandedBits isn't always able to simplify it. @@ -3001,7 +3289,7 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op, } case ISD::FP_TO_SINT_SAT: case ISD::FP_TO_UINT_SAT: - return lowerFP_TO_INT_SAT(Op, DAG); + return lowerFP_TO_INT_SAT(Op, DAG, Subtarget); case ISD::FTRUNC: case ISD::FCEIL: case ISD::FFLOOR: @@ -3063,9 +3351,14 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op, unsigned NumOpElts = Op.getOperand(0).getSimpleValueType().getVectorMinNumElements(); SDValue Vec = DAG.getUNDEF(VT); - for (const auto &OpIdx : enumerate(Op->ops())) - Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, Vec, OpIdx.value(), + for (const auto &OpIdx : enumerate(Op->ops())) { + SDValue SubVec = OpIdx.value(); + // Don't insert undef subvectors. + if (SubVec.isUndef()) + continue; + Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, Vec, SubVec, DAG.getIntPtrConstant(OpIdx.index() * NumOpElts, DL)); + } return Vec; } case ISD::LOAD: @@ -3181,6 +3474,8 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op, return lowerSET_ROUNDING(Op, DAG); case ISD::VP_SELECT: return lowerVPOp(Op, DAG, RISCVISD::VSELECT_VL); + case ISD::VP_MERGE: + return lowerVPOp(Op, DAG, RISCVISD::VP_MERGE_VL); case ISD::VP_ADD: return lowerVPOp(Op, DAG, RISCVISD::ADD_VL); case ISD::VP_SUB: @@ -4044,10 +4339,10 @@ static SDValue lowerVectorIntrinsicSplats(SDValue Op, SelectionDAG &DAG, const RISCVVIntrinsicsTable::RISCVVIntrinsicInfo *II = RISCVVIntrinsicsTable::getRISCVVIntrinsicInfo(IntNo); - if (!II || !II->SplatOperand) + if (!II || !II->hasSplatOperand()) return SDValue(); - unsigned SplatOp = II->SplatOperand + HasChain; + unsigned SplatOp = II->SplatOperand + 1 + HasChain; assert(SplatOp < Op.getNumOperands()); SmallVector<SDValue, 8> Operands(Op->op_begin(), Op->op_end()); @@ -4077,7 +4372,7 @@ static SDValue lowerVectorIntrinsicSplats(SDValue Op, SelectionDAG &DAG, // that a widening operation never uses SEW=64. // NOTE: If this fails the below assert, we can probably just find the // element count from any operand or result and use it to construct the VT. - assert(II->SplatOperand > 1 && "Unexpected splat operand!"); + assert(II->SplatOperand > 0 && "Unexpected splat operand!"); MVT VT = Op.getOperand(SplatOp - 1).getSimpleValueType(); // The more complex case is when the scalar is larger than XLenVT. @@ -4096,8 +4391,7 @@ static SDValue lowerVectorIntrinsicSplats(SDValue Op, SelectionDAG &DAG, // We need to convert the scalar to a splat vector. // FIXME: Can we implicitly truncate the scalar if it is known to // be sign extended? - // VL should be the last operand. - SDValue VL = Op.getOperand(Op.getNumOperands() - 1); + SDValue VL = getVLOperand(Op); assert(VL.getValueType() == XLenVT); ScalarOp = splatSplitI64WithVL(DL, VT, ScalarOp, VL, DAG); return DAG.getNode(Op->getOpcode(), DL, Op->getVTList(), Operands); @@ -4138,6 +4432,15 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, : RISCVISD::BDECOMPRESS; return DAG.getNode(Opc, DL, XLenVT, Op.getOperand(1), Op.getOperand(2)); } + case Intrinsic::riscv_bfp: + return DAG.getNode(RISCVISD::BFP, DL, XLenVT, Op.getOperand(1), + Op.getOperand(2)); + case Intrinsic::riscv_fsl: + return DAG.getNode(RISCVISD::FSL, DL, XLenVT, Op.getOperand(1), + Op.getOperand(2), Op.getOperand(3)); + case Intrinsic::riscv_fsr: + return DAG.getNode(RISCVISD::FSR, DL, XLenVT, Op.getOperand(1), + Op.getOperand(2), Op.getOperand(3)); case Intrinsic::riscv_vmv_x_s: assert(Op.getValueType() == XLenVT && "Unexpected VT!"); return DAG.getNode(RISCVISD::VMV_X_S, DL, Op.getValueType(), @@ -4176,7 +4479,7 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, // vmerge.vvm vDest, vSrc, vVal, mMask MVT VT = Op.getSimpleValueType(); SDValue Vec = Op.getOperand(1); - SDValue VL = Op.getOperand(3); + SDValue VL = getVLOperand(Op); SDValue SplattedVal = splatSplitI64WithVL(DL, VT, Scalar, VL, DAG); SDValue SplattedIdx = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, VT, @@ -4222,7 +4525,7 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, DAG.getConstant(1, DL, XLenVT)); // Double the VL since we halved SEW. - SDValue VL = Op.getOperand(NumOps - (1 + OpOffset)); + SDValue VL = getVLOperand(Op); SDValue I32VL = DAG.getNode(ISD::SHL, DL, XLenVT, VL, DAG.getConstant(1, DL, XLenVT)); @@ -4294,7 +4597,9 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, auto *Load = cast<MemIntrinsicSDNode>(Op); SmallVector<SDValue, 8> Ops{Load->getChain(), IntID}; - if (!IsUnmasked) + if (IsUnmasked) + Ops.push_back(DAG.getUNDEF(ContainerVT)); + else Ops.push_back(PassThru); Ops.push_back(Op.getOperand(3)); // Ptr Ops.push_back(Op.getOperand(4)); // Stride @@ -4720,7 +5025,7 @@ SDValue RISCVTargetLowering::lowerINSERT_SUBVECTOR(SDValue Op, // register size. Therefore we must slide the vector group up the full // amount. if (SubVecVT.isFixedLengthVector()) { - if (OrigIdx == 0 && Vec.isUndef()) + if (OrigIdx == 0 && Vec.isUndef() && !VecVT.isFixedLengthVector()) return Op; MVT ContainerVT = VecVT; if (VecVT.isFixedLengthVector()) { @@ -4730,6 +5035,10 @@ SDValue RISCVTargetLowering::lowerINSERT_SUBVECTOR(SDValue Op, SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ContainerVT, DAG.getUNDEF(ContainerVT), SubVec, DAG.getConstant(0, DL, XLenVT)); + if (OrigIdx == 0 && Vec.isUndef() && VecVT.isFixedLengthVector()) { + SubVec = convertFromScalableVector(VecVT, SubVec, DAG, Subtarget); + return DAG.getBitcast(Op.getValueType(), SubVec); + } SDValue Mask = getDefaultVLOps(VecVT, ContainerVT, DL, DAG, Subtarget).first; // Set the vector length to only the number of elements we care about. Note @@ -5148,7 +5457,9 @@ SDValue RISCVTargetLowering::lowerMaskedLoad(SDValue Op, unsigned IntID = IsUnmasked ? Intrinsic::riscv_vle : Intrinsic::riscv_vle_mask; SmallVector<SDValue, 8> Ops{Chain, DAG.getTargetConstant(IntID, DL, XLenVT)}; - if (!IsUnmasked) + if (IsUnmasked) + Ops.push_back(DAG.getUNDEF(ContainerVT)); + else Ops.push_back(PassThru); Ops.push_back(BasePtr); if (!IsUnmasked) @@ -5518,13 +5829,20 @@ SDValue RISCVTargetLowering::lowerMaskedGather(SDValue Op, } } + if (XLenVT == MVT::i32 && IndexVT.getVectorElementType().bitsGT(XLenVT)) { + IndexVT = IndexVT.changeVectorElementType(XLenVT); + Index = DAG.getNode(ISD::TRUNCATE, DL, IndexVT, Index); + } + if (!VL) VL = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget).second; unsigned IntID = IsUnmasked ? Intrinsic::riscv_vluxei : Intrinsic::riscv_vluxei_mask; SmallVector<SDValue, 8> Ops{Chain, DAG.getTargetConstant(IntID, DL, XLenVT)}; - if (!IsUnmasked) + if (IsUnmasked) + Ops.push_back(DAG.getUNDEF(ContainerVT)); + else Ops.push_back(PassThru); Ops.push_back(BasePtr); Ops.push_back(Index); @@ -5619,6 +5937,11 @@ SDValue RISCVTargetLowering::lowerMaskedScatter(SDValue Op, } } + if (XLenVT == MVT::i32 && IndexVT.getVectorElementType().bitsGT(XLenVT)) { + IndexVT = IndexVT.changeVectorElementType(XLenVT); + Index = DAG.getNode(ISD::TRUNCATE, DL, IndexVT, Index); + } + if (!VL) VL = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget).second; @@ -5697,6 +6020,39 @@ SDValue RISCVTargetLowering::lowerSET_ROUNDING(SDValue Op, RMValue); } +static RISCVISD::NodeType getRISCVWOpcodeByIntr(unsigned IntNo) { + switch (IntNo) { + default: + llvm_unreachable("Unexpected Intrinsic"); + case Intrinsic::riscv_grev: + return RISCVISD::GREVW; + case Intrinsic::riscv_gorc: + return RISCVISD::GORCW; + case Intrinsic::riscv_bcompress: + return RISCVISD::BCOMPRESSW; + case Intrinsic::riscv_bdecompress: + return RISCVISD::BDECOMPRESSW; + case Intrinsic::riscv_bfp: + return RISCVISD::BFPW; + case Intrinsic::riscv_fsl: + return RISCVISD::FSLW; + case Intrinsic::riscv_fsr: + return RISCVISD::FSRW; + } +} + +// Converts the given intrinsic to a i64 operation with any extension. +static SDValue customLegalizeToWOpByIntr(SDNode *N, SelectionDAG &DAG, + unsigned IntNo) { + SDLoc DL(N); + RISCVISD::NodeType WOpcode = getRISCVWOpcodeByIntr(IntNo); + SDValue NewOp1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(1)); + SDValue NewOp2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(2)); + SDValue NewRes = DAG.getNode(WOpcode, DL, MVT::i64, NewOp1, NewOp2); + // ReplaceNodeResults requires we maintain the same type for the return value. + return DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), NewRes); +} + // Returns the opcode of the target-specific SDNode that implements the 32-bit // form of the given Opcode. static RISCVISD::NodeType getRISCVWOpcode(unsigned Opcode) { @@ -5776,17 +6132,20 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N, if (!isTypeLegal(Op0.getValueType())) return; if (IsStrict) { - unsigned Opc = IsSigned ? RISCVISD::STRICT_FCVT_W_RTZ_RV64 - : RISCVISD::STRICT_FCVT_WU_RTZ_RV64; + unsigned Opc = IsSigned ? RISCVISD::STRICT_FCVT_W_RV64 + : RISCVISD::STRICT_FCVT_WU_RV64; SDVTList VTs = DAG.getVTList(MVT::i64, MVT::Other); - SDValue Res = DAG.getNode(Opc, DL, VTs, N->getOperand(0), Op0); + SDValue Res = DAG.getNode( + Opc, DL, VTs, N->getOperand(0), Op0, + DAG.getTargetConstant(RISCVFPRndMode::RTZ, DL, MVT::i64)); Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Res)); Results.push_back(Res.getValue(1)); return; } - unsigned Opc = - IsSigned ? RISCVISD::FCVT_W_RTZ_RV64 : RISCVISD::FCVT_WU_RTZ_RV64; - SDValue Res = DAG.getNode(Opc, DL, MVT::i64, Op0); + unsigned Opc = IsSigned ? RISCVISD::FCVT_W_RV64 : RISCVISD::FCVT_WU_RV64; + SDValue Res = + DAG.getNode(Opc, DL, MVT::i64, Op0, + DAG.getTargetConstant(RISCVFPRndMode::RTZ, DL, MVT::i64)); Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Res)); return; } @@ -6078,15 +6437,23 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N, DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(0)); SDValue NewOp1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(1)); - SDValue NewOp2 = + SDValue NewShAmt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(2)); // FSLW/FSRW take a 6 bit shift amount but i32 FSHL/FSHR only use 5 bits. - // Mask the shift amount to 5 bits. - NewOp2 = DAG.getNode(ISD::AND, DL, MVT::i64, NewOp2, - DAG.getConstant(0x1f, DL, MVT::i64)); - unsigned Opc = - N->getOpcode() == ISD::FSHL ? RISCVISD::FSLW : RISCVISD::FSRW; - SDValue NewOp = DAG.getNode(Opc, DL, MVT::i64, NewOp0, NewOp1, NewOp2); + // Mask the shift amount to 5 bits to prevent accidentally setting bit 5. + NewShAmt = DAG.getNode(ISD::AND, DL, MVT::i64, NewShAmt, + DAG.getConstant(0x1f, DL, MVT::i64)); + // fshl and fshr concatenate their operands in the same order. fsrw and fslw + // instruction use different orders. fshl will return its first operand for + // shift of zero, fshr will return its second operand. fsl and fsr both + // return rs1 so the ISD nodes need to have different operand orders. + // Shift amount is in rs2. + unsigned Opc = RISCVISD::FSLW; + if (N->getOpcode() == ISD::FSHR) { + std::swap(NewOp0, NewOp1); + Opc = RISCVISD::FSRW; + } + SDValue NewOp = DAG.getNode(Opc, DL, MVT::i64, NewOp0, NewOp1, NewShAmt); Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, NewOp)); break; } @@ -6154,6 +6521,31 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N, default: llvm_unreachable( "Don't know how to custom type legalize this intrinsic!"); + case Intrinsic::riscv_grev: + case Intrinsic::riscv_gorc: + case Intrinsic::riscv_bcompress: + case Intrinsic::riscv_bdecompress: + case Intrinsic::riscv_bfp: { + assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() && + "Unexpected custom legalisation"); + Results.push_back(customLegalizeToWOpByIntr(N, DAG, IntNo)); + break; + } + case Intrinsic::riscv_fsl: + case Intrinsic::riscv_fsr: { + assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() && + "Unexpected custom legalisation"); + SDValue NewOp1 = + DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(1)); + SDValue NewOp2 = + DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(2)); + SDValue NewOp3 = + DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(3)); + unsigned Opc = getRISCVWOpcodeByIntr(IntNo); + SDValue Res = DAG.getNode(Opc, DL, MVT::i64, NewOp1, NewOp2, NewOp3); + Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Res)); + break; + } case Intrinsic::riscv_orc_b: { // Lower to the GORCI encoding for orc.b with the operand extended. SDValue NewOp = @@ -6166,20 +6558,6 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N, Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Res)); return; } - case Intrinsic::riscv_grev: - case Intrinsic::riscv_gorc: { - assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() && - "Unexpected custom legalisation"); - SDValue NewOp1 = - DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(1)); - SDValue NewOp2 = - DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(2)); - unsigned Opc = - IntNo == Intrinsic::riscv_grev ? RISCVISD::GREVW : RISCVISD::GORCW; - SDValue Res = DAG.getNode(Opc, DL, MVT::i64, NewOp1, NewOp2); - Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Res)); - break; - } case Intrinsic::riscv_shfl: case Intrinsic::riscv_unshfl: { assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() && @@ -6200,21 +6578,6 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N, Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Res)); break; } - case Intrinsic::riscv_bcompress: - case Intrinsic::riscv_bdecompress: { - assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() && - "Unexpected custom legalisation"); - SDValue NewOp1 = - DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(1)); - SDValue NewOp2 = - DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(2)); - unsigned Opc = IntNo == Intrinsic::riscv_bcompress - ? RISCVISD::BCOMPRESSW - : RISCVISD::BDECOMPRESSW; - SDValue Res = DAG.getNode(Opc, DL, MVT::i64, NewOp1, NewOp2); - Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Res)); - break; - } case Intrinsic::riscv_vmv_x_s: { EVT VT = N->getValueType(0); MVT XLenVT = Subtarget.getXLenVT(); @@ -6923,9 +7286,14 @@ static SDValue performANY_EXTENDCombine(SDNode *N, // Try to form VWMUL or VWMULU. // FIXME: Support VWMULSU. -static SDValue combineMUL_VLToVWMUL(SDNode *N, SDValue Op0, SDValue Op1, - SelectionDAG &DAG) { +static SDValue combineMUL_VLToVWMUL_VL(SDNode *N, SelectionDAG &DAG, + bool Commute) { assert(N->getOpcode() == RISCVISD::MUL_VL && "Unexpected opcode"); + SDValue Op0 = N->getOperand(0); + SDValue Op1 = N->getOperand(1); + if (Commute) + std::swap(Op0, Op1); + bool IsSignExt = Op0.getOpcode() == RISCVISD::VSEXT_VL; bool IsZeroExt = Op0.getOpcode() == RISCVISD::VZEXT_VL; if ((!IsSignExt && !IsZeroExt) || !Op0.hasOneUse()) @@ -7002,6 +7370,123 @@ static SDValue combineMUL_VLToVWMUL(SDNode *N, SDValue Op0, SDValue Op1, return DAG.getNode(WMulOpc, DL, VT, Op0, Op1, Mask, VL); } +static RISCVFPRndMode::RoundingMode matchRoundingOp(SDValue Op) { + switch (Op.getOpcode()) { + case ISD::FROUNDEVEN: return RISCVFPRndMode::RNE; + case ISD::FTRUNC: return RISCVFPRndMode::RTZ; + case ISD::FFLOOR: return RISCVFPRndMode::RDN; + case ISD::FCEIL: return RISCVFPRndMode::RUP; + case ISD::FROUND: return RISCVFPRndMode::RMM; + } + + return RISCVFPRndMode::Invalid; +} + +// Fold +// (fp_to_int (froundeven X)) -> fcvt X, rne +// (fp_to_int (ftrunc X)) -> fcvt X, rtz +// (fp_to_int (ffloor X)) -> fcvt X, rdn +// (fp_to_int (fceil X)) -> fcvt X, rup +// (fp_to_int (fround X)) -> fcvt X, rmm +static SDValue performFP_TO_INTCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + const RISCVSubtarget &Subtarget) { + SelectionDAG &DAG = DCI.DAG; + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + MVT XLenVT = Subtarget.getXLenVT(); + + // Only handle XLen or i32 types. Other types narrower than XLen will + // eventually be legalized to XLenVT. + EVT VT = N->getValueType(0); + if (VT != MVT::i32 && VT != XLenVT) + return SDValue(); + + SDValue Src = N->getOperand(0); + + // Ensure the FP type is also legal. + if (!TLI.isTypeLegal(Src.getValueType())) + return SDValue(); + + // Don't do this for f16 with Zfhmin and not Zfh. + if (Src.getValueType() == MVT::f16 && !Subtarget.hasStdExtZfh()) + return SDValue(); + + RISCVFPRndMode::RoundingMode FRM = matchRoundingOp(Src); + if (FRM == RISCVFPRndMode::Invalid) + return SDValue(); + + bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT; + + unsigned Opc; + if (VT == XLenVT) + Opc = IsSigned ? RISCVISD::FCVT_X : RISCVISD::FCVT_XU; + else + Opc = IsSigned ? RISCVISD::FCVT_W_RV64 : RISCVISD::FCVT_WU_RV64; + + SDLoc DL(N); + SDValue FpToInt = DAG.getNode(Opc, DL, XLenVT, Src.getOperand(0), + DAG.getTargetConstant(FRM, DL, XLenVT)); + return DAG.getNode(ISD::TRUNCATE, DL, VT, FpToInt); +} + +// Fold +// (fp_to_int_sat (froundeven X)) -> (select X == nan, 0, (fcvt X, rne)) +// (fp_to_int_sat (ftrunc X)) -> (select X == nan, 0, (fcvt X, rtz)) +// (fp_to_int_sat (ffloor X)) -> (select X == nan, 0, (fcvt X, rdn)) +// (fp_to_int_sat (fceil X)) -> (select X == nan, 0, (fcvt X, rup)) +// (fp_to_int_sat (fround X)) -> (select X == nan, 0, (fcvt X, rmm)) +static SDValue performFP_TO_INT_SATCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + const RISCVSubtarget &Subtarget) { + SelectionDAG &DAG = DCI.DAG; + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + MVT XLenVT = Subtarget.getXLenVT(); + + // Only handle XLen types. Other types narrower than XLen will eventually be + // legalized to XLenVT. + EVT DstVT = N->getValueType(0); + if (DstVT != XLenVT) + return SDValue(); + + SDValue Src = N->getOperand(0); + + // Ensure the FP type is also legal. + if (!TLI.isTypeLegal(Src.getValueType())) + return SDValue(); + + // Don't do this for f16 with Zfhmin and not Zfh. + if (Src.getValueType() == MVT::f16 && !Subtarget.hasStdExtZfh()) + return SDValue(); + + EVT SatVT = cast<VTSDNode>(N->getOperand(1))->getVT(); + + RISCVFPRndMode::RoundingMode FRM = matchRoundingOp(Src); + if (FRM == RISCVFPRndMode::Invalid) + return SDValue(); + + bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT_SAT; + + unsigned Opc; + if (SatVT == DstVT) + Opc = IsSigned ? RISCVISD::FCVT_X : RISCVISD::FCVT_XU; + else if (DstVT == MVT::i64 && SatVT == MVT::i32) + Opc = IsSigned ? RISCVISD::FCVT_W_RV64 : RISCVISD::FCVT_WU_RV64; + else + return SDValue(); + // FIXME: Support other SatVTs by clamping before or after the conversion. + + Src = Src.getOperand(0); + + SDLoc DL(N); + SDValue FpToInt = DAG.getNode(Opc, DL, XLenVT, Src, + DAG.getTargetConstant(FRM, DL, XLenVT)); + + // RISCV FP-to-int conversions saturate to the destination register size, but + // don't produce 0 for nan. + SDValue ZeroInt = DAG.getConstant(0, DL, DstVT); + return DAG.getSelectCC(DL, Src, Src, ZeroInt, FpToInt, ISD::CondCode::SETUO); +} + SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -7083,25 +7568,6 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N, return SDValue(N, 0); break; } - case RISCVISD::FSL: - case RISCVISD::FSR: { - // Only the lower log2(Bitwidth)+1 bits of the the shift amount are read. - unsigned BitWidth = N->getOperand(2).getValueSizeInBits(); - assert(isPowerOf2_32(BitWidth) && "Unexpected bit width"); - if (SimplifyDemandedLowBitsHelper(2, Log2_32(BitWidth) + 1)) - return SDValue(N, 0); - break; - } - case RISCVISD::FSLW: - case RISCVISD::FSRW: { - // Only the lower 32 bits of Values and lower 6 bits of shift amount are - // read. - if (SimplifyDemandedLowBitsHelper(0, 32) || - SimplifyDemandedLowBitsHelper(1, 32) || - SimplifyDemandedLowBitsHelper(2, 6)) - return SDValue(N, 0); - break; - } case RISCVISD::GREV: case RISCVISD::GORC: { // Only the lower log2(Bitwidth) bits of the the shift amount are read. @@ -7331,6 +7797,12 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N, } break; } + case ISD::FP_TO_SINT: + case ISD::FP_TO_UINT: + return performFP_TO_INTCombine(N, DCI, Subtarget); + case ISD::FP_TO_SINT_SAT: + case ISD::FP_TO_UINT_SAT: + return performFP_TO_INT_SATCombine(N, DCI, Subtarget); case ISD::FCOPYSIGN: { EVT VT = N->getValueType(0); if (!VT.isVector()) @@ -7464,15 +7936,11 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N, } break; } - case RISCVISD::MUL_VL: { - SDValue Op0 = N->getOperand(0); - SDValue Op1 = N->getOperand(1); - if (SDValue V = combineMUL_VLToVWMUL(N, Op0, Op1, DAG)) + case RISCVISD::MUL_VL: + if (SDValue V = combineMUL_VLToVWMUL_VL(N, DAG, /*Commute*/ false)) return V; - if (SDValue V = combineMUL_VLToVWMUL(N, Op1, Op0, DAG)) - return V; - return SDValue(); - } + // Mul is commutative. + return combineMUL_VLToVWMUL_VL(N, DAG, /*Commute*/ true); case ISD::STORE: { auto *Store = cast<StoreSDNode>(N); SDValue Val = Store->getValue(); @@ -7486,12 +7954,12 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N, if (VecVT.getVectorElementType() == MemVT) { SDLoc DL(N); MVT MaskVT = MVT::getVectorVT(MVT::i1, VecVT.getVectorElementCount()); - return DAG.getStoreVP(Store->getChain(), DL, Src, Store->getBasePtr(), - DAG.getConstant(1, DL, MaskVT), - DAG.getConstant(1, DL, Subtarget.getXLenVT()), - Store->getPointerInfo(), - Store->getOriginalAlign(), - Store->getMemOperand()->getFlags()); + return DAG.getStoreVP( + Store->getChain(), DL, Src, Store->getBasePtr(), Store->getOffset(), + DAG.getConstant(1, DL, MaskVT), + DAG.getConstant(1, DL, Subtarget.getXLenVT()), MemVT, + Store->getMemOperand(), Store->getAddressingMode(), + Store->isTruncatingStore(), /*IsCompress*/ false); } } @@ -7732,14 +8200,18 @@ void RISCVTargetLowering::computeKnownBitsForTargetNode(const SDValue Op, // We assume VLENB is no more than 65536 / 8 bytes. Known.Zero.setBitsFrom(14); break; - case ISD::INTRINSIC_W_CHAIN: { - unsigned IntNo = Op.getConstantOperandVal(1); + case ISD::INTRINSIC_W_CHAIN: + case ISD::INTRINSIC_WO_CHAIN: { + unsigned IntNo = + Op.getConstantOperandVal(Opc == ISD::INTRINSIC_WO_CHAIN ? 0 : 1); switch (IntNo) { default: // We can't do anything for most intrinsics. break; case Intrinsic::riscv_vsetvli: case Intrinsic::riscv_vsetvlimax: + case Intrinsic::riscv_vsetvli_opt: + case Intrinsic::riscv_vsetvlimax_opt: // Assume that VL output is positive and would fit in an int32_t. // TODO: VLEN might be capped at 16 bits in a future V spec update. if (BitWidth >= 32) @@ -7779,10 +8251,11 @@ unsigned RISCVTargetLowering::ComputeNumSignBitsForTargetNode( case RISCVISD::UNSHFLW: case RISCVISD::BCOMPRESSW: case RISCVISD::BDECOMPRESSW: - case RISCVISD::FCVT_W_RTZ_RV64: - case RISCVISD::FCVT_WU_RTZ_RV64: - case RISCVISD::STRICT_FCVT_W_RTZ_RV64: - case RISCVISD::STRICT_FCVT_WU_RTZ_RV64: + case RISCVISD::BFPW: + case RISCVISD::FCVT_W_RV64: + case RISCVISD::FCVT_WU_RV64: + case RISCVISD::STRICT_FCVT_W_RV64: + case RISCVISD::STRICT_FCVT_WU_RV64: // TODO: As the result is sign-extended, this is conservatively correct. A // more precise answer could be calculated for SRAW depending on known // bits in the shift amount. @@ -7958,6 +8431,42 @@ static bool isSelectPseudo(MachineInstr &MI) { } } +static MachineBasicBlock *emitQuietFCMP(MachineInstr &MI, MachineBasicBlock *BB, + unsigned RelOpcode, unsigned EqOpcode, + const RISCVSubtarget &Subtarget) { + DebugLoc DL = MI.getDebugLoc(); + Register DstReg = MI.getOperand(0).getReg(); + Register Src1Reg = MI.getOperand(1).getReg(); + Register Src2Reg = MI.getOperand(2).getReg(); + MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); + Register SavedFFlags = MRI.createVirtualRegister(&RISCV::GPRRegClass); + const TargetInstrInfo &TII = *BB->getParent()->getSubtarget().getInstrInfo(); + + // Save the current FFLAGS. + BuildMI(*BB, MI, DL, TII.get(RISCV::ReadFFLAGS), SavedFFlags); + + auto MIB = BuildMI(*BB, MI, DL, TII.get(RelOpcode), DstReg) + .addReg(Src1Reg) + .addReg(Src2Reg); + if (MI.getFlag(MachineInstr::MIFlag::NoFPExcept)) + MIB->setFlag(MachineInstr::MIFlag::NoFPExcept); + + // Restore the FFLAGS. + BuildMI(*BB, MI, DL, TII.get(RISCV::WriteFFLAGS)) + .addReg(SavedFFlags, RegState::Kill); + + // Issue a dummy FEQ opcode to raise exception for signaling NaNs. + auto MIB2 = BuildMI(*BB, MI, DL, TII.get(EqOpcode), RISCV::X0) + .addReg(Src1Reg, getKillRegState(MI.getOperand(1).isKill())) + .addReg(Src2Reg, getKillRegState(MI.getOperand(2).isKill())); + if (MI.getFlag(MachineInstr::MIFlag::NoFPExcept)) + MIB2->setFlag(MachineInstr::MIFlag::NoFPExcept); + + // Erase the pseudoinstruction. + MI.eraseFromParent(); + return BB; +} + static MachineBasicBlock *emitSelectPseudo(MachineInstr &MI, MachineBasicBlock *BB, const RISCVSubtarget &Subtarget) { @@ -8099,6 +8608,18 @@ RISCVTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, return emitBuildPairF64Pseudo(MI, BB); case RISCV::SplitF64Pseudo: return emitSplitF64Pseudo(MI, BB); + case RISCV::PseudoQuietFLE_H: + return emitQuietFCMP(MI, BB, RISCV::FLE_H, RISCV::FEQ_H, Subtarget); + case RISCV::PseudoQuietFLT_H: + return emitQuietFCMP(MI, BB, RISCV::FLT_H, RISCV::FEQ_H, Subtarget); + case RISCV::PseudoQuietFLE_S: + return emitQuietFCMP(MI, BB, RISCV::FLE_S, RISCV::FEQ_S, Subtarget); + case RISCV::PseudoQuietFLT_S: + return emitQuietFCMP(MI, BB, RISCV::FLT_S, RISCV::FEQ_S, Subtarget); + case RISCV::PseudoQuietFLE_D: + return emitQuietFCMP(MI, BB, RISCV::FLE_D, RISCV::FEQ_D, Subtarget); + case RISCV::PseudoQuietFLT_D: + return emitQuietFCMP(MI, BB, RISCV::FLT_D, RISCV::FEQ_D, Subtarget); } } @@ -8393,7 +8914,8 @@ static bool CC_RISCV(const DataLayout &DL, RISCVABI::ABI ABI, unsigned ValNo, LocVT = XLenVT; LocInfo = CCValAssign::Indirect; } else if (ValVT.isScalableVector()) { - report_fatal_error("Unable to pass scalable vector types on the stack"); + LocVT = XLenVT; + LocInfo = CCValAssign::Indirect; } else { // Pass fixed-length vectors on the stack. LocVT = ValVT; @@ -8592,8 +9114,14 @@ static SDValue unpackFromMemLoc(SelectionDAG &DAG, SDValue Chain, EVT LocVT = VA.getLocVT(); EVT ValVT = VA.getValVT(); EVT PtrVT = MVT::getIntegerVT(DAG.getDataLayout().getPointerSizeInBits(0)); + if (ValVT.isScalableVector()) { + // When the value is a scalable vector, we save the pointer which points to + // the scalable vector value in the stack. The ValVT will be the pointer + // type, instead of the scalable vector type. + ValVT = LocVT; + } int FI = MFI.CreateFixedObject(ValVT.getStoreSize(), VA.getLocMemOffset(), - /*Immutable=*/true); + /*IsImmutable=*/true); SDValue FIN = DAG.getFrameIndex(FI, PtrVT); SDValue Val; @@ -8623,7 +9151,8 @@ static SDValue unpackF64OnRV32DSoftABI(SelectionDAG &DAG, SDValue Chain, if (VA.isMemLoc()) { // f64 is passed on the stack. - int FI = MFI.CreateFixedObject(8, VA.getLocMemOffset(), /*Immutable=*/true); + int FI = + MFI.CreateFixedObject(8, VA.getLocMemOffset(), /*IsImmutable=*/true); SDValue FIN = DAG.getFrameIndex(FI, MVT::i32); return DAG.getLoad(MVT::f64, DL, Chain, FIN, MachinePointerInfo::getFixedStack(MF, FI)); @@ -8637,7 +9166,7 @@ static SDValue unpackF64OnRV32DSoftABI(SelectionDAG &DAG, SDValue Chain, SDValue Hi; if (VA.getLocReg() == RISCV::X17) { // Second half of f64 is passed on the stack. - int FI = MFI.CreateFixedObject(4, 0, /*Immutable=*/true); + int FI = MFI.CreateFixedObject(4, 0, /*IsImmutable=*/true); SDValue FIN = DAG.getFrameIndex(FI, MVT::i32); Hi = DAG.getLoad(MVT::i32, DL, Chain, FIN, MachinePointerInfo::getFixedStack(MF, FI)); @@ -9510,12 +10039,12 @@ const char *RISCVTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(FMV_X_ANYEXTH) NODE_NAME_CASE(FMV_W_X_RV64) NODE_NAME_CASE(FMV_X_ANYEXTW_RV64) - NODE_NAME_CASE(FCVT_X_RTZ) - NODE_NAME_CASE(FCVT_XU_RTZ) - NODE_NAME_CASE(FCVT_W_RTZ_RV64) - NODE_NAME_CASE(FCVT_WU_RTZ_RV64) - NODE_NAME_CASE(STRICT_FCVT_W_RTZ_RV64) - NODE_NAME_CASE(STRICT_FCVT_WU_RTZ_RV64) + NODE_NAME_CASE(FCVT_X) + NODE_NAME_CASE(FCVT_XU) + NODE_NAME_CASE(FCVT_W_RV64) + NODE_NAME_CASE(FCVT_WU_RV64) + NODE_NAME_CASE(STRICT_FCVT_W_RV64) + NODE_NAME_CASE(STRICT_FCVT_WU_RV64) NODE_NAME_CASE(READ_CYCLE_WIDE) NODE_NAME_CASE(GREV) NODE_NAME_CASE(GREVW) @@ -9525,6 +10054,8 @@ const char *RISCVTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(SHFLW) NODE_NAME_CASE(UNSHFL) NODE_NAME_CASE(UNSHFLW) + NODE_NAME_CASE(BFP) + NODE_NAME_CASE(BFPW) NODE_NAME_CASE(BCOMPRESS) NODE_NAME_CASE(BCOMPRESSW) NODE_NAME_CASE(BDECOMPRESS) @@ -9598,8 +10129,10 @@ const char *RISCVTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(FP_ROUND_VL) NODE_NAME_CASE(VWMUL_VL) NODE_NAME_CASE(VWMULU_VL) + NODE_NAME_CASE(VWADDU_VL) NODE_NAME_CASE(SETCC_VL) NODE_NAME_CASE(VSELECT_VL) + NODE_NAME_CASE(VP_MERGE_VL) NODE_NAME_CASE(VMAND_VL) NODE_NAME_CASE(VMOR_VL) NODE_NAME_CASE(VMXOR_VL) @@ -9768,12 +10301,18 @@ RISCVTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, .Default(RISCV::NoRegister); if (FReg != RISCV::NoRegister) { assert(RISCV::F0_F <= FReg && FReg <= RISCV::F31_F && "Unknown fp-reg"); - if (Subtarget.hasStdExtD()) { + if (Subtarget.hasStdExtD() && (VT == MVT::f64 || VT == MVT::Other)) { unsigned RegNo = FReg - RISCV::F0_F; unsigned DReg = RISCV::F0_D + RegNo; return std::make_pair(DReg, &RISCV::FPR64RegClass); } - return std::make_pair(FReg, &RISCV::FPR32RegClass); + if (VT == MVT::f32 || VT == MVT::Other) + return std::make_pair(FReg, &RISCV::FPR32RegClass); + if (Subtarget.hasStdExtZfh() && VT == MVT::f16) { + unsigned RegNo = FReg - RISCV::F0_F; + unsigned HReg = RISCV::F0_H + RegNo; + return std::make_pair(HReg, &RISCV::FPR16RegClass); + } } } @@ -10070,6 +10609,24 @@ bool RISCVTargetLowering::shouldConvertFpToSat(unsigned Op, EVT FPVT, } } +unsigned RISCVTargetLowering::getJumpTableEncoding() const { + // If we are using the small code model, we can reduce size of jump table + // entry to 4 bytes. + if (Subtarget.is64Bit() && !isPositionIndependent() && + getTargetMachine().getCodeModel() == CodeModel::Small) { + return MachineJumpTableInfo::EK_Custom32; + } + return TargetLowering::getJumpTableEncoding(); +} + +const MCExpr *RISCVTargetLowering::LowerCustomJumpTableEntry( + const MachineJumpTableInfo *MJTI, const MachineBasicBlock *MBB, + unsigned uid, MCContext &Ctx) const { + assert(Subtarget.is64Bit() && !isPositionIndependent() && + getTargetMachine().getCodeModel() == CodeModel::Small); + return MCSymbolRefExpr::create(MBB->getSymbol(), Ctx); +} + bool RISCVTargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const { VT = VT.getScalarType(); @@ -10293,6 +10850,60 @@ SDValue RISCVTargetLowering::joinRegisterPartsIntoValue( return SDValue(); } +SDValue +RISCVTargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor, + SelectionDAG &DAG, + SmallVectorImpl<SDNode *> &Created) const { + AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes(); + if (isIntDivCheap(N->getValueType(0), Attr)) + return SDValue(N, 0); // Lower SDIV as SDIV + + assert((Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()) && + "Unexpected divisor!"); + + // Conditional move is needed, so do the transformation iff Zbt is enabled. + if (!Subtarget.hasStdExtZbt()) + return SDValue(); + + // When |Divisor| >= 2 ^ 12, it isn't profitable to do such transformation. + // Besides, more critical path instructions will be generated when dividing + // by 2. So we keep using the original DAGs for these cases. + unsigned Lg2 = Divisor.countTrailingZeros(); + if (Lg2 == 1 || Lg2 >= 12) + return SDValue(); + + // fold (sdiv X, pow2) + EVT VT = N->getValueType(0); + if (VT != MVT::i32 && !(Subtarget.is64Bit() && VT == MVT::i64)) + return SDValue(); + + SDLoc DL(N); + SDValue N0 = N->getOperand(0); + SDValue Zero = DAG.getConstant(0, DL, VT); + SDValue Pow2MinusOne = DAG.getConstant((1ULL << Lg2) - 1, DL, VT); + + // Add (N0 < 0) ? Pow2 - 1 : 0; + SDValue Cmp = DAG.getSetCC(DL, VT, N0, Zero, ISD::SETLT); + SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Pow2MinusOne); + SDValue Sel = DAG.getNode(ISD::SELECT, DL, VT, Cmp, Add, N0); + + Created.push_back(Cmp.getNode()); + Created.push_back(Add.getNode()); + Created.push_back(Sel.getNode()); + + // Divide by pow2. + SDValue SRA = + DAG.getNode(ISD::SRA, DL, VT, Sel, DAG.getConstant(Lg2, DL, VT)); + + // If we're dividing by a positive value, we're done. Otherwise, we must + // negate the result. + if (Divisor.isNonNegative()) + return SRA; + + Created.push_back(SRA.getNode()); + return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), SRA); +} + #define GET_REGISTER_MATCHER #include "RISCVGenAsmMatcher.inc" diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h index 48c5ce730933..58b7ec89f875 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -63,11 +63,11 @@ enum NodeType : unsigned { CLZW, CTZW, // RV64IB/RV32IB funnel shifts, with the semantics of the named RISC-V - // instructions, but the same operand order as fshl/fshr intrinsics. + // instructions. Operand order is rs1, rs3, rs2/shamt. FSR, FSL, - // RV64IB funnel shifts, with the semantics of the named RISC-V instructions, - // but the same operand order as fshl/fshr intrinsics. + // RV64IB funnel shifts, with the semantics of the named RISC-V instructions. + // Operand order is rs1, rs3, rs2/shamt. FSRW, FSLW, // FPR<->GPR transfer operations when the FPR is smaller than XLEN, needed as @@ -86,14 +86,16 @@ enum NodeType : unsigned { FMV_X_ANYEXTW_RV64, // FP to XLen int conversions. Corresponds to fcvt.l(u).s/d/h on RV64 and // fcvt.w(u).s/d/h on RV32. Unlike FP_TO_S/UINT these saturate out of - // range inputs. These are used for FP_TO_S/UINT_SAT lowering. - FCVT_X_RTZ, - FCVT_XU_RTZ, + // range inputs. These are used for FP_TO_S/UINT_SAT lowering. Rounding mode + // is passed as a TargetConstant operand using the RISCVFPRndMode enum. + FCVT_X, + FCVT_XU, // FP to 32 bit int conversions for RV64. These are used to keep track of the // result being sign extended to 64 bit. These saturate out of range inputs. - // Used for FP_TO_S/UINT and FP_TO_S/UINT_SAT lowering. - FCVT_W_RTZ_RV64, - FCVT_WU_RTZ_RV64, + // Used for FP_TO_S/UINT and FP_TO_S/UINT_SAT lowering. Rounding mode + // is passed as a TargetConstant operand using the RISCVFPRndMode enum. + FCVT_W_RV64, + FCVT_WU_RV64, // READ_CYCLE_WIDE - A read of the 64-bit cycle CSR on a 32-bit target // (returns (Lo, Hi)). It takes a chain operand. READ_CYCLE_WIDE, @@ -118,6 +120,13 @@ enum NodeType : unsigned { BCOMPRESSW, BDECOMPRESS, BDECOMPRESSW, + // The bit field place (bfp) instruction places up to XLEN/2 LSB bits from rs2 + // into the value in rs1. The upper bits of rs2 control the length of the bit + // field and target position. The layout of rs2 is chosen in a way that makes + // it possible to construct rs2 easily using pack[h] instructions and/or + // andi/lui. + BFP, + BFPW, // Vector Extension // VMV_V_X_VL matches the semantics of vmv.v.x but includes an extra operand // for the VL value to be used for the operation. @@ -236,6 +245,7 @@ enum NodeType : unsigned { // Widening instructions VWMUL_VL, VWMULU_VL, + VWADDU_VL, // Vector compare producing a mask. Fourth operand is input mask. Fifth // operand is VL. @@ -243,6 +253,10 @@ enum NodeType : unsigned { // Vector select with an additional VL operand. This operation is unmasked. VSELECT_VL, + // Vector select with operand #2 (the value when the condition is false) tied + // to the destination and an additional VL operand. This operation is + // unmasked. + VP_MERGE_VL, // Mask binary operators. VMAND_VL, @@ -284,8 +298,8 @@ enum NodeType : unsigned { // FP to 32 bit int conversions for RV64. These are used to keep track of the // result being sign extended to 64 bit. These saturate out of range inputs. - STRICT_FCVT_W_RTZ_RV64 = ISD::FIRST_TARGET_STRICTFP_OPCODE, - STRICT_FCVT_WU_RTZ_RV64, + STRICT_FCVT_W_RV64 = ISD::FIRST_TARGET_STRICTFP_OPCODE, + STRICT_FCVT_WU_RV64, // Memory opcodes start here. VLE_VL = ISD::FIRST_TARGET_MEMORY_OPCODE, @@ -462,6 +476,8 @@ public: SelectionDAG &DAG) const override; SDValue LowerCall(TargetLowering::CallLoweringInfo &CLI, SmallVectorImpl<SDValue> &InVals) const override; + template <class NodeTy> + SDValue getAddr(NodeTy *N, SelectionDAG &DAG, bool IsLocal = true) const; bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override { @@ -524,6 +540,16 @@ public: bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const override; + SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, + SmallVectorImpl<SDNode *> &Created) const override; + + unsigned getJumpTableEncoding() const override; + + const MCExpr *LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI, + const MachineBasicBlock *MBB, + unsigned uid, + MCContext &Ctx) const override; + private: /// RISCVCCAssignFn - This target-specific function extends the default /// CCValAssign with additional information used to lower RISC-V calling @@ -544,9 +570,6 @@ private: bool IsRet, CallLoweringInfo *CLI, RISCVCCAssignFn Fn) const; - template <class NodeTy> - SDValue getAddr(NodeTy *N, SelectionDAG &DAG, bool IsLocal = true) const; - SDValue getStaticTLSAddr(GlobalAddressSDNode *N, SelectionDAG &DAG, bool UseGOT) const; SDValue getDynamicTLSAddr(GlobalAddressSDNode *N, SelectionDAG &DAG) const; @@ -652,6 +675,15 @@ namespace RISCVVIntrinsicsTable { struct RISCVVIntrinsicInfo { unsigned IntrinsicID; uint8_t SplatOperand; + uint8_t VLOperand; + bool hasSplatOperand() const { + // 0xF is not valid. See NoSplatOperand in IntrinsicsRISCV.td. + return SplatOperand != 0xF; + } + bool hasVLOperand() const { + // 0x1F is not valid. See NoVLOperand in IntrinsicsRISCV.td. + return VLOperand != 0x1F; + } }; using namespace RISCV; diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp index dbfc90f36f80..d39e0805a79c 100644 --- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp +++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp @@ -59,12 +59,13 @@ class VSETVLIInfo { uint8_t MaskAgnostic : 1; uint8_t MaskRegOp : 1; uint8_t StoreOp : 1; + uint8_t ScalarMovOp : 1; uint8_t SEWLMULRatioOnly : 1; public: VSETVLIInfo() : AVLImm(0), TailAgnostic(false), MaskAgnostic(false), MaskRegOp(false), - StoreOp(false), SEWLMULRatioOnly(false) {} + StoreOp(false), ScalarMovOp(false), SEWLMULRatioOnly(false) {} static VSETVLIInfo getUnknown() { VSETVLIInfo Info; @@ -96,6 +97,18 @@ public: assert(hasAVLImm()); return AVLImm; } + bool hasZeroAVL() const { + if (hasAVLImm()) + return getAVLImm() == 0; + return false; + } + bool hasNonZeroAVL() const { + if (hasAVLImm()) + return getAVLImm() > 0; + if (hasAVLReg()) + return getAVLReg() == RISCV::X0; + return false; + } bool hasSameAVL(const VSETVLIInfo &Other) const { assert(isValid() && Other.isValid() && @@ -120,7 +133,7 @@ public: MaskAgnostic = RISCVVType::isMaskAgnostic(VType); } void setVTYPE(RISCVII::VLMUL L, unsigned S, bool TA, bool MA, bool MRO, - bool IsStore) { + bool IsStore, bool IsScalarMovOp) { assert(isValid() && !isUnknown() && "Can't set VTYPE for uninitialized or unknown"); VLMul = L; @@ -129,6 +142,7 @@ public: MaskAgnostic = MA; MaskRegOp = MRO; StoreOp = IsStore; + ScalarMovOp = IsScalarMovOp; } unsigned encodeVTYPE() const { @@ -139,6 +153,16 @@ public: bool hasSEWLMULRatioOnly() const { return SEWLMULRatioOnly; } + bool hasSameSEW(const VSETVLIInfo &Other) const { + assert(isValid() && Other.isValid() && + "Can't compare invalid VSETVLIInfos"); + assert(!isUnknown() && !Other.isUnknown() && + "Can't compare VTYPE in unknown state"); + assert(!SEWLMULRatioOnly && !Other.SEWLMULRatioOnly && + "Can't compare when only LMUL/SEW ratio is valid."); + return SEW == Other.SEW; + } + bool hasSameVTYPE(const VSETVLIInfo &Other) const { assert(isValid() && Other.isValid() && "Can't compare invalid VSETVLIInfos"); @@ -178,6 +202,15 @@ public: return getSEWLMULRatio() == Other.getSEWLMULRatio(); } + bool hasSamePolicy(const VSETVLIInfo &Other) const { + assert(isValid() && Other.isValid() && + "Can't compare invalid VSETVLIInfos"); + assert(!isUnknown() && !Other.isUnknown() && + "Can't compare VTYPE in unknown state"); + return TailAgnostic == Other.TailAgnostic && + MaskAgnostic == Other.MaskAgnostic; + } + bool hasCompatibleVTYPE(const VSETVLIInfo &InstrInfo, bool Strict) const { // Simple case, see if full VTYPE matches. if (hasSameVTYPE(InstrInfo)) @@ -222,6 +255,15 @@ public: return true; } + // For vmv.s.x and vfmv.s.f, there is only two behaviors, VL = 0 and VL > 0. + // So it's compatible when we could make sure that both VL be the same + // situation. + if (!Strict && InstrInfo.ScalarMovOp && InstrInfo.hasAVLImm() && + ((hasNonZeroAVL() && InstrInfo.hasNonZeroAVL()) || + (hasZeroAVL() && InstrInfo.hasZeroAVL())) && + hasSameSEW(InstrInfo) && hasSamePolicy(InstrInfo)) + return true; + // The AVL must match. if (!hasSameAVL(InstrInfo)) return false; @@ -414,6 +456,36 @@ static MachineInstr *elideCopies(MachineInstr *MI, } } +static bool isScalarMoveInstr(const MachineInstr &MI) { + switch (MI.getOpcode()) { + default: + return false; + case RISCV::PseudoVMV_S_X_M1: + case RISCV::PseudoVMV_S_X_M2: + case RISCV::PseudoVMV_S_X_M4: + case RISCV::PseudoVMV_S_X_M8: + case RISCV::PseudoVMV_S_X_MF2: + case RISCV::PseudoVMV_S_X_MF4: + case RISCV::PseudoVMV_S_X_MF8: + case RISCV::PseudoVFMV_S_F16_M1: + case RISCV::PseudoVFMV_S_F16_M2: + case RISCV::PseudoVFMV_S_F16_M4: + case RISCV::PseudoVFMV_S_F16_M8: + case RISCV::PseudoVFMV_S_F16_MF2: + case RISCV::PseudoVFMV_S_F16_MF4: + case RISCV::PseudoVFMV_S_F32_M1: + case RISCV::PseudoVFMV_S_F32_M2: + case RISCV::PseudoVFMV_S_F32_M4: + case RISCV::PseudoVFMV_S_F32_M8: + case RISCV::PseudoVFMV_S_F32_MF2: + case RISCV::PseudoVFMV_S_F64_M1: + case RISCV::PseudoVFMV_S_F64_M2: + case RISCV::PseudoVFMV_S_F64_M4: + case RISCV::PseudoVFMV_S_F64_M8: + return true; + } +} + static VSETVLIInfo computeInfoForInstr(const MachineInstr &MI, uint64_t TSFlags, const MachineRegisterInfo *MRI) { VSETVLIInfo InstrInfo; @@ -461,6 +533,7 @@ static VSETVLIInfo computeInfoForInstr(const MachineInstr &MI, uint64_t TSFlags, // If there are no explicit defs, this is a store instruction which can // ignore the tail and mask policies. bool StoreOp = MI.getNumExplicitDefs() == 0; + bool ScalarMovOp = isScalarMoveInstr(MI); if (RISCVII::hasVLOp(TSFlags)) { const MachineOperand &VLOp = MI.getOperand(NumOperands - 2); @@ -477,7 +550,7 @@ static VSETVLIInfo computeInfoForInstr(const MachineInstr &MI, uint64_t TSFlags, } else InstrInfo.setAVLReg(RISCV::NoRegister); InstrInfo.setVTYPE(VLMul, SEW, /*TailAgnostic*/ TailAgnostic, - /*MaskAgnostic*/ false, MaskRegOp, StoreOp); + /*MaskAgnostic*/ false, MaskRegOp, StoreOp, ScalarMovOp); return InstrInfo; } @@ -1000,6 +1073,13 @@ void RISCVInsertVSETVLI::emitVSETVLIs(MachineBasicBlock &MBB) { PrevVSETVLIMI->getOperand(2).setImm(NewInfo.encodeVTYPE()); NeedInsertVSETVLI = false; } + if (isScalarMoveInstr(MI) && + ((CurInfo.hasNonZeroAVL() && NewInfo.hasNonZeroAVL()) || + (CurInfo.hasZeroAVL() && NewInfo.hasZeroAVL())) && + NewInfo.hasSameVLMAX(CurInfo)) { + PrevVSETVLIMI->getOperand(2).setImm(NewInfo.encodeVTYPE()); + NeedInsertVSETVLI = false; + } } if (NeedInsertVSETVLI) insertVSETVLI(MBB, MI, NewInfo, CurInfo); diff --git a/llvm/lib/Target/RISCV/RISCVInstrFormats.td b/llvm/lib/Target/RISCV/RISCVInstrFormats.td index 6a16b6354f95..f99d0f56c406 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrFormats.td +++ b/llvm/lib/Target/RISCV/RISCVInstrFormats.td @@ -206,6 +206,13 @@ class Pseudo<dag outs, dag ins, list<dag> pattern, string opcodestr = "", string let isCodeGenOnly = 1; } +class PseudoQuietFCMP<RegisterClass Ty> + : Pseudo<(outs GPR:$rd), (ins Ty:$rs1, Ty:$rs2), []> { + let hasSideEffects = 1; + let mayLoad = 0; + let mayStore = 0; +} + // Pseudo load instructions. class PseudoLoad<string opcodestr, RegisterClass rdty = GPR> : Pseudo<(outs rdty:$rd), (ins bare_symbol:$addr), [], opcodestr, "$rd, $addr"> { diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp index 2e2e00886d57..7baed2793e4e 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp @@ -201,8 +201,9 @@ static bool isConvertibleToVMV_V_V(const RISCVSubtarget &STI, if (MBBI->modifiesRegister(RISCV::VL)) return false; - // Go through all defined operands, including implicit defines. - for (const MachineOperand &MO : MBBI->operands()) { + // Only converting whole register copies to vmv.v.v when the defining + // value appears in the explicit operands. + for (const MachineOperand &MO : MBBI->explicit_operands()) { if (!MO.isReg() || !MO.isDef()) continue; if (!FoundDef && TRI->isSubRegisterEq(MO.getReg(), SrcReg)) { @@ -914,7 +915,7 @@ void RISCVInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB, .addMBB(&DestBB, RISCVII::MO_CALL); RS->enterBasicBlockEnd(MBB); - unsigned Scav = RS->scavengeRegisterBackwards(RISCV::GPRRegClass, + Register Scav = RS->scavengeRegisterBackwards(RISCV::GPRRegClass, MI.getIterator(), false, 0); // TODO: The case when there is no scavenged register needs special handling. assert(Scav != RISCV::NoRegister && "No register is scavenged!"); @@ -1145,6 +1146,9 @@ bool RISCVInstrInfo::verifyInstruction(const MachineInstr &MI, else Ok = isUInt<5>(Imm); break; + case RISCVOp::OPERAND_RVKRNUM: + Ok = Imm >= 0 && Imm <= 10; + break; } if (!Ok) { ErrInfo = "Invalid immediate"; @@ -1399,19 +1403,28 @@ MachineBasicBlock::iterator RISCVInstrInfo::insertOutlinedCall( #define CASE_VFMA_OPCODE_COMMON(OP, TYPE, LMUL) \ RISCV::PseudoV##OP##_##TYPE##_##LMUL -#define CASE_VFMA_OPCODE_LMULS(OP, TYPE) \ - CASE_VFMA_OPCODE_COMMON(OP, TYPE, MF8): \ - case CASE_VFMA_OPCODE_COMMON(OP, TYPE, MF4): \ - case CASE_VFMA_OPCODE_COMMON(OP, TYPE, MF2): \ - case CASE_VFMA_OPCODE_COMMON(OP, TYPE, M1): \ +#define CASE_VFMA_OPCODE_LMULS_M1(OP, TYPE) \ + CASE_VFMA_OPCODE_COMMON(OP, TYPE, M1): \ case CASE_VFMA_OPCODE_COMMON(OP, TYPE, M2): \ case CASE_VFMA_OPCODE_COMMON(OP, TYPE, M4): \ case CASE_VFMA_OPCODE_COMMON(OP, TYPE, M8) +#define CASE_VFMA_OPCODE_LMULS_MF2(OP, TYPE) \ + CASE_VFMA_OPCODE_COMMON(OP, TYPE, MF2): \ + case CASE_VFMA_OPCODE_LMULS_M1(OP, TYPE) + +#define CASE_VFMA_OPCODE_LMULS_MF4(OP, TYPE) \ + CASE_VFMA_OPCODE_COMMON(OP, TYPE, MF4): \ + case CASE_VFMA_OPCODE_LMULS_MF2(OP, TYPE) + +#define CASE_VFMA_OPCODE_LMULS(OP, TYPE) \ + CASE_VFMA_OPCODE_COMMON(OP, TYPE, MF8): \ + case CASE_VFMA_OPCODE_LMULS_MF4(OP, TYPE) + #define CASE_VFMA_SPLATS(OP) \ - CASE_VFMA_OPCODE_LMULS(OP, VF16): \ - case CASE_VFMA_OPCODE_LMULS(OP, VF32): \ - case CASE_VFMA_OPCODE_LMULS(OP, VF64) + CASE_VFMA_OPCODE_LMULS_MF4(OP, VF16): \ + case CASE_VFMA_OPCODE_LMULS_MF2(OP, VF32): \ + case CASE_VFMA_OPCODE_LMULS_M1(OP, VF64) // clang-format on bool RISCVInstrInfo::findCommutedOpIndices(const MachineInstr &MI, @@ -1430,10 +1443,10 @@ bool RISCVInstrInfo::findCommutedOpIndices(const MachineInstr &MI, case CASE_VFMA_SPLATS(FNMSUB): case CASE_VFMA_SPLATS(FNMACC): case CASE_VFMA_SPLATS(FNMSAC): - case CASE_VFMA_OPCODE_LMULS(FMACC, VV): - case CASE_VFMA_OPCODE_LMULS(FMSAC, VV): - case CASE_VFMA_OPCODE_LMULS(FNMACC, VV): - case CASE_VFMA_OPCODE_LMULS(FNMSAC, VV): + case CASE_VFMA_OPCODE_LMULS_MF4(FMACC, VV): + case CASE_VFMA_OPCODE_LMULS_MF4(FMSAC, VV): + case CASE_VFMA_OPCODE_LMULS_MF4(FNMACC, VV): + case CASE_VFMA_OPCODE_LMULS_MF4(FNMSAC, VV): case CASE_VFMA_OPCODE_LMULS(MADD, VX): case CASE_VFMA_OPCODE_LMULS(NMSUB, VX): case CASE_VFMA_OPCODE_LMULS(MACC, VX): @@ -1454,10 +1467,10 @@ bool RISCVInstrInfo::findCommutedOpIndices(const MachineInstr &MI, return false; return true; } - case CASE_VFMA_OPCODE_LMULS(FMADD, VV): - case CASE_VFMA_OPCODE_LMULS(FMSUB, VV): - case CASE_VFMA_OPCODE_LMULS(FNMADD, VV): - case CASE_VFMA_OPCODE_LMULS(FNMSUB, VV): + case CASE_VFMA_OPCODE_LMULS_MF4(FMADD, VV): + case CASE_VFMA_OPCODE_LMULS_MF4(FMSUB, VV): + case CASE_VFMA_OPCODE_LMULS_MF4(FNMADD, VV): + case CASE_VFMA_OPCODE_LMULS_MF4(FNMSUB, VV): case CASE_VFMA_OPCODE_LMULS(MADD, VV): case CASE_VFMA_OPCODE_LMULS(NMSUB, VV): { // If the tail policy is undisturbed we can't commute. @@ -1533,19 +1546,28 @@ bool RISCVInstrInfo::findCommutedOpIndices(const MachineInstr &MI, Opc = RISCV::PseudoV##NEWOP##_##TYPE##_##LMUL; \ break; -#define CASE_VFMA_CHANGE_OPCODE_LMULS(OLDOP, NEWOP, TYPE) \ - CASE_VFMA_CHANGE_OPCODE_COMMON(OLDOP, NEWOP, TYPE, MF8) \ - CASE_VFMA_CHANGE_OPCODE_COMMON(OLDOP, NEWOP, TYPE, MF4) \ - CASE_VFMA_CHANGE_OPCODE_COMMON(OLDOP, NEWOP, TYPE, MF2) \ +#define CASE_VFMA_CHANGE_OPCODE_LMULS_M1(OLDOP, NEWOP, TYPE) \ CASE_VFMA_CHANGE_OPCODE_COMMON(OLDOP, NEWOP, TYPE, M1) \ CASE_VFMA_CHANGE_OPCODE_COMMON(OLDOP, NEWOP, TYPE, M2) \ CASE_VFMA_CHANGE_OPCODE_COMMON(OLDOP, NEWOP, TYPE, M4) \ CASE_VFMA_CHANGE_OPCODE_COMMON(OLDOP, NEWOP, TYPE, M8) +#define CASE_VFMA_CHANGE_OPCODE_LMULS_MF2(OLDOP, NEWOP, TYPE) \ + CASE_VFMA_CHANGE_OPCODE_COMMON(OLDOP, NEWOP, TYPE, MF2) \ + CASE_VFMA_CHANGE_OPCODE_LMULS_M1(OLDOP, NEWOP, TYPE) + +#define CASE_VFMA_CHANGE_OPCODE_LMULS_MF4(OLDOP, NEWOP, TYPE) \ + CASE_VFMA_CHANGE_OPCODE_COMMON(OLDOP, NEWOP, TYPE, MF4) \ + CASE_VFMA_CHANGE_OPCODE_LMULS_MF2(OLDOP, NEWOP, TYPE) + +#define CASE_VFMA_CHANGE_OPCODE_LMULS(OLDOP, NEWOP, TYPE) \ + CASE_VFMA_CHANGE_OPCODE_COMMON(OLDOP, NEWOP, TYPE, MF8) \ + CASE_VFMA_CHANGE_OPCODE_LMULS_MF4(OLDOP, NEWOP, TYPE) + #define CASE_VFMA_CHANGE_OPCODE_SPLATS(OLDOP, NEWOP) \ - CASE_VFMA_CHANGE_OPCODE_LMULS(OLDOP, NEWOP, VF16) \ - CASE_VFMA_CHANGE_OPCODE_LMULS(OLDOP, NEWOP, VF32) \ - CASE_VFMA_CHANGE_OPCODE_LMULS(OLDOP, NEWOP, VF64) + CASE_VFMA_CHANGE_OPCODE_LMULS_MF4(OLDOP, NEWOP, VF16) \ + CASE_VFMA_CHANGE_OPCODE_LMULS_MF2(OLDOP, NEWOP, VF32) \ + CASE_VFMA_CHANGE_OPCODE_LMULS_M1(OLDOP, NEWOP, VF64) MachineInstr *RISCVInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, @@ -1566,10 +1588,10 @@ MachineInstr *RISCVInstrInfo::commuteInstructionImpl(MachineInstr &MI, case CASE_VFMA_SPLATS(FNMADD): case CASE_VFMA_SPLATS(FNMSAC): case CASE_VFMA_SPLATS(FNMSUB): - case CASE_VFMA_OPCODE_LMULS(FMACC, VV): - case CASE_VFMA_OPCODE_LMULS(FMSAC, VV): - case CASE_VFMA_OPCODE_LMULS(FNMACC, VV): - case CASE_VFMA_OPCODE_LMULS(FNMSAC, VV): + case CASE_VFMA_OPCODE_LMULS_MF4(FMACC, VV): + case CASE_VFMA_OPCODE_LMULS_MF4(FMSAC, VV): + case CASE_VFMA_OPCODE_LMULS_MF4(FNMACC, VV): + case CASE_VFMA_OPCODE_LMULS_MF4(FNMSAC, VV): case CASE_VFMA_OPCODE_LMULS(MADD, VX): case CASE_VFMA_OPCODE_LMULS(NMSUB, VX): case CASE_VFMA_OPCODE_LMULS(MACC, VX): @@ -1592,10 +1614,10 @@ MachineInstr *RISCVInstrInfo::commuteInstructionImpl(MachineInstr &MI, CASE_VFMA_CHANGE_OPCODE_SPLATS(FNMADD, FNMACC) CASE_VFMA_CHANGE_OPCODE_SPLATS(FNMSAC, FNMSUB) CASE_VFMA_CHANGE_OPCODE_SPLATS(FNMSUB, FNMSAC) - CASE_VFMA_CHANGE_OPCODE_LMULS(FMACC, FMADD, VV) - CASE_VFMA_CHANGE_OPCODE_LMULS(FMSAC, FMSUB, VV) - CASE_VFMA_CHANGE_OPCODE_LMULS(FNMACC, FNMADD, VV) - CASE_VFMA_CHANGE_OPCODE_LMULS(FNMSAC, FNMSUB, VV) + CASE_VFMA_CHANGE_OPCODE_LMULS_MF4(FMACC, FMADD, VV) + CASE_VFMA_CHANGE_OPCODE_LMULS_MF4(FMSAC, FMSUB, VV) + CASE_VFMA_CHANGE_OPCODE_LMULS_MF4(FNMACC, FNMADD, VV) + CASE_VFMA_CHANGE_OPCODE_LMULS_MF4(FNMSAC, FNMSUB, VV) CASE_VFMA_CHANGE_OPCODE_LMULS(MACC, MADD, VX) CASE_VFMA_CHANGE_OPCODE_LMULS(MADD, MACC, VX) CASE_VFMA_CHANGE_OPCODE_LMULS(NMSAC, NMSUB, VX) @@ -1609,10 +1631,10 @@ MachineInstr *RISCVInstrInfo::commuteInstructionImpl(MachineInstr &MI, return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, OpIdx1, OpIdx2); } - case CASE_VFMA_OPCODE_LMULS(FMADD, VV): - case CASE_VFMA_OPCODE_LMULS(FMSUB, VV): - case CASE_VFMA_OPCODE_LMULS(FNMADD, VV): - case CASE_VFMA_OPCODE_LMULS(FNMSUB, VV): + case CASE_VFMA_OPCODE_LMULS_MF4(FMADD, VV): + case CASE_VFMA_OPCODE_LMULS_MF4(FMSUB, VV): + case CASE_VFMA_OPCODE_LMULS_MF4(FNMADD, VV): + case CASE_VFMA_OPCODE_LMULS_MF4(FNMSUB, VV): case CASE_VFMA_OPCODE_LMULS(MADD, VV): case CASE_VFMA_OPCODE_LMULS(NMSUB, VV): { assert((OpIdx1 == 1 || OpIdx2 == 1) && "Unexpected opcode index"); @@ -1623,10 +1645,10 @@ MachineInstr *RISCVInstrInfo::commuteInstructionImpl(MachineInstr &MI, switch (MI.getOpcode()) { default: llvm_unreachable("Unexpected opcode"); - CASE_VFMA_CHANGE_OPCODE_LMULS(FMADD, FMACC, VV) - CASE_VFMA_CHANGE_OPCODE_LMULS(FMSUB, FMSAC, VV) - CASE_VFMA_CHANGE_OPCODE_LMULS(FNMADD, FNMACC, VV) - CASE_VFMA_CHANGE_OPCODE_LMULS(FNMSUB, FNMSAC, VV) + CASE_VFMA_CHANGE_OPCODE_LMULS_MF4(FMADD, FMACC, VV) + CASE_VFMA_CHANGE_OPCODE_LMULS_MF4(FMSUB, FMSAC, VV) + CASE_VFMA_CHANGE_OPCODE_LMULS_MF4(FNMADD, FNMACC, VV) + CASE_VFMA_CHANGE_OPCODE_LMULS_MF4(FNMSUB, FNMSAC, VV) CASE_VFMA_CHANGE_OPCODE_LMULS(MADD, MACC, VV) CASE_VFMA_CHANGE_OPCODE_LMULS(NMSUB, NMSAC, VV) } @@ -1655,13 +1677,16 @@ MachineInstr *RISCVInstrInfo::commuteInstructionImpl(MachineInstr &MI, #define CASE_WIDEOP_OPCODE_COMMON(OP, LMUL) \ RISCV::PseudoV##OP##_##LMUL##_TIED -#define CASE_WIDEOP_OPCODE_LMULS(OP) \ - CASE_WIDEOP_OPCODE_COMMON(OP, MF8): \ - case CASE_WIDEOP_OPCODE_COMMON(OP, MF4): \ +#define CASE_WIDEOP_OPCODE_LMULS_MF4(OP) \ + CASE_WIDEOP_OPCODE_COMMON(OP, MF4): \ case CASE_WIDEOP_OPCODE_COMMON(OP, MF2): \ case CASE_WIDEOP_OPCODE_COMMON(OP, M1): \ case CASE_WIDEOP_OPCODE_COMMON(OP, M2): \ case CASE_WIDEOP_OPCODE_COMMON(OP, M4) + +#define CASE_WIDEOP_OPCODE_LMULS(OP) \ + CASE_WIDEOP_OPCODE_COMMON(OP, MF8): \ + case CASE_WIDEOP_OPCODE_LMULS_MF4(OP) // clang-format on #define CASE_WIDEOP_CHANGE_OPCODE_COMMON(OP, LMUL) \ @@ -1669,22 +1694,25 @@ MachineInstr *RISCVInstrInfo::commuteInstructionImpl(MachineInstr &MI, NewOpc = RISCV::PseudoV##OP##_##LMUL; \ break; -#define CASE_WIDEOP_CHANGE_OPCODE_LMULS(OP) \ - CASE_WIDEOP_CHANGE_OPCODE_COMMON(OP, MF8) \ +#define CASE_WIDEOP_CHANGE_OPCODE_LMULS_MF4(OP) \ CASE_WIDEOP_CHANGE_OPCODE_COMMON(OP, MF4) \ CASE_WIDEOP_CHANGE_OPCODE_COMMON(OP, MF2) \ CASE_WIDEOP_CHANGE_OPCODE_COMMON(OP, M1) \ CASE_WIDEOP_CHANGE_OPCODE_COMMON(OP, M2) \ CASE_WIDEOP_CHANGE_OPCODE_COMMON(OP, M4) +#define CASE_WIDEOP_CHANGE_OPCODE_LMULS(OP) \ + CASE_WIDEOP_CHANGE_OPCODE_COMMON(OP, MF8) \ + CASE_WIDEOP_CHANGE_OPCODE_LMULS_MF4(OP) + MachineInstr *RISCVInstrInfo::convertToThreeAddress(MachineInstr &MI, LiveVariables *LV, LiveIntervals *LIS) const { switch (MI.getOpcode()) { default: break; - case CASE_WIDEOP_OPCODE_LMULS(FWADD_WV): - case CASE_WIDEOP_OPCODE_LMULS(FWSUB_WV): + case CASE_WIDEOP_OPCODE_LMULS_MF4(FWADD_WV): + case CASE_WIDEOP_OPCODE_LMULS_MF4(FWSUB_WV): case CASE_WIDEOP_OPCODE_LMULS(WADD_WV): case CASE_WIDEOP_OPCODE_LMULS(WADDU_WV): case CASE_WIDEOP_OPCODE_LMULS(WSUB_WV): @@ -1694,14 +1722,14 @@ MachineInstr *RISCVInstrInfo::convertToThreeAddress(MachineInstr &MI, switch (MI.getOpcode()) { default: llvm_unreachable("Unexpected opcode"); - CASE_WIDEOP_CHANGE_OPCODE_LMULS(FWADD_WV) - CASE_WIDEOP_CHANGE_OPCODE_LMULS(FWSUB_WV) + CASE_WIDEOP_CHANGE_OPCODE_LMULS_MF4(FWADD_WV) + CASE_WIDEOP_CHANGE_OPCODE_LMULS_MF4(FWSUB_WV) CASE_WIDEOP_CHANGE_OPCODE_LMULS(WADD_WV) CASE_WIDEOP_CHANGE_OPCODE_LMULS(WADDU_WV) CASE_WIDEOP_CHANGE_OPCODE_LMULS(WSUB_WV) CASE_WIDEOP_CHANGE_OPCODE_LMULS(WSUBU_WV) } - //clang-format on + // clang-format on MachineBasicBlock &MBB = *MI.getParent(); MachineInstrBuilder MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc)) diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td index 71eb6f01a4f4..64cd89cda06a 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td @@ -402,6 +402,21 @@ def AddiPairImmB : SDNodeXForm<imm, [{ N->getValueType(0)); }]>; +def XLenSubTrailingOnes : SDNodeXForm<imm, [{ + uint64_t XLen = Subtarget->getXLen(); + uint64_t TrailingOnes = N->getAPIntValue().countTrailingOnes(); + return CurDAG->getTargetConstant(XLen - TrailingOnes, SDLoc(N), + N->getValueType(0)); +}]>; + +// Checks if this mask is a non-empty sequence of ones starting at the +// least significant bit with the remainder zero and exceeds simm12. +def TrailingOnesMask : PatLeaf<(imm), [{ + if (!N->hasOneUse()) + return false; + return !isInt<12>(N->getSExtValue()) && isMask_64(N->getZExtValue()); +}], XLenSubTrailingOnes>; + //===----------------------------------------------------------------------===// // Instruction Formats //===----------------------------------------------------------------------===// @@ -1019,6 +1034,23 @@ def mul_const_oneuse : PatFrag<(ops node:$A, node:$B), return false; }]>; +def sext_oneuse : PatFrag<(ops node:$A), (sext node:$A), [{ + return N->hasOneUse(); +}]>; + +def zext_oneuse : PatFrag<(ops node:$A), (zext node:$A), [{ + return N->hasOneUse(); +}]>; + +def anyext_oneuse : PatFrag<(ops node:$A), (anyext node:$A), [{ + return N->hasOneUse(); +}]>; + +def fpext_oneuse : PatFrag<(ops node:$A), + (any_fpextend node:$A), [{ + return N->hasOneUse(); +}]>; + /// Simple arithmetic operations def : PatGprGpr<add, ADD>; @@ -1034,6 +1066,10 @@ def : PatGprUimmLog2XLen<shl, SLLI>; def : PatGprUimmLog2XLen<srl, SRLI>; def : PatGprUimmLog2XLen<sra, SRAI>; +// AND with trailing ones mask exceeding simm12. +def : Pat<(XLenVT (and GPR:$rs, TrailingOnesMask:$mask)), + (SRLI (SLLI $rs, TrailingOnesMask:$mask), TrailingOnesMask:$mask)>; + // Match both a plain shift and one where the shift amount is masked (this is // typically introduced when the legalizer promotes the shift amount and // zero-extends it). For RISC-V, the mask is unnecessary as shifts in the base @@ -1350,6 +1386,10 @@ def ReadFRM : ReadSysReg<SysRegFRM, [FRM]>; def WriteFRM : WriteSysReg<SysRegFRM, [FRM]>; def WriteFRMImm : WriteSysRegImm<SysRegFRM, [FRM]>; +let hasSideEffects = true in { +def ReadFFLAGS : ReadSysReg<SysRegFFLAGS, [FFLAGS]>; +def WriteFFLAGS : WriteSysReg<SysRegFFLAGS, [FFLAGS]>; +} /// Other pseudo-instructions // Pessimistically assume the stack pointer will be clobbered @@ -1476,5 +1516,6 @@ include "RISCVInstrInfoF.td" include "RISCVInstrInfoD.td" include "RISCVInstrInfoC.td" include "RISCVInstrInfoZb.td" +include "RISCVInstrInfoZk.td" include "RISCVInstrInfoV.td" include "RISCVInstrInfoZfh.td" diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoD.td b/llvm/lib/Target/RISCV/RISCVInstrInfoD.td index d6c31c4804db..2837b92da81f 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoD.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoD.td @@ -30,21 +30,12 @@ def RISCVSplitF64 : SDNode<"RISCVISD::SplitF64", SDT_RISCVSplitF64>; //===----------------------------------------------------------------------===// let Predicates = [HasStdExtD] in { - -let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in -def FLD : RVInstI<0b011, OPC_LOAD_FP, (outs FPR64:$rd), - (ins GPR:$rs1, simm12:$imm12), - "fld", "$rd, ${imm12}(${rs1})">, - Sched<[WriteFLD64, ReadFMemBase]>; +def FLD : FPLoad_r<0b011, "fld", FPR64, WriteFLD64>; // Operands for stores are in the order srcreg, base, offset rather than // reflecting the order these fields are specified in the instruction // encoding. -let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in -def FSD : RVInstS<0b011, OPC_STORE_FP, (outs), - (ins FPR64:$rs2, GPR:$rs1, simm12:$imm12), - "fsd", "$rs2, ${imm12}(${rs1})">, - Sched<[WriteFST64, ReadStoreData, ReadFMemBase]>; +def FSD : FPStore_r<0b011, "fsd", FPR64, WriteFST64>; let SchedRW = [WriteFMA64, ReadFMA64, ReadFMA64, ReadFMA64] in { def FMADD_D : FPFMA_rrr_frm<OPC_MADD, 0b01, "fmadd.d", FPR64>; @@ -167,6 +158,10 @@ def : InstAlias<"fge.d $rd, $rs, $rt", def PseudoFLD : PseudoFloatLoad<"fld", FPR64>; def PseudoFSD : PseudoStore<"fsd", FPR64>; +let usesCustomInserter = 1 in { +def PseudoQuietFLE_D : PseudoQuietFCMP<FPR64>; +def PseudoQuietFLT_D : PseudoQuietFCMP<FPR64>; +} } // Predicates = [HasStdExtD] //===----------------------------------------------------------------------===// @@ -231,13 +226,34 @@ def : PatFpr64Fpr64<fminnum, FMIN_D>; def : PatFpr64Fpr64<fmaxnum, FMAX_D>; /// Setcc - -def : PatFpr64Fpr64<seteq, FEQ_D>; -def : PatFpr64Fpr64<setoeq, FEQ_D>; -def : PatFpr64Fpr64<setlt, FLT_D>; -def : PatFpr64Fpr64<setolt, FLT_D>; -def : PatFpr64Fpr64<setle, FLE_D>; -def : PatFpr64Fpr64<setole, FLE_D>; +// FIXME: SETEQ/SETLT/SETLE imply nonans, can we pick better instructions for +// strict versions of those. + +// Match non-signaling FEQ_D +def : PatSetCC<FPR64, any_fsetcc, SETEQ, FEQ_D>; +def : PatSetCC<FPR64, any_fsetcc, SETOEQ, FEQ_D>; +def : PatSetCC<FPR64, strict_fsetcc, SETLT, PseudoQuietFLT_D>; +def : PatSetCC<FPR64, strict_fsetcc, SETOLT, PseudoQuietFLT_D>; +def : PatSetCC<FPR64, strict_fsetcc, SETLE, PseudoQuietFLE_D>; +def : PatSetCC<FPR64, strict_fsetcc, SETOLE, PseudoQuietFLE_D>; + +// Match signaling FEQ_D +def : Pat<(strict_fsetccs FPR64:$rs1, FPR64:$rs2, SETEQ), + (AND (FLE_D $rs1, $rs2), + (FLE_D $rs2, $rs1))>; +def : Pat<(strict_fsetccs FPR64:$rs1, FPR64:$rs2, SETOEQ), + (AND (FLE_D $rs1, $rs2), + (FLE_D $rs2, $rs1))>; +// If both operands are the same, use a single FLE. +def : Pat<(strict_fsetccs FPR64:$rs1, FPR64:$rs1, SETEQ), + (FLE_D $rs1, $rs1)>; +def : Pat<(strict_fsetccs FPR64:$rs1, FPR64:$rs1, SETOEQ), + (FLE_D $rs1, $rs1)>; + +def : PatSetCC<FPR64, any_fsetccs, SETLT, FLT_D>; +def : PatSetCC<FPR64, any_fsetccs, SETOLT, FLT_D>; +def : PatSetCC<FPR64, any_fsetccs, SETLE, FLE_D>; +def : PatSetCC<FPR64, any_fsetccs, SETOLE, FLE_D>; def Select_FPR64_Using_CC_GPR : SelectCC_rrirr<FPR64, GPR>; @@ -269,20 +285,22 @@ let Predicates = [HasStdExtD, IsRV32] in { /// Float constants def : Pat<(f64 (fpimm0)), (FCVT_D_W (i32 X0))>; +def : Pat<(f64 (fpimmneg0)), (FSGNJN_D (FCVT_D_W (i32 X0)), + (FCVT_D_W (i32 X0)))>; // double->[u]int. Round-to-zero must be used. def : Pat<(i32 (any_fp_to_sint FPR64:$rs1)), (FCVT_W_D FPR64:$rs1, 0b001)>; def : Pat<(i32 (any_fp_to_uint FPR64:$rs1)), (FCVT_WU_D FPR64:$rs1, 0b001)>; // Saturating double->[u]int32. -def : Pat<(i32 (riscv_fcvt_x_rtz FPR64:$rs1)), (FCVT_W_D $rs1, 0b001)>; -def : Pat<(i32 (riscv_fcvt_xu_rtz FPR64:$rs1)), (FCVT_WU_D $rs1, 0b001)>; +def : Pat<(i32 (riscv_fcvt_x FPR64:$rs1, timm:$frm)), (FCVT_W_D $rs1, timm:$frm)>; +def : Pat<(i32 (riscv_fcvt_xu FPR64:$rs1, timm:$frm)), (FCVT_WU_D $rs1, timm:$frm)>; // float->int32 with current rounding mode. -def : Pat<(i32 (lrint FPR64:$rs1)), (FCVT_W_D $rs1, 0b111)>; +def : Pat<(i32 (any_lrint FPR64:$rs1)), (FCVT_W_D $rs1, 0b111)>; // float->int32 rounded to nearest with ties rounded away from zero. -def : Pat<(i32 (lround FPR64:$rs1)), (FCVT_W_D $rs1, 0b100)>; +def : Pat<(i32 (any_lround FPR64:$rs1)), (FCVT_W_D $rs1, 0b100)>; // [u]int->double. def : Pat<(any_sint_to_fp (i32 GPR:$rs1)), (FCVT_D_W GPR:$rs1)>; @@ -293,6 +311,8 @@ let Predicates = [HasStdExtD, IsRV64] in { /// Float constants def : Pat<(f64 (fpimm0)), (FMV_D_X (i64 X0))>; +def : Pat<(f64 (fpimmneg0)), (FSGNJN_D (FMV_D_X (i64 X0)), + (FMV_D_X (i64 X0)))>; // Moves (no conversion) def : Pat<(bitconvert (i64 GPR:$rs1)), (FMV_D_X GPR:$rs1)>; @@ -301,28 +321,28 @@ def : Pat<(i64 (bitconvert FPR64:$rs1)), (FMV_X_D FPR64:$rs1)>; // Use target specific isd nodes to help us remember the result is sign // extended. Matching sext_inreg+fptoui/fptosi may cause the conversion to be // duplicated if it has another user that didn't need the sign_extend. -def : Pat<(riscv_any_fcvt_w_rtz_rv64 FPR64:$rs1), (FCVT_W_D $rs1, 0b001)>; -def : Pat<(riscv_any_fcvt_wu_rtz_rv64 FPR64:$rs1), (FCVT_WU_D $rs1, 0b001)>; +def : Pat<(riscv_any_fcvt_w_rv64 FPR64:$rs1, timm:$frm), (FCVT_W_D $rs1, timm:$frm)>; +def : Pat<(riscv_any_fcvt_wu_rv64 FPR64:$rs1, timm:$frm), (FCVT_WU_D $rs1, timm:$frm)>; // [u]int32->fp def : Pat<(any_sint_to_fp (i64 (sexti32 (i64 GPR:$rs1)))), (FCVT_D_W $rs1)>; def : Pat<(any_uint_to_fp (i64 (zexti32 (i64 GPR:$rs1)))), (FCVT_D_WU $rs1)>; // Saturating double->[u]int64. -def : Pat<(i64 (riscv_fcvt_x_rtz FPR64:$rs1)), (FCVT_L_D $rs1, 0b001)>; -def : Pat<(i64 (riscv_fcvt_xu_rtz FPR64:$rs1)), (FCVT_LU_D $rs1, 0b001)>; +def : Pat<(i64 (riscv_fcvt_x FPR64:$rs1, timm:$frm)), (FCVT_L_D $rs1, timm:$frm)>; +def : Pat<(i64 (riscv_fcvt_xu FPR64:$rs1, timm:$frm)), (FCVT_LU_D $rs1, timm:$frm)>; // double->[u]int64. Round-to-zero must be used. def : Pat<(i64 (any_fp_to_sint FPR64:$rs1)), (FCVT_L_D FPR64:$rs1, 0b001)>; def : Pat<(i64 (any_fp_to_uint FPR64:$rs1)), (FCVT_LU_D FPR64:$rs1, 0b001)>; // double->int64 with current rounding mode. -def : Pat<(i64 (lrint FPR64:$rs1)), (FCVT_L_D $rs1, 0b111)>; -def : Pat<(i64 (llrint FPR64:$rs1)), (FCVT_L_D $rs1, 0b111)>; +def : Pat<(i64 (any_lrint FPR64:$rs1)), (FCVT_L_D $rs1, 0b111)>; +def : Pat<(i64 (any_llrint FPR64:$rs1)), (FCVT_L_D $rs1, 0b111)>; // double->int64 rounded to nearest with ties rounded away from zero. -def : Pat<(i64 (lround FPR64:$rs1)), (FCVT_L_D $rs1, 0b100)>; -def : Pat<(i64 (llround FPR64:$rs1)), (FCVT_L_D $rs1, 0b100)>; +def : Pat<(i64 (any_lround FPR64:$rs1)), (FCVT_L_D $rs1, 0b100)>; +def : Pat<(i64 (any_llround FPR64:$rs1)), (FCVT_L_D $rs1, 0b100)>; // [u]int64->fp. Match GCC and default to using dynamic rounding mode. def : Pat<(any_sint_to_fp (i64 GPR:$rs1)), (FCVT_D_L GPR:$rs1, 0b111)>; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoF.td b/llvm/lib/Target/RISCV/RISCVInstrInfoF.td index bb45ed859442..a8ac06ba8da3 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoF.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoF.td @@ -20,36 +20,38 @@ def SDT_RISCVFMV_W_X_RV64 def SDT_RISCVFMV_X_ANYEXTW_RV64 : SDTypeProfile<1, 1, [SDTCisVT<0, i64>, SDTCisVT<1, f32>]>; def SDT_RISCVFCVT_W_RV64 - : SDTypeProfile<1, 1, [SDTCisVT<0, i64>, SDTCisFP<1>]>; + : SDTypeProfile<1, 2, [SDTCisVT<0, i64>, SDTCisFP<1>, + SDTCisVT<2, i64>]>; def SDT_RISCVFCVT_X - : SDTypeProfile<1, 1, [SDTCisVT<0, XLenVT>, SDTCisFP<1>]>; + : SDTypeProfile<1, 2, [SDTCisVT<0, XLenVT>, SDTCisFP<1>, + SDTCisVT<2, XLenVT>]>; def riscv_fmv_w_x_rv64 : SDNode<"RISCVISD::FMV_W_X_RV64", SDT_RISCVFMV_W_X_RV64>; def riscv_fmv_x_anyextw_rv64 : SDNode<"RISCVISD::FMV_X_ANYEXTW_RV64", SDT_RISCVFMV_X_ANYEXTW_RV64>; -def riscv_fcvt_w_rtz_rv64 - : SDNode<"RISCVISD::FCVT_W_RTZ_RV64", SDT_RISCVFCVT_W_RV64>; -def riscv_fcvt_wu_rtz_rv64 - : SDNode<"RISCVISD::FCVT_WU_RTZ_RV64", SDT_RISCVFCVT_W_RV64>; -def riscv_fcvt_x_rtz - : SDNode<"RISCVISD::FCVT_X_RTZ", SDT_RISCVFCVT_X>; -def riscv_fcvt_xu_rtz - : SDNode<"RISCVISD::FCVT_XU_RTZ", SDT_RISCVFCVT_X>; - -def riscv_strict_fcvt_w_rtz_rv64 - : SDNode<"RISCVISD::STRICT_FCVT_W_RTZ_RV64", SDT_RISCVFCVT_W_RV64, +def riscv_fcvt_w_rv64 + : SDNode<"RISCVISD::FCVT_W_RV64", SDT_RISCVFCVT_W_RV64>; +def riscv_fcvt_wu_rv64 + : SDNode<"RISCVISD::FCVT_WU_RV64", SDT_RISCVFCVT_W_RV64>; +def riscv_fcvt_x + : SDNode<"RISCVISD::FCVT_X", SDT_RISCVFCVT_X>; +def riscv_fcvt_xu + : SDNode<"RISCVISD::FCVT_XU", SDT_RISCVFCVT_X>; + +def riscv_strict_fcvt_w_rv64 + : SDNode<"RISCVISD::STRICT_FCVT_W_RV64", SDT_RISCVFCVT_W_RV64, [SDNPHasChain]>; -def riscv_strict_fcvt_wu_rtz_rv64 - : SDNode<"RISCVISD::STRICT_FCVT_WU_RTZ_RV64", SDT_RISCVFCVT_W_RV64, +def riscv_strict_fcvt_wu_rv64 + : SDNode<"RISCVISD::STRICT_FCVT_WU_RV64", SDT_RISCVFCVT_W_RV64, [SDNPHasChain]>; -def riscv_any_fcvt_w_rtz_rv64 : PatFrags<(ops node:$src), - [(riscv_strict_fcvt_w_rtz_rv64 node:$src), - (riscv_fcvt_w_rtz_rv64 node:$src)]>; -def riscv_any_fcvt_wu_rtz_rv64 : PatFrags<(ops node:$src), - [(riscv_strict_fcvt_wu_rtz_rv64 node:$src), - (riscv_fcvt_wu_rtz_rv64 node:$src)]>; +def riscv_any_fcvt_w_rv64 : PatFrags<(ops node:$src, node:$frm), + [(riscv_strict_fcvt_w_rv64 node:$src, node:$frm), + (riscv_fcvt_w_rv64 node:$src, node:$frm)]>; +def riscv_any_fcvt_wu_rv64 : PatFrags<(ops node:$src, node:$frm), + [(riscv_strict_fcvt_wu_rv64 node:$src, node:$frm), + (riscv_fcvt_wu_rv64 node:$src, node:$frm)]>; //===----------------------------------------------------------------------===// // Operand and SDNode transformation definitions. @@ -73,6 +75,22 @@ def frmarg : Operand<XLenVT> { // Instruction class templates //===----------------------------------------------------------------------===// +let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in +class FPLoad_r<bits<3> funct3, string opcodestr, RegisterClass rty, + SchedWrite sw> + : RVInstI<funct3, OPC_LOAD_FP, (outs rty:$rd), + (ins GPR:$rs1, simm12:$imm12), + opcodestr, "$rd, ${imm12}(${rs1})">, + Sched<[sw, ReadFMemBase]>; + +let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in +class FPStore_r<bits<3> funct3, string opcodestr, RegisterClass rty, + SchedWrite sw> + : RVInstS<funct3, OPC_STORE_FP, (outs), + (ins rty:$rs2, GPR:$rs1, simm12:$imm12), + opcodestr, "$rs2, ${imm12}(${rs1})">, + Sched<[sw, ReadStoreData, ReadFMemBase]>; + let hasSideEffects = 0, mayLoad = 0, mayStore = 0, mayRaiseFPException = 1, UseNamedOperandTable = 1, hasPostISelHook = 1 in class FPFMA_rrr_frm<RISCVOpcode opcode, bits<2> funct2, string opcodestr, @@ -138,20 +156,12 @@ class FPCmp_rr<bits<7> funct7, bits<3> funct3, string opcodestr, //===----------------------------------------------------------------------===// let Predicates = [HasStdExtF] in { -let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in -def FLW : RVInstI<0b010, OPC_LOAD_FP, (outs FPR32:$rd), - (ins GPR:$rs1, simm12:$imm12), - "flw", "$rd, ${imm12}(${rs1})">, - Sched<[WriteFLD32, ReadFMemBase]>; +def FLW : FPLoad_r<0b010, "flw", FPR32, WriteFLD32>; // Operands for stores are in the order srcreg, base, offset rather than // reflecting the order these fields are specified in the instruction // encoding. -let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in -def FSW : RVInstS<0b010, OPC_STORE_FP, (outs), - (ins FPR32:$rs2, GPR:$rs1, simm12:$imm12), - "fsw", "$rs2, ${imm12}(${rs1})">, - Sched<[WriteFST32, ReadStoreData, ReadFMemBase]>; +def FSW : FPStore_r<0b010, "fsw", FPR32, WriteFST32>; let SchedRW = [WriteFMA32, ReadFMA32, ReadFMA32, ReadFMA32] in { def FMADD_S : FPFMA_rrr_frm<OPC_MADD, 0b00, "fmadd.s", FPR32>; @@ -299,6 +309,10 @@ def : MnemonicAlias<"fmv.x.s", "fmv.x.w">; def PseudoFLW : PseudoFloatLoad<"flw", FPR32>; def PseudoFSW : PseudoStore<"fsw", FPR32>; +let usesCustomInserter = 1 in { +def PseudoQuietFLE_S : PseudoQuietFCMP<FPR32>; +def PseudoQuietFLT_S : PseudoQuietFCMP<FPR32>; +} } // Predicates = [HasStdExtF] //===----------------------------------------------------------------------===// @@ -306,9 +320,13 @@ def PseudoFSW : PseudoStore<"fsw", FPR32>; //===----------------------------------------------------------------------===// /// Floating point constants -def fpimm0 : PatLeaf<(fpimm), [{ return N->isExactlyValue(+0.0); }]>; +def fpimm0 : PatLeaf<(fpimm), [{ return N->isExactlyValue(+0.0); }]>; +def fpimmneg0 : PatLeaf<(fpimm), [{ return N->isExactlyValue(-0.0); }]>; /// Generic pattern classes +class PatSetCC<RegisterClass Ty, SDPatternOperator OpNode, CondCode Cond, RVInst Inst> + : Pat<(OpNode Ty:$rs1, Ty:$rs2, Cond), (Inst $rs1, $rs2)>; + class PatFpr32Fpr32<SDPatternOperator OpNode, RVInstR Inst> : Pat<(OpNode FPR32:$rs1, FPR32:$rs2), (Inst $rs1, $rs2)>; @@ -319,6 +337,7 @@ let Predicates = [HasStdExtF] in { /// Float constants def : Pat<(f32 (fpimm0)), (FMV_W_X X0)>; +def : Pat<(f32 (fpimmneg0)), (FSGNJN_S (FMV_W_X X0), (FMV_W_X X0))>; /// Float conversion operations @@ -363,13 +382,34 @@ def : PatFpr32Fpr32<fminnum, FMIN_S>; def : PatFpr32Fpr32<fmaxnum, FMAX_S>; /// Setcc - -def : PatFpr32Fpr32<seteq, FEQ_S>; -def : PatFpr32Fpr32<setoeq, FEQ_S>; -def : PatFpr32Fpr32<setlt, FLT_S>; -def : PatFpr32Fpr32<setolt, FLT_S>; -def : PatFpr32Fpr32<setle, FLE_S>; -def : PatFpr32Fpr32<setole, FLE_S>; +// FIXME: SETEQ/SETLT/SETLE imply nonans, can we pick better instructions for +// strict versions of those. + +// Match non-signaling FEQ_S +def : PatSetCC<FPR32, any_fsetcc, SETEQ, FEQ_S>; +def : PatSetCC<FPR32, any_fsetcc, SETOEQ, FEQ_S>; +def : PatSetCC<FPR32, strict_fsetcc, SETLT, PseudoQuietFLT_S>; +def : PatSetCC<FPR32, strict_fsetcc, SETOLT, PseudoQuietFLT_S>; +def : PatSetCC<FPR32, strict_fsetcc, SETLE, PseudoQuietFLE_S>; +def : PatSetCC<FPR32, strict_fsetcc, SETOLE, PseudoQuietFLE_S>; + +// Match signaling FEQ_S +def : Pat<(strict_fsetccs FPR32:$rs1, FPR32:$rs2, SETEQ), + (AND (FLE_S $rs1, $rs2), + (FLE_S $rs2, $rs1))>; +def : Pat<(strict_fsetccs FPR32:$rs1, FPR32:$rs2, SETOEQ), + (AND (FLE_S $rs1, $rs2), + (FLE_S $rs2, $rs1))>; +// If both operands are the same, use a single FLE. +def : Pat<(strict_fsetccs FPR32:$rs1, FPR32:$rs1, SETEQ), + (FLE_S $rs1, $rs1)>; +def : Pat<(strict_fsetccs FPR32:$rs1, FPR32:$rs1, SETOEQ), + (FLE_S $rs1, $rs1)>; + +def : PatSetCC<FPR32, any_fsetccs, SETLT, FLT_S>; +def : PatSetCC<FPR32, any_fsetccs, SETOLT, FLT_S>; +def : PatSetCC<FPR32, any_fsetccs, SETLE, FLE_S>; +def : PatSetCC<FPR32, any_fsetccs, SETOLE, FLE_S>; def Select_FPR32_Using_CC_GPR : SelectCC_rrirr<FPR32, GPR>; @@ -393,14 +433,14 @@ def : Pat<(i32 (any_fp_to_sint FPR32:$rs1)), (FCVT_W_S $rs1, 0b001)>; def : Pat<(i32 (any_fp_to_uint FPR32:$rs1)), (FCVT_WU_S $rs1, 0b001)>; // Saturating float->[u]int32. -def : Pat<(i32 (riscv_fcvt_x_rtz FPR32:$rs1)), (FCVT_W_S $rs1, 0b001)>; -def : Pat<(i32 (riscv_fcvt_xu_rtz FPR32:$rs1)), (FCVT_WU_S $rs1, 0b001)>; +def : Pat<(i32 (riscv_fcvt_x FPR32:$rs1, timm:$frm)), (FCVT_W_S $rs1, timm:$frm)>; +def : Pat<(i32 (riscv_fcvt_xu FPR32:$rs1, timm:$frm)), (FCVT_WU_S $rs1, timm:$frm)>; // float->int32 with current rounding mode. -def : Pat<(i32 (lrint FPR32:$rs1)), (FCVT_W_S $rs1, 0b111)>; +def : Pat<(i32 (any_lrint FPR32:$rs1)), (FCVT_W_S $rs1, 0b111)>; // float->int32 rounded to nearest with ties rounded away from zero. -def : Pat<(i32 (lround FPR32:$rs1)), (FCVT_W_S $rs1, 0b100)>; +def : Pat<(i32 (any_lround FPR32:$rs1)), (FCVT_W_S $rs1, 0b100)>; // [u]int->float. Match GCC and default to using dynamic rounding mode. def : Pat<(any_sint_to_fp (i32 GPR:$rs1)), (FCVT_S_W $rs1, 0b111)>; @@ -417,24 +457,24 @@ def : Pat<(sext_inreg (riscv_fmv_x_anyextw_rv64 FPR32:$src), i32), // Use target specific isd nodes to help us remember the result is sign // extended. Matching sext_inreg+fptoui/fptosi may cause the conversion to be // duplicated if it has another user that didn't need the sign_extend. -def : Pat<(riscv_any_fcvt_w_rtz_rv64 FPR32:$rs1), (FCVT_W_S $rs1, 0b001)>; -def : Pat<(riscv_any_fcvt_wu_rtz_rv64 FPR32:$rs1), (FCVT_WU_S $rs1, 0b001)>; +def : Pat<(riscv_any_fcvt_w_rv64 FPR32:$rs1, timm:$frm), (FCVT_W_S $rs1, timm:$frm)>; +def : Pat<(riscv_any_fcvt_wu_rv64 FPR32:$rs1, timm:$frm), (FCVT_WU_S $rs1, timm:$frm)>; // float->[u]int64. Round-to-zero must be used. def : Pat<(i64 (any_fp_to_sint FPR32:$rs1)), (FCVT_L_S $rs1, 0b001)>; def : Pat<(i64 (any_fp_to_uint FPR32:$rs1)), (FCVT_LU_S $rs1, 0b001)>; // Saturating float->[u]int64. -def : Pat<(i64 (riscv_fcvt_x_rtz FPR32:$rs1)), (FCVT_L_S $rs1, 0b001)>; -def : Pat<(i64 (riscv_fcvt_xu_rtz FPR32:$rs1)), (FCVT_LU_S $rs1, 0b001)>; +def : Pat<(i64 (riscv_fcvt_x FPR32:$rs1, timm:$frm)), (FCVT_L_S $rs1, timm:$frm)>; +def : Pat<(i64 (riscv_fcvt_xu FPR32:$rs1, timm:$frm)), (FCVT_LU_S $rs1, timm:$frm)>; // float->int64 with current rounding mode. -def : Pat<(i64 (lrint FPR32:$rs1)), (FCVT_L_S $rs1, 0b111)>; -def : Pat<(i64 (llrint FPR32:$rs1)), (FCVT_L_S $rs1, 0b111)>; +def : Pat<(i64 (any_lrint FPR32:$rs1)), (FCVT_L_S $rs1, 0b111)>; +def : Pat<(i64 (any_llrint FPR32:$rs1)), (FCVT_L_S $rs1, 0b111)>; // float->int64 rounded to neartest with ties rounded away from zero. -def : Pat<(i64 (lround FPR32:$rs1)), (FCVT_L_S $rs1, 0b100)>; -def : Pat<(i64 (llround FPR32:$rs1)), (FCVT_L_S $rs1, 0b100)>; +def : Pat<(i64 (any_lround FPR32:$rs1)), (FCVT_L_S $rs1, 0b100)>; +def : Pat<(i64 (any_llround FPR32:$rs1)), (FCVT_L_S $rs1, 0b100)>; // [u]int->fp. Match GCC and default to using dynamic rounding mode. def : Pat<(any_sint_to_fp (i64 (sexti32 (i64 GPR:$rs1)))), (FCVT_S_W $rs1, 0b111)>; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoV.td b/llvm/lib/Target/RISCV/RISCVInstrInfoV.td index 173ae43a08d6..306024a3e4fd 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoV.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoV.td @@ -19,18 +19,22 @@ include "RISCVInstrFormatsV.td" // Operand and SDNode transformation definitions. //===----------------------------------------------------------------------===// -def VTypeIAsmOperand : AsmOperandClass { - let Name = "VTypeI"; +class VTypeIAsmOperand<int VTypeINum> : AsmOperandClass { + let Name = "VTypeI" # VTypeINum; let ParserMethod = "parseVTypeI"; let DiagnosticType = "InvalidVTypeI"; + let RenderMethod = "addVTypeIOperands"; } -def VTypeIOp : Operand<XLenVT> { - let ParserMatchClass = VTypeIAsmOperand; +class VTypeIOp<int VTypeINum> : Operand<XLenVT> { + let ParserMatchClass = VTypeIAsmOperand<VTypeINum>; let PrintMethod = "printVTypeI"; - let DecoderMethod = "decodeUImmOperand<11>"; + let DecoderMethod = "decodeUImmOperand<"#VTypeINum#">"; } +def VTypeIOp10 : VTypeIOp<10>; +def VTypeIOp11 : VTypeIOp<11>; + def VMaskAsmOperand : AsmOperandClass { let Name = "RVVMaskRegOpOperand"; let RenderMethod = "addRegOperands"; @@ -77,6 +81,9 @@ def simm5_plus1 : Operand<XLenVT>, ImmLeaf<XLenVT, }]; } +def simm5_plus1_nonzero : ImmLeaf<XLenVT, + [{return Imm != 0 && ((isInt<5>(Imm) && Imm != -16) || Imm == 16);}]>; + //===----------------------------------------------------------------------===// // Scheduling definitions. //===----------------------------------------------------------------------===// @@ -342,6 +349,27 @@ class VALUVs2<bits<6> funct6, bits<5> vs1, RISCVVFormat opv, string opcodestr> // Combination of instruction classes. // Use these multiclasses to define instructions more easily. //===----------------------------------------------------------------------===// + +multiclass VIndexLoadStore<list<int> EEWList> { + foreach n = EEWList in { + defvar w = !cast<RISCVWidth>("LSWidth" # n); + + def VLUXEI # n # _V : + VIndexedLoad<MOPLDIndexedUnord, w, "vluxei" # n # ".v">, + VLXSched<n, "U">; + def VLOXEI # n # _V : + VIndexedLoad<MOPLDIndexedOrder, w, "vloxei" # n # ".v">, + VLXSched<n, "O">; + + def VSUXEI # n # _V : + VIndexedStore<MOPSTIndexedUnord, w, "vsuxei" # n # ".v">, + VSXSched<n, "U">; + def VSOXEI # n # _V : + VIndexedStore<MOPSTIndexedOrder, w, "vsoxei" # n # ".v">, + VSXSched<n, "O">; + } +} + multiclass VALU_IV_V_X_I<string opcodestr, bits<6> funct6, Operand optype = simm5, string vw = "v"> { def V : VALUVV<funct6, OPIVV, opcodestr # "." # vw # "v">, Sched<[WriteVIALUV, ReadVIALUV, ReadVIALUV, ReadVMask]>; @@ -757,7 +785,7 @@ multiclass VCPR_MV_Mask<string opcodestr, bits<6> funct6, string vm = "v"> { } multiclass VWholeLoadN<bits<3> nf, string opcodestr, RegisterClass VRC> { - foreach l = [8, 16, 32, 64] in { + foreach l = [8, 16, 32] in { defvar w = !cast<RISCVWidth>("LSWidth" # l); defvar s = !cast<SchedWrite>("WriteVLD" # !add(nf, 1) # "R" # l); @@ -765,23 +793,27 @@ multiclass VWholeLoadN<bits<3> nf, string opcodestr, RegisterClass VRC> { Sched<[s, ReadVLDX]>; } } +multiclass VWholeLoadEEW64<bits<3> nf, string opcodestr, RegisterClass VRC, SchedReadWrite schedrw> { + def E64_V : VWholeLoad<nf, LSWidth64, opcodestr # "e64.v", VRC>, + Sched<[schedrw, ReadVLDX]>; +} //===----------------------------------------------------------------------===// // Instructions //===----------------------------------------------------------------------===// -let Predicates = [HasStdExtV] in { +let Predicates = [HasVInstructions] in { let hasSideEffects = 1, mayLoad = 0, mayStore = 0 in { -def VSETVLI : RVInstSetVLi<(outs GPR:$rd), (ins GPR:$rs1, VTypeIOp:$vtypei), +def VSETVLI : RVInstSetVLi<(outs GPR:$rd), (ins GPR:$rs1, VTypeIOp11:$vtypei), "vsetvli", "$rd, $rs1, $vtypei">; -def VSETIVLI : RVInstSetiVLi<(outs GPR:$rd), (ins uimm5:$uimm, VTypeIOp:$vtypei), +def VSETIVLI : RVInstSetiVLi<(outs GPR:$rd), (ins uimm5:$uimm, VTypeIOp10:$vtypei), "vsetivli", "$rd, $uimm, $vtypei">; def VSETVL : RVInstSetVL<(outs GPR:$rd), (ins GPR:$rs1, GPR:$rs2), "vsetvl", "$rd, $rs1, $rs2">; } // hasSideEffects = 1, mayLoad = 0, mayStore = 0 -foreach eew = [8, 16, 32, 64] in { +foreach eew = [8, 16, 32] in { defvar w = !cast<RISCVWidth>("LSWidth" # eew); // Vector Unit-Stride Instructions @@ -794,18 +826,12 @@ foreach eew = [8, 16, 32, 64] in { // Vector Strided Instructions def VLSE#eew#_V : VStridedLoad<w, "vlse"#eew#".v">, VLSSched<eew>; def VSSE#eew#_V : VStridedStore<w, "vsse"#eew#".v">, VSSSched<eew>; - - // Vector Indexed Instructions - def VLUXEI#eew#_V : - VIndexedLoad<MOPLDIndexedUnord, w, "vluxei"#eew#".v">, VLXSched<eew, "U">; - def VLOXEI#eew#_V : - VIndexedLoad<MOPLDIndexedOrder, w, "vloxei"#eew#".v">, VLXSched<eew, "O">; - def VSUXEI#eew#_V : - VIndexedStore<MOPSTIndexedUnord, w, "vsuxei"#eew#".v">, VSXSched<eew, "U">; - def VSOXEI#eew#_V : - VIndexedStore<MOPSTIndexedOrder, w, "vsoxei"#eew#".v">, VSXSched<eew, "O">; } +defm "" : VIndexLoadStore<[8, 16, 32]>; +} // Predicates = [HasVInstructions] + +let Predicates = [HasVInstructions] in { def VLM_V : VUnitStrideLoadMask<"vlm.v">, Sched<[WriteVLDM, ReadVLDX]>; def VSM_V : VUnitStrideStoreMask<"vsm.v">, @@ -820,11 +846,6 @@ defm VL2R : VWholeLoadN<1, "vl2r", VRM2>; defm VL4R : VWholeLoadN<3, "vl4r", VRM4>; defm VL8R : VWholeLoadN<7, "vl8r", VRM8>; -def : InstAlias<"vl1r.v $vd, (${rs1})", (VL1RE8_V VR:$vd, GPR:$rs1)>; -def : InstAlias<"vl2r.v $vd, (${rs1})", (VL2RE8_V VRM2:$vd, GPR:$rs1)>; -def : InstAlias<"vl4r.v $vd, (${rs1})", (VL4RE8_V VRM4:$vd, GPR:$rs1)>; -def : InstAlias<"vl8r.v $vd, (${rs1})", (VL8RE8_V VRM8:$vd, GPR:$rs1)>; - def VS1R_V : VWholeStore<0, "vs1r.v", VR>, Sched<[WriteVST1R, ReadVST1R, ReadVSTX]>; def VS2R_V : VWholeStore<1, "vs2r.v", VRM2>, @@ -834,6 +855,40 @@ def VS4R_V : VWholeStore<3, "vs4r.v", VRM4>, def VS8R_V : VWholeStore<7, "vs8r.v", VRM8>, Sched<[WriteVST8R, ReadVST8R, ReadVSTX]>; +def : InstAlias<"vl1r.v $vd, (${rs1})", (VL1RE8_V VR:$vd, GPR:$rs1)>; +def : InstAlias<"vl2r.v $vd, (${rs1})", (VL2RE8_V VRM2:$vd, GPR:$rs1)>; +def : InstAlias<"vl4r.v $vd, (${rs1})", (VL4RE8_V VRM4:$vd, GPR:$rs1)>; +def : InstAlias<"vl8r.v $vd, (${rs1})", (VL8RE8_V VRM8:$vd, GPR:$rs1)>; +} // Predicates = [HasVInstructions] + +let Predicates = [HasVInstructionsI64] in { +// Vector Unit-Stride Instructions +def VLE64_V : VUnitStrideLoad<LSWidth64, "vle64.v">, + VLESched<64>; + +def VLE64FF_V : VUnitStrideLoadFF<LSWidth64, "vle64ff.v">, + VLFSched<64>; + +def VSE64_V : VUnitStrideStore<LSWidth64, "vse64.v">, + VSESched<64>; +// Vector Strided Instructions +def VLSE64_V : VStridedLoad<LSWidth64, "vlse64.v">, + VLSSched<32>; + +def VSSE64_V : VStridedStore<LSWidth64, "vsse64.v">, + VSSSched<64>; + +defm VL1R: VWholeLoadEEW64<0, "vl1r", VR, WriteVLD1R64>; +defm VL2R: VWholeLoadEEW64<1, "vl2r", VRM2, WriteVLD2R64>; +defm VL4R: VWholeLoadEEW64<3, "vl4r", VRM4, WriteVLD4R64>; +defm VL8R: VWholeLoadEEW64<7, "vl8r", VRM8, WriteVLD8R64>; +} // Predicates = [HasVInstructionsI64] +let Predicates = [IsRV64, HasVInstructionsI64] in { + // Vector Indexed Instructions + defm "" : VIndexLoadStore<[64]>; +} // [IsRV64, HasVInstructionsI64] + +let Predicates = [HasVInstructions] in { // Vector Single-Width Integer Add and Subtract defm VADD_V : VALU_IV_V_X_I<"vadd", 0b000000>; defm VSUB_V : VALU_IV_V_X<"vsub", 0b000010>; @@ -1065,9 +1120,9 @@ let Constraints = "@earlyclobber $vd" in { defm VNCLIPU_W : VNCLP_IV_V_X_I<"vnclipu", 0b101110, uimm5, "w">; defm VNCLIP_W : VNCLP_IV_V_X_I<"vnclip", 0b101111, uimm5, "w">; } // Constraints = "@earlyclobber $vd" -} // Predicates = [HasStdExtV] +} // Predicates = [HasVInstructions] -let Predicates = [HasStdExtV, HasStdExtF] in { +let Predicates = [HasVInstructionsAnyF] in { // Vector Single-Width Floating-Point Add/Subtract Instructions defm VFADD_V : VALU_FV_V_F<"vfadd", 0b000000>; defm VFSUB_V : VALU_FV_V_F<"vfsub", 0b000010>; @@ -1202,9 +1257,9 @@ defm VFNCVT_F_X_W : VNCVTF_IV_VS2<"vfncvt.f.x.w", 0b010010, 0b10011>; defm VFNCVT_F_F_W : VNCVTF_FV_VS2<"vfncvt.f.f.w", 0b010010, 0b10100>; defm VFNCVT_ROD_F_F_W : VNCVTF_FV_VS2<"vfncvt.rod.f.f.w", 0b010010, 0b10101>; } // Constraints = "@earlyclobber $vd" -} // Predicates = [HasStdExtV, HasStdExtF] +} // Predicates = HasVInstructionsAnyF] -let Predicates = [HasStdExtV] in { +let Predicates = [HasVInstructions] in { // Vector Single-Width Integer Reduction Instructions let RVVConstraint = NoConstraint in { @@ -1228,9 +1283,9 @@ defm VWREDSUMU : VWRED_IV_V<"vwredsumu", 0b110000>; defm VWREDSUM : VWRED_IV_V<"vwredsum", 0b110001>; } // Constraints = "@earlyclobber $vd", RVVConstraint = NoConstraint -} // Predicates = [HasStdExtV] +} // Predicates = [HasVInstructions] -let Predicates = [HasStdExtV, HasStdExtF] in { +let Predicates = [HasVInstructionsAnyF] in { // Vector Single-Width Floating-Point Reduction Instructions let RVVConstraint = NoConstraint in { defm VFREDOSUM : VREDO_FV_V<"vfredosum", 0b000011>; @@ -1254,9 +1309,9 @@ defm VFWREDUSUM : VWRED_FV_V<"vfwredusum", 0b110001>; def : InstAlias<"vfwredsum.vs $vd, $vs2, $vs1$vm", (VFWREDUSUM_VS VR:$vd, VR:$vs2, VR:$vs1, VMaskOp:$vm), 0>; -} // Predicates = [HasStdExtV, HasStdExtF] +} // Predicates = [HasVInstructionsAnyF] -let Predicates = [HasStdExtV] in { +let Predicates = [HasVInstructions] in { // Vector Mask-Register Logical Instructions let RVVConstraint = NoConstraint in { defm VMAND_M : VMALU_MV_Mask<"vmand", 0b011001, "m">; @@ -1337,9 +1392,9 @@ def VMV_S_X : RVInstV2<0b010000, 0b00000, OPMVX, (outs VR:$vd_wb), } // hasSideEffects = 0, mayLoad = 0, mayStore = 0 -} // Predicates = [HasStdExtV] +} // Predicates = [HasVInstructions] -let Predicates = [HasStdExtV, HasStdExtF] in { +let Predicates = [HasVInstructionsAnyF] in { let hasSideEffects = 0, mayLoad = 0, mayStore = 0, vm = 1, RVVConstraint = NoConstraint in { @@ -1354,9 +1409,9 @@ def VFMV_S_F : RVInstV2<0b010000, 0b00000, OPFVF, (outs VR:$vd_wb), } // hasSideEffects = 0, mayLoad = 0, mayStore = 0, vm = 1 -} // Predicates = [HasStdExtV, HasStdExtF] +} // Predicates = [HasVInstructionsAnyF] -let Predicates = [HasStdExtV] in { +let Predicates = [HasVInstructions] in { // Vector Slide Instructions let Constraints = "@earlyclobber $vd", RVVConstraint = SlideUp in { defm VSLIDEUP_V : VSLD_IV_X_I<"vslideup", 0b001110, uimm5>; @@ -1364,16 +1419,16 @@ defm VSLIDE1UP_V : VSLD1_MV_X<"vslide1up", 0b001110>; } // Constraints = "@earlyclobber $vd", RVVConstraint = SlideUp defm VSLIDEDOWN_V : VSLD_IV_X_I<"vslidedown", 0b001111, uimm5>; defm VSLIDE1DOWN_V : VSLD1_MV_X<"vslide1down", 0b001111>; -} // Predicates = [HasStdExtV] +} // Predicates = [HasVInstructions] -let Predicates = [HasStdExtV, HasStdExtF] in { +let Predicates = [HasVInstructionsAnyF] in { let Constraints = "@earlyclobber $vd", RVVConstraint = SlideUp in { defm VFSLIDE1UP_V : VSLD1_FV_F<"vfslide1up", 0b001110>; } // Constraints = "@earlyclobber $vd", RVVConstraint = SlideUp defm VFSLIDE1DOWN_V : VSLD1_FV_F<"vfslide1down", 0b001111>; -} // Predicates = [HasStdExtV, HasStdExtF] +} // Predicates = [HasVInstructionsAnyF] -let Predicates = [HasStdExtV] in { +let Predicates = [HasVInstructions] in { // Vector Register Gather Instruction let Constraints = "@earlyclobber $vd", RVVConstraint = Vrgather in { defm VRGATHER_V : VGTR_IV_V_X_I<"vrgather", 0b001100, uimm5>; @@ -1404,11 +1459,11 @@ foreach n = [2, 4, 8] in { } } } // hasSideEffects = 0, mayLoad = 0, mayStore = 0 -} // Predicates = [HasStdExtV] +} // Predicates = [HasVInstructions] -let Predicates = [HasStdExtZvlsseg] in { +let Predicates = [HasVInstructions] in { foreach nf=2-8 in { - foreach eew = [8, 16, 32, 64] in { + foreach eew = [8, 16, 32] in { defvar w = !cast<RISCVWidth>("LSWidth"#eew); def VLSEG#nf#E#eew#_V : @@ -1439,6 +1494,41 @@ let Predicates = [HasStdExtZvlsseg] in { "vsoxseg"#nf#"ei"#eew#".v">; } } -} // Predicates = [HasStdExtZvlsseg] +} // Predicates = [HasVInstructions] + +let Predicates = [HasVInstructionsI64] in { + foreach nf=2-8 in { + // Vector Unit-strided Segment Instructions + def VLSEG#nf#E64_V : + VUnitStrideSegmentLoad<!add(nf, -1), LSWidth64, "vlseg"#nf#"e64.v">; + def VLSEG#nf#E64FF_V : + VUnitStrideSegmentLoadFF<!add(nf, -1), LSWidth64, "vlseg"#nf#"e64ff.v">; + def VSSEG#nf#E64_V : + VUnitStrideSegmentStore<!add(nf, -1), LSWidth64, "vsseg"#nf#"e64.v">; + + // Vector Strided Segment Instructions + def VLSSEG#nf#E64_V : + VStridedSegmentLoad<!add(nf, -1), LSWidth64, "vlsseg"#nf#"e64.v">; + def VSSSEG#nf#E64_V : + VStridedSegmentStore<!add(nf, -1), LSWidth64, "vssseg"#nf#"e64.v">; + } +} // Predicates = [HasVInstructionsI64] +let Predicates = [HasVInstructionsI64, IsRV64] in { + foreach nf=2-8 in { + // Vector Indexed Segment Instructions + def VLUXSEG#nf#EI64_V : + VIndexedSegmentLoad<!add(nf, -1), MOPLDIndexedUnord, LSWidth64, + "vluxseg"#nf#"ei64.v">; + def VLOXSEG#nf#EI64_V : + VIndexedSegmentLoad<!add(nf, -1), MOPLDIndexedOrder, LSWidth64, + "vloxseg"#nf#"ei64.v">; + def VSUXSEG#nf#EI64_V : + VIndexedSegmentStore<!add(nf, -1), MOPSTIndexedUnord, LSWidth64, + "vsuxseg"#nf#"ei64.v">; + def VSOXSEG#nf#EI64_V : + VIndexedSegmentStore<!add(nf, -1), MOPSTIndexedOrder, LSWidth64, + "vsoxseg"#nf#"ei64.v">; + } +} // Predicates = [HasVInstructionsI64, IsRV64] include "RISCVInstrInfoVPseudos.td" diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td index 073fa605e0fb..4e7e251bc412 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td @@ -71,49 +71,45 @@ def V_MF4 : LMULInfo<0b110, 2, VR, VR, VR,/*NoVReg*/VR,/*NoVReg*/VR, "M def V_MF2 : LMULInfo<0b111, 4, VR, VR, VR, VR,/*NoVReg*/VR, "MF2">; // Used to iterate over all possible LMULs. -def MxList { - list<LMULInfo> m = [V_MF8, V_MF4, V_MF2, V_M1, V_M2, V_M4, V_M8]; -} +defvar MxList = [V_MF8, V_MF4, V_MF2, V_M1, V_M2, V_M4, V_M8]; +// For floating point which don't need MF8. +defvar MxListF = [V_MF4, V_MF2, V_M1, V_M2, V_M4, V_M8]; + // Used for widening and narrowing instructions as it doesn't contain M8. -def MxListW { - list<LMULInfo> m = [V_MF8, V_MF4, V_MF2, V_M1, V_M2, V_M4]; -} +defvar MxListW = [V_MF8, V_MF4, V_MF2, V_M1, V_M2, V_M4]; +// For floating point which don't need MF8. +defvar MxListFW = [V_MF4, V_MF2, V_M1, V_M2, V_M4]; + // Use for zext/sext.vf2 -def MxListVF2 { - list<LMULInfo> m = [V_MF4, V_MF2, V_M1, V_M2, V_M4, V_M8]; -} +defvar MxListVF2 = [V_MF4, V_MF2, V_M1, V_M2, V_M4, V_M8]; + // Use for zext/sext.vf4 -def MxListVF4 { - list<LMULInfo> m = [V_MF2, V_M1, V_M2, V_M4, V_M8]; -} +defvar MxListVF4 = [V_MF2, V_M1, V_M2, V_M4, V_M8]; + // Use for zext/sext.vf8 -def MxListVF8 { - list<LMULInfo> m = [V_M1, V_M2, V_M4, V_M8]; +defvar MxListVF8 = [V_M1, V_M2, V_M4, V_M8]; + +class MxSet<int eew> { + list<LMULInfo> m = !cond(!eq(eew, 8) : [V_MF8, V_MF4, V_MF2, V_M1, V_M2, V_M4, V_M8], + !eq(eew, 16) : [V_MF4, V_MF2, V_M1, V_M2, V_M4, V_M8], + !eq(eew, 32) : [V_MF2, V_M1, V_M2, V_M4, V_M8], + !eq(eew, 64) : [V_M1, V_M2, V_M4, V_M8]); } -class FPR_Info<RegisterClass regclass, string fx> { +class FPR_Info<RegisterClass regclass, string fx, list<LMULInfo> mxlist> { RegisterClass fprclass = regclass; string FX = fx; + list<LMULInfo> MxList = mxlist; } -def SCALAR_F16 : FPR_Info<FPR16, "F16">; -def SCALAR_F32 : FPR_Info<FPR32, "F32">; -def SCALAR_F64 : FPR_Info<FPR64, "F64">; +def SCALAR_F16 : FPR_Info<FPR16, "F16", MxSet<16>.m>; +def SCALAR_F32 : FPR_Info<FPR32, "F32", MxSet<32>.m>; +def SCALAR_F64 : FPR_Info<FPR64, "F64", MxSet<64>.m>; -def FPList { - list<FPR_Info> fpinfo = [SCALAR_F16, SCALAR_F32, SCALAR_F64]; -} -// Used for widening instructions. It excludes F64. -def FPListW { - list<FPR_Info> fpinfo = [SCALAR_F16, SCALAR_F32]; -} +defvar FPList = [SCALAR_F16, SCALAR_F32, SCALAR_F64]; -class MxSet<int eew> { - list<LMULInfo> m = !cond(!eq(eew, 8) : [V_MF8, V_MF4, V_MF2, V_M1, V_M2, V_M4, V_M8], - !eq(eew, 16) : [V_MF4, V_MF2, V_M1, V_M2, V_M4, V_M8], - !eq(eew, 32) : [V_MF2, V_M1, V_M2, V_M4, V_M8], - !eq(eew, 64) : [V_M1, V_M2, V_M4, V_M8]); -} +// Used for widening instructions. It excludes F64. +defvar FPListW = [SCALAR_F16, SCALAR_F32]; class NFSet<LMULInfo m> { list<int> L = !cond(!eq(m.value, V_M8.value): [], @@ -236,25 +232,25 @@ defset list<VTypeInfo> AllVectors = { defset list<GroupVTypeInfo> GroupFloatVectors = { def VF16M2: GroupVTypeInfo<vfloat16m2_t, vfloat16m1_t, vbool8_t, 16, - VRM2, V_M2, f16, FPR16>; + VRM2, V_M2, f16, FPR16>; def VF16M4: GroupVTypeInfo<vfloat16m4_t, vfloat16m1_t, vbool4_t, 16, - VRM4, V_M4, f16, FPR16>; + VRM4, V_M4, f16, FPR16>; def VF16M8: GroupVTypeInfo<vfloat16m8_t, vfloat16m1_t, vbool2_t, 16, - VRM8, V_M8, f16, FPR16>; + VRM8, V_M8, f16, FPR16>; def VF32M2: GroupVTypeInfo<vfloat32m2_t, vfloat32m1_t, vbool16_t, 32, - VRM2, V_M2, f32, FPR32>; + VRM2, V_M2, f32, FPR32>; def VF32M4: GroupVTypeInfo<vfloat32m4_t, vfloat32m1_t, vbool8_t, 32, - VRM4, V_M4, f32, FPR32>; + VRM4, V_M4, f32, FPR32>; def VF32M8: GroupVTypeInfo<vfloat32m8_t, vfloat32m1_t, vbool4_t, 32, - VRM8, V_M8, f32, FPR32>; + VRM8, V_M8, f32, FPR32>; def VF64M2: GroupVTypeInfo<vfloat64m2_t, vfloat64m1_t, vbool32_t, 64, - VRM2, V_M2, f64, FPR64>; + VRM2, V_M2, f64, FPR64>; def VF64M4: GroupVTypeInfo<vfloat64m4_t, vfloat64m1_t, vbool16_t, 64, - VRM4, V_M4, f64, FPR64>; + VRM4, V_M4, f64, FPR64>; def VF64M8: GroupVTypeInfo<vfloat64m8_t, vfloat64m1_t, vbool8_t, 64, - VRM8, V_M8, f64, FPR64>; + VRM8, V_M8, f64, FPR64>; } } } @@ -423,13 +419,14 @@ def RISCVVPseudosTable : GenericTable { def RISCVVIntrinsicsTable : GenericTable { let FilterClass = "RISCVVIntrinsic"; let CppTypeName = "RISCVVIntrinsicInfo"; - let Fields = ["IntrinsicID", "SplatOperand"]; + let Fields = ["IntrinsicID", "SplatOperand", "VLOperand"]; let PrimaryKey = ["IntrinsicID"]; let PrimaryKeyName = "getRISCVVIntrinsicInfo"; } -class RISCVVLE<bit M, bit Str, bit F, bits<3> S, bits<3> L> { +class RISCVVLE<bit M, bit TU, bit Str, bit F, bits<3> S, bits<3> L> { bits<1> Masked = M; + bits<1> IsTU = TU; bits<1> Strided = Str; bits<1> FF = F; bits<3> Log2SEW = S; @@ -440,8 +437,8 @@ class RISCVVLE<bit M, bit Str, bit F, bits<3> S, bits<3> L> { def RISCVVLETable : GenericTable { let FilterClass = "RISCVVLE"; let CppTypeName = "VLEPseudo"; - let Fields = ["Masked", "Strided", "FF", "Log2SEW", "LMUL", "Pseudo"]; - let PrimaryKey = ["Masked", "Strided", "FF", "Log2SEW", "LMUL"]; + let Fields = ["Masked", "IsTU", "Strided", "FF", "Log2SEW", "LMUL", "Pseudo"]; + let PrimaryKey = ["Masked", "IsTU", "Strided", "FF", "Log2SEW", "LMUL"]; let PrimaryKeyName = "getVLEPseudo"; } @@ -461,8 +458,9 @@ def RISCVVSETable : GenericTable { let PrimaryKeyName = "getVSEPseudo"; } -class RISCVVLX_VSX<bit M, bit O, bits<3> S, bits<3> L, bits<3> IL> { +class RISCVVLX_VSX<bit M, bit TU, bit O, bits<3> S, bits<3> L, bits<3> IL> { bits<1> Masked = M; + bits<1> IsTU = TU; bits<1> Ordered = O; bits<3> Log2SEW = S; bits<3> LMUL = L; @@ -470,15 +468,15 @@ class RISCVVLX_VSX<bit M, bit O, bits<3> S, bits<3> L, bits<3> IL> { Pseudo Pseudo = !cast<Pseudo>(NAME); } -class RISCVVLX<bit M, bit O, bits<3> S, bits<3> L, bits<3> IL> : - RISCVVLX_VSX<M, O, S, L, IL>; +class RISCVVLX<bit M, bit TU, bit O, bits<3> S, bits<3> L, bits<3> IL> : + RISCVVLX_VSX<M, TU, O, S, L, IL>; class RISCVVSX<bit M, bit O, bits<3> S, bits<3> L, bits<3> IL> : - RISCVVLX_VSX<M, O, S, L, IL>; + RISCVVLX_VSX<M, /*TU*/0, O, S, L, IL>; class RISCVVLX_VSXTable : GenericTable { let CppTypeName = "VLX_VSXPseudo"; - let Fields = ["Masked", "Ordered", "Log2SEW", "LMUL", "IndexLMUL", "Pseudo"]; - let PrimaryKey = ["Masked", "Ordered", "Log2SEW", "LMUL", "IndexLMUL"]; + let Fields = ["Masked", "IsTU", "Ordered", "Log2SEW", "LMUL", "IndexLMUL", "Pseudo"]; + let PrimaryKey = ["Masked", "IsTU", "Ordered", "Log2SEW", "LMUL", "IndexLMUL"]; } def RISCVVLXTable : RISCVVLX_VSXTable { @@ -583,10 +581,11 @@ class PseudoToVInst<string PseudoInst> { !subst("_B64", "", !subst("_MASK", "", !subst("_TIED", "", + !subst("_TU", "", !subst("F16", "F", !subst("F32", "F", !subst("F64", "F", - !subst("Pseudo", "", PseudoInst)))))))))))))))))))); + !subst("Pseudo", "", PseudoInst))))))))))))))))))))); } // The destination vector register group for a masked vector instruction cannot @@ -632,7 +631,7 @@ class VPseudoUSLoadNoMask<VReg RetClass, int EEW, bit isFF> : Pseudo<(outs RetClass:$rd), (ins GPR:$rs1, AVL:$vl, ixlenimm:$sew),[]>, RISCVVPseudo, - RISCVVLE</*Masked*/0, /*Strided*/0, /*FF*/isFF, log2<EEW>.val, VLMul> { + RISCVVLE</*Masked*/0, /*TU*/0, /*Strided*/0, /*FF*/isFF, log2<EEW>.val, VLMul> { let mayLoad = 1; let mayStore = 0; let hasSideEffects = 0; @@ -642,13 +641,29 @@ class VPseudoUSLoadNoMask<VReg RetClass, int EEW, bit isFF> : let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst); } +class VPseudoUSLoadNoMaskTU<VReg RetClass, int EEW, bit isFF> : + Pseudo<(outs RetClass:$rd), + (ins RetClass:$dest, GPR:$rs1, AVL:$vl, ixlenimm:$sew),[]>, + RISCVVPseudo, + RISCVVLE</*Masked*/0, /*TU*/1, /*Strided*/0, /*FF*/isFF, log2<EEW>.val, VLMul> { + let mayLoad = 1; + let mayStore = 0; + let hasSideEffects = 0; + let HasVLOp = 1; + let HasSEWOp = 1; + let HasDummyMask = 1; + let HasMergeOp = 1; + let Constraints = "$rd = $dest"; + let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst); +} + class VPseudoUSLoadMask<VReg RetClass, int EEW, bit isFF> : Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd), (ins GetVRegNoV0<RetClass>.R:$merge, GPR:$rs1, VMaskOp:$vm, AVL:$vl, ixlenimm:$sew, ixlenimm:$policy),[]>, RISCVVPseudo, - RISCVVLE</*Masked*/1, /*Strided*/0, /*FF*/isFF, log2<EEW>.val, VLMul> { + RISCVVLE</*Masked*/1, /*TU*/1, /*Strided*/0, /*FF*/isFF, log2<EEW>.val, VLMul> { let mayLoad = 1; let mayStore = 0; let hasSideEffects = 0; @@ -664,7 +679,7 @@ class VPseudoSLoadNoMask<VReg RetClass, int EEW>: Pseudo<(outs RetClass:$rd), (ins GPR:$rs1, GPR:$rs2, AVL:$vl, ixlenimm:$sew),[]>, RISCVVPseudo, - RISCVVLE</*Masked*/0, /*Strided*/1, /*FF*/0, log2<EEW>.val, VLMul> { + RISCVVLE</*Masked*/0, /*TU*/0, /*Strided*/1, /*FF*/0, log2<EEW>.val, VLMul> { let mayLoad = 1; let mayStore = 0; let hasSideEffects = 0; @@ -674,13 +689,29 @@ class VPseudoSLoadNoMask<VReg RetClass, int EEW>: let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst); } +class VPseudoSLoadNoMaskTU<VReg RetClass, int EEW>: + Pseudo<(outs RetClass:$rd), + (ins RetClass:$dest, GPR:$rs1, GPR:$rs2, AVL:$vl, ixlenimm:$sew),[]>, + RISCVVPseudo, + RISCVVLE</*Masked*/0, /*TU*/1, /*Strided*/1, /*FF*/0, log2<EEW>.val, VLMul> { + let mayLoad = 1; + let mayStore = 0; + let hasSideEffects = 0; + let HasVLOp = 1; + let HasSEWOp = 1; + let HasDummyMask = 1; + let HasMergeOp = 1; + let Constraints = "$rd = $dest"; + let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst); +} + class VPseudoSLoadMask<VReg RetClass, int EEW>: Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd), (ins GetVRegNoV0<RetClass>.R:$merge, GPR:$rs1, GPR:$rs2, VMaskOp:$vm, AVL:$vl, ixlenimm:$sew, ixlenimm:$policy),[]>, RISCVVPseudo, - RISCVVLE</*Masked*/1, /*Strided*/1, /*FF*/0, log2<EEW>.val, VLMul> { + RISCVVLE</*Masked*/1, /*TU*/1, /*Strided*/1, /*FF*/0, log2<EEW>.val, VLMul> { let mayLoad = 1; let mayStore = 0; let hasSideEffects = 0; @@ -695,9 +726,10 @@ class VPseudoSLoadMask<VReg RetClass, int EEW>: class VPseudoILoadNoMask<VReg RetClass, VReg IdxClass, int EEW, bits<3> LMUL, bit Ordered, bit EarlyClobber>: Pseudo<(outs RetClass:$rd), - (ins GPR:$rs1, IdxClass:$rs2, AVL:$vl, ixlenimm:$sew),[]>, + (ins GPR:$rs1, IdxClass:$rs2, AVL:$vl, + ixlenimm:$sew),[]>, RISCVVPseudo, - RISCVVLX</*Masked*/0, Ordered, log2<EEW>.val, VLMul, LMUL> { + RISCVVLX</*Masked*/0, /*TU*/0, Ordered, log2<EEW>.val, VLMul, LMUL> { let mayLoad = 1; let mayStore = 0; let hasSideEffects = 0; @@ -708,6 +740,24 @@ class VPseudoILoadNoMask<VReg RetClass, VReg IdxClass, int EEW, bits<3> LMUL, let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst); } +class VPseudoILoadNoMaskTU<VReg RetClass, VReg IdxClass, int EEW, bits<3> LMUL, + bit Ordered, bit EarlyClobber>: + Pseudo<(outs RetClass:$rd), + (ins RetClass:$dest, GPR:$rs1, IdxClass:$rs2, AVL:$vl, + ixlenimm:$sew),[]>, + RISCVVPseudo, + RISCVVLX</*Masked*/0, /*TU*/1, Ordered, log2<EEW>.val, VLMul, LMUL> { + let mayLoad = 1; + let mayStore = 0; + let hasSideEffects = 0; + let HasVLOp = 1; + let HasSEWOp = 1; + let HasDummyMask = 1; + let HasMergeOp = 1; + let Constraints = !if(!eq(EarlyClobber, 1), "@earlyclobber $rd, $rd = $dest", "$rd = $dest"); + let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst); +} + class VPseudoILoadMask<VReg RetClass, VReg IdxClass, int EEW, bits<3> LMUL, bit Ordered, bit EarlyClobber>: Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd), @@ -715,7 +765,7 @@ class VPseudoILoadMask<VReg RetClass, VReg IdxClass, int EEW, bits<3> LMUL, GPR:$rs1, IdxClass:$rs2, VMaskOp:$vm, AVL:$vl, ixlenimm:$sew, ixlenimm:$policy),[]>, RISCVVPseudo, - RISCVVLX</*Masked*/1, Ordered, log2<EEW>.val, VLMul, LMUL> { + RISCVVLX</*Masked*/1, /*TU*/1, Ordered, log2<EEW>.val, VLMul, LMUL> { let mayLoad = 1; let mayStore = 0; let hasSideEffects = 0; @@ -932,6 +982,9 @@ class VPseudoBinaryNoMask<VReg RetClass, let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst); } +// Special version of VPseudoBinaryNoMask where we pretend the first source is +// tied to the destination. +// This allows maskedoff and rs2 to be the same register. class VPseudoTiedBinaryNoMask<VReg RetClass, DAGOperand Op2Class, string Constraint> : @@ -1083,6 +1136,30 @@ class VPseudoBinaryCarryIn<VReg RetClass, let VLMul = MInfo.value; } +class VPseudoTiedBinaryCarryIn<VReg RetClass, + VReg Op1Class, + DAGOperand Op2Class, + LMULInfo MInfo, + bit CarryIn, + string Constraint> : + Pseudo<(outs RetClass:$rd), + !if(CarryIn, + (ins RetClass:$merge, Op1Class:$rs2, Op2Class:$rs1, VMV0:$carry, AVL:$vl, + ixlenimm:$sew), + (ins RetClass:$merge, Op1Class:$rs2, Op2Class:$rs1, AVL:$vl, ixlenimm:$sew)), []>, + RISCVVPseudo { + let mayLoad = 0; + let mayStore = 0; + let hasSideEffects = 0; + let Constraints = Join<[Constraint, "$rd = $merge"], ",">.ret; + let HasVLOp = 1; + let HasSEWOp = 1; + let HasMergeOp = 1; + let HasVecPolicyOp = 0; + let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst); + let VLMul = MInfo.value; +} + class VPseudoTernaryNoMask<VReg RetClass, RegisterClass Op1Class, DAGOperand Op2Class, @@ -1323,6 +1400,9 @@ multiclass VPseudoUSLoad { def "E" # eew # "_V_" # LInfo : VPseudoUSLoadNoMask<vreg, eew, false>, VLESched<eew>; + def "E" # eew # "_V_" # LInfo # "_TU": + VPseudoUSLoadNoMaskTU<vreg, eew, false>, + VLESched<eew>; def "E" # eew # "_V_" # LInfo # "_MASK" : VPseudoUSLoadMask<vreg, eew, false>, VLESched<eew>; @@ -1340,6 +1420,9 @@ multiclass VPseudoFFLoad { def "E" # eew # "FF_V_" # LInfo : VPseudoUSLoadNoMask<vreg, eew, true>, VLFSched<eew>; + def "E" # eew # "FF_V_" # LInfo # "_TU": + VPseudoUSLoadNoMaskTU<vreg, eew, true>, + VLFSched<eew>; def "E" # eew # "FF_V_" # LInfo # "_MASK" : VPseudoUSLoadMask<vreg, eew, true>, VLFSched<eew>; @@ -1364,6 +1447,8 @@ multiclass VPseudoSLoad { let VLMul = lmul.value in { def "E" # eew # "_V_" # LInfo : VPseudoSLoadNoMask<vreg, eew>, VLSSched<eew>; + def "E" # eew # "_V_" # LInfo # "_TU": VPseudoSLoadNoMaskTU<vreg, eew>, + VLSSched<eew>; def "E" # eew # "_V_" # LInfo # "_MASK" : VPseudoSLoadMask<vreg, eew>, VLSSched<eew>; } @@ -1390,6 +1475,9 @@ multiclass VPseudoILoad<bit Ordered> { def "EI" # eew # "_V_" # IdxLInfo # "_" # LInfo : VPseudoILoadNoMask<Vreg, IdxVreg, eew, idx_lmul.value, Ordered, HasConstraint>, VLXSched<eew, Order>; + def "EI" # eew # "_V_" # IdxLInfo # "_" # LInfo # "_TU": + VPseudoILoadNoMaskTU<Vreg, IdxVreg, eew, idx_lmul.value, Ordered, HasConstraint>, + VLXSched<eew, Order>; def "EI" # eew # "_V_" # IdxLInfo # "_" # LInfo # "_MASK" : VPseudoILoadMask<Vreg, IdxVreg, eew, idx_lmul.value, Ordered, HasConstraint>, VLXSched<eew, Order>; @@ -1504,7 +1592,7 @@ multiclass VPseudoVSFS_M { } multiclass VPseudoVID_V { - foreach m = MxList.m in { + foreach m = MxList in { let VLMul = m.value in { def "_V_" # m.MX : VPseudoNullaryNoMask<m.vrclass>, Sched<[WriteVMIdxV, ReadVMask]>; @@ -1524,7 +1612,7 @@ multiclass VPseudoNullaryPseudoM <string BaseInst> { multiclass VPseudoVIOT_M { defvar constraint = "@earlyclobber $rd"; - foreach m = MxList.m in { + foreach m = MxList in { let VLMul = m.value in { def "_" # m.MX : VPseudoUnaryNoMask<m.vrclass, VR, constraint>, Sched<[WriteVMIotV, ReadVMIotV, ReadVMask]>; @@ -1535,7 +1623,7 @@ multiclass VPseudoVIOT_M { } multiclass VPseudoVCPR_V { - foreach m = MxList.m in { + foreach m = MxList in { let VLMul = m.value in def _VM # "_" # m.MX : VPseudoUnaryAnyMask<m.vrclass, m.vrclass>, Sched<[WriteVCompressV, ReadVCompressV, ReadVCompressV]>; @@ -1596,12 +1684,18 @@ multiclass VPseudoTiedBinary<VReg RetClass, } multiclass VPseudoBinaryV_VV<string Constraint = ""> { - foreach m = MxList.m in + foreach m = MxList in + defm _VV : VPseudoBinary<m.vrclass, m.vrclass, m.vrclass, m, Constraint>; +} + +// Similar to VPseudoBinaryV_VV, but uses MxListF. +multiclass VPseudoBinaryFV_VV<string Constraint = ""> { + foreach m = MxListF in defm _VV : VPseudoBinary<m.vrclass, m.vrclass, m.vrclass, m, Constraint>; } multiclass VPseudoVGTR_VV_EEW<int eew, string Constraint = ""> { - foreach m = MxList.m in { + foreach m = MxList in { foreach sew = EEWList in { defvar octuple_lmul = m.octuple; // emul = lmul * eew / sew @@ -1617,38 +1711,38 @@ multiclass VPseudoVGTR_VV_EEW<int eew, string Constraint = ""> { } multiclass VPseudoBinaryV_VX<string Constraint = ""> { - foreach m = MxList.m in + foreach m = MxList in defm "_VX" : VPseudoBinary<m.vrclass, m.vrclass, GPR, m, Constraint>; } multiclass VPseudoVSLD1_VX<string Constraint = ""> { - foreach m = MxList.m in + foreach m = MxList in defm "_VX" : VPseudoBinary<m.vrclass, m.vrclass, GPR, m, Constraint>, Sched<[WriteVISlide1X, ReadVISlideV, ReadVISlideX, ReadVMask]>; } multiclass VPseudoBinaryV_VF<string Constraint = ""> { - foreach m = MxList.m in - foreach f = FPList.fpinfo in + foreach f = FPList in + foreach m = f.MxList in defm "_V" # f.FX : VPseudoBinary<m.vrclass, m.vrclass, f.fprclass, m, Constraint>; } multiclass VPseudoVSLD1_VF<string Constraint = ""> { - foreach m = MxList.m in - foreach f = FPList.fpinfo in + foreach f = FPList in + foreach m = f.MxList in defm "_V" # f.FX : VPseudoBinary<m.vrclass, m.vrclass, f.fprclass, m, Constraint>, Sched<[WriteVFSlide1F, ReadVFSlideV, ReadVFSlideF, ReadVMask]>; } multiclass VPseudoBinaryV_VI<Operand ImmType = simm5, string Constraint = ""> { - foreach m = MxList.m in + foreach m = MxList in defm _VI : VPseudoBinary<m.vrclass, m.vrclass, ImmType, m, Constraint>; } multiclass VPseudoVALU_MM { - foreach m = MxList.m in + foreach m = MxList in let VLMul = m.value in { def "_MM_" # m.MX : VPseudoBinaryNoMask<VR, VR, VR, "">, Sched<[WriteVMALUV, ReadVMALUV, ReadVMALUV]>; @@ -1662,28 +1756,28 @@ multiclass VPseudoVALU_MM { // * The destination EEW is greater than the source EEW, the source EMUL is // at least 1, and the overlap is in the highest-numbered part of the // destination register group is legal. Otherwise, it is illegal. -multiclass VPseudoBinaryW_VV { - foreach m = MxListW.m in +multiclass VPseudoBinaryW_VV<list<LMULInfo> mxlist = MxListW> { + foreach m = mxlist in defm _VV : VPseudoBinary<m.wvrclass, m.vrclass, m.vrclass, m, "@earlyclobber $rd">; } multiclass VPseudoBinaryW_VX { - foreach m = MxListW.m in + foreach m = MxListW in defm "_VX" : VPseudoBinary<m.wvrclass, m.vrclass, GPR, m, "@earlyclobber $rd">; } multiclass VPseudoBinaryW_VF { - foreach m = MxListW.m in - foreach f = FPListW.fpinfo in + foreach f = FPListW in + foreach m = f.MxList in defm "_V" # f.FX : VPseudoBinary<m.wvrclass, m.vrclass, f.fprclass, m, "@earlyclobber $rd">; } -multiclass VPseudoBinaryW_WV { - foreach m = MxListW.m in { +multiclass VPseudoBinaryW_WV<list<LMULInfo> mxlist = MxListW> { + foreach m = mxlist in { defm _WV : VPseudoBinary<m.wvrclass, m.wvrclass, m.vrclass, m, "@earlyclobber $rd">; defm _WV : VPseudoTiedBinary<m.wvrclass, m.vrclass, m, @@ -1692,13 +1786,13 @@ multiclass VPseudoBinaryW_WV { } multiclass VPseudoBinaryW_WX { - foreach m = MxListW.m in + foreach m = MxListW in defm "_WX" : VPseudoBinary<m.wvrclass, m.wvrclass, GPR, m>; } multiclass VPseudoBinaryW_WF { - foreach m = MxListW.m in - foreach f = FPListW.fpinfo in + foreach f = FPListW in + foreach m = f.MxList in defm "_W" # f.FX : VPseudoBinary<m.wvrclass, m.wvrclass, f.fprclass, m>; } @@ -1709,19 +1803,19 @@ multiclass VPseudoBinaryW_WF { // "The destination EEW is smaller than the source EEW and the overlap is in the // lowest-numbered part of the source register group." multiclass VPseudoBinaryV_WV { - foreach m = MxListW.m in + foreach m = MxListW in defm _WV : VPseudoBinary<m.vrclass, m.wvrclass, m.vrclass, m, !if(!ge(m.octuple, 8), "@earlyclobber $rd", "")>; } multiclass VPseudoBinaryV_WX { - foreach m = MxListW.m in + foreach m = MxListW in defm _WX : VPseudoBinary<m.vrclass, m.wvrclass, GPR, m, !if(!ge(m.octuple, 8), "@earlyclobber $rd", "")>; } multiclass VPseudoBinaryV_WI { - foreach m = MxListW.m in + foreach m = MxListW in defm _WI : VPseudoBinary<m.vrclass, m.wvrclass, uimm5, m, !if(!ge(m.octuple, 8), "@earlyclobber $rd", "")>; } @@ -1731,7 +1825,7 @@ multiclass VPseudoBinaryV_WI { // For vadc and vsbc, CarryIn == 1 and CarryOut == 0 multiclass VPseudoBinaryV_VM<bit CarryOut = 0, bit CarryIn = 1, string Constraint = ""> { - foreach m = MxList.m in + foreach m = MxList in def "_VV" # !if(CarryIn, "M", "") # "_" # m.MX : VPseudoBinaryCarryIn<!if(CarryOut, VR, !if(!and(CarryIn, !not(CarryOut)), @@ -1739,9 +1833,19 @@ multiclass VPseudoBinaryV_VM<bit CarryOut = 0, bit CarryIn = 1, m.vrclass, m.vrclass, m, CarryIn, Constraint>; } +multiclass VPseudoTiedBinaryV_VM<bit CarryOut = 0, bit CarryIn = 1, + string Constraint = ""> { + foreach m = MxList in + def "_VV" # !if(CarryIn, "M", "") # "_" # m.MX # "_TU" : + VPseudoTiedBinaryCarryIn<!if(CarryOut, VR, + !if(!and(CarryIn, !not(CarryOut)), + GetVRegNoV0<m.vrclass>.R, m.vrclass)), + m.vrclass, m.vrclass, m, CarryIn, Constraint>; +} + multiclass VPseudoBinaryV_XM<bit CarryOut = 0, bit CarryIn = 1, string Constraint = ""> { - foreach m = MxList.m in + foreach m = MxList in def "_VX" # !if(CarryIn, "M", "") # "_" # m.MX : VPseudoBinaryCarryIn<!if(CarryOut, VR, !if(!and(CarryIn, !not(CarryOut)), @@ -1749,18 +1853,34 @@ multiclass VPseudoBinaryV_XM<bit CarryOut = 0, bit CarryIn = 1, m.vrclass, GPR, m, CarryIn, Constraint>; } +multiclass VPseudoTiedBinaryV_XM<bit CarryOut = 0, bit CarryIn = 1, + string Constraint = ""> { + foreach m = MxList in + def "_VX" # !if(CarryIn, "M", "") # "_" # m.MX # "_TU": + VPseudoTiedBinaryCarryIn<!if(CarryOut, VR, + !if(!and(CarryIn, !not(CarryOut)), + GetVRegNoV0<m.vrclass>.R, m.vrclass)), + m.vrclass, GPR, m, CarryIn, Constraint>; +} + multiclass VPseudoVMRG_FM { - foreach m = MxList.m in - foreach f = FPList.fpinfo in + foreach f = FPList in + foreach m = f.MxList in { def "_V" # f.FX # "M_" # m.MX : VPseudoBinaryCarryIn<GetVRegNoV0<m.vrclass>.R, m.vrclass, f.fprclass, m, /*CarryIn=*/1, "">, Sched<[WriteVFMergeV, ReadVFMergeV, ReadVFMergeF, ReadVMask]>; + // Tied version to allow codegen control over the tail elements + def "_V" # f.FX # "M_" # m.MX # "_TU": + VPseudoTiedBinaryCarryIn<GetVRegNoV0<m.vrclass>.R, + m.vrclass, f.fprclass, m, /*CarryIn=*/1, "">, + Sched<[WriteVFMergeV, ReadVFMergeV, ReadVFMergeF, ReadVMask]>; + } } multiclass VPseudoBinaryV_IM<bit CarryOut = 0, bit CarryIn = 1, string Constraint = ""> { - foreach m = MxList.m in + foreach m = MxList in def "_VI" # !if(CarryIn, "M", "") # "_" # m.MX : VPseudoBinaryCarryIn<!if(CarryOut, VR, !if(!and(CarryIn, !not(CarryOut)), @@ -1768,8 +1888,18 @@ multiclass VPseudoBinaryV_IM<bit CarryOut = 0, bit CarryIn = 1, m.vrclass, simm5, m, CarryIn, Constraint>; } +multiclass VPseudoTiedBinaryV_IM<bit CarryOut = 0, bit CarryIn = 1, + string Constraint = ""> { + foreach m = MxList in + def "_VI" # !if(CarryIn, "M", "") # "_" # m.MX # "_TU": + VPseudoTiedBinaryCarryIn<!if(CarryOut, VR, + !if(!and(CarryIn, !not(CarryOut)), + GetVRegNoV0<m.vrclass>.R, m.vrclass)), + m.vrclass, simm5, m, CarryIn, Constraint>; +} + multiclass VPseudoUnaryVMV_V_X_I { - foreach m = MxList.m in { + foreach m = MxList in { let VLMul = m.value in { def "_V_" # m.MX : VPseudoUnaryNoDummyMask<m.vrclass, m.vrclass>, Sched<[WriteVIMovV, ReadVIMovV]>; @@ -1782,8 +1912,8 @@ multiclass VPseudoUnaryVMV_V_X_I { } multiclass VPseudoVMV_F { - foreach m = MxList.m in { - foreach f = FPList.fpinfo in { + foreach f = FPList in { + foreach m = f.MxList in { let VLMul = m.value in { def "_" # f.FX # "_" # m.MX : VPseudoUnaryNoDummyMask<m.vrclass, f.fprclass>, @@ -1794,7 +1924,7 @@ multiclass VPseudoVMV_F { } multiclass VPseudoVCLS_V { - foreach m = MxList.m in { + foreach m = MxListF in { let VLMul = m.value in { def "_V_" # m.MX : VPseudoUnaryNoMask<m.vrclass, m.vrclass>, Sched<[WriteVFClassV, ReadVFClassV, ReadVMask]>; @@ -1805,7 +1935,7 @@ multiclass VPseudoVCLS_V { } multiclass VPseudoVSQR_V { - foreach m = MxList.m in { + foreach m = MxListF in { let VLMul = m.value in { def "_V_" # m.MX : VPseudoUnaryNoMask<m.vrclass, m.vrclass>, Sched<[WriteVFSqrtV, ReadVFSqrtV, ReadVMask]>; @@ -1816,7 +1946,7 @@ multiclass VPseudoVSQR_V { } multiclass VPseudoVRCP_V { - foreach m = MxList.m in { + foreach m = MxListF in { let VLMul = m.value in { def "_V_" # m.MX : VPseudoUnaryNoMask<m.vrclass, m.vrclass>, Sched<[WriteVFRecpV, ReadVFRecpV, ReadVMask]>; @@ -1828,7 +1958,7 @@ multiclass VPseudoVRCP_V { multiclass PseudoVEXT_VF2 { defvar constraints = "@earlyclobber $rd"; - foreach m = MxListVF2.m in + foreach m = MxListVF2 in { let VLMul = m.value in { def "_" # m.MX : VPseudoUnaryNoMask<m.vrclass, m.f2vrclass, constraints>, @@ -1842,7 +1972,7 @@ multiclass PseudoVEXT_VF2 { multiclass PseudoVEXT_VF4 { defvar constraints = "@earlyclobber $rd"; - foreach m = MxListVF4.m in + foreach m = MxListVF4 in { let VLMul = m.value in { def "_" # m.MX : VPseudoUnaryNoMask<m.vrclass, m.f4vrclass, constraints>, @@ -1856,7 +1986,7 @@ multiclass PseudoVEXT_VF4 { multiclass PseudoVEXT_VF8 { defvar constraints = "@earlyclobber $rd"; - foreach m = MxListVF8.m in + foreach m = MxListVF8 in { let VLMul = m.value in { def "_" # m.MX : VPseudoUnaryNoMask<m.vrclass, m.f8vrclass, constraints>, @@ -1879,29 +2009,29 @@ multiclass PseudoVEXT_VF8 { // lowest-numbered part of the source register group". // With LMUL<=1 the source and dest occupy a single register so any overlap // is in the lowest-numbered part. -multiclass VPseudoBinaryM_VV { - foreach m = MxList.m in +multiclass VPseudoBinaryM_VV<list<LMULInfo> mxlist = MxList> { + foreach m = mxlist in defm _VV : VPseudoBinaryM<VR, m.vrclass, m.vrclass, m, !if(!ge(m.octuple, 16), "@earlyclobber $rd", "")>; } multiclass VPseudoBinaryM_VX { - foreach m = MxList.m in + foreach m = MxList in defm "_VX" : VPseudoBinaryM<VR, m.vrclass, GPR, m, !if(!ge(m.octuple, 16), "@earlyclobber $rd", "")>; } multiclass VPseudoBinaryM_VF { - foreach m = MxList.m in - foreach f = FPList.fpinfo in + foreach f = FPList in + foreach m = f.MxList in defm "_V" # f.FX : VPseudoBinaryM<VR, m.vrclass, f.fprclass, m, !if(!ge(m.octuple, 16), "@earlyclobber $rd", "")>; } multiclass VPseudoBinaryM_VI { - foreach m = MxList.m in + foreach m = MxList in defm _VI : VPseudoBinaryM<VR, m.vrclass, simm5, m, !if(!ge(m.octuple, 16), "@earlyclobber $rd", "")>; } @@ -1995,14 +2125,14 @@ multiclass VPseudoVDIV_VV_VX { } multiclass VPseudoVFMUL_VV_VF { - defm "" : VPseudoBinaryV_VV, + defm "" : VPseudoBinaryFV_VV, Sched<[WriteVFMulV, ReadVFMulV, ReadVFMulV, ReadVMask]>; defm "" : VPseudoBinaryV_VF, Sched<[WriteVFMulF, ReadVFMulV, ReadVFMulF, ReadVMask]>; } multiclass VPseudoVFDIV_VV_VF { - defm "" : VPseudoBinaryV_VV, + defm "" : VPseudoBinaryFV_VV, Sched<[WriteVFDivV, ReadVFDivV, ReadVFDivV, ReadVMask]>; defm "" : VPseudoBinaryV_VF, Sched<[WriteVFDivF, ReadVFDivV, ReadVFDivF, ReadVMask]>; @@ -2021,21 +2151,21 @@ multiclass VPseudoVALU_VV_VX { } multiclass VPseudoVSGNJ_VV_VF { - defm "" : VPseudoBinaryV_VV, + defm "" : VPseudoBinaryFV_VV, Sched<[WriteVFSgnjV, ReadVFSgnjV, ReadVFSgnjV, ReadVMask]>; defm "" : VPseudoBinaryV_VF, Sched<[WriteVFSgnjF, ReadVFSgnjV, ReadVFSgnjF, ReadVMask]>; } multiclass VPseudoVMAX_VV_VF { - defm "" : VPseudoBinaryV_VV, + defm "" : VPseudoBinaryFV_VV, Sched<[WriteVFCmpV, ReadVFCmpV, ReadVFCmpV, ReadVMask]>; defm "" : VPseudoBinaryV_VF, Sched<[WriteVFCmpF, ReadVFCmpV, ReadVFCmpF, ReadVMask]>; } multiclass VPseudoVALU_VV_VF { - defm "" : VPseudoBinaryV_VV, + defm "" : VPseudoBinaryFV_VV, Sched<[WriteVFALUV, ReadVFALUV, ReadVFALUV, ReadVMask]>; defm "" : VPseudoBinaryV_VF, Sched<[WriteVFALUF, ReadVFALUV, ReadVFALUF, ReadVMask]>; @@ -2068,17 +2198,12 @@ multiclass VPseudoVWMUL_VV_VX { } multiclass VPseudoVWMUL_VV_VF { - defm "" : VPseudoBinaryW_VV, + defm "" : VPseudoBinaryW_VV<MxListFW>, Sched<[WriteVFWMulV, ReadVFWMulV, ReadVFWMulV, ReadVMask]>; defm "" : VPseudoBinaryW_VF, Sched<[WriteVFWMulF, ReadVFWMulV, ReadVFWMulF, ReadVMask]>; } -multiclass VPseudoBinaryW_VV_VF { - defm "" : VPseudoBinaryW_VV; - defm "" : VPseudoBinaryW_VF; -} - multiclass VPseudoVWALU_WV_WX { defm "" : VPseudoBinaryW_WV, Sched<[WriteVIWALUV, ReadVIWALUV, ReadVIWALUV, ReadVMask]>; @@ -2087,14 +2212,14 @@ multiclass VPseudoVWALU_WV_WX { } multiclass VPseudoVFWALU_VV_VF { - defm "" : VPseudoBinaryW_VV, + defm "" : VPseudoBinaryW_VV<MxListFW>, Sched<[WriteVFWALUV, ReadVFWALUV, ReadVFWALUV, ReadVMask]>; defm "" : VPseudoBinaryW_VF, Sched<[WriteVFWALUF, ReadVFWALUV, ReadVFWALUF, ReadVMask]>; } multiclass VPseudoVFWALU_WV_WF { - defm "" : VPseudoBinaryW_WV, + defm "" : VPseudoBinaryW_WV<MxListFW>, Sched<[WriteVFWALUV, ReadVFWALUV, ReadVFWALUV, ReadVMask]>; defm "" : VPseudoBinaryW_WF, Sched<[WriteVFWALUF, ReadVFWALUV, ReadVFWALUF, ReadVMask]>; @@ -2107,6 +2232,13 @@ multiclass VPseudoVMRG_VM_XM_IM { Sched<[WriteVIMergeX, ReadVIMergeV, ReadVIMergeX, ReadVMask]>; defm "" : VPseudoBinaryV_IM, Sched<[WriteVIMergeI, ReadVIMergeV, ReadVMask]>; + // Tied versions to allow codegen control over the tail elements + defm "" : VPseudoTiedBinaryV_VM, + Sched<[WriteVIMergeV, ReadVIMergeV, ReadVIMergeV, ReadVMask]>; + defm "" : VPseudoTiedBinaryV_XM, + Sched<[WriteVIMergeX, ReadVIMergeV, ReadVIMergeX, ReadVMask]>; + defm "" : VPseudoTiedBinaryV_IM, + Sched<[WriteVIMergeI, ReadVIMergeV, ReadVMask]>; } multiclass VPseudoVCALU_VM_XM_IM { @@ -2199,56 +2331,57 @@ multiclass VPseudoTernaryWithPolicy<VReg RetClass, } } -multiclass VPseudoTernaryV_VV_AAXA<string Constraint = ""> { - foreach m = MxList.m in { +multiclass VPseudoTernaryV_VV_AAXA<string Constraint = "", + list<LMULInfo> mxlist = MxList> { + foreach m = mxlist in { defm _VV : VPseudoTernaryWithPolicy<m.vrclass, m.vrclass, m.vrclass, m, Constraint, /*Commutable*/1>; } } multiclass VPseudoTernaryV_VX<string Constraint = ""> { - foreach m = MxList.m in + foreach m = MxList in defm _VX : VPseudoTernary<m.vrclass, m.vrclass, GPR, m, Constraint>; } multiclass VPseudoTernaryV_VX_AAXA<string Constraint = ""> { - foreach m = MxList.m in + foreach m = MxList in defm "_VX" : VPseudoTernaryWithPolicy<m.vrclass, GPR, m.vrclass, m, Constraint, /*Commutable*/1>; } multiclass VPseudoTernaryV_VF_AAXA<string Constraint = ""> { - foreach m = MxList.m in - foreach f = FPList.fpinfo in + foreach f = FPList in + foreach m = f.MxList in defm "_V" # f.FX : VPseudoTernaryWithPolicy<m.vrclass, f.fprclass, m.vrclass, m, Constraint, /*Commutable*/1>; } -multiclass VPseudoTernaryW_VV { +multiclass VPseudoTernaryW_VV<list<LMULInfo> mxlist = MxListW> { defvar constraint = "@earlyclobber $rd"; - foreach m = MxListW.m in + foreach m = mxlist in defm _VV : VPseudoTernaryWithPolicy<m.wvrclass, m.vrclass, m.vrclass, m, constraint>; } multiclass VPseudoTernaryW_VX { defvar constraint = "@earlyclobber $rd"; - foreach m = MxListW.m in + foreach m = MxListW in defm "_VX" : VPseudoTernaryWithPolicy<m.wvrclass, GPR, m.vrclass, m, constraint>; } multiclass VPseudoTernaryW_VF { defvar constraint = "@earlyclobber $rd"; - foreach m = MxListW.m in - foreach f = FPListW.fpinfo in + foreach f = FPListW in + foreach m = f.MxList in defm "_V" # f.FX : VPseudoTernaryWithPolicy<m.wvrclass, f.fprclass, m.vrclass, m, constraint>; } multiclass VPseudoTernaryV_VI<Operand ImmType = simm5, string Constraint = ""> { - foreach m = MxList.m in + foreach m = MxList in defm _VI : VPseudoTernary<m.vrclass, m.vrclass, ImmType, m, Constraint>; } @@ -2260,7 +2393,7 @@ multiclass VPseudoVMAC_VV_VX_AAXA<string Constraint = ""> { } multiclass VPseudoVMAC_VV_VF_AAXA<string Constraint = ""> { - defm "" : VPseudoTernaryV_VV_AAXA<Constraint>, + defm "" : VPseudoTernaryV_VV_AAXA<Constraint, MxListF>, Sched<[WriteVFMulAddV, ReadVFMulAddV, ReadVFMulAddV, ReadVFMulAddV, ReadVMask]>; defm "" : VPseudoTernaryV_VF_AAXA<Constraint>, Sched<[WriteVFMulAddF, ReadVFMulAddV, ReadVFMulAddV, ReadVFMulAddF, ReadVMask]>; @@ -2286,7 +2419,7 @@ multiclass VPseudoVWMAC_VX { } multiclass VPseudoVWMAC_VV_VF { - defm "" : VPseudoTernaryW_VV, + defm "" : VPseudoTernaryW_VV<MxListFW>, Sched<[WriteVFWMulAddV, ReadVFWMulAddV, ReadVFWMulAddV, ReadVFWMulAddV, ReadVMask]>; defm "" : VPseudoTernaryW_VF, Sched<[WriteVFWMulAddF, ReadVFWMulAddV, ReadVFWMulAddV, ReadVFWMulAddF, ReadVMask]>; @@ -2309,7 +2442,7 @@ multiclass VPseudoVCMPM_VV_VX { } multiclass VPseudoVCMPM_VV_VF { - defm "" : VPseudoBinaryM_VV, + defm "" : VPseudoBinaryM_VV<MxListF>, Sched<[WriteVFCmpV, ReadVFCmpV, ReadVFCmpV, ReadVMask]>; defm "" : VPseudoBinaryM_VF, Sched<[WriteVFCmpF, ReadVFCmpV, ReadVFCmpF, ReadVMask]>; @@ -2328,35 +2461,35 @@ multiclass VPseudoVCMPM_VX_VI { } multiclass VPseudoVRED_VS { - foreach m = MxList.m in { + foreach m = MxList in { defm _VS : VPseudoTernary<V_M1.vrclass, m.vrclass, V_M1.vrclass, m>, Sched<[WriteVIRedV, ReadVIRedV, ReadVIRedV, ReadVIRedV, ReadVMask]>; } } multiclass VPseudoVWRED_VS { - foreach m = MxList.m in { + foreach m = MxList in { defm _VS : VPseudoTernary<V_M1.vrclass, m.vrclass, V_M1.vrclass, m>, Sched<[WriteVIWRedV, ReadVIWRedV, ReadVIWRedV, ReadVIWRedV, ReadVMask]>; } } multiclass VPseudoVFRED_VS { - foreach m = MxList.m in { + foreach m = MxListF in { defm _VS : VPseudoTernary<V_M1.vrclass, m.vrclass, V_M1.vrclass, m>, Sched<[WriteVFRedV, ReadVFRedV, ReadVFRedV, ReadVFRedV, ReadVMask]>; } } multiclass VPseudoVFREDO_VS { - foreach m = MxList.m in { + foreach m = MxListF in { defm _VS : VPseudoTernary<V_M1.vrclass, m.vrclass, V_M1.vrclass, m>, Sched<[WriteVFRedOV, ReadVFRedOV, ReadVFRedOV, ReadVFRedOV, ReadVMask]>; } } multiclass VPseudoVFWRED_VS { - foreach m = MxList.m in { + foreach m = MxListF in { defm _VS : VPseudoTernary<V_M1.vrclass, m.vrclass, V_M1.vrclass, m>, Sched<[WriteVFWRedV, ReadVFWRedV, ReadVFWRedV, ReadVFWRedV, ReadVMask]>; } @@ -2374,61 +2507,61 @@ multiclass VPseudoConversion<VReg RetClass, } multiclass VPseudoVCVTI_V { - foreach m = MxList.m in + foreach m = MxListF in defm _V : VPseudoConversion<m.vrclass, m.vrclass, m>, Sched<[WriteVFCvtFToIV, ReadVFCvtFToIV, ReadVMask]>; } multiclass VPseudoVCVTF_V { - foreach m = MxList.m in + foreach m = MxListF in defm _V : VPseudoConversion<m.vrclass, m.vrclass, m>, Sched<[WriteVFCvtIToFV, ReadVFCvtIToFV, ReadVMask]>; } multiclass VPseudoConversionW_V { defvar constraint = "@earlyclobber $rd"; - foreach m = MxListW.m in + foreach m = MxListW in defm _V : VPseudoConversion<m.wvrclass, m.vrclass, m, constraint>; } multiclass VPseudoVWCVTI_V { defvar constraint = "@earlyclobber $rd"; - foreach m = MxList.m[0-5] in + foreach m = MxListFW in defm _V : VPseudoConversion<m.wvrclass, m.vrclass, m, constraint>, Sched<[WriteVFWCvtFToIV, ReadVFWCvtFToIV, ReadVMask]>; } multiclass VPseudoVWCVTF_V { defvar constraint = "@earlyclobber $rd"; - foreach m = MxList.m[0-5] in + foreach m = MxListW in defm _V : VPseudoConversion<m.wvrclass, m.vrclass, m, constraint>, Sched<[WriteVFWCvtIToFV, ReadVFWCvtIToFV, ReadVMask]>; } multiclass VPseudoVWCVTD_V { defvar constraint = "@earlyclobber $rd"; - foreach m = MxList.m[0-5] in + foreach m = MxListFW in defm _V : VPseudoConversion<m.wvrclass, m.vrclass, m, constraint>, Sched<[WriteVFWCvtFToFV, ReadVFWCvtFToFV, ReadVMask]>; } multiclass VPseudoVNCVTI_W { defvar constraint = "@earlyclobber $rd"; - foreach m = MxList.m[0-5] in + foreach m = MxListW in defm _W : VPseudoConversion<m.vrclass, m.wvrclass, m, constraint>, Sched<[WriteVFNCvtFToIV, ReadVFNCvtFToIV, ReadVMask]>; } multiclass VPseudoVNCVTF_W { defvar constraint = "@earlyclobber $rd"; - foreach m = MxList.m[0-5] in + foreach m = MxListFW in defm _W : VPseudoConversion<m.vrclass, m.wvrclass, m, constraint>, Sched<[WriteVFNCvtIToFV, ReadVFNCvtIToFV, ReadVMask]>; } multiclass VPseudoVNCVTD_W { defvar constraint = "@earlyclobber $rd"; - foreach m = MxListW.m in + foreach m = MxListFW in defm _W : VPseudoConversion<m.vrclass, m.wvrclass, m, constraint>, Sched<[WriteVFNCvtFToFV, ReadVFNCvtFToFV, ReadVMask]>; } @@ -3702,6 +3835,28 @@ multiclass VPatConversionVF_WF <string intrinsic, string instruction> { } } +multiclass VPatCompare_VI<string intrinsic, string inst, + ImmLeaf ImmType = simm5_plus1> { + foreach vti = AllIntegerVectors in { + defvar Intr = !cast<Intrinsic>(intrinsic); + defvar Pseudo = !cast<Instruction>(inst#"_VI_"#vti.LMul.MX); + def : Pat<(vti.Mask (Intr (vti.Vector vti.RegClass:$rs1), + (vti.Scalar ImmType:$rs2), + VLOpFrag)), + (Pseudo vti.RegClass:$rs1, (DecImm ImmType:$rs2), + GPR:$vl, vti.Log2SEW)>; + defvar IntrMask = !cast<Intrinsic>(intrinsic # "_mask"); + defvar PseudoMask = !cast<Instruction>(inst#"_VI_"#vti.LMul.MX#"_MASK"); + def : Pat<(vti.Mask (IntrMask (vti.Mask VR:$merge), + (vti.Vector vti.RegClass:$rs1), + (vti.Scalar ImmType:$rs2), + (vti.Mask V0), + VLOpFrag)), + (PseudoMask VR:$merge, vti.RegClass:$rs1, (DecImm ImmType:$rs2), + (vti.Mask V0), GPR:$vl, vti.Log2SEW)>; + } +} + //===----------------------------------------------------------------------===// // Pseudo instructions //===----------------------------------------------------------------------===// @@ -3741,7 +3896,7 @@ let hasSideEffects = 0, mayLoad = 1, mayStore = 0, isCodeGenOnly = 1 in { def PseudoVRELOAD_M8 : VPseudo<VL8RE8_V, V_M8, (outs VRM8:$rs1), (ins GPR:$rs2)>; } -foreach lmul = MxList.m in { +foreach lmul = MxList in { foreach nf = NFSet<lmul>.L in { defvar vreg = SegRegClass<lmul, nf>.RC; let hasSideEffects = 0, mayLoad = 0, mayStore = 1, isCodeGenOnly = 1 in { @@ -3765,9 +3920,9 @@ let hasSideEffects = 1, mayLoad = 0, mayStore = 0, Defs = [VL, VTYPE] in { // the when we aren't using one of the special X0 encodings. Otherwise it could // be accidentally be made X0 by MachineIR optimizations. To satisfy the // verifier, we also need a GPRX0 instruction for the special encodings. -def PseudoVSETVLI : Pseudo<(outs GPR:$rd), (ins GPRNoX0:$rs1, VTypeIOp:$vtypei), []>; -def PseudoVSETVLIX0 : Pseudo<(outs GPR:$rd), (ins GPRX0:$rs1, VTypeIOp:$vtypei), []>; -def PseudoVSETIVLI : Pseudo<(outs GPR:$rd), (ins uimm5:$rs1, VTypeIOp:$vtypei), []>; +def PseudoVSETVLI : Pseudo<(outs GPR:$rd), (ins GPRNoX0:$rs1, VTypeIOp11:$vtypei), []>; +def PseudoVSETVLIX0 : Pseudo<(outs GPR:$rd), (ins GPRX0:$rs1, VTypeIOp11:$vtypei), []>; +def PseudoVSETIVLI : Pseudo<(outs GPR:$rd), (ins uimm5:$rs1, VTypeIOp10:$vtypei), []>; } //===----------------------------------------------------------------------===// @@ -4304,7 +4459,7 @@ defm PseudoVID : VPseudoVID_V; let Predicates = [HasVInstructions] in { let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in { - foreach m = MxList.m in { + foreach m = MxList in { let VLMul = m.value in { let HasSEWOp = 1, BaseInstr = VMV_X_S in def PseudoVMV_X_S # "_" # m.MX: @@ -4330,8 +4485,8 @@ let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in { let Predicates = [HasVInstructionsAnyF] in { let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in { - foreach m = MxList.m in { - foreach f = FPList.fpinfo in { + foreach f = FPList in { + foreach m = f.MxList in { let VLMul = m.value in { let HasSEWOp = 1, BaseInstr = VFMV_F_S in def "PseudoVFMV_" # f.FX # "_S_" # m.MX : @@ -4452,6 +4607,30 @@ defm : VPatBinaryV_VV_VX_VI<"int_riscv_vsrl", "PseudoVSRL", AllIntegerVectors, defm : VPatBinaryV_VV_VX_VI<"int_riscv_vsra", "PseudoVSRA", AllIntegerVectors, uimm5>; +foreach vti = AllIntegerVectors in { + // Emit shift by 1 as an add since it might be faster. + def : Pat<(vti.Vector (int_riscv_vsll (vti.Vector vti.RegClass:$rs1), + (XLenVT 1), VLOpFrag)), + (!cast<Instruction>("PseudoVADD_VV_"#vti.LMul.MX) vti.RegClass:$rs1, + vti.RegClass:$rs1, + GPR:$vl, + vti.Log2SEW)>; + def : Pat<(vti.Vector (int_riscv_vsll_mask (vti.Vector vti.RegClass:$merge), + (vti.Vector vti.RegClass:$rs1), + (XLenVT 1), + (vti.Mask V0), + VLOpFrag, + (XLenVT timm:$policy))), + (!cast<Instruction>("PseudoVADD_VV_"#vti.LMul.MX#"_MASK") + vti.RegClass:$merge, + vti.RegClass:$rs1, + vti.RegClass:$rs1, + (vti.Mask V0), + GPR:$vl, + vti.Log2SEW, + (XLenVT timm:$policy))>; +} + //===----------------------------------------------------------------------===// // 12.7. Vector Narrowing Integer Right Shift Instructions //===----------------------------------------------------------------------===// @@ -4481,129 +4660,11 @@ defm : VPatBinarySwappedM_VV<"int_riscv_vmsge", "PseudoVMSLE", AllIntegerVectors // Match vmslt(u).vx intrinsics to vmsle(u).vi if the scalar is -15 to 16. This // avoids the user needing to know that there is no vmslt(u).vi instruction. // Similar for vmsge(u).vx intrinsics using vmslt(u).vi. -foreach vti = AllIntegerVectors in { - def : Pat<(vti.Mask (int_riscv_vmslt (vti.Vector vti.RegClass:$rs1), - (vti.Scalar simm5_plus1:$rs2), - VLOpFrag)), - (!cast<Instruction>("PseudoVMSLE_VI_"#vti.LMul.MX) vti.RegClass:$rs1, - (DecImm simm5_plus1:$rs2), - GPR:$vl, - vti.Log2SEW)>; - def : Pat<(vti.Mask (int_riscv_vmslt_mask (vti.Mask VR:$merge), - (vti.Vector vti.RegClass:$rs1), - (vti.Scalar simm5_plus1:$rs2), - (vti.Mask V0), - VLOpFrag)), - (!cast<Instruction>("PseudoVMSLE_VI_"#vti.LMul.MX#"_MASK") - VR:$merge, - vti.RegClass:$rs1, - (DecImm simm5_plus1:$rs2), - (vti.Mask V0), - GPR:$vl, - vti.Log2SEW)>; +defm : VPatCompare_VI<"int_riscv_vmslt", "PseudoVMSLE">; +defm : VPatCompare_VI<"int_riscv_vmsltu", "PseudoVMSLEU", simm5_plus1_nonzero>; - def : Pat<(vti.Mask (int_riscv_vmsltu (vti.Vector vti.RegClass:$rs1), - (vti.Scalar simm5_plus1:$rs2), - VLOpFrag)), - (!cast<Instruction>("PseudoVMSLEU_VI_"#vti.LMul.MX) vti.RegClass:$rs1, - (DecImm simm5_plus1:$rs2), - GPR:$vl, - vti.Log2SEW)>; - def : Pat<(vti.Mask (int_riscv_vmsltu_mask (vti.Mask VR:$merge), - (vti.Vector vti.RegClass:$rs1), - (vti.Scalar simm5_plus1:$rs2), - (vti.Mask V0), - VLOpFrag)), - (!cast<Instruction>("PseudoVMSLEU_VI_"#vti.LMul.MX#"_MASK") - VR:$merge, - vti.RegClass:$rs1, - (DecImm simm5_plus1:$rs2), - (vti.Mask V0), - GPR:$vl, - vti.Log2SEW)>; - - // Special cases to avoid matching vmsltu.vi 0 (always false) to - // vmsleu.vi -1 (always true). Instead match to vmsne.vv. - def : Pat<(vti.Mask (int_riscv_vmsltu (vti.Vector vti.RegClass:$rs1), - (vti.Scalar 0), VLOpFrag)), - (!cast<Instruction>("PseudoVMSNE_VV_"#vti.LMul.MX) vti.RegClass:$rs1, - vti.RegClass:$rs1, - GPR:$vl, - vti.Log2SEW)>; - def : Pat<(vti.Mask (int_riscv_vmsltu_mask (vti.Mask VR:$merge), - (vti.Vector vti.RegClass:$rs1), - (vti.Scalar 0), - (vti.Mask V0), - VLOpFrag)), - (!cast<Instruction>("PseudoVMSNE_VV_"#vti.LMul.MX#"_MASK") - VR:$merge, - vti.RegClass:$rs1, - vti.RegClass:$rs1, - (vti.Mask V0), - GPR:$vl, - vti.Log2SEW)>; - - def : Pat<(vti.Mask (int_riscv_vmsge (vti.Vector vti.RegClass:$rs1), - (vti.Scalar simm5_plus1:$rs2), - VLOpFrag)), - (!cast<Instruction>("PseudoVMSGT_VI_"#vti.LMul.MX) vti.RegClass:$rs1, - (DecImm simm5_plus1:$rs2), - GPR:$vl, - vti.Log2SEW)>; - def : Pat<(vti.Mask (int_riscv_vmsge_mask (vti.Mask VR:$merge), - (vti.Vector vti.RegClass:$rs1), - (vti.Scalar simm5_plus1:$rs2), - (vti.Mask V0), - VLOpFrag)), - (!cast<Instruction>("PseudoVMSGT_VI_"#vti.LMul.MX#"_MASK") - VR:$merge, - vti.RegClass:$rs1, - (DecImm simm5_plus1:$rs2), - (vti.Mask V0), - GPR:$vl, - vti.Log2SEW)>; - - def : Pat<(vti.Mask (int_riscv_vmsgeu (vti.Vector vti.RegClass:$rs1), - (vti.Scalar simm5_plus1:$rs2), - VLOpFrag)), - (!cast<Instruction>("PseudoVMSGTU_VI_"#vti.LMul.MX) vti.RegClass:$rs1, - (DecImm simm5_plus1:$rs2), - GPR:$vl, - vti.Log2SEW)>; - def : Pat<(vti.Mask (int_riscv_vmsgeu_mask (vti.Mask VR:$merge), - (vti.Vector vti.RegClass:$rs1), - (vti.Scalar simm5_plus1:$rs2), - (vti.Mask V0), - VLOpFrag)), - (!cast<Instruction>("PseudoVMSGTU_VI_"#vti.LMul.MX#"_MASK") - VR:$merge, - vti.RegClass:$rs1, - (DecImm simm5_plus1:$rs2), - (vti.Mask V0), - GPR:$vl, - vti.Log2SEW)>; - - // Special cases to avoid matching vmsgeu.vi 0 (always true) to - // vmsgtu.vi -1 (always false). Instead match to vmsne.vv. - def : Pat<(vti.Mask (int_riscv_vmsgeu (vti.Vector vti.RegClass:$rs1), - (vti.Scalar 0), VLOpFrag)), - (!cast<Instruction>("PseudoVMSEQ_VV_"#vti.LMul.MX) vti.RegClass:$rs1, - vti.RegClass:$rs1, - GPR:$vl, - vti.Log2SEW)>; - def : Pat<(vti.Mask (int_riscv_vmsgeu_mask (vti.Mask VR:$merge), - (vti.Vector vti.RegClass:$rs1), - (vti.Scalar 0), - (vti.Mask V0), - VLOpFrag)), - (!cast<Instruction>("PseudoVMSEQ_VV_"#vti.LMul.MX#"_MASK") - VR:$merge, - vti.RegClass:$rs1, - vti.RegClass:$rs1, - (vti.Mask V0), - GPR:$vl, - vti.Log2SEW)>; -} +defm : VPatCompare_VI<"int_riscv_vmsge", "PseudoVMSGT">; +defm : VPatCompare_VI<"int_riscv_vmsgeu", "PseudoVMSGTU", simm5_plus1_nonzero>; //===----------------------------------------------------------------------===// // 12.9. Vector Integer Min/Max Instructions diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td index 711ad4335ece..e452a84a9a6f 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td @@ -363,6 +363,91 @@ multiclass VPatNConvertFP2ISDNode_V<SDNode vop, string instruction_name> { } } +multiclass VPatWidenBinarySDNode_VV_VX_WV_WX<SDNode op, PatFrags extop, string instruction_name> { + foreach vti = AllWidenableIntVectors in { + def : Pat<(op (vti.Wti.Vector (extop (vti.Vti.Vector vti.Vti.RegClass:$rs2))), + (vti.Wti.Vector (extop (vti.Vti.Vector vti.Vti.RegClass:$rs1)))), + (!cast<Instruction>(instruction_name#"_VV_"#vti.Vti.LMul.MX) + vti.Vti.RegClass:$rs2, vti.Vti.RegClass:$rs1, + vti.Vti.AVL, vti.Vti.Log2SEW)>; + def : Pat<(op (vti.Wti.Vector (extop (vti.Vti.Vector vti.Vti.RegClass:$rs2))), + (vti.Wti.Vector (extop (vti.Vti.Vector (SplatPat GPR:$rs1))))), + (!cast<Instruction>(instruction_name#"_VX_"#vti.Vti.LMul.MX) + vti.Vti.RegClass:$rs2, GPR:$rs1, + vti.Vti.AVL, vti.Vti.Log2SEW)>; + def : Pat<(op (vti.Wti.Vector vti.Wti.RegClass:$rs2), + (vti.Wti.Vector (extop (vti.Vti.Vector vti.Vti.RegClass:$rs1)))), + (!cast<Instruction>(instruction_name#"_WV_"#vti.Vti.LMul.MX) + vti.Wti.RegClass:$rs2, vti.Vti.RegClass:$rs1, + vti.Vti.AVL, vti.Vti.Log2SEW)>; + def : Pat<(op (vti.Wti.Vector vti.Wti.RegClass:$rs2), + (vti.Wti.Vector (extop (vti.Vti.Vector (SplatPat GPR:$rs1))))), + (!cast<Instruction>(instruction_name#"_WX_"#vti.Vti.LMul.MX) + vti.Wti.RegClass:$rs2, GPR:$rs1, + vti.Vti.AVL, vti.Vti.Log2SEW)>; + } +} + +multiclass VPatWidenMulAddSDNode_VV<PatFrags extop1, PatFrags extop2, string instruction_name> { + foreach vti = AllWidenableIntVectors in { + def : Pat< + (add (vti.Wti.Vector vti.Wti.RegClass:$rd), + (mul_oneuse (vti.Wti.Vector (extop1 (vti.Vti.Vector vti.Vti.RegClass:$rs1))), + (vti.Wti.Vector (extop2 (vti.Vti.Vector vti.Vti.RegClass:$rs2))))), + (!cast<Instruction>(instruction_name#"_VV_"#vti.Vti.LMul.MX) + vti.Wti.RegClass:$rd, vti.Vti.RegClass:$rs1, vti.Vti.RegClass:$rs2, + vti.Vti.AVL, vti.Vti.Log2SEW, TAIL_AGNOSTIC + )>; + } +} +multiclass VPatWidenMulAddSDNode_VX<PatFrags extop1, PatFrags extop2, string instruction_name> { + foreach vti = AllWidenableIntVectors in { + def : Pat< + (add (vti.Wti.Vector vti.Wti.RegClass:$rd), + (mul_oneuse (vti.Wti.Vector (extop1 (vti.Vti.Vector (SplatPat GPR:$rs1)))), + (vti.Wti.Vector (extop2 (vti.Vti.Vector vti.Vti.RegClass:$rs2))))), + (!cast<Instruction>(instruction_name#"_VX_"#vti.Vti.LMul.MX) + vti.Wti.RegClass:$rd, GPR:$rs1, vti.Vti.RegClass:$rs2, + vti.Vti.AVL, vti.Vti.Log2SEW, TAIL_AGNOSTIC + )>; + } +} + +multiclass VPatWidenBinaryFPSDNode_VV_VF<SDNode op, string instruction_name> { + foreach vti = AllWidenableFloatVectors in { + def : Pat<(op (vti.Wti.Vector (fpext_oneuse (vti.Vti.Vector vti.Vti.RegClass:$rs2))), + (vti.Wti.Vector (fpext_oneuse (vti.Vti.Vector vti.Vti.RegClass:$rs1)))), + (!cast<Instruction>(instruction_name#"_VV_"#vti.Vti.LMul.MX) + vti.Vti.RegClass:$rs2, vti.Vti.RegClass:$rs1, + vti.Vti.AVL, vti.Vti.Log2SEW)>; + def : Pat<(op (vti.Wti.Vector (fpext_oneuse (vti.Vti.Vector vti.Vti.RegClass:$rs2))), + (vti.Wti.Vector (fpext_oneuse (vti.Vti.Vector (SplatPat vti.Vti.ScalarRegClass:$rs1))))), + (!cast<Instruction>(instruction_name#"_V"#vti.Vti.ScalarSuffix#"_"#vti.Vti.LMul.MX) + vti.Vti.RegClass:$rs2, vti.Vti.ScalarRegClass:$rs1, + vti.Vti.AVL, vti.Vti.Log2SEW)>; + } +} + +multiclass VPatWidenBinaryFPSDNode_WV_WF<SDNode op, string instruction_name> { + foreach vti = AllWidenableFloatVectors in { + def : Pat<(op (vti.Wti.Vector vti.Wti.RegClass:$rs2), + (vti.Wti.Vector (fpext_oneuse (vti.Vti.Vector vti.Vti.RegClass:$rs1)))), + (!cast<Instruction>(instruction_name#"_WV_"#vti.Vti.LMul.MX) + vti.Wti.RegClass:$rs2, vti.Vti.RegClass:$rs1, + vti.Vti.AVL, vti.Vti.Log2SEW)>; + def : Pat<(op (vti.Wti.Vector vti.Wti.RegClass:$rs2), + (vti.Wti.Vector (fpext_oneuse (vti.Vti.Vector (SplatPat vti.Vti.ScalarRegClass:$rs1))))), + (!cast<Instruction>(instruction_name#"_W"#vti.Vti.ScalarSuffix#"_"#vti.Vti.LMul.MX) + vti.Wti.RegClass:$rs2, vti.Vti.ScalarRegClass:$rs1, + vti.Vti.AVL, vti.Vti.Log2SEW)>; + } +} + +multiclass VPatWidenBinaryFPSDNode_VV_VF_WV_WF<SDNode op, string instruction_name> { + defm : VPatWidenBinaryFPSDNode_VV_VF<op, instruction_name>; + defm : VPatWidenBinaryFPSDNode_WV_WF<op, instruction_name>; +} + //===----------------------------------------------------------------------===// // Patterns. //===----------------------------------------------------------------------===// @@ -399,6 +484,15 @@ foreach vti = AllIntegerVectors in { vti.RegClass:$rs1, simm5:$rs2, vti.AVL, vti.Log2SEW)>; } +// 12.2. Vector Widening Integer Add and Subtract +defm : VPatWidenBinarySDNode_VV_VX_WV_WX<add, sext_oneuse, "PseudoVWADD">; +defm : VPatWidenBinarySDNode_VV_VX_WV_WX<add, zext_oneuse, "PseudoVWADDU">; +defm : VPatWidenBinarySDNode_VV_VX_WV_WX<add, anyext_oneuse, "PseudoVWADDU">; + +defm : VPatWidenBinarySDNode_VV_VX_WV_WX<sub, sext_oneuse, "PseudoVWSUB">; +defm : VPatWidenBinarySDNode_VV_VX_WV_WX<sub, zext_oneuse, "PseudoVWSUBU">; +defm : VPatWidenBinarySDNode_VV_VX_WV_WX<sub, anyext_oneuse, "PseudoVWSUBU">; + // 12.3. Vector Integer Extension defm : VPatExtendSDNode_V<[zext, anyext], "PseudoVZEXT", "VF2", AllFractionableVF2IntVectors>; @@ -513,6 +607,15 @@ foreach vti = AllIntegerVectors in { vti.AVL, vti.Log2SEW, TAIL_AGNOSTIC)>; } +// 12.14 Vector Widening Integer Multiply-Add Instructions +defm : VPatWidenMulAddSDNode_VV<sext_oneuse, sext_oneuse, "PseudoVWMACC">; +defm : VPatWidenMulAddSDNode_VX<sext_oneuse, sext_oneuse, "PseudoVWMACC">; +defm : VPatWidenMulAddSDNode_VV<zext_oneuse, zext_oneuse, "PseudoVWMACCU">; +defm : VPatWidenMulAddSDNode_VX<zext_oneuse, zext_oneuse, "PseudoVWMACCU">; +defm : VPatWidenMulAddSDNode_VV<sext_oneuse, zext_oneuse, "PseudoVWMACCSU">; +defm : VPatWidenMulAddSDNode_VX<sext_oneuse, zext_oneuse, "PseudoVWMACCSU">; +defm : VPatWidenMulAddSDNode_VX<zext_oneuse, sext_oneuse, "PseudoVWMACCUS">; + // 12.15. Vector Integer Merge Instructions foreach vti = AllIntegerVectors in { def : Pat<(vti.Vector (vselect (vti.Mask V0), vti.RegClass:$rs1, @@ -582,11 +685,18 @@ defm : VPatBinaryFPSDNode_VV_VF<fadd, "PseudoVFADD">; defm : VPatBinaryFPSDNode_VV_VF<fsub, "PseudoVFSUB">; defm : VPatBinaryFPSDNode_R_VF<fsub, "PseudoVFRSUB">; +// 14.3. Vector Widening Floating-Point Add/Subtract Instructions +defm : VPatWidenBinaryFPSDNode_VV_VF_WV_WF<fadd, "PseudoVFWADD">; +defm : VPatWidenBinaryFPSDNode_VV_VF_WV_WF<fsub, "PseudoVFWSUB">; + // 14.4. Vector Single-Width Floating-Point Multiply/Divide Instructions defm : VPatBinaryFPSDNode_VV_VF<fmul, "PseudoVFMUL">; defm : VPatBinaryFPSDNode_VV_VF<fdiv, "PseudoVFDIV">; defm : VPatBinaryFPSDNode_R_VF<fdiv, "PseudoVFRDIV">; +// 14.5. Vector Widening Floating-Point Multiply Instructions +defm : VPatWidenBinaryFPSDNode_VV_VF<fmul, "PseudoVFWMUL">; + // 14.6 Vector Single-Width Floating-Point Fused Multiply-Add Instructions. foreach fvti = AllFloatVectors in { // NOTE: We choose VFMADD because it has the most commuting freedom. So it diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td index 73b97e1c3675..964f0fa54512 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td @@ -177,14 +177,13 @@ def riscv_vrgatherei16_vv_vl : SDNode<"RISCVISD::VRGATHEREI16_VV_VL", SDTCisSameNumEltsAs<0, 3>, SDTCisVT<4, XLenVT>]>>; -def riscv_vselect_vl : SDNode<"RISCVISD::VSELECT_VL", - SDTypeProfile<1, 4, [SDTCisVec<0>, - SDTCisVec<1>, - SDTCisSameNumEltsAs<0, 1>, - SDTCVecEltisVT<1, i1>, - SDTCisSameAs<0, 2>, - SDTCisSameAs<2, 3>, - SDTCisVT<4, XLenVT>]>>; +def SDT_RISCVSelect_VL : SDTypeProfile<1, 4, [ + SDTCisVec<0>, SDTCisVec<1>, SDTCisSameNumEltsAs<0, 1>, SDTCVecEltisVT<1, i1>, + SDTCisSameAs<0, 2>, SDTCisSameAs<2, 3>, SDTCisVT<4, XLenVT> +]>; + +def riscv_vselect_vl : SDNode<"RISCVISD::VSELECT_VL", SDT_RISCVSelect_VL>; +def riscv_vp_merge_vl : SDNode<"RISCVISD::VP_MERGE_VL", SDT_RISCVSelect_VL>; def SDT_RISCVMaskBinOp_VL : SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, @@ -216,19 +215,20 @@ def riscv_zext_vl : SDNode<"RISCVISD::VZEXT_VL", SDT_RISCVVEXTEND_VL>; def riscv_trunc_vector_vl : SDNode<"RISCVISD::TRUNCATE_VECTOR_VL", SDTypeProfile<1, 3, [SDTCisVec<0>, - SDTCisVec<1>, + SDTCisSameNumEltsAs<0, 1>, SDTCisSameNumEltsAs<0, 2>, SDTCVecEltisVT<2, i1>, SDTCisVT<3, XLenVT>]>>; -def SDT_RISCVVWMUL_VL : SDTypeProfile<1, 4, [SDTCisVec<0>, - SDTCisSameNumEltsAs<0, 1>, - SDTCisSameAs<1, 2>, - SDTCisSameNumEltsAs<1, 3>, - SDTCVecEltisVT<3, i1>, - SDTCisVT<4, XLenVT>]>; -def riscv_vwmul_vl : SDNode<"RISCVISD::VWMUL_VL", SDT_RISCVVWMUL_VL, [SDNPCommutative]>; -def riscv_vwmulu_vl : SDNode<"RISCVISD::VWMULU_VL", SDT_RISCVVWMUL_VL, [SDNPCommutative]>; +def SDT_RISCVVWBinOp_VL : SDTypeProfile<1, 4, [SDTCisVec<0>, + SDTCisSameNumEltsAs<0, 1>, + SDTCisSameAs<1, 2>, + SDTCisSameNumEltsAs<1, 3>, + SDTCVecEltisVT<3, i1>, + SDTCisVT<4, XLenVT>]>; +def riscv_vwmul_vl : SDNode<"RISCVISD::VWMUL_VL", SDT_RISCVVWBinOp_VL, [SDNPCommutative]>; +def riscv_vwmulu_vl : SDNode<"RISCVISD::VWMULU_VL", SDT_RISCVVWBinOp_VL, [SDNPCommutative]>; +def riscv_vwaddu_vl : SDNode<"RISCVISD::VWADDU_VL", SDT_RISCVVWBinOp_VL, [SDNPCommutative]>; def SDTRVVVecReduce : SDTypeProfile<1, 5, [ SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisSameAs<0, 3>, @@ -363,37 +363,47 @@ multiclass VPatBinaryWVL_VV_VX<SDNode vop, string instruction_name> { } } -class VPatBinaryVL_VF<SDNode vop, - string instruction_name, - ValueType result_type, - ValueType vop_type, - ValueType mask_type, - int sew, - LMULInfo vlmul, - VReg vop_reg_class, - RegisterClass scalar_reg_class> : - Pat<(result_type (vop (vop_type vop_reg_class:$rs1), - (vop_type (SplatFPOp scalar_reg_class:$rs2)), - (mask_type true_mask), - VLOpFrag)), +multiclass VPatBinaryVL_VF<SDNode vop, + string instruction_name, + ValueType result_type, + ValueType vop_type, + ValueType mask_type, + int sew, + LMULInfo vlmul, + VReg vop_reg_class, + RegisterClass scalar_reg_class> { + def : Pat<(result_type (vop (vop_type vop_reg_class:$rs1), + (vop_type (SplatFPOp scalar_reg_class:$rs2)), + (mask_type true_mask), + VLOpFrag)), (!cast<Instruction>(instruction_name#"_"#vlmul.MX) vop_reg_class:$rs1, scalar_reg_class:$rs2, GPR:$vl, sew)>; + def : Pat<(result_type (vop (vop_type vop_reg_class:$rs1), + (vop_type (SplatFPOp scalar_reg_class:$rs2)), + (mask_type V0), + VLOpFrag)), + (!cast<Instruction>(instruction_name#"_"#vlmul.MX#"_MASK") + (result_type (IMPLICIT_DEF)), + vop_reg_class:$rs1, + scalar_reg_class:$rs2, + (mask_type V0), GPR:$vl, sew, TAIL_AGNOSTIC)>; +} multiclass VPatBinaryFPVL_VV_VF<SDNode vop, string instruction_name> { foreach vti = AllFloatVectors in { defm : VPatBinaryVL_VV<vop, instruction_name, vti.Vector, vti.Vector, vti.Mask, vti.Log2SEW, vti.LMul, vti.RegClass>; - def : VPatBinaryVL_VF<vop, instruction_name#"_V"#vti.ScalarSuffix, - vti.Vector, vti.Vector, vti.Mask, vti.Log2SEW, - vti.LMul, vti.RegClass, vti.ScalarRegClass>; + defm : VPatBinaryVL_VF<vop, instruction_name#"_V"#vti.ScalarSuffix, + vti.Vector, vti.Vector, vti.Mask, vti.Log2SEW, + vti.LMul, vti.RegClass, vti.ScalarRegClass>; } } multiclass VPatBinaryFPVL_R_VF<SDNode vop, string instruction_name> { - foreach fvti = AllFloatVectors in + foreach fvti = AllFloatVectors in { def : Pat<(fvti.Vector (vop (SplatFPOp fvti.ScalarRegClass:$rs2), fvti.RegClass:$rs1, (fvti.Mask true_mask), @@ -401,6 +411,15 @@ multiclass VPatBinaryFPVL_R_VF<SDNode vop, string instruction_name> { (!cast<Instruction>(instruction_name#"_V"#fvti.ScalarSuffix#"_"#fvti.LMul.MX) fvti.RegClass:$rs1, fvti.ScalarRegClass:$rs2, GPR:$vl, fvti.Log2SEW)>; + def : Pat<(fvti.Vector (vop (SplatFPOp fvti.ScalarRegClass:$rs2), + fvti.RegClass:$rs1, + (fvti.Mask V0), + VLOpFrag)), + (!cast<Instruction>(instruction_name#"_V"#fvti.ScalarSuffix#"_"#fvti.LMul.MX#"_MASK") + (fvti.Vector (IMPLICIT_DEF)), + fvti.RegClass:$rs1, fvti.ScalarRegClass:$rs2, + (fvti.Mask V0), GPR:$vl, fvti.Log2SEW, TAIL_AGNOSTIC)>; + } } multiclass VPatIntegerSetCCVL_VV<VTypeInfo vti, string instruction_name, @@ -602,6 +621,47 @@ multiclass VPatReductionVL<SDNode vop, string instruction_name, bit is_float> { } } +multiclass VPatBinarySDNodeExt_V_WV<SDNode op, PatFrags extop, string instruction_name> { + foreach vti = AllWidenableIntVectors in { + def : Pat< + (vti.Vti.Vector + (riscv_trunc_vector_vl + (op (vti.Wti.Vector vti.Wti.RegClass:$rs2), + (vti.Wti.Vector (extop (vti.Vti.Vector vti.Vti.RegClass:$rs1)))), + (riscv_vmset_vl VLMax), + VLMax)), + (!cast<Instruction>(instruction_name#"_WV_"#vti.Vti.LMul.MX) + vti.Wti.RegClass:$rs2, vti.Vti.RegClass:$rs1, + vti.Vti.AVL, vti.Vti.Log2SEW)>; + } +} + +multiclass VPatBinarySDNodeExt_V_WX<SDNode op, PatFrags extop, string instruction_name> { + foreach vti = AllWidenableIntVectors in { + def : Pat< + (vti.Vti.Vector + (riscv_trunc_vector_vl + (op (vti.Wti.Vector vti.Wti.RegClass:$rs2), + (vti.Wti.Vector (extop (vti.Vti.Vector (SplatPat GPR:$rs1))))), + (riscv_vmset_vl VLMax), + VLMax)), + (!cast<Instruction>(instruction_name#"_WX_"#vti.Vti.LMul.MX) + vti.Wti.RegClass:$rs2, GPR:$rs1, + vti.Vti.AVL, vti.Vti.Log2SEW)>; + } +} + + +multiclass VPatBinarySDNode_V_WV<SDNode op, string instruction_name> { + defm : VPatBinarySDNodeExt_V_WV<op, sext_oneuse, instruction_name>; + defm : VPatBinarySDNodeExt_V_WV<op, zext_oneuse, instruction_name>; +} + +multiclass VPatBinarySDNode_V_WX<SDNode op, string instruction_name> { + defm : VPatBinarySDNodeExt_V_WX<op, sext_oneuse, instruction_name>; + defm : VPatBinarySDNodeExt_V_WX<op, zext_oneuse, instruction_name>; +} + //===----------------------------------------------------------------------===// // Patterns. //===----------------------------------------------------------------------===// @@ -661,6 +721,9 @@ foreach vti = AllIntegerVectors in { (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>; } +// 12.2. Vector Widening Integer Add/Subtract +defm : VPatBinaryWVL_VV_VX<riscv_vwaddu_vl, "PseudoVWADDU">; + // 12.3. Vector Integer Extension defm : VPatExtendSDNode_V_VL<riscv_zext_vl, "PseudoVZEXT", "VF2", AllFractionableVF2IntVectors>; @@ -696,14 +759,19 @@ foreach vti = AllIntegerVectors in { } // 12.7. Vector Narrowing Integer Right Shift Instructions +defm : VPatBinarySDNode_V_WV<srl, "PseudoVNSRL">; +defm : VPatBinarySDNode_V_WX<srl, "PseudoVNSRL">; +defm : VPatBinarySDNode_V_WV<sra, "PseudoVNSRA">; +defm : VPatBinarySDNode_V_WX<sra, "PseudoVNSRA">; + foreach vtiTowti = AllWidenableIntVectors in { defvar vti = vtiTowti.Vti; defvar wti = vtiTowti.Wti; def : Pat<(vti.Vector (riscv_trunc_vector_vl (wti.Vector wti.RegClass:$rs1), (vti.Mask true_mask), VLOpFrag)), - (!cast<Instruction>("PseudoVNSRL_WI_"#vti.LMul.MX) - wti.RegClass:$rs1, 0, GPR:$vl, vti.Log2SEW)>; + (!cast<Instruction>("PseudoVNSRL_WX_"#vti.LMul.MX) + wti.RegClass:$rs1, X0, GPR:$vl, vti.Log2SEW)>; def : Pat<(vti.Vector (riscv_trunc_vector_vl @@ -760,6 +828,8 @@ foreach vti = AllIntegerVectors in { defm : VPatIntegerSetCCVL_VI_Swappable<vti, "PseudoVMSNE", SETNE, SETNE>; defm : VPatIntegerSetCCVL_VI_Swappable<vti, "PseudoVMSLE", SETLE, SETGE>; defm : VPatIntegerSetCCVL_VI_Swappable<vti, "PseudoVMSLEU", SETULE, SETUGE>; + defm : VPatIntegerSetCCVL_VI_Swappable<vti, "PseudoVMSGT", SETGT, SETLT>; + defm : VPatIntegerSetCCVL_VI_Swappable<vti, "PseudoVMSGTU", SETUGT, SETULT>; defm : VPatIntegerSetCCVL_VIPlus1<vti, "PseudoVMSLE", SETLT, SplatPat_simm5_plus1>; @@ -905,6 +975,30 @@ foreach vti = AllIntegerVectors in { VLOpFrag)), (!cast<Instruction>("PseudoVMERGE_VIM_"#vti.LMul.MX) vti.RegClass:$rs2, simm5:$rs1, (vti.Mask V0), GPR:$vl, vti.Log2SEW)>; + + def : Pat<(vti.Vector (riscv_vp_merge_vl (vti.Mask V0), + vti.RegClass:$rs1, + vti.RegClass:$rs2, + VLOpFrag)), + (!cast<Instruction>("PseudoVMERGE_VVM_"#vti.LMul.MX#"_TU") + vti.RegClass:$rs2, vti.RegClass:$rs2, vti.RegClass:$rs1, + (vti.Mask V0), GPR:$vl, vti.Log2SEW)>; + + def : Pat<(vti.Vector (riscv_vp_merge_vl (vti.Mask V0), + (SplatPat XLenVT:$rs1), + vti.RegClass:$rs2, + VLOpFrag)), + (!cast<Instruction>("PseudoVMERGE_VXM_"#vti.LMul.MX#"_TU") + vti.RegClass:$rs2, vti.RegClass:$rs2, GPR:$rs1, + (vti.Mask V0), GPR:$vl, vti.Log2SEW)>; + + def : Pat<(vti.Vector (riscv_vp_merge_vl (vti.Mask V0), + (SplatPat_simm5 simm5:$rs1), + vti.RegClass:$rs2, + VLOpFrag)), + (!cast<Instruction>("PseudoVMERGE_VIM_"#vti.LMul.MX#"_TU") + vti.RegClass:$rs2, vti.RegClass:$rs2, simm5:$rs1, + (vti.Mask V0), GPR:$vl, vti.Log2SEW)>; } // 12.16. Vector Integer Move Instructions @@ -1152,6 +1246,31 @@ foreach fvti = AllFloatVectors in { (!cast<Instruction>("PseudoVMERGE_VIM_"#fvti.LMul.MX) fvti.RegClass:$rs2, 0, (fvti.Mask V0), GPR:$vl, fvti.Log2SEW)>; + def : Pat<(fvti.Vector (riscv_vp_merge_vl (fvti.Mask V0), + fvti.RegClass:$rs1, + fvti.RegClass:$rs2, + VLOpFrag)), + (!cast<Instruction>("PseudoVMERGE_VVM_"#fvti.LMul.MX#"_TU") + fvti.RegClass:$rs2, fvti.RegClass:$rs2, fvti.RegClass:$rs1, (fvti.Mask V0), + GPR:$vl, fvti.Log2SEW)>; + + def : Pat<(fvti.Vector (riscv_vp_merge_vl (fvti.Mask V0), + (SplatFPOp fvti.ScalarRegClass:$rs1), + fvti.RegClass:$rs2, + VLOpFrag)), + (!cast<Instruction>("PseudoVFMERGE_V"#fvti.ScalarSuffix#"M_"#fvti.LMul.MX#"_TU") + fvti.RegClass:$rs2, fvti.RegClass:$rs2, + (fvti.Scalar fvti.ScalarRegClass:$rs1), + (fvti.Mask V0), GPR:$vl, fvti.Log2SEW)>; + + def : Pat<(fvti.Vector (riscv_vp_merge_vl (fvti.Mask V0), + (SplatFPOp (fvti.Scalar fpimm0)), + fvti.RegClass:$rs2, + VLOpFrag)), + (!cast<Instruction>("PseudoVMERGE_VIM_"#fvti.LMul.MX#"_TU") + fvti.RegClass:$rs2, fvti.RegClass:$rs2, 0, (fvti.Mask V0), + GPR:$vl, fvti.Log2SEW)>; + // 14.16. Vector Floating-Point Move Instruction // If we're splatting fpimm0, use vmv.v.x vd, x0. def : Pat<(fvti.Vector (riscv_vfmv_v_f_vl @@ -1368,6 +1487,11 @@ let Predicates = [HasVInstructionsAnyF] in { // 17.2. Floating-Point Scalar Move Instructions foreach vti = AllFloatVectors in { def : Pat<(vti.Vector (riscv_vfmv_s_f_vl (vti.Vector vti.RegClass:$merge), + (vti.Scalar (fpimm0)), + VLOpFrag)), + (!cast<Instruction>("PseudoVMV_S_X_"#vti.LMul.MX) + vti.RegClass:$merge, X0, GPR:$vl, vti.Log2SEW)>; + def : Pat<(vti.Vector (riscv_vfmv_s_f_vl (vti.Vector vti.RegClass:$merge), vti.ScalarRegClass:$rs1, VLOpFrag)), (!cast<Instruction>("PseudoVFMV_S_"#vti.ScalarSuffix#"_"#vti.LMul.MX) diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td index 7eb8ae7d4193..db3f5851879a 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td @@ -12,14 +12,22 @@ // Zbb - 1.0 // Zbc - 1.0 // Zbs - 1.0 -// Zbe - 0.93 -// Zbf - 0.93 -// Zbm - 0.93 -// Zbp - 0.93 -// Zbr - 0.93 -// Zbt - 0.93 -// This version is still experimental as the Bitmanip extensions haven't been -// ratified yet. +// Zbe - 0.93 *experimental +// Zbf - 0.93 *experimental +// Zbm - 0.93 *experimental +// Zbp - 0.93 *experimental +// Zbr - 0.93 *experimental +// Zbt - 0.93 *experimental +// +// The experimental extensions appeared in an earlier draft of the Bitmanip +// extensions. They are not ratified and subject to change. +// +// This file also describes RISC-V instructions from the Zbk* extensions in +// Cryptography Extensions Volume I: Scalar & Entropy Source Instructions, +// versions: +// Zbkb - 1.0 +// Zbkc - 1.0 +// Zbkx - 1.0 // //===----------------------------------------------------------------------===// @@ -43,6 +51,8 @@ def riscv_shfl : SDNode<"RISCVISD::SHFL", SDTIntBinOp>; def riscv_shflw : SDNode<"RISCVISD::SHFLW", SDT_RISCVIntBinOpW>; def riscv_unshfl : SDNode<"RISCVISD::UNSHFL", SDTIntBinOp>; def riscv_unshflw: SDNode<"RISCVISD::UNSHFLW",SDT_RISCVIntBinOpW>; +def riscv_bfp : SDNode<"RISCVISD::BFP", SDTIntBinOp>; +def riscv_bfpw : SDNode<"RISCVISD::BFPW", SDT_RISCVIntBinOpW>; def riscv_bcompress : SDNode<"RISCVISD::BCOMPRESS", SDTIntBinOp>; def riscv_bcompressw : SDNode<"RISCVISD::BCOMPRESSW", SDT_RISCVIntBinOpW>; def riscv_bdecompress : SDNode<"RISCVISD::BDECOMPRESS", SDTIntBinOp>; @@ -309,14 +319,14 @@ class RVBTernaryImm5<bits<2> funct2, bits<3> funct3, RISCVOpcode opcode, // Instructions //===----------------------------------------------------------------------===// -let Predicates = [HasStdExtZbbOrZbp] in { +let Predicates = [HasStdExtZbbOrZbpOrZbkb] in { def ANDN : ALU_rr<0b0100000, 0b111, "andn">, Sched<[WriteIALU, ReadIALU, ReadIALU]>; def ORN : ALU_rr<0b0100000, 0b110, "orn">, Sched<[WriteIALU, ReadIALU, ReadIALU]>; def XNOR : ALU_rr<0b0100000, 0b100, "xnor">, Sched<[WriteIALU, ReadIALU, ReadIALU]>; -} // Predicates = [HasStdExtZbbOrZbp] +} // Predicates = [HasStdExtZbbOrZbpOrZbkb] let Predicates = [HasStdExtZba] in { def SH1ADD : ALU_rr<0b0010000, 0b010, "sh1add">, @@ -327,18 +337,22 @@ def SH3ADD : ALU_rr<0b0010000, 0b110, "sh3add">, Sched<[WriteSHXADD, ReadSHXADD, ReadSHXADD]>; } // Predicates = [HasStdExtZba] -let Predicates = [HasStdExtZbbOrZbp] in { +let Predicates = [HasStdExtZbbOrZbpOrZbkb] in { def ROL : ALU_rr<0b0110000, 0b001, "rol">, Sched<[WriteRotateReg, ReadRotateReg, ReadRotateReg]>; def ROR : ALU_rr<0b0110000, 0b101, "ror">, Sched<[WriteRotateReg, ReadRotateReg, ReadRotateReg]>; -} // Predicates = [HasStdExtZbbOrZbp] +} // Predicates = [HasStdExtZbbOrZbpOrZbkb] let Predicates = [HasStdExtZbs] in { -def BCLR : ALU_rr<0b0100100, 0b001, "bclr">, Sched<[]>; -def BSET : ALU_rr<0b0010100, 0b001, "bset">, Sched<[]>; -def BINV : ALU_rr<0b0110100, 0b001, "binv">, Sched<[]>; -def BEXT : ALU_rr<0b0100100, 0b101, "bext">, Sched<[]>; +def BCLR : ALU_rr<0b0100100, 0b001, "bclr">, + Sched<[WriteSingleBit, ReadSingleBit, ReadSingleBit]>; +def BSET : ALU_rr<0b0010100, 0b001, "bset">, + Sched<[WriteSingleBit, ReadSingleBit, ReadSingleBit]>; +def BINV : ALU_rr<0b0110100, 0b001, "binv">, + Sched<[WriteSingleBit, ReadSingleBit, ReadSingleBit]>; +def BEXT : ALU_rr<0b0100100, 0b101, "bext">, + Sched<[WriteSingleBit, ReadSingleBit, ReadSingleBit]>; } // Predicates = [HasStdExtZbs] let Predicates = [HasStdExtZbp] in { @@ -346,21 +360,28 @@ def GORC : ALU_rr<0b0010100, 0b101, "gorc">, Sched<[]>; def GREV : ALU_rr<0b0110100, 0b101, "grev">, Sched<[]>; } // Predicates = [HasStdExtZbp] +let Predicates = [HasStdExtZbpOrZbkx] in { +def XPERMN : ALU_rr<0b0010100, 0b010, "xperm4">, Sched<[]>; +def XPERMB : ALU_rr<0b0010100, 0b100, "xperm8">, Sched<[]>; +} // Predicates = [HasStdExtZbpOrZbkx] + let Predicates = [HasStdExtZbp] in { -def XPERMN : ALU_rr<0b0010100, 0b010, "xperm.n">, Sched<[]>; -def XPERMB : ALU_rr<0b0010100, 0b100, "xperm.b">, Sched<[]>; def XPERMH : ALU_rr<0b0010100, 0b110, "xperm.h">, Sched<[]>; } // Predicates = [HasStdExtZbp] -let Predicates = [HasStdExtZbbOrZbp] in +let Predicates = [HasStdExtZbbOrZbpOrZbkb] in def RORI : RVBShift_ri<0b01100, 0b101, OPC_OP_IMM, "rori">, Sched<[WriteRotateImm, ReadRotateImm]>; let Predicates = [HasStdExtZbs] in { -def BCLRI : RVBShift_ri<0b01001, 0b001, OPC_OP_IMM, "bclri">, Sched<[]>; -def BSETI : RVBShift_ri<0b00101, 0b001, OPC_OP_IMM, "bseti">, Sched<[]>; -def BINVI : RVBShift_ri<0b01101, 0b001, OPC_OP_IMM, "binvi">, Sched<[]>; -def BEXTI : RVBShift_ri<0b01001, 0b101, OPC_OP_IMM, "bexti">, Sched<[]>; +def BCLRI : RVBShift_ri<0b01001, 0b001, OPC_OP_IMM, "bclri">, + Sched<[WriteSingleBitImm, ReadSingleBitImm]>; +def BSETI : RVBShift_ri<0b00101, 0b001, OPC_OP_IMM, "bseti">, + Sched<[WriteSingleBitImm, ReadSingleBitImm]>; +def BINVI : RVBShift_ri<0b01101, 0b001, OPC_OP_IMM, "binvi">, + Sched<[WriteSingleBitImm, ReadSingleBitImm]>; +def BEXTI : RVBShift_ri<0b01001, 0b101, OPC_OP_IMM, "bexti">, + Sched<[WriteSingleBitImm, ReadSingleBitImm]>; } // Predicates = [HasStdExtZbs] let Predicates = [HasStdExtZbp] in { @@ -428,11 +449,17 @@ def CRC32CD : RVBUnary<0b0110000, 0b11011, 0b001, OPC_OP_IMM, "crc32c.d">, Sched<[]>; let Predicates = [HasStdExtZbc] in { -def CLMUL : ALU_rr<0b0000101, 0b001, "clmul">, Sched<[]>; -def CLMULR : ALU_rr<0b0000101, 0b010, "clmulr">, Sched<[]>; -def CLMULH : ALU_rr<0b0000101, 0b011, "clmulh">, Sched<[]>; +def CLMULR : ALU_rr<0b0000101, 0b010, "clmulr">, + Sched<[WriteCLMUL, ReadCLMUL, ReadCLMUL]>; } // Predicates = [HasStdExtZbc] +let Predicates = [HasStdExtZbcOrZbkc] in { +def CLMUL : ALU_rr<0b0000101, 0b001, "clmul">, + Sched<[WriteCLMUL, ReadCLMUL, ReadCLMUL]>; +def CLMULH : ALU_rr<0b0000101, 0b011, "clmulh">, + Sched<[WriteCLMUL, ReadCLMUL, ReadCLMUL]>; +} // Predicates = [HasStdExtZbcOrZbkc] + let Predicates = [HasStdExtZbb] in { def MIN : ALU_rr<0b0000101, 0b100, "min">, Sched<[WriteIALU, ReadIALU, ReadIALU]>; @@ -456,11 +483,13 @@ def BDECOMPRESS : ALU_rr<0b0100100, 0b110, "bdecompress">, Sched<[]>; def BCOMPRESS : ALU_rr<0b0000100, 0b110, "bcompress">, Sched<[]>; } // Predicates = [HasStdExtZbe] -let Predicates = [HasStdExtZbp] in { +let Predicates = [HasStdExtZbpOrZbkb] in { def PACK : ALU_rr<0b0000100, 0b100, "pack">, Sched<[]>; -def PACKU : ALU_rr<0b0100100, 0b100, "packu">, Sched<[]>; def PACKH : ALU_rr<0b0000100, 0b111, "packh">, Sched<[]>; -} // Predicates = [HasStdExtZbp] +} // Predicates = [HasStdExtZbpOrZbkb] + +let Predicates = [HasStdExtZbp] in +def PACKU : ALU_rr<0b0100100, 0b100, "packu">, Sched<[]>; let Predicates = [HasStdExtZbm, IsRV64] in { def BMATOR : ALU_rr<0b0000100, 0b011, "bmator">, Sched<[]>; @@ -468,7 +497,8 @@ def BMATXOR : ALU_rr<0b0100100, 0b011, "bmatxor">, Sched<[]>; } // Predicates = [HasStdExtZbm, IsRV64] let Predicates = [HasStdExtZbf] in -def BFP : ALU_rr<0b0100100, 0b111, "bfp">, Sched<[]>; +def BFP : ALU_rr<0b0100100, 0b111, "bfp">, + Sched<[WriteBFP, ReadBFP, ReadBFP]>; let Predicates = [HasStdExtZbp] in { def SHFLI : RVBShfl_ri<0b0000100, 0b001, OPC_OP_IMM, "shfli">, Sched<[]>; @@ -488,7 +518,7 @@ def SH3ADDUW : ALUW_rr<0b0010000, 0b110, "sh3add.uw">, Sched<[WriteSHXADD32, ReadSHXADD32, ReadSHXADD32]>; } // Predicates = [HasStdExtZbb, IsRV64] -let Predicates = [HasStdExtZbbOrZbp, IsRV64] in { +let Predicates = [HasStdExtZbbOrZbpOrZbkb, IsRV64] in { def ROLW : ALUW_rr<0b0110000, 0b001, "rolw">, Sched<[WriteRotateReg32, ReadRotateReg32, ReadRotateReg32]>; def RORW : ALUW_rr<0b0110000, 0b101, "rorw">, @@ -504,7 +534,7 @@ let Predicates = [HasStdExtZbp, IsRV64] in { def XPERMW : ALU_rr<0b0010100, 0b000, "xperm.w">, Sched<[]>; } // Predicates = [HasStdExtZbp, IsRV64] -let Predicates = [HasStdExtZbbOrZbp, IsRV64] in +let Predicates = [HasStdExtZbbOrZbpOrZbkb, IsRV64] in def RORIW : RVBShiftW_ri<0b0110000, 0b101, OPC_OP_IMM_32, "roriw">, Sched<[WriteRotateImm32, ReadRotateImm32]>; @@ -543,13 +573,15 @@ def BDECOMPRESSW : ALUW_rr<0b0100100, 0b110, "bdecompressw">, Sched<[]>; def BCOMPRESSW : ALUW_rr<0b0000100, 0b110, "bcompressw">, Sched<[]>; } // Predicates = [HasStdExtZbe, IsRV64] -let Predicates = [HasStdExtZbp, IsRV64] in { +let Predicates = [HasStdExtZbpOrZbkb, IsRV64] in def PACKW : ALUW_rr<0b0000100, 0b100, "packw">, Sched<[]>; + +let Predicates = [HasStdExtZbp, IsRV64] in def PACKUW : ALUW_rr<0b0100100, 0b100, "packuw">, Sched<[]>; -} // Predicates = [HasStdExtZbp, IsRV64] let Predicates = [HasStdExtZbf, IsRV64] in -def BFPW : ALUW_rr<0b0100100, 0b111, "bfpw">, Sched<[]>; +def BFPW : ALUW_rr<0b0100100, 0b111, "bfpw">, + Sched<[WriteBFP32, ReadBFP32, ReadBFP32]>; let Predicates = [HasStdExtZbbOrZbp, IsRV32] in { let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in @@ -576,30 +608,30 @@ def ZEXTH_RV64 : RVInstR<0b0000100, 0b100, OPC_OP_32, (outs GPR:$rd), // causes diagnostics to suggest that Zbp rather than Zbb is required for rev8 // or gorci. Since Zbb is closer to being finalized than Zbp this will be // misleading to users. -let Predicates = [HasStdExtZbbOrZbp, IsRV32] in { -let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in -def REV8_RV32 : RVInstI<0b101, OPC_OP_IMM, (outs GPR:$rd), (ins GPR:$rs1), - "rev8", "$rd, $rs1">, Sched<[WriteREV8, ReadREV8]> { - let imm12 = { 0b01101, 0b0011000 }; -} -} // Predicates = [HasStdExtZbbOrZbp, IsRV32] +let Predicates = [HasStdExtZbbOrZbpOrZbkb, IsRV32] in { +def REV8_RV32 : RVBUnary<0b0110100, 0b11000, 0b101, OPC_OP_IMM, "rev8">, + Sched<[WriteREV8, ReadREV8]>; +} // Predicates = [HasStdExtZbbOrZbpOrZbkb, IsRV32] -let Predicates = [HasStdExtZbbOrZbp, IsRV64] in { -let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in -def REV8_RV64 : RVInstI<0b101, OPC_OP_IMM, (outs GPR:$rd), (ins GPR:$rs1), - "rev8", "$rd, $rs1">, Sched<[WriteREV8, ReadREV8]> { - let imm12 = { 0b01101, 0b0111000 }; -} -} // Predicates = [HasStdExtZbbOrZbp, IsRV64] +let Predicates = [HasStdExtZbbOrZbpOrZbkb, IsRV64] in { +def REV8_RV64 : RVBUnary<0b0110101, 0b11000, 0b101, OPC_OP_IMM, "rev8">, + Sched<[WriteREV8, ReadREV8]>; +} // Predicates = [HasStdExtZbbOrZbpOrZbkb, IsRV64] let Predicates = [HasStdExtZbbOrZbp] in { -let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in -def ORCB : RVInstI<0b101, OPC_OP_IMM, (outs GPR:$rd), (ins GPR:$rs1), - "orc.b", "$rd, $rs1">, Sched<[WriteORCB, ReadORCB]> { - let imm12 = { 0b00101, 0b0000111 }; -} +def ORCB : RVBUnary<0b0010100, 0b00111, 0b101, OPC_OP_IMM, "orc.b">, + Sched<[WriteORCB, ReadORCB]>; } // Predicates = [HasStdExtZbbOrZbp] +let Predicates = [HasStdExtZbpOrZbkb] in +def BREV8 : RVBUnary<0b0110100, 0b00111, 0b101, OPC_OP_IMM, "brev8">; + +let Predicates = [HasStdExtZbpOrZbkb, IsRV32] in { +def ZIP_RV32 : RVBUnary<0b0000100, 0b01111, 0b001, OPC_OP_IMM, "zip">; +def UNZIP_RV32 : RVBUnary<0b0000100, 0b01111, 0b101, OPC_OP_IMM, "unzip">; +} // Predicates = [HasStdExtZbkb, IsRV32] + + //===----------------------------------------------------------------------===// // Pseudo Instructions //===----------------------------------------------------------------------===// @@ -614,11 +646,11 @@ def : InstAlias<"rev2.n $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b00010)>; def : InstAlias<"rev.n $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b00011)>; def : InstAlias<"rev4.b $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b00100)>; def : InstAlias<"rev2.b $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b00110)>; -def : InstAlias<"rev.b $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b00111)>; def : InstAlias<"rev8.h $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b01000)>; def : InstAlias<"rev4.h $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b01100)>; def : InstAlias<"rev2.h $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b01110)>; def : InstAlias<"rev.h $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b01111)>; +def : InstAlias<"rev.b $rd, $rs", (BREV8 GPR:$rd, GPR:$rs)>; def : InstAlias<"zip.n $rd, $rs", (SHFLI GPR:$rd, GPR:$rs, 0b0001)>; def : InstAlias<"unzip.n $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b0001)>; @@ -658,8 +690,7 @@ def : InstAlias<"zip4 $rd, $rs", (SHFLI GPR:$rd, GPR:$rs, 0b1100)>; def : InstAlias<"unzip4 $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b1100)>; def : InstAlias<"zip2 $rd, $rs", (SHFLI GPR:$rd, GPR:$rs, 0b1110)>; def : InstAlias<"unzip2 $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b1110)>; -def : InstAlias<"zip $rd, $rs", (SHFLI GPR:$rd, GPR:$rs, 0b1111)>; -def : InstAlias<"unzip $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b1111)>; +// zip and unzip are considered instructions rather than an alias. def : InstAlias<"orc16 $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b10000)>; def : InstAlias<"orc8 $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b11000)>; @@ -741,6 +772,13 @@ def : InstAlias<"gorcw $rd, $rs1, $shamt", (GORCIW GPR:$rd, GPR:$rs1, uimm5:$shamt), 0>; } // Predicates = [HasStdExtZbp, IsRV64] +// Zbp is unratified and that it would likely adopt the already ratified Zbkx names. +// Thus current Zbp instructions are defined as aliases for Zbkx instructions. +let Predicates = [HasStdExtZbp] in { + def : InstAlias<"xperm.b $rd, $rs1, $rs2", (XPERMB GPR:$rd, GPR:$rs1, GPR:$rs2)>; + def : InstAlias<"xperm.n $rd, $rs1, $rs2", (XPERMN GPR:$rd, GPR:$rs1, GPR:$rs2)>; +} // Predicates = [HasStdExtZbp] + let Predicates = [HasStdExtZbs] in { def : InstAlias<"bset $rd, $rs1, $shamt", (BSETI GPR:$rd, GPR:$rs1, uimmlog2xlen:$shamt), 0>; @@ -756,16 +794,16 @@ def : InstAlias<"bext $rd, $rs1, $shamt", // Codegen patterns //===----------------------------------------------------------------------===// -let Predicates = [HasStdExtZbbOrZbp] in { +let Predicates = [HasStdExtZbbOrZbpOrZbkb] in { def : Pat<(and GPR:$rs1, (not GPR:$rs2)), (ANDN GPR:$rs1, GPR:$rs2)>; def : Pat<(or GPR:$rs1, (not GPR:$rs2)), (ORN GPR:$rs1, GPR:$rs2)>; def : Pat<(xor GPR:$rs1, (not GPR:$rs2)), (XNOR GPR:$rs1, GPR:$rs2)>; -} // Predicates = [HasStdExtZbbOrZbp] +} // Predicates = [HasStdExtZbbOrZbpOrZbkb] -let Predicates = [HasStdExtZbbOrZbp] in { +let Predicates = [HasStdExtZbbOrZbpOrZbkb] in { def : PatGprGpr<rotl, ROL>; def : PatGprGpr<rotr, ROR>; -} // Predicates = [HasStdExtZbbOrZbp] +} // Predicates = [HasStdExtZbbOrZbpOrZbkb] let Predicates = [HasStdExtZbs] in { def : Pat<(and (not (shiftop<shl> 1, GPR:$rs2)), GPR:$rs1), @@ -816,7 +854,7 @@ def : Pat<(and GPR:$r, BCLRIANDIMask:$i), // There's no encoding for roli in the the 'B' extension as it can be // implemented with rori by negating the immediate. -let Predicates = [HasStdExtZbbOrZbp] in { +let Predicates = [HasStdExtZbbOrZbpOrZbkb] in { def : PatGprImm<rotr, RORI, uimmlog2xlen>; def : Pat<(rotl GPR:$rs1, uimmlog2xlen:$shamt), (RORI GPR:$rs1, (ImmSubFromXLen uimmlog2xlen:$shamt))>; @@ -834,19 +872,28 @@ def : PatGprGpr<riscv_unshfl, UNSHFL>; def : PatGprGpr<int_riscv_xperm_n, XPERMN>; def : PatGprGpr<int_riscv_xperm_b, XPERMB>; def : PatGprGpr<int_riscv_xperm_h, XPERMH>; -def : PatGprGpr<int_riscv_xperm_w, XPERMW>; def : PatGprImm<riscv_shfl, SHFLI, shfl_uimm>; def : PatGprImm<riscv_unshfl, UNSHFLI, shfl_uimm>; def : PatGprImm<riscv_grev, GREVI, uimmlog2xlen>; def : PatGprImm<riscv_gorc, GORCI, uimmlog2xlen>; + +// We treat brev8 as a separate instruction, so match it directly. +def : Pat<(riscv_grev GPR:$rs1, 7), (BREV8 GPR:$rs1)>; } // Predicates = [HasStdExtZbp] +let Predicates = [HasStdExtZbp, IsRV64] in +def : PatGprGpr<int_riscv_xperm_w, XPERMW>; + let Predicates = [HasStdExtZbp, IsRV32] in { def : Pat<(i32 (rotr (riscv_grev GPR:$rs1, 24), (i32 16))), (GREVI GPR:$rs1, 8)>; def : Pat<(i32 (rotl (riscv_grev GPR:$rs1, 24), (i32 16))), (GREVI GPR:$rs1, 8)>; // We treat rev8 as a separate instruction, so match it directly. def : Pat<(i32 (riscv_grev GPR:$rs1, 24)), (REV8_RV32 GPR:$rs1)>; + +// We treat zip and unzip as separate instructions, so match it directly. +def : Pat<(i32 (riscv_shfl GPR:$rs1, 15)), (ZIP_RV32 GPR:$rs1)>; +def : Pat<(i32 (riscv_unshfl GPR:$rs1, 15)), (UNZIP_RV32 GPR:$rs1)>; } // Predicates = [HasStdExtZbp, IsRV32] let Predicates = [HasStdExtZbp, IsRV64] in { @@ -882,21 +929,16 @@ def : Pat<(select GPR:$rs2, GPR:$rs1, GPR:$rs3), (CMOV GPR:$rs1, GPR:$rs2, GPR:$rs3)>; } // Predicates = [HasStdExtZbt] -// fshl and fshr concatenate their operands in the same order. fsr and fsl -// instruction use different orders. fshl will return its first operand for -// shift of zero, fshr will return its second operand. fsl and fsr both return -// $rs1 so the patterns need to have different operand orders. let Predicates = [HasStdExtZbt] in { def : Pat<(riscv_fsl GPR:$rs1, GPR:$rs3, GPR:$rs2), (FSL GPR:$rs1, GPR:$rs2, GPR:$rs3)>; -def : Pat<(riscv_fsr GPR:$rs3, GPR:$rs1, GPR:$rs2), +def : Pat<(riscv_fsr GPR:$rs1, GPR:$rs3, GPR:$rs2), (FSR GPR:$rs1, GPR:$rs2, GPR:$rs3)>; - -def : Pat<(fshr GPR:$rs3, GPR:$rs1, uimmlog2xlen:$shamt), +def : Pat<(riscv_fsr GPR:$rs1, GPR:$rs3, uimmlog2xlen:$shamt), (FSRI GPR:$rs1, GPR:$rs3, uimmlog2xlen:$shamt)>; -// We can use FSRI for fshl by immediate if we subtract the immediate from +// We can use FSRI for FSL by immediate if we subtract the immediate from // XLen and swap the operands. -def : Pat<(fshl GPR:$rs3, GPR:$rs1, uimmlog2xlen:$shamt), +def : Pat<(riscv_fsl GPR:$rs3, GPR:$rs1, uimmlog2xlen:$shamt), (FSRI GPR:$rs1, GPR:$rs3, (ImmSubFromXLen uimmlog2xlen:$shamt))>; } // Predicates = [HasStdExtZbt] @@ -918,31 +960,38 @@ def : PatGprGpr<umin, MINU>; def : PatGprGpr<umax, MAXU>; } // Predicates = [HasStdExtZbb] -let Predicates = [HasStdExtZbb, IsRV32] in { +let Predicates = [HasStdExtZbbOrZbkb, IsRV32] in { def : Pat<(i32 (bswap GPR:$rs1)), (REV8_RV32 GPR:$rs1)>; -} // Predicates = [HasStdExtZbb, IsRV32] +} // Predicates = [HasStdExtZbbOrZbkb, IsRV32] -let Predicates = [HasStdExtZbb, IsRV64] in { +let Predicates = [HasStdExtZbbOrZbkb, IsRV64] in { def : Pat<(i64 (bswap GPR:$rs1)), (REV8_RV64 GPR:$rs1)>; -} // Predicates = [HasStdExtZbb, IsRV64] +} // Predicates = [HasStdExtZbbOrZbkb, IsRV64] -let Predicates = [HasStdExtZbp, IsRV32] in { +let Predicates = [HasStdExtZbpOrZbkb, IsRV32] in def : Pat<(i32 (or (and GPR:$rs1, 0x0000FFFF), (shl GPR:$rs2, (i32 16)))), (PACK GPR:$rs1, GPR:$rs2)>; + +let Predicates = [HasStdExtZbp, IsRV32] in def : Pat<(i32 (or (and GPR:$rs2, 0xFFFF0000), (srl GPR:$rs1, (i32 16)))), (PACKU GPR:$rs1, GPR:$rs2)>; -} -let Predicates = [HasStdExtZbp, IsRV64] in { +let Predicates = [HasStdExtZbpOrZbkb, IsRV64] in def : Pat<(i64 (or (and GPR:$rs1, 0x00000000FFFFFFFF), (shl GPR:$rs2, (i64 32)))), (PACK GPR:$rs1, GPR:$rs2)>; + +let Predicates = [HasStdExtZbp, IsRV64] in def : Pat<(i64 (or (and GPR:$rs2, 0xFFFFFFFF00000000), (srl GPR:$rs1, (i64 32)))), (PACKU GPR:$rs1, GPR:$rs2)>; -} -let Predicates = [HasStdExtZbp] in + +let Predicates = [HasStdExtZbpOrZbkb] in { def : Pat<(or (and (shl GPR:$rs2, (XLenVT 8)), 0xFFFF), (and GPR:$rs1, 0x00FF)), (PACKH GPR:$rs1, GPR:$rs2)>; +def : Pat<(or (shl (and GPR:$rs2, 0x00FF), (XLenVT 8)), + (and GPR:$rs1, 0x00FF)), + (PACKH GPR:$rs1, GPR:$rs2)>; +} // Predicates = [HasStdExtZbpOrZbkb] let Predicates = [HasStdExtZbbOrZbp, IsRV32] in def : Pat<(i32 (and GPR:$rs, 0xFFFF)), (ZEXTH_RV32 GPR:$rs)>; @@ -1045,13 +1094,13 @@ def : Pat<(i64 (add (and (shl GPR:$rs1, (i64 3)), 0x7FFFFFFFF), non_imm12:$rs2)) (SH3ADDUW GPR:$rs1, GPR:$rs2)>; } // Predicates = [HasStdExtZba, IsRV64] -let Predicates = [HasStdExtZbbOrZbp, IsRV64] in { +let Predicates = [HasStdExtZbbOrZbpOrZbkb, IsRV64] in { def : PatGprGpr<riscv_rolw, ROLW>; def : PatGprGpr<riscv_rorw, RORW>; def : PatGprImm<riscv_rorw, RORIW, uimm5>; def : Pat<(riscv_rolw GPR:$rs1, uimm5:$rs2), (RORIW GPR:$rs1, (ImmSubFrom32 uimm5:$rs2))>; -} // Predicates = [HasStdExtZbbOrZbp, IsRV64] +} // Predicates = [HasStdExtZbbOrZbpOrZbkb, IsRV64] let Predicates = [HasStdExtZbp, IsRV64] in { def : Pat<(riscv_rorw (riscv_grevw GPR:$rs1, 24), 16), (GREVIW GPR:$rs1, 8)>; @@ -1067,10 +1116,12 @@ def : PatGprImm<riscv_gorcw, GORCIW, uimm5>; let Predicates = [HasStdExtZbt, IsRV64] in { def : Pat<(riscv_fslw GPR:$rs1, GPR:$rs3, GPR:$rs2), (FSLW GPR:$rs1, GPR:$rs2, GPR:$rs3)>; -def : Pat<(riscv_fsrw GPR:$rs3, GPR:$rs1, GPR:$rs2), +def : Pat<(riscv_fsrw GPR:$rs1, GPR:$rs3, GPR:$rs2), (FSRW GPR:$rs1, GPR:$rs2, GPR:$rs3)>; -def : Pat<(riscv_fsrw GPR:$rs3, GPR:$rs1, uimm5:$shamt), +def : Pat<(riscv_fsrw GPR:$rs1, GPR:$rs3, uimm5:$shamt), (FSRIW GPR:$rs1, GPR:$rs3, uimm5:$shamt)>; +// We can use FSRIW for FSLW by immediate if we subtract the immediate from +// 32 and swap the operands. def : Pat<(riscv_fslw GPR:$rs3, GPR:$rs1, uimm5:$shamt), (FSRIW GPR:$rs1, GPR:$rs3, (ImmSubFrom32 uimm5:$shamt))>; } // Predicates = [HasStdExtZbt, IsRV64] @@ -1081,7 +1132,7 @@ def : PatGpr<riscv_ctzw, CTZW>; def : Pat<(i64 (ctpop (i64 (zexti32 (i64 GPR:$rs1))))), (CPOPW GPR:$rs1)>; } // Predicates = [HasStdExtZbb, IsRV64] -let Predicates = [HasStdExtZbp, IsRV64] in { +let Predicates = [HasStdExtZbpOrZbkb, IsRV64] in { def : Pat<(i64 (sext_inreg (or (shl GPR:$rs2, (i64 16)), (and GPR:$rs1, 0x000000000000FFFF)), i32)), @@ -1089,16 +1140,21 @@ def : Pat<(i64 (sext_inreg (or (shl GPR:$rs2, (i64 16)), def : Pat<(i64 (or (sext_inreg (shl GPR:$rs2, (i64 16)), i32), (and GPR:$rs1, 0x000000000000FFFF))), (PACKW GPR:$rs1, GPR:$rs2)>; +} + +let Predicates = [HasStdExtZbp, IsRV64] in def : Pat<(i64 (or (and (assertsexti32 GPR:$rs2), 0xFFFFFFFFFFFF0000), (srl (and GPR:$rs1, 0xFFFFFFFF), (i64 16)))), (PACKUW GPR:$rs1, GPR:$rs2)>; -} // Predicates = [HasStdExtZbp, IsRV64] -let Predicates = [HasStdExtZbc] in { + +let Predicates = [HasStdExtZbcOrZbkc] in { def : PatGprGpr<int_riscv_clmul, CLMUL>; def : PatGprGpr<int_riscv_clmulh, CLMULH>; +} // Predicates = [HasStdExtZbcOrZbkc] + +let Predicates = [HasStdExtZbc] in def : PatGprGpr<int_riscv_clmulr, CLMULR>; -} // Predicates = [HasStdExtZbc] let Predicates = [HasStdExtZbe] in { def : PatGprGpr<riscv_bcompress, BCOMPRESS>; @@ -1123,3 +1179,23 @@ let Predicates = [HasStdExtZbr, IsRV64] in { def : PatGpr<int_riscv_crc32_d, CRC32D>; def : PatGpr<int_riscv_crc32c_d, CRC32CD>; } // Predicates = [HasStdExtZbr, IsRV64] + +let Predicates = [HasStdExtZbf] in +def : PatGprGpr<riscv_bfp, BFP>; + +let Predicates = [HasStdExtZbf, IsRV64] in +def : PatGprGpr<riscv_bfpw, BFPW>; + +let Predicates = [HasStdExtZbkb] in { +def : PatGpr<int_riscv_brev8, BREV8>; +} // Predicates = [HasStdExtZbkb] + +let Predicates = [HasStdExtZbkb, IsRV32] in { +def : PatGpr<int_riscv_zip, ZIP_RV32>; +def : PatGpr<int_riscv_unzip, UNZIP_RV32>; +} // Predicates = [HasStdExtZbkb, IsRV32] + +let Predicates = [HasStdExtZbkx] in { +def : PatGprGpr<int_riscv_xperm4, XPERMN>; +def : PatGprGpr<int_riscv_xperm8, XPERMB>; +} diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td index 663e44813899..dfd0c74ee26c 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td @@ -1,4 +1,4 @@ -//===-- RISCVInstrInfoFH.td - RISC-V 'FH' instructions -----*- tablegen -*-===// +//===-- RISCVInstrInfoZfh.td - RISC-V 'Zfh' instructions ---*- tablegen -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -7,9 +7,7 @@ //===----------------------------------------------------------------------===// // // This file describes the RISC-V instructions from the standard 'Zfh' -// half-precision floating-point extension, version 0.1. -// This version is still experimental as the 'Zfh' extension hasn't been -// ratified yet. +// half-precision floating-point extension, version 1.0. // //===----------------------------------------------------------------------===// @@ -32,20 +30,12 @@ def riscv_fmv_x_anyexth //===----------------------------------------------------------------------===// let Predicates = [HasStdExtZfhmin] in { -let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in -def FLH : RVInstI<0b001, OPC_LOAD_FP, (outs FPR16:$rd), - (ins GPR:$rs1, simm12:$imm12), - "flh", "$rd, ${imm12}(${rs1})">, - Sched<[WriteFLD16, ReadFMemBase]>; +def FLH : FPLoad_r<0b001, "flh", FPR16, WriteFLD16>; // Operands for stores are in the order srcreg, base, offset rather than // reflecting the order these fields are specified in the instruction // encoding. -let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in -def FSH : RVInstS<0b001, OPC_STORE_FP, (outs), - (ins FPR16:$rs2, GPR:$rs1, simm12:$imm12), - "fsh", "$rs2, ${imm12}(${rs1})">, - Sched<[WriteFST16, ReadStoreData, ReadFMemBase]>; +def FSH : FPStore_r<0b001, "fsh", FPR16, WriteFST16>; } // Predicates = [HasStdExtZfhmin] let Predicates = [HasStdExtZfh] in { @@ -190,6 +180,10 @@ def : InstAlias<"fge.h $rd, $rs, $rt", let Predicates = [HasStdExtZfhmin] in { def PseudoFLH : PseudoFloatLoad<"flh", FPR16>; def PseudoFSH : PseudoStore<"fsh", FPR16>; +let usesCustomInserter = 1 in { +def PseudoQuietFLE_H : PseudoQuietFCMP<FPR16>; +def PseudoQuietFLT_H : PseudoQuietFCMP<FPR16>; +} } // Predicates = [HasStdExtZfhmin] //===----------------------------------------------------------------------===// @@ -207,6 +201,7 @@ let Predicates = [HasStdExtZfh] in { /// Float constants def : Pat<(f16 (fpimm0)), (FMV_H_X X0)>; +def : Pat<(f16 (fpimmneg0)), (FSGNJN_H (FMV_H_X X0), (FMV_H_X X0))>; /// Float conversion operations @@ -254,13 +249,34 @@ def : PatFpr16Fpr16<fminnum, FMIN_H>; def : PatFpr16Fpr16<fmaxnum, FMAX_H>; /// Setcc - -def : PatFpr16Fpr16<seteq, FEQ_H>; -def : PatFpr16Fpr16<setoeq, FEQ_H>; -def : PatFpr16Fpr16<setlt, FLT_H>; -def : PatFpr16Fpr16<setolt, FLT_H>; -def : PatFpr16Fpr16<setle, FLE_H>; -def : PatFpr16Fpr16<setole, FLE_H>; +// FIXME: SETEQ/SETLT/SETLE imply nonans, can we pick better instructions for +// strict versions of those. + +// Match non-signaling FEQ_D +def : PatSetCC<FPR16, any_fsetcc, SETEQ, FEQ_H>; +def : PatSetCC<FPR16, any_fsetcc, SETOEQ, FEQ_H>; +def : PatSetCC<FPR16, strict_fsetcc, SETLT, PseudoQuietFLT_H>; +def : PatSetCC<FPR16, strict_fsetcc, SETOLT, PseudoQuietFLT_H>; +def : PatSetCC<FPR16, strict_fsetcc, SETLE, PseudoQuietFLE_H>; +def : PatSetCC<FPR16, strict_fsetcc, SETOLE, PseudoQuietFLE_H>; + +// Match signaling FEQ_H +def : Pat<(strict_fsetccs FPR16:$rs1, FPR16:$rs2, SETEQ), + (AND (FLE_H $rs1, $rs2), + (FLE_H $rs2, $rs1))>; +def : Pat<(strict_fsetccs FPR16:$rs1, FPR16:$rs2, SETOEQ), + (AND (FLE_H $rs1, $rs2), + (FLE_H $rs2, $rs1))>; +// If both operands are the same, use a single FLE. +def : Pat<(strict_fsetccs FPR16:$rs1, FPR16:$rs1, SETEQ), + (FLE_H $rs1, $rs1)>; +def : Pat<(strict_fsetccs FPR16:$rs1, FPR16:$rs1, SETOEQ), + (FLE_H $rs1, $rs1)>; + +def : PatSetCC<FPR16, any_fsetccs, SETLT, FLT_H>; +def : PatSetCC<FPR16, any_fsetccs, SETOLT, FLT_H>; +def : PatSetCC<FPR16, any_fsetccs, SETLE, FLE_H>; +def : PatSetCC<FPR16, any_fsetccs, SETOLE, FLE_H>; def Select_FPR16_Using_CC_GPR : SelectCC_rrirr<FPR16, GPR>; } // Predicates = [HasStdExtZfh] @@ -291,14 +307,14 @@ def : Pat<(i32 (any_fp_to_sint FPR16:$rs1)), (FCVT_W_H $rs1, 0b001)>; def : Pat<(i32 (any_fp_to_uint FPR16:$rs1)), (FCVT_WU_H $rs1, 0b001)>; // Saturating float->[u]int32. -def : Pat<(i32 (riscv_fcvt_x_rtz FPR16:$rs1)), (FCVT_W_H $rs1, 0b001)>; -def : Pat<(i32 (riscv_fcvt_xu_rtz FPR16:$rs1)), (FCVT_WU_H $rs1, 0b001)>; +def : Pat<(i32 (riscv_fcvt_x FPR16:$rs1, timm:$frm)), (FCVT_W_H $rs1, timm:$frm)>; +def : Pat<(i32 (riscv_fcvt_xu FPR16:$rs1, timm:$frm)), (FCVT_WU_H $rs1, timm:$frm)>; // half->int32 with current rounding mode. -def : Pat<(i32 (lrint FPR16:$rs1)), (FCVT_W_H $rs1, 0b111)>; +def : Pat<(i32 (any_lrint FPR16:$rs1)), (FCVT_W_H $rs1, 0b111)>; // half->int32 rounded to nearest with ties rounded away from zero. -def : Pat<(i32 (lround FPR16:$rs1)), (FCVT_W_H $rs1, 0b100)>; +def : Pat<(i32 (any_lround FPR16:$rs1)), (FCVT_W_H $rs1, 0b100)>; // [u]int->half. Match GCC and default to using dynamic rounding mode. def : Pat<(any_sint_to_fp (i32 GPR:$rs1)), (FCVT_H_W $rs1, 0b111)>; @@ -309,24 +325,24 @@ let Predicates = [HasStdExtZfh, IsRV64] in { // Use target specific isd nodes to help us remember the result is sign // extended. Matching sext_inreg+fptoui/fptosi may cause the conversion to be // duplicated if it has another user that didn't need the sign_extend. -def : Pat<(riscv_any_fcvt_w_rtz_rv64 FPR16:$rs1), (FCVT_W_H $rs1, 0b001)>; -def : Pat<(riscv_any_fcvt_wu_rtz_rv64 FPR16:$rs1), (FCVT_WU_H $rs1, 0b001)>; +def : Pat<(riscv_any_fcvt_w_rv64 FPR16:$rs1, timm:$frm), (FCVT_W_H $rs1, timm:$frm)>; +def : Pat<(riscv_any_fcvt_wu_rv64 FPR16:$rs1, timm:$frm), (FCVT_WU_H $rs1, timm:$frm)>; // half->[u]int64. Round-to-zero must be used. def : Pat<(i64 (any_fp_to_sint FPR16:$rs1)), (FCVT_L_H $rs1, 0b001)>; def : Pat<(i64 (any_fp_to_uint FPR16:$rs1)), (FCVT_LU_H $rs1, 0b001)>; // Saturating float->[u]int64. -def : Pat<(i64 (riscv_fcvt_x_rtz FPR16:$rs1)), (FCVT_L_H $rs1, 0b001)>; -def : Pat<(i64 (riscv_fcvt_xu_rtz FPR16:$rs1)), (FCVT_LU_H $rs1, 0b001)>; +def : Pat<(i64 (riscv_fcvt_x FPR16:$rs1, timm:$frm)), (FCVT_L_H $rs1, timm:$frm)>; +def : Pat<(i64 (riscv_fcvt_xu FPR16:$rs1, timm:$frm)), (FCVT_LU_H $rs1, timm:$frm)>; // half->int64 with current rounding mode. -def : Pat<(i64 (lrint FPR16:$rs1)), (FCVT_L_H $rs1, 0b111)>; -def : Pat<(i64 (llrint FPR16:$rs1)), (FCVT_L_H $rs1, 0b111)>; +def : Pat<(i64 (any_lrint FPR16:$rs1)), (FCVT_L_H $rs1, 0b111)>; +def : Pat<(i64 (any_llrint FPR16:$rs1)), (FCVT_L_H $rs1, 0b111)>; // half->int64 rounded to nearest with ties rounded away from zero. -def : Pat<(i64 (lround FPR16:$rs1)), (FCVT_L_H $rs1, 0b100)>; -def : Pat<(i64 (llround FPR16:$rs1)), (FCVT_L_H $rs1, 0b100)>; +def : Pat<(i64 (any_lround FPR16:$rs1)), (FCVT_L_H $rs1, 0b100)>; +def : Pat<(i64 (any_llround FPR16:$rs1)), (FCVT_L_H $rs1, 0b100)>; // [u]int->fp. Match GCC and default to using dynamic rounding mode. def : Pat<(any_sint_to_fp (i64 (sexti32 (i64 GPR:$rs1)))), (FCVT_H_W $rs1, 0b111)>; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZk.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZk.td new file mode 100644 index 000000000000..4a41cddedc71 --- /dev/null +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZk.td @@ -0,0 +1,203 @@ +//===- RISCVInstrInfoZk.td - RISC-V Scalar Crypto instructions - tablegen -*===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file describes the RISC-V instructions from the standard 'Zk', +// Scalar Cryptography Instructions extension, version 1.0. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Operand and SDNode transformation definitions. +//===----------------------------------------------------------------------===// + +def RnumArg : AsmOperandClass { + let Name = "RnumArg"; + let RenderMethod = "addImmOperands"; + let DiagnosticType = "InvalidRnumArg"; +} + +def rnum : Operand<i32>, TImmLeaf<i32, [{return (Imm >= 0 && Imm <= 10);}]> { + let ParserMatchClass = RnumArg; + let EncoderMethod = "getImmOpValue"; + let DecoderMethod = "decodeUImmOperand<4>"; + let OperandType = "OPERAND_RVKRNUM"; + let OperandNamespace = "RISCVOp"; +} + +def byteselect : Operand<i8>, TImmLeaf<i8, [{return isUInt<2>(Imm);}]> { + let ParserMatchClass = UImmAsmOperand<2>; + let DecoderMethod = "decodeUImmOperand<2>"; + let OperandType = "OPERAND_UIMM2"; + let OperandNamespace = "RISCVOp"; +} + +//===----------------------------------------------------------------------===// +// Instruction class templates +//===----------------------------------------------------------------------===// +let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in +class RVKUnary<bits<12> imm12_in, bits<3> funct3, string opcodestr> + : RVInstI<funct3, OPC_OP_IMM, (outs GPR:$rd), (ins GPR:$rs1), + opcodestr, "$rd, $rs1">{ + let imm12 = imm12_in; +} + +let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in +class RVKByteSelect<bits<5> funct5, string opcodestr> + : RVInstR<{0b00, funct5}, 0b000, OPC_OP, (outs GPR:$rd), + (ins GPR:$rs1, GPR:$rs2, byteselect:$bs), + opcodestr, "$rd, $rs1, $rs2, $bs">{ + bits<2> bs; + let Inst{31-30} = bs; +} + +let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in +class RVKUnary_rnum<bits<7> funct7, bits<3> funct3, string opcodestr> + : RVInstI<funct3, OPC_OP_IMM, (outs GPR:$rd), (ins GPR:$rs1, rnum:$rnum), + opcodestr, "$rd, $rs1, $rnum">{ + bits<4> rnum; + let Inst{31-25} = funct7; + let Inst{24} = 1; + let Inst{23-20} = rnum; +} + +//===----------------------------------------------------------------------===// +// Instructions +//===----------------------------------------------------------------------===// +let Predicates = [HasStdExtZknd, IsRV32] in { +def AES32DSI : RVKByteSelect<0b10101, "aes32dsi">; +def AES32DSMI : RVKByteSelect<0b10111, "aes32dsmi">; +} // Predicates = [HasStdExtZknd, IsRV32] + +let Predicates = [HasStdExtZknd, IsRV64] in { +def AES64DS : ALU_rr<0b0011101, 0b000, "aes64ds">; +def AES64DSM : ALU_rr<0b0011111, 0b000, "aes64dsm">; + +def AES64IM : RVKUnary<0b001100000000, 0b001, "aes64im">; +} // Predicates = [HasStdExtZknd, IsRV64] + +let Predicates = [HasStdExtZkndOrZkne, IsRV64] in { +def AES64KS2 : ALU_rr<0b0111111, 0b000, "aes64ks2">; + +def AES64KS1I : RVKUnary_rnum<0b0011000, 0b001, "aes64ks1i">; +} // Predicates = [HasStdExtZkndOrZkne, IsRV64] + +let Predicates = [HasStdExtZkne, IsRV32] in { +def AES32ESI : RVKByteSelect<0b10001, "aes32esi">; +def AES32ESMI : RVKByteSelect<0b10011, "aes32esmi">; +} // Predicates = [HasStdExtZkne, IsRV32] + +let Predicates = [HasStdExtZkne, IsRV64] in { +def AES64ES : ALU_rr<0b0011001, 0b000, "aes64es">; +def AES64ESM : ALU_rr<0b0011011, 0b000, "aes64esm">; +} // Predicates = [HasStdExtZkne, IsRV64] + +let Predicates = [HasStdExtZknh] in { +def SHA256SIG0 : RVKUnary<0b000100000010, 0b001, "sha256sig0">; +def SHA256SIG1 : RVKUnary<0b000100000011, 0b001, "sha256sig1">; +def SHA256SUM0 : RVKUnary<0b000100000000, 0b001, "sha256sum0">; +def SHA256SUM1 : RVKUnary<0b000100000001, 0b001, "sha256sum1">; +} // Predicates = [HasStdExtZknh] + +let Predicates = [HasStdExtZknh, IsRV32] in { +def SHA512SIG0H : ALU_rr<0b0101110, 0b000, "sha512sig0h">; +def SHA512SIG0L : ALU_rr<0b0101010, 0b000, "sha512sig0l">; +def SHA512SIG1H : ALU_rr<0b0101111, 0b000, "sha512sig1h">; +def SHA512SIG1L : ALU_rr<0b0101011, 0b000, "sha512sig1l">; +def SHA512SUM0R : ALU_rr<0b0101000, 0b000, "sha512sum0r">; +def SHA512SUM1R : ALU_rr<0b0101001, 0b000, "sha512sum1r">; +} // [HasStdExtZknh, IsRV32] + +let Predicates = [HasStdExtZknh, IsRV64] in { +def SHA512SIG0 : RVKUnary<0b000100000110, 0b001, "sha512sig0">; +def SHA512SIG1 : RVKUnary<0b000100000111, 0b001, "sha512sig1">; +def SHA512SUM0 : RVKUnary<0b000100000100, 0b001, "sha512sum0">; +def SHA512SUM1 : RVKUnary<0b000100000101, 0b001, "sha512sum1">; +} // Predicates = [HasStdExtZknh, IsRV64] + +let Predicates = [HasStdExtZksed] in { +def SM4ED : RVKByteSelect<0b11000, "sm4ed">; +def SM4KS : RVKByteSelect<0b11010, "sm4ks">; +} // Predicates = [HasStdExtZksed] + +let Predicates = [HasStdExtZksh] in { +def SM3P0 : RVKUnary<0b000100001000, 0b001, "sm3p0">; +def SM3P1 : RVKUnary<0b000100001001, 0b001, "sm3p1">; +} // Predicates = [HasStdExtZksh] + +//===----------------------------------------------------------------------===// +// Codegen patterns +//===----------------------------------------------------------------------===// + +class PatGprGprByteSelect<SDPatternOperator OpNode, RVInst Inst> + : Pat<(OpNode GPR:$rs1, GPR:$rs2, i8:$imm), + (Inst GPR:$rs1, GPR:$rs2, byteselect:$imm)>; + +// Zknd +let Predicates = [HasStdExtZknd, IsRV32] in { +def : PatGprGprByteSelect<int_riscv_aes32dsi, AES32DSI>; +def : PatGprGprByteSelect<int_riscv_aes32dsmi, AES32DSMI>; +} // Predicates = [HasStdExtZknd, IsRV32] + +let Predicates = [HasStdExtZknd, IsRV64] in { +def : PatGprGpr<int_riscv_aes64ds, AES64DS>; +def : PatGprGpr<int_riscv_aes64dsm, AES64DSM>; +def : PatGpr<int_riscv_aes64im, AES64IM>; +} // Predicates = [HasStdExtZknd, IsRV64] + +let Predicates = [HasStdExtZkndOrZkne, IsRV64] in { +def : PatGprGpr<int_riscv_aes64ks2, AES64KS2>; +def : Pat<(int_riscv_aes64ks1i GPR:$rs1, i32:$rnum), + (AES64KS1I GPR:$rs1, rnum:$rnum)>; +} // Predicates = [HasStdExtZkndOrZkne, IsRV64] + +// Zkne +let Predicates = [HasStdExtZkne, IsRV32] in { +def : PatGprGprByteSelect<int_riscv_aes32esi, AES32ESI>; +def : PatGprGprByteSelect<int_riscv_aes32esmi, AES32ESMI>; +} // Predicates = [HasStdExtZkne, IsRV32] + +let Predicates = [HasStdExtZkne, IsRV64] in { +def : PatGprGpr<int_riscv_aes64es, AES64ES>; +def : PatGprGpr<int_riscv_aes64esm, AES64ESM>; +} // Predicates = [HasStdExtZkne, IsRV64] + +// Zknh +let Predicates = [HasStdExtZknh] in { +def : PatGpr<int_riscv_sha256sig0, SHA256SIG0>; +def : PatGpr<int_riscv_sha256sig1, SHA256SIG1>; +def : PatGpr<int_riscv_sha256sum0, SHA256SUM0>; +def : PatGpr<int_riscv_sha256sum1, SHA256SUM1>; +} // Predicates = [HasStdExtZknh] + +let Predicates = [HasStdExtZknh, IsRV32] in { +def : PatGprGpr<int_riscv_sha512sig0l, SHA512SIG0L>; +def : PatGprGpr<int_riscv_sha512sig0h, SHA512SIG0H>; +def : PatGprGpr<int_riscv_sha512sig1l, SHA512SIG1L>; +def : PatGprGpr<int_riscv_sha512sig1h, SHA512SIG1H>; +def : PatGprGpr<int_riscv_sha512sum0r, SHA512SUM0R>; +def : PatGprGpr<int_riscv_sha512sum1r, SHA512SUM1R>; +} // Predicates = [HasStdExtZknh, IsRV32] + +let Predicates = [HasStdExtZknh, IsRV64] in { +def : PatGpr<int_riscv_sha512sig0, SHA512SIG0>; +def : PatGpr<int_riscv_sha512sig1, SHA512SIG1>; +def : PatGpr<int_riscv_sha512sum0, SHA512SUM0>; +def : PatGpr<int_riscv_sha512sum1, SHA512SUM1>; +} // Predicates = [HasStdExtZknh, IsRV64] + +// Zksed +let Predicates = [HasStdExtZksed] in { +def : PatGprGprByteSelect<int_riscv_sm4ks, SM4KS>; +def : PatGprGprByteSelect<int_riscv_sm4ed, SM4ED>; +} // Predicates = [HasStdExtZksed] + +// Zksh +let Predicates = [HasStdExtZksh] in { +def : PatGpr<int_riscv_sm3p0, SM3P0>; +def : PatGpr<int_riscv_sm3p1, SM3P1>; +} // Predicates = [HasStdExtZksh] diff --git a/llvm/lib/Target/RISCV/RISCVInstructionSelector.cpp b/llvm/lib/Target/RISCV/RISCVInstructionSelector.cpp index 4d1f47da209d..8dfd71ac0b6b 100644 --- a/llvm/lib/Target/RISCV/RISCVInstructionSelector.cpp +++ b/llvm/lib/Target/RISCV/RISCVInstructionSelector.cpp @@ -69,8 +69,7 @@ private: RISCVInstructionSelector::RISCVInstructionSelector( const RISCVTargetMachine &TM, const RISCVSubtarget &STI, const RISCVRegisterBankInfo &RBI) - : InstructionSelector(), STI(STI), TII(*STI.getInstrInfo()), - TRI(*STI.getRegisterInfo()), RBI(RBI), + : STI(STI), TII(*STI.getInstrInfo()), TRI(*STI.getRegisterInfo()), RBI(RBI), #define GET_GLOBALISEL_PREDICATES_INIT #include "RISCVGenGlobalISel.inc" diff --git a/llvm/lib/Target/RISCV/RISCVMCInstLower.cpp b/llvm/lib/Target/RISCV/RISCVMCInstLower.cpp index dd084f53e511..c167c095521a 100644 --- a/llvm/lib/Target/RISCV/RISCVMCInstLower.cpp +++ b/llvm/lib/Target/RISCV/RISCVMCInstLower.cpp @@ -172,7 +172,7 @@ static bool lowerRISCVVMachineInstrToMCInst(const MachineInstr *MI, default: llvm_unreachable("Unknown operand type"); case MachineOperand::MO_Register: { - unsigned Reg = MO.getReg(); + Register Reg = MO.getReg(); if (RISCV::VRM2RegClass.contains(Reg) || RISCV::VRM4RegClass.contains(Reg) || diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp index 9094dff1dda1..35363bf37c0d 100644 --- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp @@ -347,3 +347,8 @@ void RISCVRegisterInfo::getOffsetOpcodes(const StackOffset &Offset, Ops.push_back(dwarf::DW_OP_minus); } } + +unsigned +RISCVRegisterInfo::getRegisterCostTableIndex(const MachineFunction &MF) const { + return MF.getSubtarget<RISCVSubtarget>().hasStdExtC() ? 1 : 0; +} diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.h b/llvm/lib/Target/RISCV/RISCVRegisterInfo.h index 2b2bbdfbdf32..9e0ef7902210 100644 --- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.h +++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.h @@ -66,6 +66,8 @@ struct RISCVRegisterInfo : public RISCVGenRegisterInfo { void getOffsetOpcodes(const StackOffset &Offset, SmallVectorImpl<uint64_t> &Ops) const override; + + unsigned getRegisterCostTableIndex(const MachineFunction &MF) const override; }; } diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td index 20903b317180..8c1c03b51c24 100644 --- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td +++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td @@ -73,12 +73,11 @@ def sub_vrm1_7 : ComposedSubRegIndex<sub_vrm2_3, sub_vrm1_1>; // are not part of GPRC, the most restrictive register class used by the // compressed instruction set. This will influence the greedy register // allocator to reduce the use of registers that can't be encoded in 16 bit -// instructions. This affects register allocation even when compressed -// instruction isn't targeted, we see no major negative codegen impact. +// instructions. let RegAltNameIndices = [ABIRegAltName] in { def X0 : RISCVReg<0, "x0", ["zero"]>, DwarfRegNum<[0]>; - let CostPerUse = [1] in { + let CostPerUse = [0, 1] in { def X1 : RISCVReg<1, "x1", ["ra"]>, DwarfRegNum<[1]>; def X2 : RISCVReg<2, "x2", ["sp"]>, DwarfRegNum<[2]>; def X3 : RISCVReg<3, "x3", ["gp"]>, DwarfRegNum<[3]>; @@ -95,7 +94,7 @@ let RegAltNameIndices = [ABIRegAltName] in { def X13 : RISCVReg<13,"x13", ["a3"]>, DwarfRegNum<[13]>; def X14 : RISCVReg<14,"x14", ["a4"]>, DwarfRegNum<[14]>; def X15 : RISCVReg<15,"x15", ["a5"]>, DwarfRegNum<[15]>; - let CostPerUse = [1] in { + let CostPerUse = [0, 1] in { def X16 : RISCVReg<16,"x16", ["a6"]>, DwarfRegNum<[16]>; def X17 : RISCVReg<17,"x17", ["a7"]>, DwarfRegNum<[17]>; def X18 : RISCVReg<18,"x18", ["s2"]>, DwarfRegNum<[18]>; @@ -138,27 +137,11 @@ def GPRX0 : RegisterClass<"RISCV", [XLenVT], 32, (add X0)> { let RegInfos = XLenRI; } -// The order of registers represents the preferred allocation sequence. -// Registers are listed in the order caller-save, callee-save, specials. -def GPRNoX0 : RegisterClass<"RISCV", [XLenVT], 32, (add - (sequence "X%u", 10, 17), - (sequence "X%u", 5, 7), - (sequence "X%u", 28, 31), - (sequence "X%u", 8, 9), - (sequence "X%u", 18, 27), - (sequence "X%u", 1, 4) - )> { +def GPRNoX0 : RegisterClass<"RISCV", [XLenVT], 32, (sub GPR, X0)> { let RegInfos = XLenRI; } -def GPRNoX0X2 : RegisterClass<"RISCV", [XLenVT], 32, (add - (sequence "X%u", 10, 17), - (sequence "X%u", 5, 7), - (sequence "X%u", 28, 31), - (sequence "X%u", 8, 9), - (sequence "X%u", 18, 27), - X1, X3, X4 - )> { +def GPRNoX0X2 : RegisterClass<"RISCV", [XLenVT], 32, (sub GPR, X0, X2)> { let RegInfos = XLenRI; } @@ -166,13 +149,7 @@ def GPRNoX0X2 : RegisterClass<"RISCV", [XLenVT], 32, (add // stack on some microarchitectures. Also remove the reserved registers X0, X2, // X3, and X4 as it reduces the number of register classes that get synthesized // by tablegen. -def GPRJALR : RegisterClass<"RISCV", [XLenVT], 32, (add - (sequence "X%u", 10, 17), - (sequence "X%u", 6, 7), - (sequence "X%u", 28, 31), - (sequence "X%u", 8, 9), - (sequence "X%u", 18, 27) - )> { +def GPRJALR : RegisterClass<"RISCV", [XLenVT], 32, (sub GPR, (sequence "X%u", 0, 5))> { let RegInfos = XLenRI; } diff --git a/llvm/lib/Target/RISCV/RISCVSExtWRemoval.cpp b/llvm/lib/Target/RISCV/RISCVSExtWRemoval.cpp new file mode 100644 index 000000000000..12ec52925798 --- /dev/null +++ b/llvm/lib/Target/RISCV/RISCVSExtWRemoval.cpp @@ -0,0 +1,278 @@ +//===-------------- RISCVSExtWRemoval.cpp - MI sext.w Removal -------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===---------------------------------------------------------------------===// +// +// This pass removes unneeded sext.w instructions at the MI level. +// +//===---------------------------------------------------------------------===// + +#include "RISCV.h" +#include "RISCVSubtarget.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/TargetInstrInfo.h" + +using namespace llvm; + +#define DEBUG_TYPE "riscv-sextw-removal" + +STATISTIC(NumRemovedSExtW, "Number of removed sign-extensions"); + +static cl::opt<bool> DisableSExtWRemoval("riscv-disable-sextw-removal", + cl::desc("Disable removal of sext.w"), + cl::init(false), cl::Hidden); +namespace { + +class RISCVSExtWRemoval : public MachineFunctionPass { +public: + static char ID; + + RISCVSExtWRemoval() : MachineFunctionPass(ID) { + initializeRISCVSExtWRemovalPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + StringRef getPassName() const override { return "RISCV sext.w Removal"; } +}; + +} // end anonymous namespace + +char RISCVSExtWRemoval::ID = 0; +INITIALIZE_PASS(RISCVSExtWRemoval, DEBUG_TYPE, "RISCV sext.w Removal", false, + false) + +FunctionPass *llvm::createRISCVSExtWRemovalPass() { + return new RISCVSExtWRemoval(); +} + +// This function returns true if the machine instruction always outputs a value +// where bits 63:32 match bit 31. +// TODO: Allocate a bit in TSFlags for the W instructions? +// TODO: Add other W instructions. +static bool isSignExtendingOpW(const MachineInstr &MI) { + switch (MI.getOpcode()) { + case RISCV::LUI: + case RISCV::LW: + case RISCV::ADDW: + case RISCV::ADDIW: + case RISCV::SUBW: + case RISCV::MULW: + case RISCV::SLLW: + case RISCV::SLLIW: + case RISCV::SRAW: + case RISCV::SRAIW: + case RISCV::SRLW: + case RISCV::SRLIW: + case RISCV::DIVW: + case RISCV::DIVUW: + case RISCV::REMW: + case RISCV::REMUW: + case RISCV::ROLW: + case RISCV::RORW: + case RISCV::RORIW: + case RISCV::CLZW: + case RISCV::CTZW: + case RISCV::CPOPW: + case RISCV::FCVT_W_H: + case RISCV::FCVT_WU_H: + case RISCV::FCVT_W_S: + case RISCV::FCVT_WU_S: + case RISCV::FCVT_W_D: + case RISCV::FCVT_WU_D: + // The following aren't W instructions, but are either sign extended from a + // smaller size or put zeros in bits 63:31. + case RISCV::LBU: + case RISCV::LHU: + case RISCV::LB: + case RISCV::LH: + case RISCV::SLT: + case RISCV::SLTI: + case RISCV::SLTU: + case RISCV::SLTIU: + case RISCV::SEXTB: + case RISCV::SEXTH: + case RISCV::ZEXTH_RV64: + return true; + // shifting right sufficiently makes the value 32-bit sign-extended + case RISCV::SRAI: + return MI.getOperand(2).getImm() >= 32; + case RISCV::SRLI: + return MI.getOperand(2).getImm() > 32; + // The LI pattern ADDI rd, X0, imm is sign extended. + case RISCV::ADDI: + return MI.getOperand(1).isReg() && MI.getOperand(1).getReg() == RISCV::X0; + // An ANDI with an 11 bit immediate will zero bits 63:11. + case RISCV::ANDI: + return isUInt<11>(MI.getOperand(2).getImm()); + // An ORI with an >11 bit immediate (negative 12-bit) will set bits 63:11. + case RISCV::ORI: + return !isUInt<11>(MI.getOperand(2).getImm()); + // Copying from X0 produces zero. + case RISCV::COPY: + return MI.getOperand(1).getReg() == RISCV::X0; + } + + return false; +} + +static bool isSignExtendedW(const MachineInstr &OrigMI, + MachineRegisterInfo &MRI) { + + SmallPtrSet<const MachineInstr *, 4> Visited; + SmallVector<const MachineInstr *, 4> Worklist; + + Worklist.push_back(&OrigMI); + + while (!Worklist.empty()) { + const MachineInstr *MI = Worklist.pop_back_val(); + + // If we already visited this instruction, we don't need to check it again. + if (!Visited.insert(MI).second) + continue; + + // If this is a sign extending operation we don't need to look any further. + if (isSignExtendingOpW(*MI)) + continue; + + // Is this an instruction that propagates sign extend. + switch (MI->getOpcode()) { + default: + // Unknown opcode, give up. + return false; + case RISCV::COPY: { + Register SrcReg = MI->getOperand(1).getReg(); + + // TODO: Handle arguments and returns from calls? + + // If this is a copy from another register, check its source instruction. + if (!SrcReg.isVirtual()) + return false; + const MachineInstr *SrcMI = MRI.getVRegDef(SrcReg); + if (!SrcMI) + return false; + + // Add SrcMI to the worklist. + Worklist.push_back(SrcMI); + break; + } + case RISCV::REM: + case RISCV::ANDI: + case RISCV::ORI: + case RISCV::XORI: { + // |Remainder| is always <= |Dividend|. If D is 32-bit, then so is R. + // DIV doesn't work because of the edge case 0xf..f 8000 0000 / (long)-1 + // Logical operations use a sign extended 12-bit immediate. We just need + // to check if the other operand is sign extended. + Register SrcReg = MI->getOperand(1).getReg(); + if (!SrcReg.isVirtual()) + return false; + const MachineInstr *SrcMI = MRI.getVRegDef(SrcReg); + if (!SrcMI) + return false; + + // Add SrcMI to the worklist. + Worklist.push_back(SrcMI); + break; + } + case RISCV::REMU: + case RISCV::AND: + case RISCV::OR: + case RISCV::XOR: + case RISCV::ANDN: + case RISCV::ORN: + case RISCV::XNOR: + case RISCV::MAX: + case RISCV::MAXU: + case RISCV::MIN: + case RISCV::MINU: + case RISCV::PHI: { + // If all incoming values are sign-extended, the output of AND, OR, XOR, + // MIN, MAX, or PHI is also sign-extended. + + // The input registers for PHI are operand 1, 3, ... + // The input registers for others are operand 1 and 2. + unsigned E = 3, D = 1; + if (MI->getOpcode() == RISCV::PHI) { + E = MI->getNumOperands(); + D = 2; + } + + for (unsigned I = 1; I != E; I += D) { + if (!MI->getOperand(I).isReg()) + return false; + + Register SrcReg = MI->getOperand(I).getReg(); + if (!SrcReg.isVirtual()) + return false; + const MachineInstr *SrcMI = MRI.getVRegDef(SrcReg); + if (!SrcMI) + return false; + + // Add SrcMI to the worklist. + Worklist.push_back(SrcMI); + } + + break; + } + } + } + + // If we get here, then every node we visited produces a sign extended value + // or propagated sign extended values. So the result must be sign extended. + return true; +} + +bool RISCVSExtWRemoval::runOnMachineFunction(MachineFunction &MF) { + if (skipFunction(MF.getFunction()) || DisableSExtWRemoval) + return false; + + MachineRegisterInfo &MRI = MF.getRegInfo(); + const RISCVSubtarget &ST = MF.getSubtarget<RISCVSubtarget>(); + + if (!ST.is64Bit()) + return false; + + bool MadeChange = false; + for (MachineBasicBlock &MBB : MF) { + for (auto I = MBB.begin(), IE = MBB.end(); I != IE;) { + MachineInstr *MI = &*I++; + + // We're looking for the sext.w pattern ADDIW rd, rs1, 0. + if (MI->getOpcode() != RISCV::ADDIW || !MI->getOperand(2).isImm() || + MI->getOperand(2).getImm() != 0 || !MI->getOperand(1).isReg()) + continue; + + // Input should be a virtual register. + Register SrcReg = MI->getOperand(1).getReg(); + if (!SrcReg.isVirtual()) + continue; + + const MachineInstr &SrcMI = *MRI.getVRegDef(SrcReg); + if (!isSignExtendedW(SrcMI, MRI)) + continue; + + Register DstReg = MI->getOperand(0).getReg(); + if (!MRI.constrainRegClass(SrcReg, MRI.getRegClass(DstReg))) + continue; + + LLVM_DEBUG(dbgs() << "Removing redundant sign-extension\n"); + MRI.replaceRegWith(DstReg, SrcReg); + MRI.clearKillFlags(SrcReg); + MI->eraseFromParent(); + ++NumRemovedSExtW; + MadeChange = true; + } + } + + return MadeChange; +} diff --git a/llvm/lib/Target/RISCV/RISCVSchedRocket.td b/llvm/lib/Target/RISCV/RISCVSchedRocket.td index d5a0932c8778..78cf34c8c582 100644 --- a/llvm/lib/Target/RISCV/RISCVSchedRocket.td +++ b/llvm/lib/Target/RISCV/RISCVSchedRocket.td @@ -17,7 +17,10 @@ def RocketModel : SchedMachineModel { let LoadLatency = 3; let MispredictPenalty = 3; let CompleteModel = false; - let UnsupportedFeatures = [HasStdExtV, HasStdExtZvlsseg]; + let UnsupportedFeatures = [HasStdExtZbkb, HasStdExtZbkc, HasStdExtZbkx, + HasStdExtZknd, HasStdExtZkne, HasStdExtZknh, + HasStdExtZksed, HasStdExtZksh, HasStdExtZkr, + HasVInstructions, HasVInstructionsI64]; } //===----------------------------------------------------------------------===// @@ -237,5 +240,8 @@ def : ReadAdvance<ReadFClass64, 0>; defm : UnsupportedSchedV; defm : UnsupportedSchedZba; defm : UnsupportedSchedZbb; +defm : UnsupportedSchedZbc; +defm : UnsupportedSchedZbs; +defm : UnsupportedSchedZbf; defm : UnsupportedSchedZfh; } diff --git a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td index 7f9d0aabc4ed..9f5e5ff1223c 100644 --- a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td +++ b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td @@ -15,7 +15,10 @@ def SiFive7Model : SchedMachineModel { let LoadLatency = 3; let MispredictPenalty = 3; let CompleteModel = 0; - let UnsupportedFeatures = [HasStdExtV, HasStdExtZvlsseg]; + let UnsupportedFeatures = [HasStdExtZbkb, HasStdExtZbkc, HasStdExtZbkx, + HasStdExtZknd, HasStdExtZkne, HasStdExtZknh, + HasStdExtZksed, HasStdExtZksh, HasStdExtZkr, + HasVInstructions]; } // The SiFive7 microarchitecture has two pipelines: A and B. @@ -224,5 +227,8 @@ def : ReadAdvance<ReadFClass64, 0>; defm : UnsupportedSchedV; defm : UnsupportedSchedZba; defm : UnsupportedSchedZbb; +defm : UnsupportedSchedZbc; +defm : UnsupportedSchedZbs; +defm : UnsupportedSchedZbf; defm : UnsupportedSchedZfh; } diff --git a/llvm/lib/Target/RISCV/RISCVScheduleB.td b/llvm/lib/Target/RISCV/RISCVScheduleB.td index b668b0acd719..193760e1e15b 100644 --- a/llvm/lib/Target/RISCV/RISCVScheduleB.td +++ b/llvm/lib/Target/RISCV/RISCVScheduleB.td @@ -26,6 +26,17 @@ def WriteCPOP32 : SchedWrite; def WriteREV8 : SchedWrite; def WriteORCB : SchedWrite; +// Zbc extension +def WriteCLMUL : SchedWrite; // CLMUL/CLMULR/CLMULH + +// Zbs extension +def WriteSingleBit : SchedWrite; // BCLR/BSET/BINV/BEXT +def WriteSingleBitImm: SchedWrite; // BCLRI/BSETI/BINVI/BEXTI + +// Zbf extension +def WriteBFP : SchedWrite; // BFP +def WriteBFP32 : SchedWrite; // BFPW + /// Define scheduler resources associated with use operands. // Zba extension @@ -46,6 +57,17 @@ def ReadCPOP32 : SchedRead; def ReadREV8 : SchedRead; def ReadORCB : SchedRead; +// Zbc extension +def ReadCLMUL : SchedRead; // CLMUL/CLMULR/CLMULH + +// Zbs extension +def ReadSingleBit : SchedRead; // BCLR/BSET/BINV/BEXT +def ReadSingleBitImm: SchedRead; // BCLRI/BSETI/BINVI/BEXTI + +// Zbf extension +def ReadBFP : SchedRead; // BFP +def ReadBFP32 : SchedRead; // BFPW + /// Define default scheduler resources for B. multiclass UnsupportedSchedZba { @@ -87,3 +109,31 @@ def : ReadAdvance<ReadREV8, 0>; def : ReadAdvance<ReadORCB, 0>; } } + +multiclass UnsupportedSchedZbc { +let Unsupported = true in { +def : WriteRes<WriteCLMUL, []>; + +def : ReadAdvance<ReadCLMUL, 0>; +} +} + +multiclass UnsupportedSchedZbs { +let Unsupported = true in { +def : WriteRes<WriteSingleBit, []>; +def : WriteRes<WriteSingleBitImm, []>; + +def : ReadAdvance<ReadSingleBit, 0>; +def : ReadAdvance<ReadSingleBitImm, 0>; +} +} + +multiclass UnsupportedSchedZbf { +let Unsupported = true in { +def : WriteRes<WriteBFP, []>; +def : WriteRes<WriteBFP32, []>; + +def : ReadAdvance<ReadBFP, 0>; +def : ReadAdvance<ReadBFP32, 0>; +} +} diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.cpp b/llvm/lib/Target/RISCV/RISCVSubtarget.cpp index 1063134b8a6c..976e4ccb1422 100644 --- a/llvm/lib/Target/RISCV/RISCVSubtarget.cpp +++ b/llvm/lib/Target/RISCV/RISCVSubtarget.cpp @@ -18,6 +18,7 @@ #include "RISCVRegisterBankInfo.h" #include "RISCVTargetMachine.h" #include "llvm/MC/TargetRegistry.h" +#include "llvm/Support/ErrorHandling.h" using namespace llvm; @@ -50,6 +51,16 @@ static cl::opt<unsigned> RVVVectorELENMax( cl::desc("The maximum ELEN value to use for fixed length vectors."), cl::init(64), cl::Hidden); +static cl::opt<bool> RISCVDisableUsingConstantPoolForLargeInts( + "riscv-disable-using-constant-pool-for-large-ints", + cl::desc("Disable using constant pool for large integers."), + cl::init(false), cl::Hidden); + +static cl::opt<unsigned> RISCVMaxBuildIntsCost( + "riscv-max-build-ints-cost", + cl::desc("The maximum cost used for building integers."), cl::init(0), + cl::Hidden); + void RISCVSubtarget::anchor() {} RISCVSubtarget & @@ -110,37 +121,69 @@ const RegisterBankInfo *RISCVSubtarget::getRegBankInfo() const { return RegBankInfo.get(); } +bool RISCVSubtarget::useConstantPoolForLargeInts() const { + return !RISCVDisableUsingConstantPoolForLargeInts; +} + +unsigned RISCVSubtarget::getMaxBuildIntsCost() const { + // Loading integer from constant pool needs two instructions (the reason why + // the minimum cost is 2): an address calculation instruction and a load + // instruction. Usually, address calculation and instructions used for + // building integers (addi, slli, etc.) can be done in one cycle, so here we + // set the default cost to (LoadLatency + 1) if no threshold is provided. + return RISCVMaxBuildIntsCost == 0 + ? getSchedModel().LoadLatency + 1 + : std::max<unsigned>(2, RISCVMaxBuildIntsCost); +} + unsigned RISCVSubtarget::getMaxRVVVectorSizeInBits() const { assert(hasVInstructions() && "Tried to get vector length without Zve or V extension support!"); if (RVVVectorBitsMax == 0) return 0; - assert(RVVVectorBitsMax >= 128 && RVVVectorBitsMax <= 65536 && - isPowerOf2_32(RVVVectorBitsMax) && - "V extension requires vector length to be in the range of 128 to " - "65536 and a power of 2!"); + + // ZvlLen specifies the minimum required vlen. The upper bound provided by + // riscv-v-vector-bits-max should be no less than it. + if (RVVVectorBitsMax < ZvlLen) + report_fatal_error("riscv-v-vector-bits-max specified is lower " + "than the Zvl*b limitation"); + + // FIXME: Change to >= 32 when VLEN = 32 is supported + assert( + RVVVectorBitsMax >= 64 && RVVVectorBitsMax <= 65536 && + isPowerOf2_32(RVVVectorBitsMax) && + "V or Zve* extension requires vector length to be in the range of 64 to " + "65536 and a power of 2!"); assert(RVVVectorBitsMax >= RVVVectorBitsMin && "Minimum V extension vector length should not be larger than its " "maximum!"); unsigned Max = std::max(RVVVectorBitsMin, RVVVectorBitsMax); - return PowerOf2Floor((Max < 128 || Max > 65536) ? 0 : Max); + return PowerOf2Floor((Max < 64 || Max > 65536) ? 0 : Max); } unsigned RISCVSubtarget::getMinRVVVectorSizeInBits() const { + // ZvlLen specifies the minimum required vlen. The lower bound provided by + // riscv-v-vector-bits-min should be no less than it. + if (RVVVectorBitsMin != 0 && RVVVectorBitsMin < ZvlLen) + report_fatal_error("riscv-v-vector-bits-min specified is lower " + "than the Zvl*b limitation"); + assert(hasVInstructions() && "Tried to get vector length without Zve or V extension support!"); - assert((RVVVectorBitsMin == 0 || - (RVVVectorBitsMin >= 128 && RVVVectorBitsMax <= 65536 && - isPowerOf2_32(RVVVectorBitsMin))) && - "V extension requires vector length to be in the range of 128 to " - "65536 and a power of 2!"); + // FIXME: Change to >= 32 when VLEN = 32 is supported + assert( + (RVVVectorBitsMin == 0 || + (RVVVectorBitsMin >= 64 && RVVVectorBitsMin <= 65536 && + isPowerOf2_32(RVVVectorBitsMin))) && + "V or Zve* extension requires vector length to be in the range of 64 to " + "65536 and a power of 2!"); assert((RVVVectorBitsMax >= RVVVectorBitsMin || RVVVectorBitsMax == 0) && "Minimum V extension vector length should not be larger than its " "maximum!"); unsigned Min = RVVVectorBitsMin; if (RVVVectorBitsMax != 0) Min = std::min(RVVVectorBitsMin, RVVVectorBitsMax); - return PowerOf2Floor((Min < 128 || Min > 65536) ? 0 : Min); + return PowerOf2Floor((Min < 64 || Min > 65536) ? 0 : Min); } unsigned RISCVSubtarget::getMaxLMULForFixedLengthVectors() const { @@ -158,8 +201,9 @@ unsigned RISCVSubtarget::getMaxELENForFixedLengthVectors() const { assert(RVVVectorELENMax <= 64 && RVVVectorELENMax >= 8 && isPowerOf2_32(RVVVectorELENMax) && "V extension requires a ELEN to be a power of 2 between 8 and 64!"); + unsigned ELEN = hasVInstructionsI64() ? 64 : 32; return PowerOf2Floor( - std::max<unsigned>(std::min<unsigned>(RVVVectorELENMax, 64), 8)); + std::max<unsigned>(std::min<unsigned>(RVVVectorELENMax, ELEN), 8)); } bool RISCVSubtarget::useRVVForFixedLengthVectors() const { diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.h b/llvm/lib/Target/RISCV/RISCVSubtarget.h index d0330e6984a5..044dda0a1ccc 100644 --- a/llvm/lib/Target/RISCV/RISCVSubtarget.h +++ b/llvm/lib/Target/RISCV/RISCVSubtarget.h @@ -33,7 +33,33 @@ namespace llvm { class StringRef; class RISCVSubtarget : public RISCVGenSubtargetInfo { +public: + enum ExtZvl : unsigned { + NotSet = 0, + Zvl32b = 32, + Zvl64b = 64, + Zvl128b = 128, + Zvl256b = 256, + Zvl512b = 512, + Zvl1024b = 1024, + Zvl2048b = 2048, + Zvl4096b = 4096, + Zvl8192b = 8192, + Zvl16384b = 16384, + Zvl32768b = 32768, + Zvl65536b = 65536 + }; + + enum RISCVProcFamilyEnum : uint8_t { + Others, + SiFive7, + }; + +private: virtual void anchor(); + + RISCVProcFamilyEnum RISCVProcFamily = Others; + bool HasStdExtM = false; bool HasStdExtA = false; bool HasStdExtF = false; @@ -50,15 +76,33 @@ class RISCVSubtarget : public RISCVGenSubtargetInfo { bool HasStdExtZbs = false; bool HasStdExtZbt = false; bool HasStdExtV = false; - bool HasStdExtZvlsseg = false; + bool HasStdExtZve32x = false; + bool HasStdExtZve32f = false; + bool HasStdExtZve64x = false; + bool HasStdExtZve64f = false; + bool HasStdExtZve64d = false; bool HasStdExtZfhmin = false; bool HasStdExtZfh = false; + bool HasStdExtZbkb = false; + bool HasStdExtZbkc = false; + bool HasStdExtZbkx = false; + bool HasStdExtZknd = false; + bool HasStdExtZkne = false; + bool HasStdExtZknh = false; + bool HasStdExtZksed = false; + bool HasStdExtZksh = false; + bool HasStdExtZkr = false; + bool HasStdExtZkn = false; + bool HasStdExtZks = false; + bool HasStdExtZkt = false; + bool HasStdExtZk = false; bool HasRV64 = false; bool IsRV32E = false; bool EnableLinkerRelax = false; bool EnableRVCHintInstrs = true; bool EnableSaveRestore = false; unsigned XLen = 32; + ExtZvl ZvlLen = ExtZvl::NotSet; MVT XLenVT = MVT::i32; uint8_t MaxInterleaveFactor = 2; RISCVABI::ABI TargetABI = RISCVABI::ABI_Unknown; @@ -100,11 +144,19 @@ public: return &TSInfo; } bool enableMachineScheduler() const override { return true; } + + /// Returns RISCV processor family. + /// Avoid this function! CPU specifics should be kept local to this class + /// and preferably modeled with SubtargetFeatures or properties in + /// initializeProperties(). + RISCVProcFamilyEnum getProcFamily() const { return RISCVProcFamily; } + bool hasStdExtM() const { return HasStdExtM; } bool hasStdExtA() const { return HasStdExtA; } bool hasStdExtF() const { return HasStdExtF; } bool hasStdExtD() const { return HasStdExtD; } bool hasStdExtC() const { return HasStdExtC; } + bool hasStdExtV() const { return HasStdExtV; } bool hasStdExtZba() const { return HasStdExtZba; } bool hasStdExtZbb() const { return HasStdExtZbb; } bool hasStdExtZbc() const { return HasStdExtZbc; } @@ -115,10 +167,18 @@ public: bool hasStdExtZbr() const { return HasStdExtZbr; } bool hasStdExtZbs() const { return HasStdExtZbs; } bool hasStdExtZbt() const { return HasStdExtZbt; } - bool hasStdExtV() const { return HasStdExtV; } - bool hasStdExtZvlsseg() const { return HasStdExtZvlsseg; } + bool hasStdExtZvl() const { return ZvlLen != ExtZvl::NotSet; } bool hasStdExtZfhmin() const { return HasStdExtZfhmin; } bool hasStdExtZfh() const { return HasStdExtZfh; } + bool hasStdExtZbkb() const { return HasStdExtZbkb; } + bool hasStdExtZbkc() const { return HasStdExtZbkc; } + bool hasStdExtZbkx() const { return HasStdExtZbkx; } + bool hasStdExtZknd() const { return HasStdExtZknd; } + bool hasStdExtZkne() const { return HasStdExtZkne; } + bool hasStdExtZknh() const { return HasStdExtZknh; } + bool hasStdExtZksed() const { return HasStdExtZksed; } + bool hasStdExtZksh() const { return HasStdExtZksh; } + bool hasStdExtZkr() const { return HasStdExtZkr; } bool is64Bit() const { return HasRV64; } bool isRV32E() const { return IsRV32E; } bool enableLinkerRelax() const { return EnableLinkerRelax; } @@ -126,6 +186,15 @@ public: bool enableSaveRestore() const { return EnableSaveRestore; } MVT getXLenVT() const { return XLenVT; } unsigned getXLen() const { return XLen; } + unsigned getFLen() const { + if (HasStdExtD) + return 64; + + if (HasStdExtF) + return 32; + + return 0; + } RISCVABI::ABI getTargetABI() const { return TargetABI; } bool isRegisterReservedByUser(Register i) const { assert(i < RISCV::NUM_TARGET_REGS && "Register out of range"); @@ -133,11 +202,19 @@ public: } // Vector codegen related methods. - bool hasVInstructions() const { return HasStdExtV; } - bool hasVInstructionsI64() const { return HasStdExtV; } - bool hasVInstructionsF16() const { return HasStdExtV && hasStdExtZfh(); } - bool hasVInstructionsF32() const { return HasStdExtV && hasStdExtF(); } - bool hasVInstructionsF64() const { return HasStdExtV && hasStdExtD(); } + bool hasVInstructions() const { return HasStdExtV || HasStdExtZve32x; } + bool hasVInstructionsI64() const { return HasStdExtV || HasStdExtZve64x; } + bool hasVInstructionsF16() const { + return (HasStdExtV || HasStdExtZve32f) && HasStdExtZfh; + } + // FIXME: Consider Zfinx in the future + bool hasVInstructionsF32() const { + return HasStdExtV || (HasStdExtZve32f && HasStdExtF); + } + // FIXME: Consider Zdinx in the future + bool hasVInstructionsF64() const { + return HasStdExtV || (HasStdExtZve64d && HasStdExtD); + } // F16 and F64 both require F32. bool hasVInstructionsAnyF() const { return hasVInstructionsF32(); } unsigned getMaxInterleaveFactor() const { @@ -157,6 +234,12 @@ public: const LegalizerInfo *getLegalizerInfo() const override; const RegisterBankInfo *getRegBankInfo() const override; + bool useConstantPoolForLargeInts() const; + + // Maximum cost used for building integers, integers will be put into constant + // pool if exceeded. + unsigned getMaxBuildIntsCost() const; + // Return the known range for the bit length of RVV data registers. A value // of 0 means nothing is known about that particular limit beyond what's // implied by the architecture. diff --git a/llvm/lib/Target/RISCV/RISCVSystemOperands.td b/llvm/lib/Target/RISCV/RISCVSystemOperands.td index 5a4c579dd708..b9aa25b321b0 100644 --- a/llvm/lib/Target/RISCV/RISCVSystemOperands.td +++ b/llvm/lib/Target/RISCV/RISCVSystemOperands.td @@ -1,4 +1,4 @@ -//===- RISCVSystemOperands.td ----------------------------*- tablegen -*-===// +//===- RISCVSystemOperands.td ------------------------------*- tablegen -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -70,16 +70,16 @@ def lookupSysRegByDeprecatedName : SearchIndex { // 2.3, 2.4 and 2.5 in the RISC-V Instruction Set Manual // Volume II: Privileged Architecture. -//===-------------------------- +//===----------------------------------------------------------------------===// // User Trap Setup -//===-------------------------- +//===----------------------------------------------------------------------===// def : SysReg<"ustatus", 0x000>; def : SysReg<"uie", 0x004>; def : SysReg<"utvec", 0x005>; -//===-------------------------- +//===----------------------------------------------------------------------===// // User Trap Handling -//===-------------------------- +//===----------------------------------------------------------------------===// def : SysReg<"uscratch", 0x040>; def : SysReg<"uepc", 0x041>; def : SysReg<"ucause", 0x042>; @@ -87,100 +87,57 @@ let DeprecatedName = "ubadaddr" in def : SysReg<"utval", 0x043>; def : SysReg<"uip", 0x044>; -//===-------------------------- +//===----------------------------------------------------------------------===// // User Floating-Point CSRs -//===-------------------------- +//===----------------------------------------------------------------------===// def SysRegFFLAGS : SysReg<"fflags", 0x001>; def SysRegFRM : SysReg<"frm", 0x002>; def SysRegFCSR : SysReg<"fcsr", 0x003>; -//===-------------------------- +//===----------------------------------------------------------------------===// // User Counter/Timers -//===-------------------------- +//===----------------------------------------------------------------------===// def CYCLE : SysReg<"cycle", 0xC00>; def TIME : SysReg<"time", 0xC01>; def INSTRET : SysReg<"instret", 0xC02>; -def : SysReg<"hpmcounter3", 0xC03>; -def : SysReg<"hpmcounter4", 0xC04>; -def : SysReg<"hpmcounter5", 0xC05>; -def : SysReg<"hpmcounter6", 0xC06>; -def : SysReg<"hpmcounter7", 0xC07>; -def : SysReg<"hpmcounter8", 0xC08>; -def : SysReg<"hpmcounter9", 0xC09>; -def : SysReg<"hpmcounter10", 0xC0A>; -def : SysReg<"hpmcounter11", 0xC0B>; -def : SysReg<"hpmcounter12", 0xC0C>; -def : SysReg<"hpmcounter13", 0xC0D>; -def : SysReg<"hpmcounter14", 0xC0E>; -def : SysReg<"hpmcounter15", 0xC0F>; -def : SysReg<"hpmcounter16", 0xC10>; -def : SysReg<"hpmcounter17", 0xC11>; -def : SysReg<"hpmcounter18", 0xC12>; -def : SysReg<"hpmcounter19", 0xC13>; -def : SysReg<"hpmcounter20", 0xC14>; -def : SysReg<"hpmcounter21", 0xC15>; -def : SysReg<"hpmcounter22", 0xC16>; -def : SysReg<"hpmcounter23", 0xC17>; -def : SysReg<"hpmcounter24", 0xC18>; -def : SysReg<"hpmcounter25", 0xC19>; -def : SysReg<"hpmcounter26", 0xC1A>; -def : SysReg<"hpmcounter27", 0xC1B>; -def : SysReg<"hpmcounter28", 0xC1C>; -def : SysReg<"hpmcounter29", 0xC1D>; -def : SysReg<"hpmcounter30", 0xC1E>; -def : SysReg<"hpmcounter31", 0xC1F>; +// hpmcounter3-hpmcounter31 at 0xC03-0xC1F. +foreach i = 3...31 in + def : SysReg<"hpmcounter"#i, !add(0xC03, !sub(i, 3))>; let isRV32Only = 1 in { def CYCLEH : SysReg<"cycleh", 0xC80>; def TIMEH : SysReg<"timeh", 0xC81>; def INSTRETH : SysReg<"instreth", 0xC82>; -def: SysReg<"hpmcounter3h", 0xC83>; -def: SysReg<"hpmcounter4h", 0xC84>; -def: SysReg<"hpmcounter5h", 0xC85>; -def: SysReg<"hpmcounter6h", 0xC86>; -def: SysReg<"hpmcounter7h", 0xC87>; -def: SysReg<"hpmcounter8h", 0xC88>; -def: SysReg<"hpmcounter9h", 0xC89>; -def: SysReg<"hpmcounter10h", 0xC8A>; -def: SysReg<"hpmcounter11h", 0xC8B>; -def: SysReg<"hpmcounter12h", 0xC8C>; -def: SysReg<"hpmcounter13h", 0xC8D>; -def: SysReg<"hpmcounter14h", 0xC8E>; -def: SysReg<"hpmcounter15h", 0xC8F>; -def: SysReg<"hpmcounter16h", 0xC90>; -def: SysReg<"hpmcounter17h", 0xC91>; -def: SysReg<"hpmcounter18h", 0xC92>; -def: SysReg<"hpmcounter19h", 0xC93>; -def: SysReg<"hpmcounter20h", 0xC94>; -def: SysReg<"hpmcounter21h", 0xC95>; -def: SysReg<"hpmcounter22h", 0xC96>; -def: SysReg<"hpmcounter23h", 0xC97>; -def: SysReg<"hpmcounter24h", 0xC98>; -def: SysReg<"hpmcounter25h", 0xC99>; -def: SysReg<"hpmcounter26h", 0xC9A>; -def: SysReg<"hpmcounter27h", 0xC9B>; -def: SysReg<"hpmcounter28h", 0xC9C>; -def: SysReg<"hpmcounter29h", 0xC9D>; -def: SysReg<"hpmcounter30h", 0xC9E>; -def: SysReg<"hpmcounter31h", 0xC9F>; +// hpmcounter3h-hpmcounter31h at 0xC83-0xC9F. +foreach i = 3...31 in + def : SysReg<"hpmcounter"#i#"h", !add(0xC83, !sub(i, 3))>; } -//===-------------------------- +//===----------------------------------------------------------------------===// // Supervisor Trap Setup -//===-------------------------- +//===----------------------------------------------------------------------===// def : SysReg<"sstatus", 0x100>; def : SysReg<"sedeleg", 0x102>; def : SysReg<"sideleg", 0x103>; def : SysReg<"sie", 0x104>; def : SysReg<"stvec", 0x105>; def : SysReg<"scounteren", 0x106>; +def : SysReg<"stimecmp", 0x14D>; +let isRV32Only = 1 in +def : SysReg<"stimecmph", 0x15D>; + +//===----------------------------------------------------------------------===// +// Supervisor Configuration +//===----------------------------------------------------------------------===// + +def : SysReg<"senvcfg", 0x10A>; -//===-------------------------- +//===----------------------------------------------------------------------===// // Supervisor Trap Handling -//===-------------------------- +//===----------------------------------------------------------------------===// def : SysReg<"sscratch", 0x140>; def : SysReg<"sepc", 0x141>; def : SysReg<"scause", 0x142>; @@ -188,24 +145,103 @@ let DeprecatedName = "sbadaddr" in def : SysReg<"stval", 0x143>; def : SysReg<"sip", 0x144>; -//===------------------------------------- +//===----------------------------------------------------------------------===// // Supervisor Protection and Translation -//===------------------------------------- +//===----------------------------------------------------------------------===// let DeprecatedName = "sptbr" in def : SysReg<"satp", 0x180>; -//===----------------------------- +//===----------------------------------------------------------------------===// +// Debug/Trace Registers +//===----------------------------------------------------------------------===// + +def : SysReg<"scontext", 0x5A8>; + +//===----------------------------------------------------------------------===// +// Supervisor Count Overflow (defined in Sscofpmf) +//===----------------------------------------------------------------------===// + +def : SysReg<"scountovf", 0xDA0>; + +//===----------------------------------------------------------------------===// +// Hypervisor Trap Setup +//===----------------------------------------------------------------------===// + +def : SysReg<"hstatus", 0x600>; +def : SysReg<"hedeleg", 0x602>; +def : SysReg<"hideleg", 0x603>; +def : SysReg<"hie", 0x604>; +def : SysReg<"hcounteren", 0x606>; +def : SysReg<"hgeie", 0x607>; + +//===----------------------------------------------------------------------===// +// Hypervisor Trap Handling +//===----------------------------------------------------------------------===// + +def : SysReg<"htval", 0x643>; +def : SysReg<"hip", 0x644>; +def : SysReg<"hvip", 0x645>; +def : SysReg<"htinst", 0x64A>; +def : SysReg<"hgeip", 0xE12>; + +//===----------------------------------------------------------------------===// +// Hypervisor Configuration +//===----------------------------------------------------------------------===// + +def : SysReg<"henvcfg", 0x60A>; +let isRV32Only = 1 in +def : SysReg<"henvcfgh", 0x61A>; + +//===----------------------------------------------------------------------===// +// Hypervisor Protection and Translation +//===----------------------------------------------------------------------===// + +def : SysReg<"hgatp", 0x680>; + +//===----------------------------------------------------------------------===// +// Debug/Trace Registers +//===----------------------------------------------------------------------===// + +def : SysReg<"hcontext", 0x6A8>; + +//===----------------------------------------------------------------------===// +// Hypervisor Counter/Timer Virtualization Registers +//===----------------------------------------------------------------------===// + +def : SysReg<"htimedelta", 0x605>; +let isRV32Only = 1 in +def : SysReg<"htimedeltah", 0x615>; + +//===----------------------------------------------------------------------===// +// Virtual Supervisor Registers +//===----------------------------------------------------------------------===// + +def : SysReg<"vsstatus", 0x200>; +def : SysReg<"vsie", 0x204>; +def : SysReg<"vstvec", 0x205>; +def : SysReg<"vsscratch", 0x240>; +def : SysReg<"vsepc", 0x241>; +def : SysReg<"vscause", 0x242>; +def : SysReg<"vstval", 0x243>; +def : SysReg<"vsip", 0x244>; +def : SysReg<"vstimecmp", 0x24D>; +let isRV32Only = 1 in +def : SysReg<"vstimecmph", 0x25D>; +def : SysReg<"vsatp", 0x280>; + +//===----------------------------------------------------------------------===// // Machine Information Registers -//===----------------------------- +//===----------------------------------------------------------------------===// def : SysReg<"mvendorid", 0xF11>; def : SysReg<"marchid", 0xF12>; def : SysReg<"mimpid", 0xF13>; def : SysReg<"mhartid", 0xF14>; +def : SysReg<"mconfigptr", 0xF15>; -//===----------------------------- +//===----------------------------------------------------------------------===// // Machine Trap Setup -//===----------------------------- +//===----------------------------------------------------------------------===// def : SysReg<"mstatus", 0x300>; def : SysReg<"misa", 0x301>; def : SysReg<"medeleg", 0x302>; @@ -213,163 +249,93 @@ def : SysReg<"mideleg", 0x303>; def : SysReg<"mie", 0x304>; def : SysReg<"mtvec", 0x305>; def : SysReg<"mcounteren", 0x306>; +let isRV32Only = 1 in +def : SysReg<"mstatush", 0x310>; -//===----------------------------- +//===----------------------------------------------------------------------===// // Machine Trap Handling -//===----------------------------- +//===----------------------------------------------------------------------===// def : SysReg<"mscratch", 0x340>; def : SysReg<"mepc", 0x341>; def : SysReg<"mcause", 0x342>; let DeprecatedName = "mbadaddr" in def : SysReg<"mtval", 0x343>; def : SysReg<"mip", 0x344>; +def : SysReg<"mtinst", 0x34A>; +def : SysReg<"mtval2", 0x34B>; -//===---------------------------------- +//===----------------------------------------------------------------------===// +// Machine Configuration +//===----------------------------------------------------------------------===// + +def : SysReg<"menvcfg", 0x30A>; +let isRV32Only = 1 in +def : SysReg<"menvcfgh", 0x31A>; +def : SysReg<"mseccfg", 0x747>; +let isRV32Only = 1 in +def : SysReg<"mseccfgh", 0x757>; + +//===----------------------------------------------------------------------===// // Machine Protection and Translation -//===---------------------------------- -def : SysReg<"pmpcfg0", 0x3A0>; -def : SysReg<"pmpcfg2", 0x3A2>; -let isRV32Only = 1 in { -def : SysReg<"pmpcfg1", 0x3A1>; -def : SysReg<"pmpcfg3", 0x3A3>; +//===----------------------------------------------------------------------===// + +// pmpcfg0-pmpcfg15 at 0x3A0-0x3AF. Odd-numbered registers are RV32-only. +foreach i = 0...15 in { + let isRV32Only = !and(i, 1) in + def : SysReg<"pmpcfg"#i, !add(0x3A0, i)>; } -def : SysReg<"pmpaddr0", 0x3B0>; -def : SysReg<"pmpaddr1", 0x3B1>; -def : SysReg<"pmpaddr2", 0x3B2>; -def : SysReg<"pmpaddr3", 0x3B3>; -def : SysReg<"pmpaddr4", 0x3B4>; -def : SysReg<"pmpaddr5", 0x3B5>; -def : SysReg<"pmpaddr6", 0x3B6>; -def : SysReg<"pmpaddr7", 0x3B7>; -def : SysReg<"pmpaddr8", 0x3B8>; -def : SysReg<"pmpaddr9", 0x3B9>; -def : SysReg<"pmpaddr10", 0x3BA>; -def : SysReg<"pmpaddr11", 0x3BB>; -def : SysReg<"pmpaddr12", 0x3BC>; -def : SysReg<"pmpaddr13", 0x3BD>; -def : SysReg<"pmpaddr14", 0x3BE>; -def : SysReg<"pmpaddr15", 0x3BF>; - - -//===-------------------------- +// pmpaddr0-pmpaddr63 at 0x3B0-0x3EF. +foreach i = 0...63 in + def : SysReg<"pmpaddr"#i, !add(0x3B0, i)>; + +//===----------------------------------------------------------------------===// // Machine Counter and Timers -//===-------------------------- +//===----------------------------------------------------------------------===// def : SysReg<"mcycle", 0xB00>; def : SysReg<"minstret", 0xB02>; -def : SysReg<"mhpmcounter3", 0xB03>; -def : SysReg<"mhpmcounter4", 0xB04>; -def : SysReg<"mhpmcounter5", 0xB05>; -def : SysReg<"mhpmcounter6", 0xB06>; -def : SysReg<"mhpmcounter7", 0xB07>; -def : SysReg<"mhpmcounter8", 0xB08>; -def : SysReg<"mhpmcounter9", 0xB09>; -def : SysReg<"mhpmcounter10", 0xB0A>; -def : SysReg<"mhpmcounter11", 0xB0B>; -def : SysReg<"mhpmcounter12", 0xB0C>; -def : SysReg<"mhpmcounter13", 0xB0D>; -def : SysReg<"mhpmcounter14", 0xB0E>; -def : SysReg<"mhpmcounter15", 0xB0F>; -def : SysReg<"mhpmcounter16", 0xB10>; -def : SysReg<"mhpmcounter17", 0xB11>; -def : SysReg<"mhpmcounter18", 0xB12>; -def : SysReg<"mhpmcounter19", 0xB13>; -def : SysReg<"mhpmcounter20", 0xB14>; -def : SysReg<"mhpmcounter21", 0xB15>; -def : SysReg<"mhpmcounter22", 0xB16>; -def : SysReg<"mhpmcounter23", 0xB17>; -def : SysReg<"mhpmcounter24", 0xB18>; -def : SysReg<"mhpmcounter25", 0xB19>; -def : SysReg<"mhpmcounter26", 0xB1A>; -def : SysReg<"mhpmcounter27", 0xB1B>; -def : SysReg<"mhpmcounter28", 0xB1C>; -def : SysReg<"mhpmcounter29", 0xB1D>; -def : SysReg<"mhpmcounter30", 0xB1E>; -def : SysReg<"mhpmcounter31", 0xB1F>; +// mhpmcounter3-mhpmcounter31 at 0xB03-0xB1F. +foreach i = 3...31 in + def : SysReg<"mhpmcounter"#i, !add(0xB03, !sub(i, 3))>; let isRV32Only = 1 in { def: SysReg<"mcycleh", 0xB80>; def: SysReg<"minstreth", 0xB82>; -def: SysReg<"mhpmcounter3h", 0xB83>; -def: SysReg<"mhpmcounter4h", 0xB84>; -def: SysReg<"mhpmcounter5h", 0xB85>; -def: SysReg<"mhpmcounter6h", 0xB86>; -def: SysReg<"mhpmcounter7h", 0xB87>; -def: SysReg<"mhpmcounter8h", 0xB88>; -def: SysReg<"mhpmcounter9h", 0xB89>; -def: SysReg<"mhpmcounter10h", 0xB8A>; -def: SysReg<"mhpmcounter11h", 0xB8B>; -def: SysReg<"mhpmcounter12h", 0xB8C>; -def: SysReg<"mhpmcounter13h", 0xB8D>; -def: SysReg<"mhpmcounter14h", 0xB8E>; -def: SysReg<"mhpmcounter15h", 0xB8F>; -def: SysReg<"mhpmcounter16h", 0xB90>; -def: SysReg<"mhpmcounter17h", 0xB91>; -def: SysReg<"mhpmcounter18h", 0xB92>; -def: SysReg<"mhpmcounter19h", 0xB93>; -def: SysReg<"mhpmcounter20h", 0xB94>; -def: SysReg<"mhpmcounter21h", 0xB95>; -def: SysReg<"mhpmcounter22h", 0xB96>; -def: SysReg<"mhpmcounter23h", 0xB97>; -def: SysReg<"mhpmcounter24h", 0xB98>; -def: SysReg<"mhpmcounter25h", 0xB99>; -def: SysReg<"mhpmcounter26h", 0xB9A>; -def: SysReg<"mhpmcounter27h", 0xB9B>; -def: SysReg<"mhpmcounter28h", 0xB9C>; -def: SysReg<"mhpmcounter29h", 0xB9D>; -def: SysReg<"mhpmcounter30h", 0xB9E>; -def: SysReg<"mhpmcounter31h", 0xB9F>; +// mhpmcounter3h-mhpmcounter31h at 0xB83-0xB9F. +foreach i = 3...31 in + def : SysReg<"mhpmcounter"#i#"h", !add(0xB83, !sub(i, 3))>; } -//===-------------------------- +//===----------------------------------------------------------------------===// // Machine Counter Setup -//===-------------------------- +//===----------------------------------------------------------------------===// let AltName = "mucounteren" in // Privileged spec v1.9.1 Name def : SysReg<"mcountinhibit", 0x320>; -def : SysReg<"mhpmevent3", 0x323>; -def : SysReg<"mhpmevent4", 0x324>; -def : SysReg<"mhpmevent5", 0x325>; -def : SysReg<"mhpmevent6", 0x326>; -def : SysReg<"mhpmevent7", 0x327>; -def : SysReg<"mhpmevent8", 0x328>; -def : SysReg<"mhpmevent9", 0x329>; -def : SysReg<"mhpmevent10", 0x32A>; -def : SysReg<"mhpmevent11", 0x32B>; -def : SysReg<"mhpmevent12", 0x32C>; -def : SysReg<"mhpmevent13", 0x32D>; -def : SysReg<"mhpmevent14", 0x32E>; -def : SysReg<"mhpmevent15", 0x32F>; -def : SysReg<"mhpmevent16", 0x330>; -def : SysReg<"mhpmevent17", 0x331>; -def : SysReg<"mhpmevent18", 0x332>; -def : SysReg<"mhpmevent19", 0x333>; -def : SysReg<"mhpmevent20", 0x334>; -def : SysReg<"mhpmevent21", 0x335>; -def : SysReg<"mhpmevent22", 0x336>; -def : SysReg<"mhpmevent23", 0x337>; -def : SysReg<"mhpmevent24", 0x338>; -def : SysReg<"mhpmevent25", 0x339>; -def : SysReg<"mhpmevent26", 0x33A>; -def : SysReg<"mhpmevent27", 0x33B>; -def : SysReg<"mhpmevent28", 0x33C>; -def : SysReg<"mhpmevent29", 0x33D>; -def : SysReg<"mhpmevent30", 0x33E>; -def : SysReg<"mhpmevent31", 0x33F>; +// mhpmevent3-mhpmevent31 at 0x323-0x33F. +foreach i = 3...31 in + def : SysReg<"mhpmevent"#i, !add(0x323, !sub(i, 3))>; -//===----------------------------------------------- +// mhpmevent3h-mhpmevent31h at 0x723-0x73F +foreach i = 3...31 in { + let isRV32Only = 1 in + def : SysReg<"mhpmevent"#i#"h", !add(0x723, !sub(i, 3))>; +} + +//===----------------------------------------------------------------------===// // Debug/ Trace Registers (shared with Debug Mode) -//===----------------------------------------------- +//===----------------------------------------------------------------------===// def : SysReg<"tselect", 0x7A0>; def : SysReg<"tdata1", 0x7A1>; def : SysReg<"tdata2", 0x7A2>; def : SysReg<"tdata3", 0x7A3>; +def : SysReg<"mcontext", 0x7A8>; -//===----------------------------------------------- +//===----------------------------------------------------------------------===// // Debug Mode Registers -//===----------------------------------------------- +//===----------------------------------------------------------------------===// def : SysReg<"dcsr", 0x7B0>; def : SysReg<"dpc", 0x7B1>; @@ -379,9 +345,9 @@ let AltName = "dscratch" in def : SysReg<"dscratch0", 0x7B2>; def : SysReg<"dscratch1", 0x7B3>; -//===----------------------------------------------- +//===----------------------------------------------------------------------===// // User Vector CSRs -//===----------------------------------------------- +//===----------------------------------------------------------------------===// def : SysReg<"vstart", 0x008>; def : SysReg<"vxsat", 0x009>; def : SysReg<"vxrm", 0x00A>; @@ -389,3 +355,26 @@ def : SysReg<"vcsr", 0x00F>; def : SysReg<"vl", 0xC20>; def : SysReg<"vtype", 0xC21>; def SysRegVLENB: SysReg<"vlenb", 0xC22>; + +//===----------------------------------------------------------------------===// +// State Enable Extension (Smstateen) +//===----------------------------------------------------------------------===// + +// sstateen0-sstateen3 at 0x10C-0x10F, mstateen0-mstateen3 at 0x30C-0x30F, +// mstateen0h-mstateen3h at 0x31C-0x31F, hstateen0-hstateen3 at 0x60C-0x60F, +// and hstateen0h-hstateen3h at 0x61C-0x61F. +foreach i = 0...3 in { + def : SysReg<"sstateen"#i, !add(0x10C, i)>; + def : SysReg<"mstateen"#i, !add(0x30C, i)>; + let isRV32Only = 1 in + def : SysReg<"mstateen"#i#"h", !add(0x31C, i)>; + def : SysReg<"hstateen"#i, !add(0x60C, i)>; + let isRV32Only = 1 in + def : SysReg<"hstateen"#i#"h", !add(0x61C, i)>; +} + +//===----------------------------------------------- +// Entropy Source CSR +//===----------------------------------------------- + +def SEED : SysReg<"seed", 0x015>; diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp index b421eba8d442..db5e2f1eeb6f 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp @@ -39,6 +39,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeRISCVTarget() { initializeGlobalISel(*PR); initializeRISCVGatherScatterLoweringPass(*PR); initializeRISCVMergeBaseOffsetOptPass(*PR); + initializeRISCVSExtWRemovalPass(*PR); initializeRISCVExpandPseudoPass(*PR); initializeRISCVInsertVSETVLIPass(*PR); } @@ -140,6 +141,7 @@ public: void addPreEmitPass() override; void addPreEmitPass2() override; void addPreSched2() override; + void addMachineSSAOptimization() override; void addPreRegAlloc() override; }; } // namespace @@ -194,6 +196,13 @@ void RISCVPassConfig::addPreEmitPass2() { addPass(createRISCVExpandAtomicPseudoPass()); } +void RISCVPassConfig::addMachineSSAOptimization() { + TargetPassConfig::addMachineSSAOptimization(); + + if (TM->getTargetTriple().getArch() == Triple::riscv64) + addPass(createRISCVSExtWRemovalPass()); +} + void RISCVPassConfig::addPreRegAlloc() { if (TM->getOptLevel() != CodeGenOpt::None) addPass(createRISCVMergeBaseOffsetOptPass()); diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp index c435430a1288..99e6774a02e4 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -15,6 +15,13 @@ using namespace llvm; #define DEBUG_TYPE "riscvtti" +static cl::opt<unsigned> RVVRegisterWidthLMUL( + "riscv-v-register-bit-width-lmul", + cl::desc( + "The LMUL to use for getRegisterBitWidth queries. Affects LMUL used " + "by autovectorized code. Fractional LMULs are not supported."), + cl::init(1), cl::Hidden); + InstructionCost RISCVTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) { assert(Ty->isIntegerTy() && @@ -137,6 +144,24 @@ Optional<unsigned> RISCVTTIImpl::getMaxVScale() const { return BaseT::getMaxVScale(); } +TypeSize +RISCVTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const { + unsigned LMUL = PowerOf2Floor( + std::max<unsigned>(std::min<unsigned>(RVVRegisterWidthLMUL, 8), 1)); + switch (K) { + case TargetTransformInfo::RGK_Scalar: + return TypeSize::getFixed(ST->getXLen()); + case TargetTransformInfo::RGK_FixedWidthVector: + return TypeSize::getFixed( + ST->hasVInstructions() ? LMUL * ST->getMinRVVVectorSizeInBits() : 0); + case TargetTransformInfo::RGK_ScalableVector: + return TypeSize::getScalable( + ST->hasVInstructions() ? LMUL * RISCV::RVVBitsPerBlock : 0); + } + + llvm_unreachable("Unsupported register kind"); +} + InstructionCost RISCVTTIImpl::getGatherScatterOpCost( unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) { @@ -172,10 +197,7 @@ void RISCVTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, // Support explicit targets enabled for SiFive with the unrolling preferences // below bool UseDefaultPreferences = true; - if (ST->getTuneCPU().contains("sifive-e76") || - ST->getTuneCPU().contains("sifive-s76") || - ST->getTuneCPU().contains("sifive-u74") || - ST->getTuneCPU().contains("sifive-7")) + if (ST->getProcFamily() == RISCVSubtarget::SiFive7) UseDefaultPreferences = false; if (UseDefaultPreferences) @@ -253,3 +275,16 @@ void RISCVTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) { BaseT::getPeelingPreferences(L, SE, PP); } + +InstructionCost RISCVTTIImpl::getRegUsageForType(Type *Ty) { + TypeSize Size = Ty->getPrimitiveSizeInBits(); + if (Ty->isVectorTy()) { + if (Size.isScalable() && ST->hasVInstructions()) + return divideCeil(Size.getKnownMinValue(), RISCV::RVVBitsPerBlock); + + if (ST->useRVVForFixedLengthVectors()) + return divideCeil(Size, ST->getMinRVVVectorSizeInBits()); + } + + return BaseT::getRegUsageForType(Ty); +} diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h index 7353496f4684..e79c4f75712b 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h @@ -58,20 +58,9 @@ public: bool supportsScalableVectors() const { return ST->hasVInstructions(); } Optional<unsigned> getMaxVScale() const; - TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const { - switch (K) { - case TargetTransformInfo::RGK_Scalar: - return TypeSize::getFixed(ST->getXLen()); - case TargetTransformInfo::RGK_FixedWidthVector: - return TypeSize::getFixed( - ST->hasVInstructions() ? ST->getMinRVVVectorSizeInBits() : 0); - case TargetTransformInfo::RGK_ScalableVector: - return TypeSize::getScalable( - ST->hasVInstructions() ? RISCV::RVVBitsPerBlock : 0); - } + TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const; - llvm_unreachable("Unsupported register kind"); - } + InstructionCost getRegUsageForType(Type *Ty); void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, @@ -81,7 +70,7 @@ public: TTI::PeelingPreferences &PP); unsigned getMinVectorRegisterBitWidth() const { - return ST->hasVInstructions() ? ST->getMinRVVVectorSizeInBits() : 0; + return ST->useRVVForFixedLengthVectors() ? 16 : 0; } InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, @@ -189,6 +178,20 @@ public: // Let regular unroll to unroll the loop. return VF == 1 ? 1 : ST->getMaxInterleaveFactor(); } + + // TODO: We should define RISC-V's own register classes. + // e.g. register class for FPR. + unsigned getNumberOfRegisters(unsigned ClassID) const { + bool Vector = (ClassID == 1); + if (Vector) { + if (ST->hasVInstructions()) + return 32; + return 0; + } + // 31 = 32 GPR - x0 (zero register) + // FIXME: Should we exclude fixed registers like SP, TP or GP? + return 31; + } }; } // end namespace llvm diff --git a/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp b/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp index 48e6903bd1b1..af3304f0907d 100644 --- a/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp +++ b/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp @@ -257,7 +257,7 @@ private: }; public: - SparcOperand(KindTy K) : MCParsedAsmOperand(), Kind(K) {} + SparcOperand(KindTy K) : Kind(K) {} bool isToken() const override { return Kind == k_Token; } bool isReg() const override { return Kind == k_Register; } diff --git a/llvm/lib/Target/Sparc/SparcISelDAGToDAG.cpp b/llvm/lib/Target/Sparc/SparcISelDAGToDAG.cpp index afb69899e724..c5d0f1de7dfd 100644 --- a/llvm/lib/Target/Sparc/SparcISelDAGToDAG.cpp +++ b/llvm/lib/Target/Sparc/SparcISelDAGToDAG.cpp @@ -66,7 +66,7 @@ private: } // end anonymous namespace SDNode* SparcDAGToDAGISel::getGlobalBaseReg() { - unsigned GlobalBaseReg = Subtarget->getInstrInfo()->getGlobalBaseReg(MF); + Register GlobalBaseReg = Subtarget->getInstrInfo()->getGlobalBaseReg(MF); return CurDAG->getRegister(GlobalBaseReg, TLI->getPointerTy(CurDAG->getDataLayout())) .getNode(); @@ -168,8 +168,7 @@ bool SparcDAGToDAGISel::tryInlineAsm(SDNode *N){ // placement. SDLoc dl(N); - SDValue Glue = N->getGluedNode() ? N->getOperand(NumOps-1) - : SDValue(nullptr,0); + SDValue Glue = N->getGluedNode() ? N->getOperand(NumOps - 1) : SDValue(); SmallVector<bool, 8> OpChanged; // Glue node will be appended late. @@ -221,8 +220,8 @@ bool SparcDAGToDAGISel::tryInlineAsm(SDNode *N){ assert((i+2 < NumOps) && "Invalid number of operands in inline asm"); SDValue V0 = N->getOperand(i+1); SDValue V1 = N->getOperand(i+2); - unsigned Reg0 = cast<RegisterSDNode>(V0)->getReg(); - unsigned Reg1 = cast<RegisterSDNode>(V1)->getReg(); + Register Reg0 = cast<RegisterSDNode>(V0)->getReg(); + Register Reg1 = cast<RegisterSDNode>(V1)->getReg(); SDValue PairedReg; MachineRegisterInfo &MRI = MF->getRegInfo(); diff --git a/llvm/lib/Target/Sparc/SparcISelLowering.cpp b/llvm/lib/Target/Sparc/SparcISelLowering.cpp index ed1faf6b1fe8..6d6879bc94b3 100644 --- a/llvm/lib/Target/Sparc/SparcISelLowering.cpp +++ b/llvm/lib/Target/Sparc/SparcISelLowering.cpp @@ -826,7 +826,7 @@ SparcTargetLowering::LowerCall_32(TargetLowering::CallLoweringInfo &CLI, // sret only allowed on first argument assert(Outs[realArgIdx].OrigArgIndex == 0); PointerType *Ty = cast<PointerType>(CLI.getArgs()[0].Ty); - Type *ElementTy = Ty->getElementType(); + Type *ElementTy = Ty->getPointerElementType(); SRetArgSize = DAG.getDataLayout().getTypeAllocSize(ElementTy); continue; } @@ -2684,7 +2684,7 @@ static SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG, SDValue RetAddr; if (depth == 0) { auto PtrVT = TLI.getPointerTy(DAG.getDataLayout()); - unsigned RetReg = MF.addLiveIn(SP::I7, TLI.getRegClassFor(PtrVT)); + Register RetReg = MF.addLiveIn(SP::I7, TLI.getRegClassFor(PtrVT)); RetAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, RetReg, VT); return RetAddr; } @@ -3245,7 +3245,7 @@ LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops, SelectionDAG &DAG) const { - SDValue Result(nullptr, 0); + SDValue Result; // Only support length 1 constraints for now. if (Constraint.length() > 1) diff --git a/llvm/lib/Target/Sparc/SparcTargetObjectFile.h b/llvm/lib/Target/Sparc/SparcTargetObjectFile.h index 9bbe602b32b3..f30ddc7b4955 100644 --- a/llvm/lib/Target/Sparc/SparcTargetObjectFile.h +++ b/llvm/lib/Target/Sparc/SparcTargetObjectFile.h @@ -18,9 +18,7 @@ class TargetMachine; class SparcELFTargetObjectFile : public TargetLoweringObjectFileELF { public: - SparcELFTargetObjectFile() : - TargetLoweringObjectFileELF() - {} + SparcELFTargetObjectFile() {} void Initialize(MCContext &Ctx, const TargetMachine &TM) override; diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h index 899fec6c3328..e76fa03af3bf 100644 --- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h +++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h @@ -23,11 +23,7 @@ class MCObjectTargetWriter; class MCRegisterInfo; class MCSubtargetInfo; class MCTargetOptions; -class StringRef; class Target; -class Triple; -class raw_pwrite_stream; -class raw_ostream; namespace SystemZMC { // How many bytes are in the ABI-defined, caller-allocated part of diff --git a/llvm/lib/Target/SystemZ/SystemZ.h b/llvm/lib/Target/SystemZ/SystemZ.h index bedbd061ea5c..5be19f0e3b46 100644 --- a/llvm/lib/Target/SystemZ/SystemZ.h +++ b/llvm/lib/Target/SystemZ/SystemZ.h @@ -20,6 +20,7 @@ namespace llvm { class SystemZTargetMachine; class FunctionPass; +class PassRegistry; namespace SystemZ { // Condition-code mask values. @@ -196,6 +197,15 @@ FunctionPass *createSystemZLDCleanupPass(SystemZTargetMachine &TM); FunctionPass *createSystemZCopyPhysRegsPass(SystemZTargetMachine &TM); FunctionPass *createSystemZPostRewritePass(SystemZTargetMachine &TM); FunctionPass *createSystemZTDCPass(); + +void initializeSystemZElimComparePass(PassRegistry &); +void initializeSystemZShortenInstPass(PassRegistry &); +void initializeSystemZLongBranchPass(PassRegistry &); +void initializeSystemZLDCleanupPass(PassRegistry &); +void initializeSystemZCopyPhysRegsPass(PassRegistry &); +void initializeSystemZPostRewritePass(PassRegistry &); +void initializeSystemZTDCPassPass(PassRegistry &); + } // end namespace llvm #endif diff --git a/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp b/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp index defab665f924..e01adcce04ab 100644 --- a/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp +++ b/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp @@ -786,6 +786,50 @@ void SystemZAsmPrinter::emitEndOfAsmFile(Module &M) { emitStackMaps(SM); } +void SystemZAsmPrinter::emitFunctionEntryLabel() { + const SystemZSubtarget &Subtarget = + static_cast<const SystemZSubtarget &>(MF->getSubtarget()); + + if (Subtarget.getTargetTriple().isOSzOS()) { + MCContext &OutContext = OutStreamer->getContext(); + MCSymbol *EPMarkerSym = OutContext.createTempSymbol("CM_", true); + + // EntryPoint Marker + const MachineFrameInfo &MFFrame = MF->getFrameInfo(); + bool IsUsingAlloca = MFFrame.hasVarSizedObjects(); + + // Set Flags + uint8_t Flags = 0; + if (IsUsingAlloca) + Flags |= 0x04; + + uint32_t DSASize = MFFrame.getStackSize(); + + // Combine into top 27 bits of DSASize and bottom 5 bits of Flags. + uint32_t DSAAndFlags = DSASize & 0xFFFFFFE0; // (x/32) << 5 + DSAAndFlags |= Flags; + + // Emit entry point marker section. + OutStreamer->AddComment("XPLINK Routine Layout Entry"); + OutStreamer->emitLabel(EPMarkerSym); + OutStreamer->AddComment("Eyecatcher 0x00C300C500C500"); + OutStreamer->emitIntValueInHex(0x00C300C500C500, 7); // Eyecatcher. + OutStreamer->AddComment("Mark Type C'1'"); + OutStreamer->emitInt8(0xF1); // Mark Type. + if (OutStreamer->isVerboseAsm()) { + OutStreamer->AddComment("DSA Size 0x" + Twine::utohexstr(DSASize)); + OutStreamer->AddComment("Entry Flags"); + if (Flags & 0x04) + OutStreamer->AddComment(" Bit 2: 1 = Uses alloca"); + else + OutStreamer->AddComment(" Bit 2: 0 = Does not use alloca"); + } + OutStreamer->emitInt32(DSAAndFlags); + } + + AsmPrinter::emitFunctionEntryLabel(); +} + // Force static initialization. extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeSystemZAsmPrinter() { RegisterAsmPrinter<SystemZAsmPrinter> X(getTheSystemZTarget()); diff --git a/llvm/lib/Target/SystemZ/SystemZAsmPrinter.h b/llvm/lib/Target/SystemZ/SystemZAsmPrinter.h index 6cfd7bd4c486..80d68d1b93ff 100644 --- a/llvm/lib/Target/SystemZ/SystemZAsmPrinter.h +++ b/llvm/lib/Target/SystemZ/SystemZAsmPrinter.h @@ -19,7 +19,6 @@ namespace llvm { class MCStreamer; -class MachineBasicBlock; class MachineInstr; class Module; class raw_ostream; @@ -52,6 +51,7 @@ public: SM.reset(); return AsmPrinter::doInitialization(M); } + void emitFunctionEntryLabel() override; private: void LowerFENTRY_CALL(const MachineInstr &MI, SystemZMCInstLower &MCIL); diff --git a/llvm/lib/Target/SystemZ/SystemZCopyPhysRegs.cpp b/llvm/lib/Target/SystemZ/SystemZCopyPhysRegs.cpp index 7d21d29d270e..763aa8c0e41f 100644 --- a/llvm/lib/Target/SystemZ/SystemZCopyPhysRegs.cpp +++ b/llvm/lib/Target/SystemZ/SystemZCopyPhysRegs.cpp @@ -25,12 +25,6 @@ using namespace llvm; -#define SYSTEMZ_COPYPHYSREGS_NAME "SystemZ Copy Physregs" - -namespace llvm { - void initializeSystemZCopyPhysRegsPass(PassRegistry&); -} - namespace { class SystemZCopyPhysRegs : public MachineFunctionPass { @@ -41,8 +35,6 @@ public: initializeSystemZCopyPhysRegsPass(*PassRegistry::getPassRegistry()); } - StringRef getPassName() const override { return SYSTEMZ_COPYPHYSREGS_NAME; } - bool runOnMachineFunction(MachineFunction &MF) override; void getAnalysisUsage(AnalysisUsage &AU) const override; @@ -59,7 +51,7 @@ char SystemZCopyPhysRegs::ID = 0; } // end anonymous namespace INITIALIZE_PASS(SystemZCopyPhysRegs, "systemz-copy-physregs", - SYSTEMZ_COPYPHYSREGS_NAME, false, false) + "SystemZ Copy Physregs", false, false) FunctionPass *llvm::createSystemZCopyPhysRegsPass(SystemZTargetMachine &TM) { return new SystemZCopyPhysRegs(); diff --git a/llvm/lib/Target/SystemZ/SystemZElimCompare.cpp b/llvm/lib/Target/SystemZ/SystemZElimCompare.cpp index 631cbff303e8..4893acc81335 100644 --- a/llvm/lib/Target/SystemZ/SystemZElimCompare.cpp +++ b/llvm/lib/Target/SystemZ/SystemZElimCompare.cpp @@ -65,11 +65,8 @@ class SystemZElimCompare : public MachineFunctionPass { public: static char ID; - SystemZElimCompare(const SystemZTargetMachine &tm) - : MachineFunctionPass(ID) {} - - StringRef getPassName() const override { - return "SystemZ Comparison Elimination"; + SystemZElimCompare() : MachineFunctionPass(ID) { + initializeSystemZElimComparePass(*PassRegistry::getPassRegistry()); } bool processBlock(MachineBasicBlock &MBB); @@ -106,6 +103,9 @@ char SystemZElimCompare::ID = 0; } // end anonymous namespace +INITIALIZE_PASS(SystemZElimCompare, DEBUG_TYPE, + "SystemZ Comparison Elimination", false, false) + // Returns true if MI is an instruction whose output equals the value in Reg. static bool preservesValueOf(MachineInstr &MI, unsigned Reg) { switch (MI.getOpcode()) { @@ -746,5 +746,5 @@ bool SystemZElimCompare::runOnMachineFunction(MachineFunction &F) { } FunctionPass *llvm::createSystemZElimComparePass(SystemZTargetMachine &TM) { - return new SystemZElimCompare(TM); + return new SystemZElimCompare(); } diff --git a/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp b/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp index 99ab4c5455d6..ccc7d0737f53 100644 --- a/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp +++ b/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp @@ -103,7 +103,7 @@ bool SystemZELFFrameLowering::assignCalleeSavedSpillSlots( unsigned HighGPR = SystemZ::R15D; int StartSPOffset = SystemZMC::ELFCallFrameSize; for (auto &CS : CSI) { - unsigned Reg = CS.getReg(); + Register Reg = CS.getReg(); int Offset = getRegSpillOffset(MF, Reg); if (Offset) { if (SystemZ::GR64BitRegClass.contains(Reg) && StartSPOffset > Offset) { @@ -124,7 +124,7 @@ bool SystemZELFFrameLowering::assignCalleeSavedSpillSlots( // Also save the GPR varargs, if any. R6D is call-saved, so would // already be included, but we also need to handle the call-clobbered // argument registers. - unsigned FirstGPR = ZFI->getVarArgsFirstGPR(); + Register FirstGPR = ZFI->getVarArgsFirstGPR(); if (FirstGPR < SystemZ::ELFNumArgGPRs) { unsigned Reg = SystemZ::ELFArgGPRs[FirstGPR]; int Offset = getRegSpillOffset(MF, Reg); @@ -143,7 +143,7 @@ bool SystemZELFFrameLowering::assignCalleeSavedSpillSlots( for (auto &CS : CSI) { if (CS.getFrameIdx() != INT32_MAX) continue; - unsigned Reg = CS.getReg(); + Register Reg = CS.getReg(); const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); unsigned Size = TRI->getSpillSize(*RC); CurrOffset -= Size; @@ -271,7 +271,7 @@ bool SystemZELFFrameLowering::spillCalleeSavedRegisters( // Make sure all call-saved GPRs are included as operands and are // marked as live on entry. for (const CalleeSavedInfo &I : CSI) { - unsigned Reg = I.getReg(); + Register Reg = I.getReg(); if (SystemZ::GR64BitRegClass.contains(Reg)) addSavedGPR(MBB, MIB, Reg, true); } @@ -284,7 +284,7 @@ bool SystemZELFFrameLowering::spillCalleeSavedRegisters( // Save FPRs/VRs in the normal TargetInstrInfo way. for (const CalleeSavedInfo &I : CSI) { - unsigned Reg = I.getReg(); + Register Reg = I.getReg(); if (SystemZ::FP64BitRegClass.contains(Reg)) { MBB.addLiveIn(Reg); TII->storeRegToStackSlot(MBB, MBBI, Reg, true, I.getFrameIdx(), @@ -314,7 +314,7 @@ bool SystemZELFFrameLowering::restoreCalleeSavedRegisters( // Restore FPRs/VRs in the normal TargetInstrInfo way. for (const CalleeSavedInfo &I : CSI) { - unsigned Reg = I.getReg(); + Register Reg = I.getReg(); if (SystemZ::FP64BitRegClass.contains(Reg)) TII->loadRegFromStackSlot(MBB, MBBI, Reg, I.getFrameIdx(), &SystemZ::FP64BitRegClass, TRI); @@ -346,7 +346,7 @@ bool SystemZELFFrameLowering::restoreCalleeSavedRegisters( // Do a second scan adding regs as being defined by instruction for (const CalleeSavedInfo &I : CSI) { - unsigned Reg = I.getReg(); + Register Reg = I.getReg(); if (Reg != RestoreGPRs.LowGPR && Reg != RestoreGPRs.HighGPR && SystemZ::GR64BitRegClass.contains(Reg)) MIB.addReg(Reg, RegState::ImplicitDefine); @@ -500,7 +500,7 @@ void SystemZELFFrameLowering::emitPrologue(MachineFunction &MF, // Add CFI for the GPR saves. for (auto &Save : CSI) { - unsigned Reg = Save.getReg(); + Register Reg = Save.getReg(); if (SystemZ::GR64BitRegClass.contains(Reg)) { int FI = Save.getFrameIdx(); int64_t Offset = MFFrame.getObjectOffset(FI); @@ -580,7 +580,7 @@ void SystemZELFFrameLowering::emitPrologue(MachineFunction &MF, // Skip over the FPR/VR saves. SmallVector<unsigned, 8> CFIIndexes; for (auto &Save : CSI) { - unsigned Reg = Save.getReg(); + Register Reg = Save.getReg(); if (SystemZ::FP64BitRegClass.contains(Reg)) { if (MBBI != MBB.end() && (MBBI->getOpcode() == SystemZ::STD || @@ -764,8 +764,7 @@ void SystemZELFFrameLowering::inlineStackProbe( bool SystemZELFFrameLowering::hasFP(const MachineFunction &MF) const { return (MF.getTarget().Options.DisableFramePointerElim(MF) || - MF.getFrameInfo().hasVarSizedObjects() || - MF.getInfo<SystemZMachineFunctionInfo>()->getManipulatesSP()); + MF.getFrameInfo().hasVarSizedObjects()); } StackOffset SystemZELFFrameLowering::getFrameIndexReference( @@ -850,7 +849,7 @@ bool SystemZXPLINKFrameLowering::assignCalleeSavedSpillSlots( auto ProcessCSI = [&](std::vector<CalleeSavedInfo> &CSIList) { for (auto &CS : CSIList) { - unsigned Reg = CS.getReg(); + Register Reg = CS.getReg(); int Offset = RegSpillOffsets[Reg]; if (Offset >= 0) { if (GRRegClass.contains(Reg)) { @@ -895,7 +894,7 @@ bool SystemZXPLINKFrameLowering::assignCalleeSavedSpillSlots( for (auto &CS : CSI) { if (CS.getFrameIdx() != INT32_MAX) continue; - unsigned Reg = CS.getReg(); + Register Reg = CS.getReg(); const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); Align Alignment = TRI->getSpillAlign(*RC); unsigned Size = TRI->getSpillSize(*RC); @@ -966,7 +965,7 @@ bool SystemZXPLINKFrameLowering::spillCalleeSavedRegisters( // marked as live on entry. auto &GRRegClass = SystemZ::GR64BitRegClass; for (const CalleeSavedInfo &I : CSI) { - unsigned Reg = I.getReg(); + Register Reg = I.getReg(); if (GRRegClass.contains(Reg)) addSavedGPR(MBB, MIB, Reg, true); } @@ -974,7 +973,7 @@ bool SystemZXPLINKFrameLowering::spillCalleeSavedRegisters( // Spill FPRs to the stack in the normal TargetInstrInfo way for (const CalleeSavedInfo &I : CSI) { - unsigned Reg = I.getReg(); + Register Reg = I.getReg(); if (SystemZ::FP64BitRegClass.contains(Reg)) { MBB.addLiveIn(Reg); TII->storeRegToStackSlot(MBB, MBBI, Reg, true, I.getFrameIdx(), @@ -1007,7 +1006,7 @@ bool SystemZXPLINKFrameLowering::restoreCalleeSavedRegisters( // Restore FPRs in the normal TargetInstrInfo way. for (unsigned I = 0, E = CSI.size(); I != E; ++I) { - unsigned Reg = CSI[I].getReg(); + Register Reg = CSI[I].getReg(); if (SystemZ::FP64BitRegClass.contains(Reg)) TII->loadRegFromStackSlot(MBB, MBBI, Reg, CSI[I].getFrameIdx(), &SystemZ::FP64BitRegClass, TRI); @@ -1041,7 +1040,7 @@ bool SystemZXPLINKFrameLowering::restoreCalleeSavedRegisters( // Do a second scan adding regs as being defined by instruction for (unsigned I = 0, E = CSI.size(); I != E; ++I) { - unsigned Reg = CSI[I].getReg(); + Register Reg = CSI[I].getReg(); if (Reg > RestoreGPRs.LowGPR && Reg < RestoreGPRs.HighGPR) MIB.addReg(Reg, RegState::ImplicitDefine); } diff --git a/llvm/lib/Target/SystemZ/SystemZFrameLowering.h b/llvm/lib/Target/SystemZ/SystemZFrameLowering.h index 106b9e8ebe06..3a1af888d8f9 100644 --- a/llvm/lib/Target/SystemZ/SystemZFrameLowering.h +++ b/llvm/lib/Target/SystemZ/SystemZFrameLowering.h @@ -17,7 +17,6 @@ #include "llvm/Support/TypeSize.h" namespace llvm { -class SystemZTargetMachine; class SystemZSubtarget; class SystemZFrameLowering : public TargetFrameLowering { diff --git a/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp b/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp index 39a82e2c07e0..cf55318d328d 100644 --- a/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp +++ b/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp @@ -62,8 +62,7 @@ struct SystemZAddressingMode { bool IncludesDynAlloc; SystemZAddressingMode(AddrForm form, DispRange dr) - : Form(form), DR(dr), Base(), Disp(0), Index(), - IncludesDynAlloc(false) {} + : Form(form), DR(dr), Disp(0), IncludesDynAlloc(false) {} // True if the address can have an index register. bool hasIndexField() { return Form != FormBD; } diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp index 24de52850771..f10651d5c5d7 100644 --- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp +++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -318,8 +318,6 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM, setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom); setOperationAction(ISD::GET_DYNAMIC_AREA_OFFSET, PtrVT, Custom); - // Use custom expanders so that we can force the function to use - // a frame pointer. setOperationAction(ISD::STACKSAVE, MVT::Other, Custom); setOperationAction(ISD::STACKRESTORE, MVT::Other, Custom); @@ -1571,7 +1569,7 @@ SDValue SystemZTargetLowering::LowerFormalArguments( int FI = MFI.CreateFixedObject(8, -SystemZMC::ELFCallFrameSize + Offset, true); SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); - unsigned VReg = MF.addLiveIn(SystemZ::ELFArgFPRs[I], + Register VReg = MF.addLiveIn(SystemZ::ELFArgFPRs[I], &SystemZ::FP64BitRegClass); SDValue ArgValue = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f64); MemOps[I] = DAG.getStore(ArgValue.getValue(1), DL, ArgValue, FIN, @@ -3417,7 +3415,7 @@ SDValue SystemZTargetLowering::lowerRETURNADDR(SDValue Op, } // Return R14D, which has the return address. Mark it an implicit live-in. - unsigned LinkReg = MF.addLiveIn(SystemZ::R14D, &SystemZ::GR64BitRegClass); + Register LinkReg = MF.addLiveIn(SystemZ::R14D, &SystemZ::GR64BitRegClass); return DAG.getCopyFromReg(DAG.getEntryNode(), DL, LinkReg, PtrVT); } @@ -4194,7 +4192,6 @@ SDValue SystemZTargetLowering::lowerSTACKSAVE(SDValue Op, MachineFunction &MF = DAG.getMachineFunction(); const SystemZSubtarget *Subtarget = &MF.getSubtarget<SystemZSubtarget>(); auto *Regs = Subtarget->getSpecialRegisters(); - MF.getInfo<SystemZMachineFunctionInfo>()->setManipulatesSP(true); if (MF.getFunction().getCallingConv() == CallingConv::GHC) report_fatal_error("Variable-sized stack allocations are not supported " "in GHC calling convention"); @@ -4207,7 +4204,6 @@ SDValue SystemZTargetLowering::lowerSTACKRESTORE(SDValue Op, MachineFunction &MF = DAG.getMachineFunction(); const SystemZSubtarget *Subtarget = &MF.getSubtarget<SystemZSubtarget>(); auto *Regs = Subtarget->getSpecialRegisters(); - MF.getInfo<SystemZMachineFunctionInfo>()->setManipulatesSP(true); bool StoreBackchain = MF.getFunction().hasFnAttribute("backchain"); if (MF.getFunction().getCallingConv() == CallingConv::GHC) @@ -8318,13 +8314,11 @@ MachineBasicBlock *SystemZTargetLowering::emitTransactionBegin( // Add FPR/VR clobbers. if (!NoFloat && (Control & 4) != 0) { if (Subtarget.hasVector()) { - for (int I = 0; I < 32; I++) { - unsigned Reg = SystemZMC::VR128Regs[I]; + for (unsigned Reg : SystemZMC::VR128Regs) { MI.addOperand(MachineOperand::CreateReg(Reg, true, true)); } } else { - for (int I = 0; I < 16; I++) { - unsigned Reg = SystemZMC::FP64Regs[I]; + for (unsigned Reg : SystemZMC::FP64Regs) { MI.addOperand(MachineOperand::CreateReg(Reg, true, true)); } } diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/llvm/lib/Target/SystemZ/SystemZISelLowering.h index 940c0a857ea4..a8ddb8c62d18 100644 --- a/llvm/lib/Target/SystemZ/SystemZISelLowering.h +++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.h @@ -381,7 +381,6 @@ enum { } // end namespace SystemZICMP class SystemZSubtarget; -class SystemZTargetMachine; class SystemZTargetLowering : public TargetLowering { public: diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp index e80496e37781..6db9bf3056b7 100644 --- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp +++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp @@ -1309,7 +1309,7 @@ MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl( // allocated regs are in an FP reg-class per previous check above. for (const MachineOperand &MO : MIB->operands()) if (MO.isReg() && Register::isVirtualRegister(MO.getReg())) { - unsigned Reg = MO.getReg(); + Register Reg = MO.getReg(); if (MRI.getRegClass(Reg) == &SystemZ::VR32BitRegClass) MRI.setRegClass(Reg, &SystemZ::FP32BitRegClass); else if (MRI.getRegClass(Reg) == &SystemZ::VR64BitRegClass) diff --git a/llvm/lib/Target/SystemZ/SystemZLDCleanup.cpp b/llvm/lib/Target/SystemZ/SystemZLDCleanup.cpp index 06d893d043e9..d6c795985448 100644 --- a/llvm/lib/Target/SystemZ/SystemZLDCleanup.cpp +++ b/llvm/lib/Target/SystemZ/SystemZLDCleanup.cpp @@ -29,11 +29,8 @@ namespace { class SystemZLDCleanup : public MachineFunctionPass { public: static char ID; - SystemZLDCleanup(const SystemZTargetMachine &tm) - : MachineFunctionPass(ID), TII(nullptr), MF(nullptr) {} - - StringRef getPassName() const override { - return "SystemZ Local Dynamic TLS Access Clean-up"; + SystemZLDCleanup() : MachineFunctionPass(ID), TII(nullptr), MF(nullptr) { + initializeSystemZLDCleanupPass(*PassRegistry::getPassRegistry()); } bool runOnMachineFunction(MachineFunction &MF) override; @@ -52,8 +49,11 @@ char SystemZLDCleanup::ID = 0; } // end anonymous namespace +INITIALIZE_PASS(SystemZLDCleanup, "systemz-ld-cleanup", + "SystemZ Local Dynamic TLS Access Clean-up", false, false) + FunctionPass *llvm::createSystemZLDCleanupPass(SystemZTargetMachine &TM) { - return new SystemZLDCleanup(TM); + return new SystemZLDCleanup(); } void SystemZLDCleanup::getAnalysisUsage(AnalysisUsage &AU) const { diff --git a/llvm/lib/Target/SystemZ/SystemZLongBranch.cpp b/llvm/lib/Target/SystemZ/SystemZLongBranch.cpp index 9c985c16f082..d53693154d40 100644 --- a/llvm/lib/Target/SystemZ/SystemZLongBranch.cpp +++ b/llvm/lib/Target/SystemZ/SystemZLongBranch.cpp @@ -135,10 +135,9 @@ class SystemZLongBranch : public MachineFunctionPass { public: static char ID; - SystemZLongBranch(const SystemZTargetMachine &tm) - : MachineFunctionPass(ID) {} - - StringRef getPassName() const override { return "SystemZ Long Branch"; } + SystemZLongBranch() : MachineFunctionPass(ID) { + initializeSystemZLongBranchPass(*PassRegistry::getPassRegistry()); + } bool runOnMachineFunction(MachineFunction &F) override; @@ -174,6 +173,9 @@ const uint64_t MaxForwardRange = 0xfffe; } // end anonymous namespace +INITIALIZE_PASS(SystemZLongBranch, DEBUG_TYPE, "SystemZ Long Branch", false, + false) + // Position describes the state immediately before Block. Update Block // accordingly and move Position to the end of the block's non-terminator // instructions. @@ -481,5 +483,5 @@ bool SystemZLongBranch::runOnMachineFunction(MachineFunction &F) { } FunctionPass *llvm::createSystemZLongBranchPass(SystemZTargetMachine &TM) { - return new SystemZLongBranch(TM); + return new SystemZLongBranch(); } diff --git a/llvm/lib/Target/SystemZ/SystemZMCInstLower.h b/llvm/lib/Target/SystemZ/SystemZMCInstLower.h index 14ad06488312..eb09033d1850 100644 --- a/llvm/lib/Target/SystemZ/SystemZMCInstLower.h +++ b/llvm/lib/Target/SystemZ/SystemZMCInstLower.h @@ -18,7 +18,6 @@ class MCInst; class MCOperand; class MachineInstr; class MachineOperand; -class Mangler; class SystemZAsmPrinter; class LLVM_LIBRARY_VISIBILITY SystemZMCInstLower { diff --git a/llvm/lib/Target/SystemZ/SystemZMachineFunctionInfo.h b/llvm/lib/Target/SystemZ/SystemZMachineFunctionInfo.h index f755d5cd3d5b..ec4b812eb0e1 100644 --- a/llvm/lib/Target/SystemZ/SystemZMachineFunctionInfo.h +++ b/llvm/lib/Target/SystemZ/SystemZMachineFunctionInfo.h @@ -34,14 +34,12 @@ class SystemZMachineFunctionInfo : public MachineFunctionInfo { unsigned VarArgsFrameIndex; unsigned RegSaveFrameIndex; int FramePointerSaveIndex; - bool ManipulatesSP; unsigned NumLocalDynamics; public: explicit SystemZMachineFunctionInfo(MachineFunction &MF) : VarArgsFirstGPR(0), VarArgsFirstFPR(0), VarArgsFrameIndex(0), - RegSaveFrameIndex(0), FramePointerSaveIndex(0), ManipulatesSP(false), - NumLocalDynamics(0) {} + RegSaveFrameIndex(0), FramePointerSaveIndex(0), NumLocalDynamics(0) {} // Get and set the first and last call-saved GPR that should be saved by // this function and the SP offset for the STMG. These are 0 if no GPRs @@ -85,11 +83,6 @@ public: int getFramePointerSaveIndex() const { return FramePointerSaveIndex; } void setFramePointerSaveIndex(int Idx) { FramePointerSaveIndex = Idx; } - // Get and set whether the function directly manipulates the stack pointer, - // e.g. through STACKSAVE or STACKRESTORE. - bool getManipulatesSP() const { return ManipulatesSP; } - void setManipulatesSP(bool MSP) { ManipulatesSP = MSP; } - // Count number of local-dynamic TLS symbols used. unsigned getNumLocalDynamicTLSAccesses() const { return NumLocalDynamics; } void incNumLocalDynamicTLSAccesses() { ++NumLocalDynamics; } diff --git a/llvm/lib/Target/SystemZ/SystemZPostRewrite.cpp b/llvm/lib/Target/SystemZ/SystemZPostRewrite.cpp index aaa7f8fc88f5..5a2cfc53da49 100644 --- a/llvm/lib/Target/SystemZ/SystemZPostRewrite.cpp +++ b/llvm/lib/Target/SystemZ/SystemZPostRewrite.cpp @@ -21,16 +21,10 @@ #include "llvm/CodeGen/MachineInstrBuilder.h" using namespace llvm; -#define SYSTEMZ_POSTREWRITE_NAME "SystemZ Post Rewrite pass" - #define DEBUG_TYPE "systemz-postrewrite" STATISTIC(MemFoldCopies, "Number of copies inserted before folded mem ops."); STATISTIC(LOCRMuxJumps, "Number of LOCRMux jump-sequences (lower is better)"); -namespace llvm { - void initializeSystemZPostRewritePass(PassRegistry&); -} - namespace { class SystemZPostRewrite : public MachineFunctionPass { @@ -44,8 +38,6 @@ public: bool runOnMachineFunction(MachineFunction &Fn) override; - StringRef getPassName() const override { return SYSTEMZ_POSTREWRITE_NAME; } - private: void selectLOCRMux(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, @@ -70,7 +62,7 @@ char SystemZPostRewrite::ID = 0; } // end anonymous namespace INITIALIZE_PASS(SystemZPostRewrite, "systemz-post-rewrite", - SYSTEMZ_POSTREWRITE_NAME, false, false) + "SystemZ Post Rewrite pass", false, false) /// Returns an instance of the Post Rewrite pass. FunctionPass *llvm::createSystemZPostRewritePass(SystemZTargetMachine &TM) { @@ -178,15 +170,15 @@ bool SystemZPostRewrite::expandCondMove(MachineBasicBlock &MBB, MF.insert(std::next(MachineFunction::iterator(MBB)), RestMBB); RestMBB->splice(RestMBB->begin(), &MBB, MI, MBB.end()); RestMBB->transferSuccessors(&MBB); - for (auto I = LiveRegs.begin(); I != LiveRegs.end(); ++I) - RestMBB->addLiveIn(*I); + for (MCPhysReg R : LiveRegs) + RestMBB->addLiveIn(R); // Create a new block MoveMBB to hold the move instruction. MachineBasicBlock *MoveMBB = MF.CreateMachineBasicBlock(BB); MF.insert(std::next(MachineFunction::iterator(MBB)), MoveMBB); MoveMBB->addLiveIn(SrcReg); - for (auto I = LiveRegs.begin(); I != LiveRegs.end(); ++I) - MoveMBB->addLiveIn(*I); + for (MCPhysReg R : LiveRegs) + MoveMBB->addLiveIn(R); // At the end of MBB, create a conditional branch to RestMBB if the // condition is false, otherwise fall through to MoveMBB. diff --git a/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.h b/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.h index a4a5b1fbdf90..da6725777e43 100644 --- a/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.h +++ b/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.h @@ -17,8 +17,6 @@ namespace llvm { -class SystemZTargetMachine; - class SystemZSelectionDAGInfo : public SelectionDAGTargetInfo { public: explicit SystemZSelectionDAGInfo() = default; diff --git a/llvm/lib/Target/SystemZ/SystemZShortenInst.cpp b/llvm/lib/Target/SystemZ/SystemZShortenInst.cpp index 254e5e92449b..92930dad80ef 100644 --- a/llvm/lib/Target/SystemZ/SystemZShortenInst.cpp +++ b/llvm/lib/Target/SystemZ/SystemZShortenInst.cpp @@ -26,11 +26,7 @@ namespace { class SystemZShortenInst : public MachineFunctionPass { public: static char ID; - SystemZShortenInst(const SystemZTargetMachine &tm); - - StringRef getPassName() const override { - return "SystemZ Instruction Shortening"; - } + SystemZShortenInst(); bool processBlock(MachineBasicBlock &MBB); bool runOnMachineFunction(MachineFunction &F) override; @@ -56,12 +52,17 @@ private: char SystemZShortenInst::ID = 0; } // end anonymous namespace +INITIALIZE_PASS(SystemZShortenInst, DEBUG_TYPE, + "SystemZ Instruction Shortening", false, false) + FunctionPass *llvm::createSystemZShortenInstPass(SystemZTargetMachine &TM) { - return new SystemZShortenInst(TM); + return new SystemZShortenInst(); } -SystemZShortenInst::SystemZShortenInst(const SystemZTargetMachine &tm) - : MachineFunctionPass(ID), TII(nullptr) {} +SystemZShortenInst::SystemZShortenInst() + : MachineFunctionPass(ID), TII(nullptr) { + initializeSystemZShortenInstPass(*PassRegistry::getPassRegistry()); +} // Tie operands if MI has become a two-address instruction. static void tieOpsIfNeeded(MachineInstr &MI) { diff --git a/llvm/lib/Target/SystemZ/SystemZSubtarget.cpp b/llvm/lib/Target/SystemZ/SystemZSubtarget.cpp index 0f03d96655bf..75c0d454d904 100644 --- a/llvm/lib/Target/SystemZ/SystemZSubtarget.cpp +++ b/llvm/lib/Target/SystemZ/SystemZSubtarget.cpp @@ -89,7 +89,7 @@ SystemZSubtarget::SystemZSubtarget(const Triple &TT, const std::string &CPU, HasSoftFloat(false), TargetTriple(TT), SpecialRegisters(initializeSpecialRegisters()), InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM, *this), - TSInfo(), FrameLowering(SystemZFrameLowering::create(*this)) {} + FrameLowering(SystemZFrameLowering::create(*this)) {} bool SystemZSubtarget::enableSubRegLiveness() const { return UseSubRegLiveness; diff --git a/llvm/lib/Target/SystemZ/SystemZSubtarget.h b/llvm/lib/Target/SystemZ/SystemZSubtarget.h index 67c5b8eb09b6..98f7094fcb48 100644 --- a/llvm/lib/Target/SystemZ/SystemZSubtarget.h +++ b/llvm/lib/Target/SystemZ/SystemZSubtarget.h @@ -85,7 +85,7 @@ private: SystemZSubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS); - SystemZCallingConventionRegisters *initializeSpecialRegisters(void); + SystemZCallingConventionRegisters *initializeSpecialRegisters(); public: SystemZSubtarget(const Triple &TT, const std::string &CPU, diff --git a/llvm/lib/Target/SystemZ/SystemZTDC.cpp b/llvm/lib/Target/SystemZ/SystemZTDC.cpp index 7cb7dca2ea28..f62afb8ddfcf 100644 --- a/llvm/lib/Target/SystemZ/SystemZTDC.cpp +++ b/llvm/lib/Target/SystemZ/SystemZTDC.cpp @@ -61,10 +61,6 @@ using namespace llvm; -namespace llvm { - void initializeSystemZTDCPassPass(PassRegistry&); -} - namespace { class SystemZTDCPass : public FunctionPass { diff --git a/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp b/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp index deb3358102ed..f1469fe8f56b 100644 --- a/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp +++ b/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp @@ -32,6 +32,14 @@ using namespace llvm; extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeSystemZTarget() { // Register the target. RegisterTargetMachine<SystemZTargetMachine> X(getTheSystemZTarget()); + auto &PR = *PassRegistry::getPassRegistry(); + initializeSystemZElimComparePass(PR); + initializeSystemZShortenInstPass(PR); + initializeSystemZLongBranchPass(PR); + initializeSystemZLDCleanupPass(PR); + initializeSystemZShortenInstPass(PR); + initializeSystemZPostRewritePass(PR); + initializeSystemZTDCPassPass(PR); } // Determine whether we use the vector ABI. diff --git a/llvm/lib/Target/VE/AsmParser/VEAsmParser.cpp b/llvm/lib/Target/VE/AsmParser/VEAsmParser.cpp index fd9dc32b04f5..4a318e493c52 100644 --- a/llvm/lib/Target/VE/AsmParser/VEAsmParser.cpp +++ b/llvm/lib/Target/VE/AsmParser/VEAsmParser.cpp @@ -210,7 +210,7 @@ private: }; public: - VEOperand(KindTy K) : MCParsedAsmOperand(), Kind(K) {} + VEOperand(KindTy K) : Kind(K) {} bool isToken() const override { return Kind == k_Token; } bool isReg() const override { return Kind == k_Register; } diff --git a/llvm/lib/Target/VE/LVLGen.cpp b/llvm/lib/Target/VE/LVLGen.cpp index c4588926af9e..4db6a59284c2 100644 --- a/llvm/lib/Target/VE/LVLGen.cpp +++ b/llvm/lib/Target/VE/LVLGen.cpp @@ -125,8 +125,8 @@ bool LVLGen::runOnMachineFunction(MachineFunction &F) { TII = Subtarget.getInstrInfo(); TRI = Subtarget.getRegisterInfo(); - for (MachineFunction::iterator FI = F.begin(), FE = F.end(); FI != FE; ++FI) - Changed |= runOnMachineBasicBlock(*FI); + for (MachineBasicBlock &MBB : F) + Changed |= runOnMachineBasicBlock(MBB); if (Changed) { LLVM_DEBUG(dbgs() << "\n"); diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEMCTargetDesc.h b/llvm/lib/Target/VE/MCTargetDesc/VEMCTargetDesc.h index 7fb8a556aa74..f0bb6e3acdee 100644 --- a/llvm/lib/Target/VE/MCTargetDesc/VEMCTargetDesc.h +++ b/llvm/lib/Target/VE/MCTargetDesc/VEMCTargetDesc.h @@ -27,10 +27,6 @@ class MCRegisterInfo; class MCSubtargetInfo; class MCTargetOptions; class Target; -class Triple; -class StringRef; -class raw_pwrite_stream; -class raw_ostream; MCCodeEmitter *createVEMCCodeEmitter(const MCInstrInfo &MCII, const MCRegisterInfo &MRI, MCContext &Ctx); diff --git a/llvm/lib/Target/VE/VE.h b/llvm/lib/Target/VE/VE.h index 8c1fa840f19c..2a729a1a311c 100644 --- a/llvm/lib/Target/VE/VE.h +++ b/llvm/lib/Target/VE/VE.h @@ -22,7 +22,6 @@ namespace llvm { class FunctionPass; class VETargetMachine; -class formatted_raw_ostream; class AsmPrinter; class MCInst; class MachineInstr; diff --git a/llvm/lib/Target/VE/VECustomDAG.cpp b/llvm/lib/Target/VE/VECustomDAG.cpp new file mode 100644 index 000000000000..af3e4af13814 --- /dev/null +++ b/llvm/lib/Target/VE/VECustomDAG.cpp @@ -0,0 +1,81 @@ +//===-- VECustomDAG.h - VE Custom DAG Nodes ------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines the interfaces that VE uses to lower LLVM code into a +// selection DAG. +// +//===----------------------------------------------------------------------===// + +#include "VECustomDAG.h" + +#ifndef DEBUG_TYPE +#define DEBUG_TYPE "vecustomdag" +#endif + +namespace llvm { + +static const int StandardVectorWidth = 256; + +bool isPackedVectorType(EVT SomeVT) { + if (!SomeVT.isVector()) + return false; + return SomeVT.getVectorNumElements() > StandardVectorWidth; +} + +/// \returns the VVP_* SDNode opcode corresponsing to \p OC. +Optional<unsigned> getVVPOpcode(unsigned Opcode) { + switch (Opcode) { +#define HANDLE_VP_TO_VVP(VPOPC, VVPNAME) \ + case ISD::VPOPC: \ + return VEISD::VVPNAME; +#define ADD_VVP_OP(VVPNAME, SDNAME) \ + case VEISD::VVPNAME: \ + case ISD::SDNAME: \ + return VEISD::VVPNAME; +#include "VVPNodes.def" + } + return None; +} + +bool isVVPBinaryOp(unsigned VVPOpcode) { + switch (VVPOpcode) { +#define ADD_BINARY_VVP_OP(VVPNAME, ...) \ + case VEISD::VVPNAME: \ + return true; +#include "VVPNodes.def" + } + return false; +} + +SDValue VECustomDAG::getConstant(uint64_t Val, EVT VT, bool IsTarget, + bool IsOpaque) const { + return DAG.getConstant(Val, DL, VT, IsTarget, IsOpaque); +} + +SDValue VECustomDAG::getBroadcast(EVT ResultVT, SDValue Scalar, + SDValue AVL) const { + assert(ResultVT.isVector()); + auto ScaVT = Scalar.getValueType(); + assert(ScaVT != MVT::i1 && "TODO: Mask broadcasts"); + + if (isPackedVectorType(ResultVT)) { + // v512x packed mode broadcast + // Replicate the scalar reg (f32 or i32) onto the opposing half of the full + // scalar register. If it's an I64 type, assume that this has already + // happened. + if (ScaVT == MVT::f32) { + Scalar = getNode(VEISD::REPL_F32, MVT::i64, Scalar); + } else if (ScaVT == MVT::i32) { + Scalar = getNode(VEISD::REPL_I32, MVT::i64, Scalar); + } + } + + return getNode(VEISD::VEC_BROADCAST, ResultVT, {Scalar, AVL}); +} + +} // namespace llvm diff --git a/llvm/lib/Target/VE/VECustomDAG.h b/llvm/lib/Target/VE/VECustomDAG.h new file mode 100644 index 000000000000..ddd6ce783366 --- /dev/null +++ b/llvm/lib/Target/VE/VECustomDAG.h @@ -0,0 +1,79 @@ +//===------------ VECustomDAG.h - VE Custom DAG Nodes -----------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines the helper functions that VE uses to lower LLVM code into a +// selection DAG. For example, hiding SDLoc, and easy to use SDNodeFlags. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_VE_VECUSTOMDAG_H +#define LLVM_LIB_TARGET_VE_VECUSTOMDAG_H + +#include "VE.h" +#include "VEISelLowering.h" +#include "llvm/CodeGen/SelectionDAG.h" +#include "llvm/CodeGen/TargetLowering.h" + +namespace llvm { + +Optional<unsigned> getVVPOpcode(unsigned Opcode); + +bool isVVPBinaryOp(unsigned Opcode); + +bool isPackedVectorType(EVT SomeVT); + +class VECustomDAG { + SelectionDAG &DAG; + SDLoc DL; + +public: + SelectionDAG *getDAG() const { return &DAG; } + + VECustomDAG(SelectionDAG &DAG, SDLoc DL) : DAG(DAG), DL(DL) {} + + VECustomDAG(SelectionDAG &DAG, SDValue WhereOp) : DAG(DAG), DL(WhereOp) {} + + VECustomDAG(SelectionDAG &DAG, const SDNode *WhereN) : DAG(DAG), DL(WhereN) {} + + /// getNode { + SDValue getNode(unsigned OC, SDVTList VTL, ArrayRef<SDValue> OpV, + Optional<SDNodeFlags> Flags = None) const { + auto N = DAG.getNode(OC, DL, VTL, OpV); + if (Flags) + N->setFlags(*Flags); + return N; + } + + SDValue getNode(unsigned OC, ArrayRef<EVT> ResVT, ArrayRef<SDValue> OpV, + Optional<SDNodeFlags> Flags = None) const { + auto N = DAG.getNode(OC, DL, ResVT, OpV); + if (Flags) + N->setFlags(*Flags); + return N; + } + + SDValue getNode(unsigned OC, EVT ResVT, ArrayRef<SDValue> OpV, + Optional<SDNodeFlags> Flags = None) const { + auto N = DAG.getNode(OC, DL, ResVT, OpV); + if (Flags) + N->setFlags(*Flags); + return N; + } + + SDValue getUNDEF(EVT VT) const { return DAG.getUNDEF(VT); } + /// } getNode + + SDValue getConstant(uint64_t Val, EVT VT, bool IsTarget = false, + bool IsOpaque = false) const; + + SDValue getBroadcast(EVT ResultVT, SDValue Scalar, SDValue AVL) const; +}; + +} // namespace llvm + +#endif // LLVM_LIB_TARGET_VE_VECUSTOMDAG_H diff --git a/llvm/lib/Target/VE/VEISelLowering.cpp b/llvm/lib/Target/VE/VEISelLowering.cpp index 5ef223d6030b..9137c476777e 100644 --- a/llvm/lib/Target/VE/VEISelLowering.cpp +++ b/llvm/lib/Target/VE/VEISelLowering.cpp @@ -13,6 +13,7 @@ #include "VEISelLowering.h" #include "MCTargetDesc/VEMCExpr.h" +#include "VECustomDAG.h" #include "VEInstrBuilder.h" #include "VEMachineFunctionInfo.h" #include "VERegisterInfo.h" @@ -419,7 +420,7 @@ SDValue VETargetLowering::LowerFormalArguments( // All integer register arguments are promoted by the caller to i64. // Create a virtual register for the promoted live-in value. - unsigned VReg = + Register VReg = MF.addLiveIn(VA.getLocReg(), getRegClassFor(VA.getLocVT())); SDValue Arg = DAG.getCopyFromReg(Chain, DL, VReg, VA.getLocVT()); @@ -754,7 +755,7 @@ SDValue VETargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, for (unsigned i = 0; i != RVLocs.size(); ++i) { CCValAssign &VA = RVLocs[i]; assert(!VA.needsCustom() && "Unexpected custom lowering"); - unsigned Reg = VA.getLocReg(); + Register Reg = VA.getLocReg(); // When returning 'inreg {i32, i32 }', two consecutive i32 arguments can // reside in the same register in the high and low bits. Reuse the @@ -898,6 +899,8 @@ const char *VETargetLowering::getTargetNodeName(unsigned Opcode) const { TARGET_NODE_CASE(RET_FLAG) TARGET_NODE_CASE(TS1AM) TARGET_NODE_CASE(VEC_BROADCAST) + TARGET_NODE_CASE(REPL_I32) + TARGET_NODE_CASE(REPL_F32) // Register the VVP_* SDNodes. #define ADD_VVP_OP(VVP_NAME, ...) TARGET_NODE_CASE(VVP_NAME) @@ -1545,7 +1548,7 @@ static SDValue lowerFRAMEADDR(SDValue Op, SelectionDAG &DAG, unsigned Depth = Op.getConstantOperandVal(0); const VERegisterInfo *RegInfo = Subtarget->getRegisterInfo(); - unsigned FrameReg = RegInfo->getFrameRegister(MF); + Register FrameReg = RegInfo->getFrameRegister(MF); SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), DL, FrameReg, PtrVT); while (Depth--) @@ -1640,28 +1643,26 @@ static SDValue getSplatValue(SDNode *N) { SDValue VETargetLowering::lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { - SDLoc DL(Op); - unsigned NumEls = Op.getValueType().getVectorNumElements(); - MVT ElemVT = Op.getSimpleValueType().getVectorElementType(); + VECustomDAG CDAG(DAG, Op); + MVT ResultVT = Op.getSimpleValueType(); // If there is just one element, expand to INSERT_VECTOR_ELT. unsigned UniqueIdx; if (getUniqueInsertion(Op.getNode(), UniqueIdx)) { - SDValue AccuV = DAG.getUNDEF(Op.getValueType()); + SDValue AccuV = CDAG.getUNDEF(Op.getValueType()); auto ElemV = Op->getOperand(UniqueIdx); - SDValue IdxV = DAG.getConstant(UniqueIdx, DL, MVT::i64); - return DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, Op.getValueType(), AccuV, - ElemV, IdxV); + SDValue IdxV = CDAG.getConstant(UniqueIdx, MVT::i64); + return CDAG.getNode(ISD::INSERT_VECTOR_ELT, ResultVT, {AccuV, ElemV, IdxV}); } // Else emit a broadcast. if (SDValue ScalarV = getSplatValue(Op.getNode())) { - // lower to VEC_BROADCAST - MVT LegalResVT = MVT::getVectorVT(ElemVT, 256); - - auto AVL = DAG.getConstant(NumEls, DL, MVT::i32); - return DAG.getNode(VEISD::VEC_BROADCAST, DL, LegalResVT, Op.getOperand(0), - AVL); + unsigned NumEls = ResultVT.getVectorNumElements(); + // TODO: Legalize packed-mode AVL. + // For now, cap the AVL at 256. + auto CappedLength = std::min<unsigned>(256, NumEls); + auto AVL = CDAG.getConstant(CappedLength, MVT::i32); + return CDAG.getBroadcast(ResultVT, Op.getOperand(0), AVL); } // Expand @@ -1720,7 +1721,7 @@ SDValue VETargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::EXTRACT_VECTOR_ELT: return lowerEXTRACT_VECTOR_ELT(Op, DAG); -#define ADD_BINARY_VVP_OP(VVP_NAME, VP_NAME, ISD_NAME) case ISD::ISD_NAME: +#define ADD_VVP_OP(VVP_NAME, ISD_NAME) case ISD::ISD_NAME: #include "VVPNodes.def" return lowerToVVP(Op, DAG); } @@ -2666,21 +2667,6 @@ bool VETargetLowering::hasAndNot(SDValue Y) const { return true; } -/// \returns the VVP_* SDNode opcode corresponsing to \p OC. -static Optional<unsigned> getVVPOpcode(unsigned Opcode) { - switch (Opcode) { -#define HANDLE_VP_TO_VVP(VPOPC, VVPNAME) \ - case ISD::VPOPC: \ - return VEISD::VVPNAME; -#define ADD_VVP_OP(VVPNAME, SDNAME) \ - case VEISD::VVPNAME: \ - case ISD::SDNAME: \ - return VEISD::VVPNAME; -#include "VVPNodes.def" - } - return None; -} - SDValue VETargetLowering::lowerToVVP(SDValue Op, SelectionDAG &DAG) const { // Can we represent this as a VVP node. const unsigned Opcode = Op->getOpcode(); @@ -2691,7 +2677,7 @@ SDValue VETargetLowering::lowerToVVP(SDValue Op, SelectionDAG &DAG) const { const bool FromVP = ISD::isVPOpcode(Opcode); // The representative and legalized vector type of this operation. - SDLoc DL(Op); + VECustomDAG CDAG(DAG, Op); MVT MaskVT = MVT::v256i1; // TODO: packed mode. EVT OpVecVT = Op.getValueType(); EVT LegalVecVT = getTypeToTransformTo(*DAG.getContext(), OpVecVT); @@ -2708,27 +2694,21 @@ SDValue VETargetLowering::lowerToVVP(SDValue Op, SelectionDAG &DAG) const { } else { // Materialize the VL parameter. - AVL = DAG.getConstant(OpVecVT.getVectorNumElements(), DL, MVT::i32); - SDValue ConstTrue = DAG.getConstant(1, DL, MVT::i32); - Mask = DAG.getNode(VEISD::VEC_BROADCAST, DL, MaskVT, - ConstTrue); // emit a VEISD::VEC_BROADCAST here. + AVL = CDAG.getConstant(OpVecVT.getVectorNumElements(), MVT::i32); + SDValue ConstTrue = CDAG.getConstant(1, MVT::i32); + Mask = CDAG.getBroadcast(MaskVT, ConstTrue, AVL); } - // Categories we are interested in. - bool IsBinaryOp = false; - - switch (VVPOpcode) { -#define ADD_BINARY_VVP_OP(VVPNAME, ...) \ - case VEISD::VVPNAME: \ - IsBinaryOp = true; \ - break; -#include "VVPNodes.def" - } - - if (IsBinaryOp) { + if (isVVPBinaryOp(VVPOpcode)) { assert(LegalVecVT.isSimple()); - return DAG.getNode(VVPOpcode, DL, LegalVecVT, Op->getOperand(0), - Op->getOperand(1), Mask, AVL); + return CDAG.getNode(VVPOpcode, LegalVecVT, + {Op->getOperand(0), Op->getOperand(1), Mask, AVL}); + } + if (VVPOpcode == VEISD::VVP_SELECT) { + auto Mask = Op->getOperand(0); + auto OnTrue = Op->getOperand(1); + auto OnFalse = Op->getOperand(2); + return CDAG.getNode(VVPOpcode, LegalVecVT, {OnTrue, OnFalse, Mask, AVL}); } llvm_unreachable("lowerToVVP called for unexpected SDNode."); } @@ -2750,7 +2730,7 @@ SDValue VETargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op, SDValue Idx = Op.getOperand(1); SDLoc DL(Op); SDValue Result = Op; - if (0 /* Idx->isConstant() */) { + if (false /* Idx->isConstant() */) { // TODO: optimized implementation using constant values } else { SDValue Const1 = DAG.getConstant(1, DL, MVT::i64); @@ -2808,7 +2788,7 @@ SDValue VETargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op, Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val); SDValue Result = Op; - if (0 /* Idx->isConstant()*/) { + if (false /* Idx->isConstant()*/) { // TODO: optimized implementation using constant values } else { SDValue Const1 = DAG.getConstant(1, DL, MVT::i64); diff --git a/llvm/lib/Target/VE/VEISelLowering.h b/llvm/lib/Target/VE/VEISelLowering.h index b4ce8906fd51..09bd19e83717 100644 --- a/llvm/lib/Target/VE/VEISelLowering.h +++ b/llvm/lib/Target/VE/VEISelLowering.h @@ -40,6 +40,8 @@ enum NodeType : unsigned { TS1AM, // A TS1AM instruction used for 1/2 bytes swap. VEC_BROADCAST, // A vector broadcast instruction. // 0: scalar value, 1: VL + REPL_I32, + REPL_F32, // Replicate subregister to other half. // VVP_* nodes. #define ADD_VVP_OP(VVP_NAME, ...) VVP_NAME, @@ -219,4 +221,4 @@ public: }; } // namespace llvm -#endif // VE_ISELLOWERING_H +#endif // LLVM_LIB_TARGET_VE_VEISELLOWERING_H diff --git a/llvm/lib/Target/VE/VEInstrInfo.cpp b/llvm/lib/Target/VE/VEInstrInfo.cpp index 46846edfeafb..7c1bd5201867 100644 --- a/llvm/lib/Target/VE/VEInstrInfo.cpp +++ b/llvm/lib/Target/VE/VEInstrInfo.cpp @@ -248,7 +248,7 @@ unsigned VEInstrInfo::insertBranch(MachineBasicBlock &MBB, const TargetRegisterInfo *TRI = &getRegisterInfo(); MachineFunction *MF = MBB.getParent(); const MachineRegisterInfo &MRI = MF->getRegInfo(); - unsigned Reg = Cond[2].getReg(); + Register Reg = Cond[2].getReg(); if (IsIntegerCC(Cond[0].getImm())) { if (TRI->getRegSizeInBits(Reg, MRI) == 32) { opc[0] = VE::BRCFWir; diff --git a/llvm/lib/Target/VE/VEInstrInfo.td b/llvm/lib/Target/VE/VEInstrInfo.td index c3abbe2cafab..717427c3f48d 100644 --- a/llvm/lib/Target/VE/VEInstrInfo.td +++ b/llvm/lib/Target/VE/VEInstrInfo.td @@ -1576,6 +1576,12 @@ def f2l : OutPatFrag<(ops node:$exp), def l2f : OutPatFrag<(ops node:$exp), (EXTRACT_SUBREG $exp, sub_f32)>; +// Zero out subregisters. +def zero_i32 : OutPatFrag<(ops node:$expr), + (ANDrm $expr, 32)>; +def zero_f32 : OutPatFrag<(ops node:$expr), + (ANDrm $expr, !add(32, 64))>; + // Small immediates. def : Pat<(i32 simm7:$val), (EXTRACT_SUBREG (ORim (LO7 $val), 0), sub_i32)>; def : Pat<(i64 simm7:$val), (ORim (LO7 $val), 0)>; @@ -2287,6 +2293,16 @@ class IsVLVT<int OpIdx> : SDTCisVT<OpIdx,i32>; def vec_broadcast : SDNode<"VEISD::VEC_BROADCAST", SDTypeProfile<1, 2, [SDTCisVec<0>, IsVLVT<2>]>>; +// replicate lower 32bit to upper 32bit (f32 scalar replication). +def repl_f32 : SDNode<"VEISD::REPL_F32", + SDTypeProfile<1, 1, + [SDTCisInt<0>, SDTCisFP<1>]>>; +// replicate upper 32bit to lower 32 bit (i32 scalar replication). +def repl_i32 : SDNode<"VEISD::REPL_I32", + SDTypeProfile<1, 1, + [SDTCisInt<0>, SDTCisInt<1>]>>; + + // Whether this is an all-true mask (assuming undef-bits above VL are all-true). def true_mask : PatLeaf< (vec_broadcast (i32 nonzero), (i32 srcvalue))>; diff --git a/llvm/lib/Target/VE/VEInstrPatternsVec.td b/llvm/lib/Target/VE/VEInstrPatternsVec.td index dc3c913c918a..6c5b80315efb 100644 --- a/llvm/lib/Target/VE/VEInstrPatternsVec.td +++ b/llvm/lib/Target/VE/VEInstrPatternsVec.td @@ -15,6 +15,17 @@ // Instruction format superclass //===----------------------------------------------------------------------===// +// Sub-register replication for packed broadcast. +def: Pat<(i64 (repl_f32 f32:$val)), + (ORrr + (SRLri (f2l $val), 32), + (zero_i32 (f2l $val)))>; +def: Pat<(i64 (repl_i32 i32:$val)), + (ORrr + (zero_f32 (i2l $val)), + (SLLri (i2l $val), 32))>; + + multiclass vbrd_elem32<ValueType v32, ValueType s32, SDPatternOperator ImmOp, SDNodeXForm ImmCast, OutPatFrag SuperRegCast> { // VBRDil @@ -89,3 +100,8 @@ defm : patterns_elem32<v256f32, f32, simm7fp, LO7FP, l2f, f2l>; defm : patterns_elem64<v256i64, i64, simm7, LO7>; defm : patterns_elem64<v256f64, f64, simm7fp, LO7FP>; + +defm : vbrd_elem64<v512i32, i64, simm7, LO7>; +defm : vbrd_elem64<v512f32, i64, simm7, LO7>; +defm : vbrd_elem64<v512i32, f64, simm7fp, LO7FP>; +defm : vbrd_elem64<v512f32, f64, simm7fp, LO7FP>; diff --git a/llvm/lib/Target/VE/VEMCInstLower.cpp b/llvm/lib/Target/VE/VEMCInstLower.cpp index bc5577ce4f97..57195f238cf6 100644 --- a/llvm/lib/Target/VE/VEMCInstLower.cpp +++ b/llvm/lib/Target/VE/VEMCInstLower.cpp @@ -78,8 +78,7 @@ void llvm::LowerVEMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI, AsmPrinter &AP) { OutMI.setOpcode(MI->getOpcode()); - for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { - const MachineOperand &MO = MI->getOperand(i); + for (const MachineOperand &MO : MI->operands()) { MCOperand MCOp = LowerOperand(MI, MO, AP); if (MCOp.isValid()) diff --git a/llvm/lib/Target/VE/VEMachineFunctionInfo.h b/llvm/lib/Target/VE/VEMachineFunctionInfo.h index 16b25fed3f11..3160f6a552d7 100644 --- a/llvm/lib/Target/VE/VEMachineFunctionInfo.h +++ b/llvm/lib/Target/VE/VEMachineFunctionInfo.h @@ -29,10 +29,9 @@ private: bool IsLeafProc; public: - VEMachineFunctionInfo() - : GlobalBaseReg(), VarArgsFrameOffset(0), IsLeafProc(false) {} + VEMachineFunctionInfo() : VarArgsFrameOffset(0), IsLeafProc(false) {} explicit VEMachineFunctionInfo(MachineFunction &MF) - : GlobalBaseReg(), VarArgsFrameOffset(0), IsLeafProc(false) {} + : VarArgsFrameOffset(0), IsLeafProc(false) {} Register getGlobalBaseReg() const { return GlobalBaseReg; } void setGlobalBaseReg(Register Reg) { GlobalBaseReg = Reg; } diff --git a/llvm/lib/Target/VE/VESubtarget.h b/llvm/lib/Target/VE/VESubtarget.h index 213aca2ea3f9..0c3dc0a08072 100644 --- a/llvm/lib/Target/VE/VESubtarget.h +++ b/llvm/lib/Target/VE/VESubtarget.h @@ -76,7 +76,7 @@ public: /// Get the size of RSA, return address, and frame pointer as described /// in VEFrameLowering.cpp. - unsigned getRsaSize(void) const { return 176; }; + unsigned getRsaSize() const { return 176; }; bool isTargetLinux() const { return TargetTriple.isOSLinux(); } }; diff --git a/llvm/lib/Target/VE/VVPInstrInfo.td b/llvm/lib/Target/VE/VVPInstrInfo.td index 99566e91ec11..ef9c238066c0 100644 --- a/llvm/lib/Target/VE/VVPInstrInfo.td +++ b/llvm/lib/Target/VE/VVPInstrInfo.td @@ -39,6 +39,15 @@ def SDTFPBinOpVVP : SDTypeProfile<1, 4, [ // vvp_fadd, etc. IsVLVT<4> ]>; +// Select(OnTrue, OnFalse, SelMask, vl) +def SDTSelectVVP : SDTypeProfile<1, 4, [ // vp_select, vp_merge + SDTCisVec<0>, + SDTCisSameNumEltsAs<0, 3>, + SDTCisSameAs<0, 1>, + SDTCisSameAs<1, 2>, + IsVLVT<4> +]>; + // Binary operator commutative pattern. class vvp_commutative<SDNode RootOp> : PatFrags< @@ -79,3 +88,5 @@ def c_vvp_fmul : vvp_commutative<vvp_fmul>; def vvp_fdiv : SDNode<"VEISD::VVP_FDIV", SDTFPBinOpVVP>; // } Binary Operators + +def vvp_select : SDNode<"VEISD::VVP_SELECT", SDTSelectVVP>; diff --git a/llvm/lib/Target/VE/VVPInstrPatternsVec.td b/llvm/lib/Target/VE/VVPInstrPatternsVec.td index 8d5d9d103547..74720fd1f419 100644 --- a/llvm/lib/Target/VE/VVPInstrPatternsVec.td +++ b/llvm/lib/Target/VE/VVPInstrPatternsVec.td @@ -20,8 +20,22 @@ include "VVPInstrInfo.td" multiclass Binary_rv<SDPatternOperator OpNode, ValueType ScalarVT, ValueType DataVT, ValueType MaskVT, string OpBaseName> { - // Masked with select, broadcast. - // TODO + // Masked with passthru, broadcast. + def : Pat<(vvp_select + (OpNode + (any_broadcast ScalarVT:$sx), + DataVT:$vy, + (MaskVT srcvalue), + (i32 srcvalue)), + DataVT:$vfalse, + MaskVT:$mask, + i32:$pivot), + (!cast<Instruction>(OpBaseName#"rvml_v") + ScalarVT:$sx, + $vy, + $mask, + $pivot, + $vfalse)>; // Unmasked, broadcast. def : Pat<(OpNode @@ -42,8 +56,22 @@ multiclass Binary_rv<SDPatternOperator OpNode, multiclass Binary_vr<SDPatternOperator OpNode, ValueType ScalarVT, ValueType DataVT, ValueType MaskVT, string OpBaseName> { - // Masked with select, broadcast. - // TODO + // Masked with passthru, broadcast. + def : Pat<(vvp_select + (OpNode + DataVT:$vx, + (any_broadcast ScalarVT:$sy), + (MaskVT srcvalue), + (i32 srcvalue)), + DataVT:$vfalse, + MaskVT:$mask, + i32:$pivot), + (!cast<Instruction>(OpBaseName#"vrml_v") + $vx, + ScalarVT:$sy, + $mask, + $pivot, + $vfalse)>; // Unmasked, broadcast. def : Pat<(OpNode @@ -64,6 +92,23 @@ multiclass Binary_vr<SDPatternOperator OpNode, multiclass Binary_vv<SDPatternOperator OpNode, ValueType DataVT, ValueType MaskVT, string OpBaseName> { + // Masked with passthru, broadcast. + def : Pat<(vvp_select + (OpNode + DataVT:$vx, + DataVT:$vy, + (MaskVT srcvalue), + (i32 srcvalue)), + DataVT:$vfalse, + MaskVT:$mask, + i32:$pivot), + (!cast<Instruction>(OpBaseName#"vvml_v") + $vx, + $vy, + $mask, + $pivot, + $vfalse)>; + // Masked with select. // TODO @@ -191,3 +236,35 @@ defm : Binary_rv_vv_ShortLong<vvp_fsub, defm : Binary_rv_vr_vv_ShortLong<vvp_fdiv, f64, v256f64, "VFDIVD", f32, v256f32, "VFDIVS">; + +multiclass Merge_mvv< + SDPatternOperator OpNode, + ValueType DataVT, ValueType MaskVT, + string OpBaseName> { + // Masked. + def : Pat<(OpNode + DataVT:$vtrue, DataVT:$vfalse, + MaskVT:$vm, + i32:$avl), + (!cast<Instruction>(OpBaseName#"vvml_v") + $vfalse, $vtrue, $vm, $avl, $vfalse)>; +} + +multiclass Merge_mvv_ShortLong< + SDPatternOperator OpNode, + ValueType LongDataVT, ValueType ShortDataVT, + string OpBaseName> { + defm : Merge_mvv<OpNode, + LongDataVT, v256i1, + OpBaseName>; + defm : Merge_mvv<OpNode, + ShortDataVT, v256i1, + OpBaseName>; +} + +defm : Merge_mvv_ShortLong<vvp_select, + v256f64, + v256f32, "VMRG">; +defm : Merge_mvv_ShortLong<vvp_select, + v256i64, + v256i32, "VMRG">; diff --git a/llvm/lib/Target/VE/VVPNodes.def b/llvm/lib/Target/VE/VVPNodes.def index 8a9231f7d3e6..8000f84c5dbe 100644 --- a/llvm/lib/Target/VE/VVPNodes.def +++ b/llvm/lib/Target/VE/VVPNodes.def @@ -59,6 +59,11 @@ ADD_BINARY_VVP_OP_COMPACT(FSUB) ADD_BINARY_VVP_OP_COMPACT(FMUL) ADD_BINARY_VVP_OP_COMPACT(FDIV) +// Shuffles. +ADD_VVP_OP(VVP_SELECT,VSELECT) +HANDLE_VP_TO_VVP(VP_SELECT, VVP_SELECT) +HANDLE_VP_TO_VVP(VP_MERGE, VVP_SELECT) + #undef ADD_BINARY_VVP_OP #undef ADD_BINARY_VVP_OP_COMPACT #undef ADD_VVP_OP diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h index b2f10ca93a4f..75d5d0675990 100644 --- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h +++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h @@ -26,7 +26,6 @@ class MCAsmBackend; class MCCodeEmitter; class MCInstrInfo; class MCObjectTargetWriter; -class MVT; class Triple; MCCodeEmitter *createWebAssemblyMCCodeEmitter(const MCInstrInfo &MCII); diff --git a/llvm/lib/Target/WebAssembly/Utils/WebAssemblyUtilities.h b/llvm/lib/Target/WebAssembly/Utils/WebAssemblyUtilities.h index d024185defb4..57e40f6cd8d7 100644 --- a/llvm/lib/Target/WebAssembly/Utils/WebAssemblyUtilities.h +++ b/llvm/lib/Target/WebAssembly/Utils/WebAssemblyUtilities.h @@ -25,7 +25,6 @@ class MachineInstr; class MachineOperand; class MCContext; class MCSymbolWasm; -class StringRef; class WebAssemblyFunctionInfo; class WebAssemblySubtarget; diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp index 910a4e5e0d1a..eeec0fc671cc 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp @@ -406,7 +406,7 @@ bool WebAssemblyExplicitLocals::runOnMachineFunction(MachineFunction &MF) { // TODO: Sort the locals for better compression. MFI.setNumLocals(CurLocal - MFI.getParams().size()); for (unsigned I = 0, E = MRI.getNumVirtRegs(); I < E; ++I) { - unsigned Reg = Register::index2VirtReg(I); + Register Reg = Register::index2VirtReg(I); auto RL = Reg2Local.find(Reg); if (RL == Reg2Local.end() || RL->second < MFI.getParams().size()) continue; diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp index 642aa6b4028a..406edef8ff3f 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp @@ -286,7 +286,7 @@ bool WebAssemblyFastISel::computeAddress(const Value *Obj, Address &Addr) { } if (S == 1 && Addr.isRegBase() && Addr.getReg() == 0) { // An unscaled add of a register. Set it as the new base. - unsigned Reg = getRegForValue(Op); + Register Reg = getRegForValue(Op); if (Reg == 0) return false; Addr.setReg(Reg); @@ -372,7 +372,7 @@ bool WebAssemblyFastISel::computeAddress(const Value *Obj, Address &Addr) { if (Addr.isSet()) { return false; } - unsigned Reg = getRegForValue(Obj); + Register Reg = getRegForValue(Obj); if (Reg == 0) return false; Addr.setReg(Reg); @@ -430,7 +430,7 @@ unsigned WebAssemblyFastISel::getRegForI1Value(const Value *V, } Not = false; - unsigned Reg = getRegForValue(V); + Register Reg = getRegForValue(V); if (Reg == 0) return 0; return maskI1Value(Reg, V); @@ -458,12 +458,12 @@ unsigned WebAssemblyFastISel::zeroExtendToI32(unsigned Reg, const Value *V, return 0; } - unsigned Imm = createResultReg(&WebAssembly::I32RegClass); + Register Imm = createResultReg(&WebAssembly::I32RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(WebAssembly::CONST_I32), Imm) .addImm(~(~uint64_t(0) << MVT(From).getSizeInBits())); - unsigned Result = createResultReg(&WebAssembly::I32RegClass); + Register Result = createResultReg(&WebAssembly::I32RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(WebAssembly::AND_I32), Result) .addReg(Reg) @@ -488,18 +488,18 @@ unsigned WebAssemblyFastISel::signExtendToI32(unsigned Reg, const Value *V, return 0; } - unsigned Imm = createResultReg(&WebAssembly::I32RegClass); + Register Imm = createResultReg(&WebAssembly::I32RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(WebAssembly::CONST_I32), Imm) .addImm(32 - MVT(From).getSizeInBits()); - unsigned Left = createResultReg(&WebAssembly::I32RegClass); + Register Left = createResultReg(&WebAssembly::I32RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(WebAssembly::SHL_I32), Left) .addReg(Reg) .addReg(Imm); - unsigned Right = createResultReg(&WebAssembly::I32RegClass); + Register Right = createResultReg(&WebAssembly::I32RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(WebAssembly::SHR_S_I32), Right) .addReg(Left) @@ -517,7 +517,7 @@ unsigned WebAssemblyFastISel::zeroExtend(unsigned Reg, const Value *V, Reg = zeroExtendToI32(Reg, V, From); - unsigned Result = createResultReg(&WebAssembly::I64RegClass); + Register Result = createResultReg(&WebAssembly::I64RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(WebAssembly::I64_EXTEND_U_I32), Result) .addReg(Reg); @@ -539,7 +539,7 @@ unsigned WebAssemblyFastISel::signExtend(unsigned Reg, const Value *V, Reg = signExtendToI32(Reg, V, From); - unsigned Result = createResultReg(&WebAssembly::I64RegClass); + Register Result = createResultReg(&WebAssembly::I64RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(WebAssembly::I64_EXTEND_S_I32), Result) .addReg(Reg); @@ -555,7 +555,7 @@ unsigned WebAssemblyFastISel::signExtend(unsigned Reg, const Value *V, unsigned WebAssemblyFastISel::getRegForUnsignedValue(const Value *V) { MVT::SimpleValueType From = getSimpleType(V->getType()); MVT::SimpleValueType To = getLegalType(From); - unsigned VReg = getRegForValue(V); + Register VReg = getRegForValue(V); if (VReg == 0) return 0; return zeroExtend(VReg, V, From, To); @@ -564,7 +564,7 @@ unsigned WebAssemblyFastISel::getRegForUnsignedValue(const Value *V) { unsigned WebAssemblyFastISel::getRegForSignedValue(const Value *V) { MVT::SimpleValueType From = getSimpleType(V->getType()); MVT::SimpleValueType To = getLegalType(From); - unsigned VReg = getRegForValue(V); + Register VReg = getRegForValue(V); if (VReg == 0) return 0; return signExtend(VReg, V, From, To); @@ -578,7 +578,7 @@ unsigned WebAssemblyFastISel::getRegForPromotedValue(const Value *V, unsigned WebAssemblyFastISel::notValue(unsigned Reg) { assert(MRI.getRegClass(Reg) == &WebAssembly::I32RegClass); - unsigned NotReg = createResultReg(&WebAssembly::I32RegClass); + Register NotReg = createResultReg(&WebAssembly::I32RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(WebAssembly::EQZ_I32), NotReg) .addReg(Reg); @@ -586,7 +586,7 @@ unsigned WebAssemblyFastISel::notValue(unsigned Reg) { } unsigned WebAssemblyFastISel::copyValue(unsigned Reg) { - unsigned ResultReg = createResultReg(MRI.getRegClass(Reg)); + Register ResultReg = createResultReg(MRI.getRegClass(Reg)); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(WebAssembly::COPY), ResultReg) .addReg(Reg); @@ -598,7 +598,7 @@ unsigned WebAssemblyFastISel::fastMaterializeAlloca(const AllocaInst *AI) { FuncInfo.StaticAllocaMap.find(AI); if (SI != FuncInfo.StaticAllocaMap.end()) { - unsigned ResultReg = + Register ResultReg = createResultReg(Subtarget->hasAddr64() ? &WebAssembly::I64RegClass : &WebAssembly::I32RegClass); unsigned Opc = @@ -617,7 +617,7 @@ unsigned WebAssemblyFastISel::fastMaterializeConstant(const Constant *C) { return 0; if (GV->isThreadLocal()) return 0; - unsigned ResultReg = + Register ResultReg = createResultReg(Subtarget->hasAddr64() ? &WebAssembly::I64RegClass : &WebAssembly::I32RegClass); unsigned Opc = Subtarget->hasAddr64() ? WebAssembly::CONST_I64 @@ -715,7 +715,7 @@ bool WebAssemblyFastISel::fastLowerArguments() { default: return false; } - unsigned ResultReg = createResultReg(RC); + Register ResultReg = createResultReg(RC); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg) .addImm(I); updateValueMap(&Arg, ResultReg); @@ -887,7 +887,7 @@ bool WebAssemblyFastISel::selectCall(const Instruction *I) { if (Subtarget->hasAddr64()) { auto Wrap = BuildMI(*FuncInfo.MBB, std::prev(FuncInfo.InsertPt), DbgLoc, TII.get(WebAssembly::I32_WRAP_I64)); - unsigned Reg32 = createResultReg(&WebAssembly::I32RegClass); + Register Reg32 = createResultReg(&WebAssembly::I32RegClass); Wrap.addReg(Reg32, RegState::Define); Wrap.addReg(CalleeReg); CalleeReg = Reg32; @@ -914,11 +914,11 @@ bool WebAssemblyFastISel::selectSelect(const Instruction *I) { if (CondReg == 0) return false; - unsigned TrueReg = getRegForValue(Select->getTrueValue()); + Register TrueReg = getRegForValue(Select->getTrueValue()); if (TrueReg == 0) return false; - unsigned FalseReg = getRegForValue(Select->getFalseValue()); + Register FalseReg = getRegForValue(Select->getFalseValue()); if (FalseReg == 0) return false; @@ -959,7 +959,7 @@ bool WebAssemblyFastISel::selectSelect(const Instruction *I) { return false; } - unsigned ResultReg = createResultReg(RC); + Register ResultReg = createResultReg(RC); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg) .addReg(TrueReg) .addReg(FalseReg) @@ -972,12 +972,12 @@ bool WebAssemblyFastISel::selectSelect(const Instruction *I) { bool WebAssemblyFastISel::selectTrunc(const Instruction *I) { const auto *Trunc = cast<TruncInst>(I); - unsigned Reg = getRegForValue(Trunc->getOperand(0)); + Register Reg = getRegForValue(Trunc->getOperand(0)); if (Reg == 0) return false; if (Trunc->getOperand(0)->getType()->isIntegerTy(64)) { - unsigned Result = createResultReg(&WebAssembly::I32RegClass); + Register Result = createResultReg(&WebAssembly::I32RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(WebAssembly::I32_WRAP_I64), Result) .addReg(Reg); @@ -994,7 +994,7 @@ bool WebAssemblyFastISel::selectZExt(const Instruction *I) { const Value *Op = ZExt->getOperand(0); MVT::SimpleValueType From = getSimpleType(Op->getType()); MVT::SimpleValueType To = getLegalType(getSimpleType(ZExt->getType())); - unsigned In = getRegForValue(Op); + Register In = getRegForValue(Op); if (In == 0) return false; unsigned Reg = zeroExtend(In, Op, From, To); @@ -1011,7 +1011,7 @@ bool WebAssemblyFastISel::selectSExt(const Instruction *I) { const Value *Op = SExt->getOperand(0); MVT::SimpleValueType From = getSimpleType(Op->getType()); MVT::SimpleValueType To = getLegalType(getSimpleType(SExt->getType())); - unsigned In = getRegForValue(Op); + Register In = getRegForValue(Op); if (In == 0) return false; unsigned Reg = signExtend(In, Op, From, To); @@ -1075,7 +1075,7 @@ bool WebAssemblyFastISel::selectICmp(const Instruction *I) { if (RHS == 0) return false; - unsigned ResultReg = createResultReg(&WebAssembly::I32RegClass); + Register ResultReg = createResultReg(&WebAssembly::I32RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg) .addReg(LHS) .addReg(RHS); @@ -1086,11 +1086,11 @@ bool WebAssemblyFastISel::selectICmp(const Instruction *I) { bool WebAssemblyFastISel::selectFCmp(const Instruction *I) { const auto *FCmp = cast<FCmpInst>(I); - unsigned LHS = getRegForValue(FCmp->getOperand(0)); + Register LHS = getRegForValue(FCmp->getOperand(0)); if (LHS == 0) return false; - unsigned RHS = getRegForValue(FCmp->getOperand(1)); + Register RHS = getRegForValue(FCmp->getOperand(1)); if (RHS == 0) return false; @@ -1136,7 +1136,7 @@ bool WebAssemblyFastISel::selectFCmp(const Instruction *I) { return false; } - unsigned ResultReg = createResultReg(&WebAssembly::I32RegClass); + Register ResultReg = createResultReg(&WebAssembly::I32RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg) .addReg(LHS) .addReg(RHS); @@ -1157,7 +1157,7 @@ bool WebAssemblyFastISel::selectBitCast(const Instruction *I) { if (!VT.isSimple() || !RetVT.isSimple()) return false; - unsigned In = getRegForValue(I->getOperand(0)); + Register In = getRegForValue(I->getOperand(0)); if (In == 0) return false; @@ -1229,7 +1229,7 @@ bool WebAssemblyFastISel::selectLoad(const Instruction *I) { materializeLoadStoreOperands(Addr); - unsigned ResultReg = createResultReg(RC); + Register ResultReg = createResultReg(RC); auto MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg); @@ -1284,7 +1284,7 @@ bool WebAssemblyFastISel::selectStore(const Instruction *I) { materializeLoadStoreOperands(Addr); - unsigned ValueReg = getRegForValue(Store->getValueOperand()); + Register ValueReg = getRegForValue(Store->getValueOperand()); if (ValueReg == 0) return false; if (VTIsi1) diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp index 38ed4c73fb93..a221f37cfd94 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp @@ -1491,8 +1491,7 @@ bool WebAssemblyTargetLowering::MatchTableForLowering(SelectionDAG &DAG, if (GA) { // We are in Case 2 above. Idx = Base->getOperand(1); - if (!Idx || GA->getNumValues() != 1 || Idx->getNumValues() != 1) - return false; + assert(GA->getNumValues() == 1); } else { // This might be Case 1 above (or an error) SDValue V = Base->getOperand(0); @@ -1629,7 +1628,7 @@ SDValue WebAssemblyTargetLowering::LowerCopyToReg(SDValue Op, // local.copy between Op and its FI operand. SDValue Chain = Op.getOperand(0); SDLoc DL(Op); - unsigned Reg = cast<RegisterSDNode>(Op.getOperand(1))->getReg(); + Register Reg = cast<RegisterSDNode>(Op.getOperand(1))->getReg(); EVT VT = Src.getValueType(); SDValue Copy(DAG.getMachineNode(VT == MVT::i32 ? WebAssembly::COPY_I32 : WebAssembly::COPY_I64, diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp index 23aaa5160abd..fe656753889f 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp @@ -279,6 +279,7 @@ #include "llvm/IR/IntrinsicsWebAssembly.h" #include "llvm/Support/CommandLine.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/SSAUpdater.h" #include "llvm/Transforms/Utils/SSAUpdaterBulk.h" @@ -454,12 +455,12 @@ static Function *getEmscriptenFunction(FunctionType *Ty, const Twine &Name, // Tell the linker that this function is expected to be imported from the // 'env' module. if (!F->hasFnAttribute("wasm-import-module")) { - llvm::AttrBuilder B; + llvm::AttrBuilder B(M->getContext()); B.addAttribute("wasm-import-module", "env"); F->addFnAttrs(B); } if (!F->hasFnAttribute("wasm-import-name")) { - llvm::AttrBuilder B; + llvm::AttrBuilder B(M->getContext()); B.addAttribute("wasm-import-name", F->getName()); F->addFnAttrs(B); } @@ -547,7 +548,7 @@ Value *WebAssemblyLowerEmscriptenEHSjLj::wrapInvoke(CallBase *CI) { for (unsigned I = 0, E = CI->arg_size(); I < E; ++I) ArgAttributes.push_back(InvokeAL.getParamAttrs(I)); - AttrBuilder FnAttrs(InvokeAL.getFnAttrs()); + AttrBuilder FnAttrs(CI->getContext(), InvokeAL.getFnAttrs()); if (FnAttrs.contains(Attribute::AllocSize)) { // The allocsize attribute (if any) referes to parameters by index and needs // to be adjusted. @@ -610,6 +611,8 @@ static bool canLongjmp(const Value *Callee) { return false; StringRef CalleeName = Callee->getName(); + // TODO Include more functions or consider checking with mangled prefixes + // The reason we include malloc/free here is to exclude the malloc/free // calls generated in setjmp prep / cleanup routines. if (CalleeName == "setjmp" || CalleeName == "malloc" || CalleeName == "free") @@ -626,11 +629,50 @@ static bool canLongjmp(const Value *Callee) { return false; // Exception-catching related functions - if (CalleeName == "__cxa_begin_catch" || CalleeName == "__cxa_end_catch" || + // + // We intentionally excluded __cxa_end_catch here even though it surely cannot + // longjmp, in order to maintain the unwind relationship from all existing + // catchpads (and calls within them) to catch.dispatch.longjmp. + // + // In Wasm EH + Wasm SjLj, we + // 1. Make all catchswitch and cleanuppad that unwind to caller unwind to + // catch.dispatch.longjmp instead + // 2. Convert all longjmpable calls to invokes that unwind to + // catch.dispatch.longjmp + // But catchswitch BBs are removed in isel, so if an EH catchswitch (generated + // from an exception)'s catchpad does not contain any calls that are converted + // into invokes unwinding to catch.dispatch.longjmp, this unwind relationship + // (EH catchswitch BB -> catch.dispatch.longjmp BB) is lost and + // catch.dispatch.longjmp BB can be placed before the EH catchswitch BB in + // CFGSort. + // int ret = setjmp(buf); + // try { + // foo(); // longjmps + // } catch (...) { + // } + // Then in this code, if 'foo' longjmps, it first unwinds to 'catch (...)' + // catchswitch, and is not caught by that catchswitch because it is a longjmp, + // then it should next unwind to catch.dispatch.longjmp BB. But if this 'catch + // (...)' catchswitch -> catch.dispatch.longjmp unwind relationship is lost, + // it will not unwind to catch.dispatch.longjmp, producing an incorrect + // result. + // + // Every catchpad generated by Wasm C++ contains __cxa_end_catch, so we + // intentionally treat it as longjmpable to work around this problem. This is + // a hacky fix but an easy one. + // + // The comment block in findWasmUnwindDestinations() in + // SelectionDAGBuilder.cpp is addressing a similar problem. + if (CalleeName == "__cxa_begin_catch" || CalleeName == "__cxa_allocate_exception" || CalleeName == "__cxa_throw" || CalleeName == "__clang_call_terminate") return false; + // std::terminate, which is generated when another exception occurs while + // handling an exception, cannot longjmp. + if (CalleeName == "_ZSt9terminatev") + return false; + // Otherwise we don't know return true; } @@ -817,6 +859,32 @@ static bool containsLongjmpableCalls(const Function *F) { return false; } +// When a function contains a setjmp call but not other calls that can longjmp, +// we don't do setjmp transformation for that setjmp. But we need to convert the +// setjmp calls into "i32 0" so they don't cause link time errors. setjmp always +// returns 0 when called directly. +static void nullifySetjmp(Function *F) { + Module &M = *F->getParent(); + IRBuilder<> IRB(M.getContext()); + Function *SetjmpF = M.getFunction("setjmp"); + SmallVector<Instruction *, 1> ToErase; + + for (User *U : SetjmpF->users()) { + auto *CI = dyn_cast<CallInst>(U); + // FIXME 'invoke' to setjmp can happen when we use Wasm EH + Wasm SjLj, but + // we don't support two being used together yet. + if (!CI) + report_fatal_error("Wasm EH + Wasm SjLj is not fully supported yet"); + BasicBlock *BB = CI->getParent(); + if (BB->getParent() != F) // in other function + continue; + ToErase.push_back(CI); + CI->replaceAllUsesWith(IRB.getInt32(0)); + } + for (auto *I : ToErase) + I->eraseFromParent(); +} + bool WebAssemblyLowerEmscriptenEHSjLj::runOnModule(Module &M) { LLVM_DEBUG(dbgs() << "********** Lower Emscripten EH & SjLj **********\n"); @@ -886,6 +954,10 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runOnModule(Module &M) { EHTypeIDF = getEmscriptenFunction(EHTypeIDTy, "llvm_eh_typeid_for", &M); } + // Functions that contains calls to setjmp but don't have other longjmpable + // calls within them. + SmallPtrSet<Function *, 4> SetjmpUsersToNullify; + if ((EnableEmSjLj || EnableWasmSjLj) && SetjmpF) { // Precompute setjmp users for (User *U : SetjmpF->users()) { @@ -896,6 +968,8 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runOnModule(Module &M) { // so can ignore it if (containsLongjmpableCalls(UserF)) SetjmpUsers.insert(UserF); + else + SetjmpUsersToNullify.insert(UserF); } else { std::string S; raw_string_ostream SS(S); @@ -975,6 +1049,14 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runOnModule(Module &M) { runSjLjOnFunction(*F); } + // Replace unnecessary setjmp calls with 0 + if ((EnableEmSjLj || EnableWasmSjLj) && !SetjmpUsersToNullify.empty()) { + Changed = true; + assert(SetjmpF); + for (Function *F : SetjmpUsersToNullify) + nullifySetjmp(F); + } + if (!Changed) { // Delete unused global variables and functions if (ResumeF) @@ -1078,20 +1160,7 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runEHOnFunction(Function &F) { } else { // This can't throw, and we don't need this invoke, just replace it with a // call+branch - SmallVector<Value *, 16> Args(II->args()); - CallInst *NewCall = - IRB.CreateCall(II->getFunctionType(), II->getCalledOperand(), Args); - NewCall->takeName(II); - NewCall->setCallingConv(II->getCallingConv()); - NewCall->setDebugLoc(II->getDebugLoc()); - NewCall->setAttributes(II->getAttributes()); - II->replaceAllUsesWith(NewCall); - ToErase.push_back(II); - - IRB.CreateBr(II->getNormalDest()); - - // Remove any PHI node entries from the exception destination - II->getUnwindDest()->removePredecessor(&BB); + changeToCall(II); } } @@ -1243,16 +1312,19 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runSjLjOnFunction(Function &F) { // Setjmp transformation SmallVector<PHINode *, 4> SetjmpRetPHIs; Function *SetjmpF = M.getFunction("setjmp"); - for (User *U : SetjmpF->users()) { - auto *CI = dyn_cast<CallInst>(U); - // FIXME 'invoke' to setjmp can happen when we use Wasm EH + Wasm SjLj, but - // we don't support two being used together yet. - if (!CI) - report_fatal_error("Wasm EH + Wasm SjLj is not fully supported yet"); - BasicBlock *BB = CI->getParent(); + for (auto *U : make_early_inc_range(SetjmpF->users())) { + auto *CB = dyn_cast<CallBase>(U); + BasicBlock *BB = CB->getParent(); if (BB->getParent() != &F) // in other function continue; + CallInst *CI = nullptr; + // setjmp cannot throw. So if it is an invoke, lower it to a call + if (auto *II = dyn_cast<InvokeInst>(CB)) + CI = llvm::changeToCall(II); + else + CI = cast<CallInst>(CB); + // The tail is everything right after the call, and will be reached once // when setjmp is called, and later when longjmp returns to the setjmp BasicBlock *Tail = SplitBlock(BB, CI->getNextNode()); @@ -1568,6 +1640,13 @@ void WebAssemblyLowerEmscriptenEHSjLj::handleLongjmpableCallsForEmscriptenSjLj( I->eraseFromParent(); } +static BasicBlock *getCleanupRetUnwindDest(const CleanupPadInst *CPI) { + for (const User *U : CPI->users()) + if (const auto *CRI = dyn_cast<CleanupReturnInst>(U)) + return CRI->getUnwindDest(); + return nullptr; +} + // Create a catchpad in which we catch a longjmp's env and val arguments, test // if the longjmp corresponds to one of setjmps in the current function, and if // so, jump to the setjmp dispatch BB from which we go to one of post-setjmp @@ -1619,18 +1698,18 @@ void WebAssemblyLowerEmscriptenEHSjLj::handleLongjmpableCallsForWasmSjLj( BasicBlock::Create(C, "setjmp.dispatch", &F, OrigEntry); cast<BranchInst>(Entry->getTerminator())->setSuccessor(0, SetjmpDispatchBB); - // Create catch.dispatch.longjmp BB a catchswitch instruction - BasicBlock *CatchSwitchBB = + // Create catch.dispatch.longjmp BB and a catchswitch instruction + BasicBlock *CatchDispatchLongjmpBB = BasicBlock::Create(C, "catch.dispatch.longjmp", &F); - IRB.SetInsertPoint(CatchSwitchBB); - CatchSwitchInst *CatchSwitch = + IRB.SetInsertPoint(CatchDispatchLongjmpBB); + CatchSwitchInst *CatchSwitchLongjmp = IRB.CreateCatchSwitch(ConstantTokenNone::get(C), nullptr, 1); // Create catch.longjmp BB and a catchpad instruction BasicBlock *CatchLongjmpBB = BasicBlock::Create(C, "catch.longjmp", &F); - CatchSwitch->addHandler(CatchLongjmpBB); + CatchSwitchLongjmp->addHandler(CatchLongjmpBB); IRB.SetInsertPoint(CatchLongjmpBB); - CatchPadInst *CatchPad = IRB.CreateCatchPad(CatchSwitch, {}); + CatchPadInst *CatchPad = IRB.CreateCatchPad(CatchSwitchLongjmp, {}); // Wasm throw and catch instructions can throw and catch multiple values, but // that requires multivalue support in the toolchain, which is currently not @@ -1696,9 +1775,9 @@ void WebAssemblyLowerEmscriptenEHSjLj::handleLongjmpableCallsForWasmSjLj( // Convert all longjmpable call instructions to invokes that unwind to the // newly created catch.dispatch.longjmp BB. - SmallVector<Instruction *, 64> ToErase; + SmallVector<CallInst *, 64> LongjmpableCalls; for (auto *BB = &*F.begin(); BB; BB = BB->getNextNode()) { - for (Instruction &I : *BB) { + for (auto &I : *BB) { auto *CI = dyn_cast<CallInst>(&I); if (!CI) continue; @@ -1716,29 +1795,66 @@ void WebAssemblyLowerEmscriptenEHSjLj::handleLongjmpableCallsForWasmSjLj( // setjmps in this function. We should not convert this call to an invoke. if (CI == WasmLongjmpCI) continue; - ToErase.push_back(CI); + LongjmpableCalls.push_back(CI); + } + } - // Even if the callee function has attribute 'nounwind', which is true for - // all C functions, it can longjmp, which means it can throw a Wasm - // exception now. - CI->removeFnAttr(Attribute::NoUnwind); - if (Function *CalleeF = CI->getCalledFunction()) { - CalleeF->removeFnAttr(Attribute::NoUnwind); + for (auto *CI : LongjmpableCalls) { + // Even if the callee function has attribute 'nounwind', which is true for + // all C functions, it can longjmp, which means it can throw a Wasm + // exception now. + CI->removeFnAttr(Attribute::NoUnwind); + if (Function *CalleeF = CI->getCalledFunction()) + CalleeF->removeFnAttr(Attribute::NoUnwind); + + // Change it to an invoke and make it unwind to the catch.dispatch.longjmp + // BB. If the call is enclosed in another catchpad/cleanuppad scope, unwind + // to its parent pad's unwind destination instead to preserve the scope + // structure. It will eventually unwind to the catch.dispatch.longjmp. + SmallVector<OperandBundleDef, 1> Bundles; + BasicBlock *UnwindDest = nullptr; + if (auto Bundle = CI->getOperandBundle(LLVMContext::OB_funclet)) { + Instruction *FromPad = cast<Instruction>(Bundle->Inputs[0]); + while (!UnwindDest && FromPad) { + if (auto *CPI = dyn_cast<CatchPadInst>(FromPad)) { + UnwindDest = CPI->getCatchSwitch()->getUnwindDest(); + FromPad = nullptr; // stop searching + } else if (auto *CPI = dyn_cast<CleanupPadInst>(FromPad)) { + // getCleanupRetUnwindDest() can return nullptr when + // 1. This cleanuppad's matching cleanupret uwninds to caller + // 2. There is no matching cleanupret because it ends with + // unreachable. + // In case of 2, we need to traverse the parent pad chain. + UnwindDest = getCleanupRetUnwindDest(CPI); + FromPad = cast<Instruction>(CPI->getParentPad()); + } } + } + if (!UnwindDest) + UnwindDest = CatchDispatchLongjmpBB; + changeToInvokeAndSplitBasicBlock(CI, UnwindDest); + } - IRB.SetInsertPoint(CI); - BasicBlock *Tail = SplitBlock(BB, CI->getNextNode()); - // We will add a new invoke. So remove the branch created when we split - // the BB - ToErase.push_back(BB->getTerminator()); - SmallVector<Value *, 8> Args(CI->args()); - InvokeInst *II = - IRB.CreateInvoke(CI->getFunctionType(), CI->getCalledOperand(), Tail, - CatchSwitchBB, Args); - II->takeName(CI); - II->setDebugLoc(CI->getDebugLoc()); - II->setAttributes(CI->getAttributes()); - CI->replaceAllUsesWith(II); + SmallVector<Instruction *, 16> ToErase; + for (auto &BB : F) { + if (auto *CSI = dyn_cast<CatchSwitchInst>(BB.getFirstNonPHI())) { + if (CSI != CatchSwitchLongjmp && CSI->unwindsToCaller()) { + IRB.SetInsertPoint(CSI); + ToErase.push_back(CSI); + auto *NewCSI = IRB.CreateCatchSwitch(CSI->getParentPad(), + CatchDispatchLongjmpBB, 1); + NewCSI->addHandler(*CSI->handler_begin()); + NewCSI->takeName(CSI); + CSI->replaceAllUsesWith(NewCSI); + } + } + + if (auto *CRI = dyn_cast<CleanupReturnInst>(BB.getTerminator())) { + if (CRI->unwindsToCaller()) { + IRB.SetInsertPoint(CRI); + ToErase.push_back(CRI); + IRB.CreateCleanupRet(CRI->getCleanupPad(), CatchDispatchLongjmpBB); + } } } diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyLowerGlobalDtors.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyLowerGlobalDtors.cpp index 3a0bef8c765c..ca6f3f194645 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyLowerGlobalDtors.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyLowerGlobalDtors.cpp @@ -26,6 +26,8 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/ModuleUtils.h" +#include <map> + using namespace llvm; #define DEBUG_TYPE "wasm-lower-global-dtors" diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyOptimizeLiveIntervals.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyOptimizeLiveIntervals.cpp index 9d83a75a8247..6a6cac6d956f 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyOptimizeLiveIntervals.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyOptimizeLiveIntervals.cpp @@ -82,7 +82,7 @@ bool WebAssemblyOptimizeLiveIntervals::runOnMachineFunction( // Split multiple-VN LiveIntervals into multiple LiveIntervals. SmallVector<LiveInterval *, 4> SplitLIs; for (unsigned I = 0, E = MRI.getNumVirtRegs(); I < E; ++I) { - unsigned Reg = Register::index2VirtReg(I); + Register Reg = Register::index2VirtReg(I); auto &TRI = *MF.getSubtarget<WebAssemblySubtarget>().getRegisterInfo(); if (MRI.reg_nodbg_empty(Reg)) diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyPrepareForLiveIntervals.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyPrepareForLiveIntervals.cpp index 8b8593ddcbdd..5682cadc1a64 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyPrepareForLiveIntervals.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyPrepareForLiveIntervals.cpp @@ -95,7 +95,7 @@ bool WebAssemblyPrepareForLiveIntervals::runOnMachineFunction( // TODO: This is fairly heavy-handed; find a better approach. // for (unsigned I = 0, E = MRI.getNumVirtRegs(); I < E; ++I) { - unsigned Reg = Register::index2VirtReg(I); + Register Reg = Register::index2VirtReg(I); // Skip unused registers. if (MRI.use_nodbg_empty(Reg)) diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp index fe127dec8aed..5252db4858b9 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp @@ -98,7 +98,7 @@ bool WebAssemblyRegColoring::runOnMachineFunction(MachineFunction &MF) { LLVM_DEBUG(dbgs() << "Interesting register intervals:\n"); for (unsigned I = 0; I < NumVRegs; ++I) { - unsigned VReg = Register::index2VirtReg(I); + Register VReg = Register::index2VirtReg(I); if (MFI.isVRegStackified(VReg)) continue; // Skip unused registers, which can use $drop. @@ -135,7 +135,7 @@ bool WebAssemblyRegColoring::runOnMachineFunction(MachineFunction &MF) { bool Changed = false; for (size_t I = 0, E = SortedIntervals.size(); I < E; ++I) { LiveInterval *LI = SortedIntervals[I]; - unsigned Old = LI->reg(); + Register Old = LI->reg(); size_t Color = I; const TargetRegisterClass *RC = MRI->getRegClass(Old); @@ -152,7 +152,7 @@ bool WebAssemblyRegColoring::runOnMachineFunction(MachineFunction &MF) { continue_outer:; } - unsigned New = SortedIntervals[Color]->reg(); + Register New = SortedIntervals[Color]->reg(); SlotMapping[I] = New; Changed |= Old != New; UsedColors.set(Color); @@ -168,7 +168,7 @@ bool WebAssemblyRegColoring::runOnMachineFunction(MachineFunction &MF) { // Rewrite register operands. for (size_t I = 0, E = SortedIntervals.size(); I < E; ++I) { - unsigned Old = SortedIntervals[I]->reg(); + Register Old = SortedIntervals[I]->reg(); unsigned New = SlotMapping[I]; if (Old != New) MRI->replaceRegWith(Old, New); diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRegNumbering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyRegNumbering.cpp index c73b8a29daeb..76c78cd23130 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyRegNumbering.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyRegNumbering.cpp @@ -89,7 +89,7 @@ bool WebAssemblyRegNumbering::runOnMachineFunction(MachineFunction &MF) { // Start the numbering for locals after the arg regs unsigned CurReg = MFI.getParams().size(); for (unsigned VRegIdx = 0; VRegIdx < NumVRegs; ++VRegIdx) { - unsigned VReg = Register::index2VirtReg(VRegIdx); + Register VReg = Register::index2VirtReg(VRegIdx); // Skip unused registers. if (MRI.use_empty(VReg)) continue; diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp index 42419259802e..d3ad47147ac8 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp @@ -909,8 +909,8 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) { SubsequentUse != Use.getParent()->uses().end()) { if (!SubsequentDef->isReg() || !SubsequentUse->isReg()) break; - unsigned DefReg = SubsequentDef->getReg(); - unsigned UseReg = SubsequentUse->getReg(); + Register DefReg = SubsequentDef->getReg(); + Register UseReg = SubsequentUse->getReg(); // TODO: This single-use restriction could be relaxed by using tees if (DefReg != UseReg || !MRI.hasOneUse(DefReg)) break; diff --git a/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.cpp b/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.cpp index add3c799f4aa..912f61765579 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.cpp @@ -42,8 +42,7 @@ WebAssemblySubtarget::WebAssemblySubtarget(const Triple &TT, const std::string &FS, const TargetMachine &TM) : WebAssemblyGenSubtargetInfo(TT, CPU, /*TuneCPU*/ CPU, FS), - TargetTriple(TT), FrameLowering(), - InstrInfo(initializeSubtargetDependencies(CPU, FS)), TSInfo(), + TargetTriple(TT), InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM, *this) {} bool WebAssemblySubtarget::enableAtomicExpand() const { diff --git a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp index 2ba0b97229cc..e9ecff3bf514 100644 --- a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp +++ b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp @@ -216,7 +216,7 @@ private: // The operator on the top of the stack has higher precedence than the // new operator. unsigned ParenCount = 0; - while (1) { + while (true) { // Nothing to process. if (InfixOperatorStack.empty()) break; @@ -3030,7 +3030,7 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, ForcedDispEncoding = DispEncoding_Default; // Parse pseudo prefixes. - while (1) { + while (true) { if (Name == "{") { if (getLexer().isNot(AsmToken::Identifier)) return Error(Parser.getTok().getLoc(), "Unexpected token after '{'"); @@ -3370,7 +3370,7 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, Operands.push_back(X86Operand::CreateToken("*", consumeToken())); // Read the operands. - while(1) { + while (true) { if (ParseOperand(Operands)) return true; if (HandleAVX512Operand(Operands)) diff --git a/llvm/lib/Target/X86/MCA/X86CustomBehaviour.cpp b/llvm/lib/Target/X86/MCA/X86CustomBehaviour.cpp new file mode 100644 index 000000000000..78379290aae9 --- /dev/null +++ b/llvm/lib/Target/X86/MCA/X86CustomBehaviour.cpp @@ -0,0 +1,64 @@ +//===------------------- X86CustomBehaviour.cpp -----------------*-C++ -* -===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \file +/// +/// This file implements methods from the X86CustomBehaviour class. +/// +//===----------------------------------------------------------------------===// + +#include "X86CustomBehaviour.h" +#include "TargetInfo/X86TargetInfo.h" +#include "X86InstrInfo.h" +#include "llvm/MC/TargetRegistry.h" +#include "llvm/Support/WithColor.h" + +namespace llvm { +namespace mca { + +void X86InstrPostProcess::setMemBarriers(std::unique_ptr<Instruction> &Inst, + const MCInst &MCI) { + switch (MCI.getOpcode()) { + case X86::MFENCE: + Inst->setLoadBarrier(true); + Inst->setStoreBarrier(true); + break; + case X86::LFENCE: + Inst->setLoadBarrier(true); + break; + case X86::SFENCE: + Inst->setStoreBarrier(true); + break; + } +} + +void X86InstrPostProcess::postProcessInstruction( + std::unique_ptr<Instruction> &Inst, const MCInst &MCI) { + // Currently, we only modify certain instructions' IsALoadBarrier and + // IsAStoreBarrier flags. + setMemBarriers(Inst, MCI); +} + +} // namespace mca +} // namespace llvm + +using namespace llvm; +using namespace mca; + +static InstrPostProcess *createX86InstrPostProcess(const MCSubtargetInfo &STI, + const MCInstrInfo &MCII) { + return new X86InstrPostProcess(STI, MCII); +} + +/// Extern function to initialize the targets for the X86 backend + +extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeX86TargetMCA() { + TargetRegistry::RegisterInstrPostProcess(getTheX86_32Target(), + createX86InstrPostProcess); + TargetRegistry::RegisterInstrPostProcess(getTheX86_64Target(), + createX86InstrPostProcess); +} diff --git a/llvm/lib/Target/X86/MCA/X86CustomBehaviour.h b/llvm/lib/Target/X86/MCA/X86CustomBehaviour.h new file mode 100644 index 000000000000..24d26751f0a1 --- /dev/null +++ b/llvm/lib/Target/X86/MCA/X86CustomBehaviour.h @@ -0,0 +1,47 @@ +//===-------------------- X86CustomBehaviour.h ------------------*-C++ -* -===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \file +/// +/// This file defines the X86CustomBehaviour class which inherits from +/// CustomBehaviour. This class is used by the tool llvm-mca to enforce +/// target specific behaviour that is not expressed well enough in the +/// scheduling model for mca to enforce it automatically. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_X86_MCA_X86CUSTOMBEHAVIOUR_H +#define LLVM_LIB_TARGET_X86_MCA_X86CUSTOMBEHAVIOUR_H + +#include "llvm/ADT/SmallVector.h" +#include "llvm/MCA/CustomBehaviour.h" +#include "llvm/Support/TargetParser.h" + +namespace llvm { +namespace mca { + +class X86InstrPostProcess : public InstrPostProcess { + void processWaitCnt(std::unique_ptr<Instruction> &Inst, const MCInst &MCI); + + /// Called within X86InstrPostProcess to specify certain instructions + /// as load and store barriers. + void setMemBarriers(std::unique_ptr<Instruction> &Inst, const MCInst &MCI); + +public: + X86InstrPostProcess(const MCSubtargetInfo &STI, const MCInstrInfo &MCII) + : InstrPostProcess(STI, MCII) {} + + ~X86InstrPostProcess() {} + + void postProcessInstruction(std::unique_ptr<Instruction> &Inst, + const MCInst &MCI) override; +}; + +} // namespace mca +} // namespace llvm + +#endif diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h index bb12ede3b729..fd82bdcd1a23 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h +++ b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h @@ -40,4 +40,4 @@ protected: } // end namespace llvm -#endif // LLVM_LIB_TARGET_X86_MCTARGETDESC_X86ATTINSTPRINTER_H +#endif // LLVM_LIB_TARGET_X86_MCTARGETDESC_X86INSTPRINTERCOMMON_H diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp index 9da0a8129f23..8913e405539e 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp @@ -111,6 +111,15 @@ void X86_MC::initLLVMToSEHAndCVRegMapping(MCRegisterInfo *MRI) { {codeview::RegisterId::EFLAGS, X86::EFLAGS}, + {codeview::RegisterId::ST0, X86::ST0}, + {codeview::RegisterId::ST1, X86::ST1}, + {codeview::RegisterId::ST2, X86::ST2}, + {codeview::RegisterId::ST3, X86::ST3}, + {codeview::RegisterId::ST4, X86::ST4}, + {codeview::RegisterId::ST5, X86::ST5}, + {codeview::RegisterId::ST6, X86::ST6}, + {codeview::RegisterId::ST7, X86::ST7}, + {codeview::RegisterId::ST0, X86::FP0}, {codeview::RegisterId::ST1, X86::FP1}, {codeview::RegisterId::ST2, X86::FP2}, @@ -281,8 +290,8 @@ void X86_MC::initLLVMToSEHAndCVRegMapping(MCRegisterInfo *MRI) { {codeview::RegisterId::AMD64_XMM31, X86::XMM31}, }; - for (unsigned I = 0; I < array_lengthof(RegMap); ++I) - MRI->mapLLVMRegToCVReg(RegMap[I].Reg, static_cast<int>(RegMap[I].CVReg)); + for (const auto &I : RegMap) + MRI->mapLLVMRegToCVReg(I.Reg, static_cast<int>(I.CVReg)); } MCSubtargetInfo *X86_MC::createX86MCSubtargetInfo(const Triple &TT, diff --git a/llvm/lib/Target/X86/X86AsmPrinter.h b/llvm/lib/Target/X86/X86AsmPrinter.h index b22f25af26cf..94679e6e3d11 100644 --- a/llvm/lib/Target/X86/X86AsmPrinter.h +++ b/llvm/lib/Target/X86/X86AsmPrinter.h @@ -23,7 +23,6 @@ class MCCodeEmitter; class MCStreamer; class X86Subtarget; class TargetMachine; -struct ASanAccessInfo; class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter { const X86Subtarget *Subtarget = nullptr; diff --git a/llvm/lib/Target/X86/X86CallLowering.h b/llvm/lib/Target/X86/X86CallLowering.h index ac5b92bf4aae..0ad67cfd3532 100644 --- a/llvm/lib/Target/X86/X86CallLowering.h +++ b/llvm/lib/Target/X86/X86CallLowering.h @@ -20,8 +20,6 @@ namespace llvm { template <typename T> class ArrayRef; -class DataLayout; -class MachineRegisterInfo; class X86TargetLowering; class X86CallLowering : public CallLowering { diff --git a/llvm/lib/Target/X86/X86FastTileConfig.cpp b/llvm/lib/Target/X86/X86FastTileConfig.cpp index 47874e82ff3b..061fff50bcea 100644 --- a/llvm/lib/Target/X86/X86FastTileConfig.cpp +++ b/llvm/lib/Target/X86/X86FastTileConfig.cpp @@ -56,8 +56,6 @@ public: bool isTileLoad(MachineInstr &MI); bool isTileStore(MachineInstr &MI); bool isAMXInstr(MachineInstr &MI); - void getTileStoreShape(MachineInstr &MI, - SmallVector<MachineOperand *> &ShapedTiles); MachineInstr *getKeyAMXInstr(MachineInstr *MI); void getTileShapesCfg(MachineInstr *MI, diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp index 0a7aea467809..51f2ced321bb 100644 --- a/llvm/lib/Target/X86/X86FrameLowering.cpp +++ b/llvm/lib/Target/X86/X86FrameLowering.cpp @@ -149,6 +149,17 @@ static unsigned getLEArOpcode(bool IsLP64) { return IsLP64 ? X86::LEA64r : X86::LEA32r; } +static unsigned getMOVriOpcode(bool Use64BitReg, int64_t Imm) { + if (Use64BitReg) { + if (isUInt<32>(Imm)) + return X86::MOV32ri64; + if (isInt<32>(Imm)) + return X86::MOV64ri32; + return X86::MOV64ri; + } + return X86::MOV32ri; +} + static bool isEAXLiveIn(MachineBasicBlock &MBB) { for (MachineBasicBlock::RegisterMaskPair RegMask : MBB.liveins()) { unsigned Reg = RegMask.PhysReg; @@ -237,11 +248,10 @@ void X86FrameLowering::emitSPUpdate(MachineBasicBlock &MBB, else Reg = TRI->findDeadCallerSavedReg(MBB, MBBI); - unsigned MovRIOpc = Is64Bit ? X86::MOV64ri : X86::MOV32ri; unsigned AddSubRROpc = isSub ? getSUBrrOpcode(Is64Bit) : getADDrrOpcode(Is64Bit); if (Reg) { - BuildMI(MBB, MBBI, DL, TII.get(MovRIOpc), Reg) + BuildMI(MBB, MBBI, DL, TII.get(getMOVriOpcode(Is64Bit, Offset)), Reg) .addImm(Offset) .setMIFlag(Flag); MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(AddSubRROpc), StackPtr) @@ -267,7 +277,7 @@ void X86FrameLowering::emitSPUpdate(MachineBasicBlock &MBB, Offset = -(Offset - SlotSize); else Offset = Offset + SlotSize; - BuildMI(MBB, MBBI, DL, TII.get(MovRIOpc), Rax) + BuildMI(MBB, MBBI, DL, TII.get(getMOVriOpcode(Is64Bit, Offset)), Rax) .addImm(Offset) .setMIFlag(Flag); MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(X86::ADD64rr), Rax) @@ -434,7 +444,7 @@ void X86FrameLowering::BuildCFI(MachineBasicBlock &MBB, /// Emits Dwarf Info specifying offsets of callee saved registers and /// frame pointer. This is called only when basic block sections are enabled. -void X86FrameLowering::emitCalleeSavedFrameMoves( +void X86FrameLowering::emitCalleeSavedFrameMovesFullCFA( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const { MachineFunction &MF = *MBB.getParent(); if (!hasFP(MF)) { @@ -469,7 +479,7 @@ void X86FrameLowering::emitCalleeSavedFrameMoves( // Calculate offsets. for (const CalleeSavedInfo &I : CSI) { int64_t Offset = MFI.getObjectOffset(I.getFrameIdx()); - unsigned Reg = I.getReg(); + Register Reg = I.getReg(); unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true); if (IsPrologue) { @@ -637,6 +647,8 @@ void X86FrameLowering::emitStackProbeInlineGenericLoop( uint64_t AlignOffset) const { assert(Offset && "null offset"); + const bool NeedsDwarfCFI = needsDwarfCFI(MF); + const bool HasFP = hasFP(MF); const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>(); const X86TargetLowering &TLI = *STI.getTargetLowering(); const unsigned MovMIOpc = Is64Bit ? X86::MOV64mi32 : X86::MOV32mi; @@ -676,17 +688,36 @@ void X86FrameLowering::emitStackProbeInlineGenericLoop( Register FinalStackProbed = Uses64BitFramePtr ? X86::R11 : Is64Bit ? X86::R11D : X86::EAX; + BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::COPY), FinalStackProbed) .addReg(StackPtr) .setMIFlag(MachineInstr::FrameSetup); // save loop bound { - const unsigned SUBOpc = getSUBriOpcode(Uses64BitFramePtr, Offset); + const unsigned BoundOffset = alignDown(Offset, StackProbeSize); + const unsigned SUBOpc = getSUBriOpcode(Uses64BitFramePtr, BoundOffset); BuildMI(MBB, MBBI, DL, TII.get(SUBOpc), FinalStackProbed) .addReg(FinalStackProbed) - .addImm(Offset / StackProbeSize * StackProbeSize) + .addImm(BoundOffset) .setMIFlag(MachineInstr::FrameSetup); + + // while in the loop, use loop-invariant reg for CFI, + // instead of the stack pointer, which changes during the loop + if (!HasFP && NeedsDwarfCFI) { + // x32 uses the same DWARF register numbers as x86-64, + // so there isn't a register number for r11d, we must use r11 instead + const Register DwarfFinalStackProbed = + STI.isTarget64BitILP32() + ? Register(getX86SubSuperRegister(FinalStackProbed, 64)) + : FinalStackProbed; + + BuildCFI(MBB, MBBI, DL, + MCCFIInstruction::createDefCfaRegister( + nullptr, TRI->getDwarfRegNum(DwarfFinalStackProbed, true))); + BuildCFI(MBB, MBBI, DL, + MCCFIInstruction::createAdjustCfaOffset(nullptr, BoundOffset)); + } } // allocate a page @@ -725,15 +756,30 @@ void X86FrameLowering::emitStackProbeInlineGenericLoop( MBB.addSuccessor(testMBB); // handle tail - unsigned TailOffset = Offset % StackProbeSize; + const unsigned TailOffset = Offset % StackProbeSize; + MachineBasicBlock::iterator TailMBBIter = tailMBB->begin(); if (TailOffset) { const unsigned Opc = getSUBriOpcode(Uses64BitFramePtr, TailOffset); - BuildMI(*tailMBB, tailMBB->begin(), DL, TII.get(Opc), StackPtr) + BuildMI(*tailMBB, TailMBBIter, DL, TII.get(Opc), StackPtr) .addReg(StackPtr) .addImm(TailOffset) .setMIFlag(MachineInstr::FrameSetup); } + // after the loop, switch back to stack pointer for CFI + if (!HasFP && NeedsDwarfCFI) { + // x32 uses the same DWARF register numbers as x86-64, + // so there isn't a register number for esp, we must use rsp instead + const Register DwarfStackPtr = + STI.isTarget64BitILP32() + ? Register(getX86SubSuperRegister(StackPtr, 64)) + : Register(StackPtr); + + BuildCFI(*tailMBB, TailMBBIter, DL, + MCCFIInstruction::createDefCfaRegister( + nullptr, TRI->getDwarfRegNum(DwarfStackPtr, true))); + } + // Update Live In information recomputeLiveIns(*testMBB); recomputeLiveIns(*tailMBB); @@ -1705,19 +1751,9 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, // Handle the 64-bit Windows ABI case where we need to call __chkstk. // Function prologue is responsible for adjusting the stack pointer. int64_t Alloc = isEAXAlive ? NumBytes - 8 : NumBytes; - if (isUInt<32>(Alloc)) { - BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX) - .addImm(Alloc) - .setMIFlag(MachineInstr::FrameSetup); - } else if (isInt<32>(Alloc)) { - BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64ri32), X86::RAX) - .addImm(Alloc) - .setMIFlag(MachineInstr::FrameSetup); - } else { - BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64ri), X86::RAX) - .addImm(Alloc) - .setMIFlag(MachineInstr::FrameSetup); - } + BuildMI(MBB, MBBI, DL, TII.get(getMOVriOpcode(Is64Bit, Alloc)), X86::RAX) + .addImm(Alloc) + .setMIFlag(MachineInstr::FrameSetup); } else { // Allocate NumBytes-4 bytes on stack in case of isEAXAlive. // We'll also use 4 already allocated bytes for EAX. @@ -2497,7 +2533,7 @@ bool X86FrameLowering::assignCalleeSavedSpillSlots( // Assign slots for GPRs. It increases frame size. for (CalleeSavedInfo &I : llvm::reverse(CSI)) { - unsigned Reg = I.getReg(); + Register Reg = I.getReg(); if (!X86::GR64RegClass.contains(Reg) && !X86::GR32RegClass.contains(Reg)) continue; @@ -2514,7 +2550,7 @@ bool X86FrameLowering::assignCalleeSavedSpillSlots( // Assign slots for XMMs. for (CalleeSavedInfo &I : llvm::reverse(CSI)) { - unsigned Reg = I.getReg(); + Register Reg = I.getReg(); if (X86::GR64RegClass.contains(Reg) || X86::GR32RegClass.contains(Reg)) continue; @@ -2560,7 +2596,7 @@ bool X86FrameLowering::spillCalleeSavedRegisters( const MachineFunction &MF = *MBB.getParent(); unsigned Opc = STI.is64Bit() ? X86::PUSH64r : X86::PUSH32r; for (const CalleeSavedInfo &I : llvm::reverse(CSI)) { - unsigned Reg = I.getReg(); + Register Reg = I.getReg(); if (!X86::GR64RegClass.contains(Reg) && !X86::GR32RegClass.contains(Reg)) continue; @@ -2594,7 +2630,7 @@ bool X86FrameLowering::spillCalleeSavedRegisters( // Make XMM regs spilled. X86 does not have ability of push/pop XMM. // It can be done by spilling XMMs to stack frame. for (const CalleeSavedInfo &I : llvm::reverse(CSI)) { - unsigned Reg = I.getReg(); + Register Reg = I.getReg(); if (X86::GR64RegClass.contains(Reg) || X86::GR32RegClass.contains(Reg)) continue; @@ -2672,7 +2708,7 @@ bool X86FrameLowering::restoreCalleeSavedRegisters( // Reload XMMs from stack frame. for (const CalleeSavedInfo &I : CSI) { - unsigned Reg = I.getReg(); + Register Reg = I.getReg(); if (X86::GR64RegClass.contains(Reg) || X86::GR32RegClass.contains(Reg)) continue; @@ -2689,7 +2725,7 @@ bool X86FrameLowering::restoreCalleeSavedRegisters( // POP GPRs. unsigned Opc = STI.is64Bit() ? X86::POP64r : X86::POP32r; for (const CalleeSavedInfo &I : CSI) { - unsigned Reg = I.getReg(); + Register Reg = I.getReg(); if (!X86::GR64RegClass.contains(Reg) && !X86::GR32RegClass.contains(Reg)) continue; @@ -2944,15 +2980,16 @@ void X86FrameLowering::adjustForSegmentedStacks( const unsigned Reg10 = IsLP64 ? X86::R10 : X86::R10D; const unsigned Reg11 = IsLP64 ? X86::R11 : X86::R11D; const unsigned MOVrr = IsLP64 ? X86::MOV64rr : X86::MOV32rr; - const unsigned MOVri = IsLP64 ? X86::MOV64ri : X86::MOV32ri; if (IsNested) BuildMI(allocMBB, DL, TII.get(MOVrr), RegAX).addReg(Reg10); - BuildMI(allocMBB, DL, TII.get(MOVri), Reg10) - .addImm(StackSize); - BuildMI(allocMBB, DL, TII.get(MOVri), Reg11) - .addImm(X86FI->getArgumentStackSize()); + BuildMI(allocMBB, DL, TII.get(getMOVriOpcode(IsLP64, StackSize)), Reg10) + .addImm(StackSize); + BuildMI(allocMBB, DL, + TII.get(getMOVriOpcode(IsLP64, X86FI->getArgumentStackSize())), + Reg11) + .addImm(X86FI->getArgumentStackSize()); } else { BuildMI(allocMBB, DL, TII.get(X86::PUSHi32)) .addImm(X86FI->getArgumentStackSize()); diff --git a/llvm/lib/Target/X86/X86FrameLowering.h b/llvm/lib/Target/X86/X86FrameLowering.h index e18be0d26321..987facbfeae4 100644 --- a/llvm/lib/Target/X86/X86FrameLowering.h +++ b/llvm/lib/Target/X86/X86FrameLowering.h @@ -65,9 +65,8 @@ public: void inlineStackProbe(MachineFunction &MF, MachineBasicBlock &PrologMBB) const override; - void - emitCalleeSavedFrameMoves(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI) const override; + void emitCalleeSavedFrameMovesFullCFA( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const override; void emitCalleeSavedFrameMoves(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp index 7ed05fd0331d..5b90c67deae6 100644 --- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -80,9 +80,9 @@ namespace { bool NegateIndex = false; X86ISelAddressMode() - : BaseType(RegBase), Base_FrameIndex(0), Scale(1), IndexReg(), Disp(0), - Segment(), GV(nullptr), CP(nullptr), BlockAddr(nullptr), ES(nullptr), - MCSym(nullptr), JT(-1), SymbolFlags(X86II::MO_NO_FLAG) {} + : BaseType(RegBase), Base_FrameIndex(0), Scale(1), Disp(0), GV(nullptr), + CP(nullptr), BlockAddr(nullptr), ES(nullptr), MCSym(nullptr), JT(-1), + SymbolFlags(X86II::MO_NO_FLAG) {} bool hasSymbolicDisplacement() const { return GV != nullptr || CP != nullptr || ES != nullptr || diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 6f6361b6757b..aff72452af6c 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -1096,6 +1096,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::ROTR, VT, Custom); } + setOperationAction(ISD::FSHL, MVT::v16i8, Custom); + setOperationAction(ISD::FSHR, MVT::v16i8, Custom); + setOperationAction(ISD::FSHL, MVT::v4i32, Custom); + setOperationAction(ISD::FSHR, MVT::v4i32, Custom); + setOperationAction(ISD::STRICT_FSQRT, MVT::v2f64, Legal); setOperationAction(ISD::STRICT_FADD, MVT::v2f64, Legal); setOperationAction(ISD::STRICT_FSUB, MVT::v2f64, Legal); @@ -1284,6 +1289,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::ROTR, VT, Custom); } + setOperationAction(ISD::FSHL, MVT::v32i8, Custom); + setOperationAction(ISD::FSHR, MVT::v32i8, Custom); + setOperationAction(ISD::FSHL, MVT::v8i32, Custom); + setOperationAction(ISD::FSHR, MVT::v8i32, Custom); + // These types need custom splitting if their input is a 128-bit vector. setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom); @@ -1688,6 +1698,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SSUBSAT, VT, HasBWI ? Legal : Custom); } + setOperationAction(ISD::FSHL, MVT::v64i8, Custom); + setOperationAction(ISD::FSHR, MVT::v64i8, Custom); + setOperationAction(ISD::FSHL, MVT::v16i32, Custom); + setOperationAction(ISD::FSHR, MVT::v16i32, Custom); + if (Subtarget.hasDQI()) { setOperationAction(ISD::SINT_TO_FP, MVT::v8i64, Legal); setOperationAction(ISD::UINT_TO_FP, MVT::v8i64, Legal); @@ -5475,10 +5490,9 @@ bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, /// materialize the FP immediate as a load from a constant pool. bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const { - for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) { - if (Imm.bitwiseIsEqual(LegalFPImmediates[i])) + for (const APFloat &FPImm : LegalFPImmediates) + if (Imm.bitwiseIsEqual(FPImm)) return true; - } return false; } @@ -6132,6 +6146,29 @@ static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget, return DAG.getBitcast(VT, Vec); } +// Helper to determine if the ops are all the extracted subvectors come from a +// single source. If we allow commute they don't have to be in order (Lo/Hi). +static SDValue getSplitVectorSrc(SDValue LHS, SDValue RHS, bool AllowCommute) { + if (LHS.getOpcode() != ISD::EXTRACT_SUBVECTOR || + RHS.getOpcode() != ISD::EXTRACT_SUBVECTOR || + LHS.getValueType() != RHS.getValueType() || + LHS.getOperand(0) != RHS.getOperand(0)) + return SDValue(); + + SDValue Src = LHS.getOperand(0); + if (Src.getValueSizeInBits() != (LHS.getValueSizeInBits() * 2)) + return SDValue(); + + unsigned NumElts = LHS.getValueType().getVectorNumElements(); + if ((LHS.getConstantOperandAPInt(1) == 0 && + RHS.getConstantOperandAPInt(1) == NumElts) || + (AllowCommute && RHS.getConstantOperandAPInt(1) == 0 && + LHS.getConstantOperandAPInt(1) == NumElts)) + return Src; + + return SDValue(); +} + static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl, unsigned vectorWidth) { EVT VT = Vec.getValueType(); @@ -6850,8 +6887,8 @@ static SDValue getPack(SelectionDAG &DAG, const X86Subtarget &Subtarget, DAG.computeKnownBits(RHS).countMaxActiveBits() <= EltSizeInBits) return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS); - if (DAG.ComputeMinSignedBits(LHS) <= EltSizeInBits && - DAG.ComputeMinSignedBits(RHS) <= EltSizeInBits) + if (DAG.ComputeMaxSignificantBits(LHS) <= EltSizeInBits && + DAG.ComputeMaxSignificantBits(RHS) <= EltSizeInBits) return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS); } @@ -7907,6 +7944,7 @@ static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs, // Attempt to decode ops that could be represented as a shuffle mask. // The decoded shuffle mask may contain a different number of elements to the // destination value type. +// TODO: Merge into getTargetShuffleInputs() static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts, SmallVectorImpl<int> &Mask, SmallVectorImpl<SDValue> &Ops, @@ -8355,6 +8393,9 @@ static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero, const SelectionDAG &DAG, unsigned Depth, bool ResolveKnownElts) { + if (Depth >= SelectionDAG::MaxRecursionDepth) + return false; // Limit search depth. + EVT VT = Op.getValueType(); if (!VT.isSimple() || !VT.isVector()) return false; @@ -9233,8 +9274,13 @@ static bool isFoldableUseOfShuffle(SDNode *N) { return true; if (Opc == ISD::BITCAST) // Ignore bitcasts return isFoldableUseOfShuffle(U); - if (N->hasOneUse()) + if (N->hasOneUse()) { + // TODO, there may be some general way to know if a SDNode can + // be folded. We now only know whether an MI is foldable. + if (Opc == X86ISD::VPDPBUSD && U->getOperand(2).getNode() != N) + return false; return true; + } } return false; } @@ -10055,13 +10101,18 @@ static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV, if (IsSubAdd) return SDValue(); - // Do not generate X86ISD::ADDSUB node for 512-bit types even though - // the ADDSUB idiom has been successfully recognized. There are no known - // X86 targets with 512-bit ADDSUB instructions! - // 512-bit ADDSUB idiom recognition was needed only as part of FMADDSUB idiom - // recognition. - if (VT.is512BitVector()) - return SDValue(); + // There are no known X86 targets with 512-bit ADDSUB instructions! + // Convert to blend(fsub,fadd). + if (VT.is512BitVector()) { + SmallVector<int> Mask; + for (int I = 0, E = VT.getVectorNumElements(); I != E; I += 2) { + Mask.push_back(I); + Mask.push_back(I + E + 1); + } + SDValue Sub = DAG.getNode(ISD::FSUB, DL, VT, Opnd0, Opnd1); + SDValue Add = DAG.getNode(ISD::FADD, DL, VT, Opnd0, Opnd1); + return DAG.getVectorShuffle(VT, DL, Sub, Add, Mask); + } return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1); } @@ -12162,12 +12213,13 @@ static SDValue lowerShuffleAsVTRUNC(const SDLoc &DL, MVT VT, SDValue V1, return SDValue(); } -/// Check whether a compaction lowering can be done by dropping even -/// elements and compute how many times even elements must be dropped. +/// Check whether a compaction lowering can be done by dropping even/odd +/// elements and compute how many times even/odd elements must be dropped. /// /// This handles shuffles which take every Nth element where N is a power of /// two. Example shuffle masks: /// +/// (even) /// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14 /// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 /// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12 @@ -12175,16 +12227,20 @@ static SDValue lowerShuffleAsVTRUNC(const SDLoc &DL, MVT VT, SDValue V1, /// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8 /// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24 /// +/// (odd) +/// N = 1: 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14 +/// N = 1: 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 +/// /// Any of these lanes can of course be undef. /// /// This routine only supports N <= 3. /// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here /// for larger N. /// -/// \returns N above, or the number of times even elements must be dropped if -/// there is such a number. Otherwise returns zero. -static int canLowerByDroppingEvenElements(ArrayRef<int> Mask, - bool IsSingleInput) { +/// \returns N above, or the number of times even/odd elements must be dropped +/// if there is such a number. Otherwise returns zero. +static int canLowerByDroppingElements(ArrayRef<int> Mask, bool MatchEven, + bool IsSingleInput) { // The modulus for the shuffle vector entries is based on whether this is // a single input or not. int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2); @@ -12192,6 +12248,7 @@ static int canLowerByDroppingEvenElements(ArrayRef<int> Mask, "We should only be called with masks with a power-of-2 size!"); uint64_t ModMask = (uint64_t)ShuffleModulus - 1; + int Offset = MatchEven ? 0 : 1; // We track whether the input is viable for all power-of-2 strides 2^1, 2^2, // and 2^3 simultaneously. This is because we may have ambiguity with @@ -12210,7 +12267,7 @@ static int canLowerByDroppingEvenElements(ArrayRef<int> Mask, uint64_t N = j + 1; // The shuffle mask must be equal to (i * 2^N) % M. - if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask)) + if ((uint64_t)(Mask[i] - Offset) == (((uint64_t)i << N) & ModMask)) IsAnyViable = true; else ViableForN[j] = false; @@ -15724,7 +15781,7 @@ static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask, // Attempt to lower using compaction, SSE41 is necessary for PACKUSDW. // We could use SIGN_EXTEND_INREG+PACKSSDW for older targets but this seems to // be slower than a PSHUFLW+PSHUFHW+PSHUFD chain. - int NumEvenDrops = canLowerByDroppingEvenElements(Mask, false); + int NumEvenDrops = canLowerByDroppingElements(Mask, true, false); if ((NumEvenDrops == 1 || NumEvenDrops == 2) && Subtarget.hasSSE41() && !Subtarget.hasVLX()) { // Check if this is part of a 256-bit vector truncation. @@ -15758,6 +15815,20 @@ static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask, return Result; } + // When compacting odd (upper) elements, use PACKSS pre-SSE41. + int NumOddDrops = canLowerByDroppingElements(Mask, false, false); + if (NumOddDrops == 1) { + bool HasSSE41 = Subtarget.hasSSE41(); + V1 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32, + DAG.getBitcast(MVT::v4i32, V1), + DAG.getTargetConstant(16, DL, MVT::i8)); + V2 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32, + DAG.getBitcast(MVT::v4i32, V2), + DAG.getTargetConstant(16, DL, MVT::i8)); + return DAG.getNode(HasSSE41 ? X86ISD::PACKUS : X86ISD::PACKSS, DL, + MVT::v8i16, V1, V2); + } + // Try to lower by permuting the inputs into an unpack instruction. if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG)) @@ -16024,7 +16095,7 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask, // Check for compaction patterns. bool IsSingleInput = V2.isUndef(); - int NumEvenDrops = canLowerByDroppingEvenElements(Mask, IsSingleInput); + int NumEvenDrops = canLowerByDroppingElements(Mask, true, IsSingleInput); // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly // with PSHUFB. It is important to do this before we attempt to generate any @@ -16135,6 +16206,19 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask, return Result; } + int NumOddDrops = canLowerByDroppingElements(Mask, false, IsSingleInput); + if (NumOddDrops == 1) { + V1 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16, + DAG.getBitcast(MVT::v8i16, V1), + DAG.getTargetConstant(8, DL, MVT::i8)); + if (!IsSingleInput) + V2 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16, + DAG.getBitcast(MVT::v8i16, V2), + DAG.getTargetConstant(8, DL, MVT::i8)); + return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, + IsSingleInput ? V1 : V2); + } + // Handle multi-input cases by blending/unpacking single-input shuffles. if (NumV2Elements > 0) return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v16i8, V1, V2, Mask, @@ -16538,20 +16622,19 @@ static SDValue lowerShuffleAsLanePermuteAndShuffle( // If there are only inputs from one 128-bit lane, splitting will in fact be // less expensive. The flags track whether the given lane contains an element // that crosses to another lane. + bool AllLanes; if (!Subtarget.hasAVX2()) { bool LaneCrossing[2] = {false, false}; for (int i = 0; i < Size; ++i) if (Mask[i] >= 0 && ((Mask[i] % Size) / LaneSize) != (i / LaneSize)) LaneCrossing[(Mask[i] % Size) / LaneSize] = true; - if (!LaneCrossing[0] || !LaneCrossing[1]) - return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG); + AllLanes = LaneCrossing[0] && LaneCrossing[1]; } else { bool LaneUsed[2] = {false, false}; for (int i = 0; i < Size; ++i) if (Mask[i] >= 0) LaneUsed[(Mask[i] % Size) / LaneSize] = true; - if (!LaneUsed[0] || !LaneUsed[1]) - return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG); + AllLanes = LaneUsed[0] && LaneUsed[1]; } // TODO - we could support shuffling V2 in the Flipped input. @@ -16569,6 +16652,11 @@ static SDValue lowerShuffleAsLanePermuteAndShuffle( assert(!is128BitLaneCrossingShuffleMask(VT, InLaneMask) && "In-lane shuffle mask expected"); + // If we're not using both lanes in each lane and the inlane mask is not + // repeating, then we're better off splitting. + if (!AllLanes && !is128BitLaneRepeatedShuffleMask(VT, InLaneMask)) + return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG); + // Flip the lanes, and shuffle the results which should now be in-lane. MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64; SDValue Flipped = DAG.getBitcast(PVT, V1); @@ -22598,7 +22686,7 @@ SDValue X86TargetLowering::lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const { /// ISD::FROUND is defined to round to nearest with ties rounding away from 0. /// This mode isn't supported in hardware on X86. But as long as we aren't /// compiling with trapping math, we can emulate this with -/// floor(X + copysign(nextafter(0.5, 0.0), X)). +/// trunc(X + copysign(nextafter(0.5, 0.0), X)). static SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) { SDValue N0 = Op.getOperand(0); SDLoc dl(Op); @@ -23157,10 +23245,10 @@ static SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, // For equality comparisons try to use SIGN_EXTEND if the input was // truncate from something with enough sign bits. if (Op0.getOpcode() == ISD::TRUNCATE) { - if (DAG.ComputeMinSignedBits(Op0.getOperand(0)) <= 16) + if (DAG.ComputeMaxSignificantBits(Op0.getOperand(0)) <= 16) ExtendOp = ISD::SIGN_EXTEND; } else if (Op1.getOpcode() == ISD::TRUNCATE) { - if (DAG.ComputeMinSignedBits(Op1.getOperand(0)) <= 16) + if (DAG.ComputeMaxSignificantBits(Op1.getOperand(0)) <= 16) ExtendOp = ISD::SIGN_EXTEND; } } @@ -24543,32 +24631,27 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { } else if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) && (CondCode == X86::COND_E || CondCode == X86::COND_NE)) { SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2; - - SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); SDVTList CmpVTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32); - // Apply further optimizations for special cases - // (select (x != 0), -1, 0) -> neg & sbb - // (select (x == 0), 0, -1) -> neg & sbb - if (isNullConstant(Y) && - (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) { + // 'X - 1' sets the carry flag if X == 0. + // '0 - X' sets the carry flag if X != 0. + // Convert the carry flag to a -1/0 mask with sbb: + // select (X != 0), -1, Y --> 0 - X; or (sbb), Y + // select (X == 0), Y, -1 --> 0 - X; or (sbb), Y + // select (X != 0), Y, -1 --> X - 1; or (sbb), Y + // select (X == 0), -1, Y --> X - 1; or (sbb), Y + SDValue Sub; + if (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE)) { SDValue Zero = DAG.getConstant(0, DL, CmpOp0.getValueType()); - SDValue Neg = DAG.getNode(X86ISD::SUB, DL, CmpVTs, Zero, CmpOp0); - Zero = DAG.getConstant(0, DL, Op.getValueType()); - return DAG.getNode(X86ISD::SBB, DL, VTs, Zero, Zero, Neg.getValue(1)); + Sub = DAG.getNode(X86ISD::SUB, DL, CmpVTs, Zero, CmpOp0); + } else { + SDValue One = DAG.getConstant(1, DL, CmpOp0.getValueType()); + Sub = DAG.getNode(X86ISD::SUB, DL, CmpVTs, CmpOp0, One); } - - Cmp = DAG.getNode(X86ISD::SUB, DL, CmpVTs, - CmpOp0, DAG.getConstant(1, DL, CmpOp0.getValueType())); - - SDValue Zero = DAG.getConstant(0, DL, Op.getValueType()); - SDValue Res = // Res = 0 or -1. - DAG.getNode(X86ISD::SBB, DL, VTs, Zero, Zero, Cmp.getValue(1)); - - if (isAllOnesConstant(Op1) != (CondCode == X86::COND_E)) - Res = DAG.getNOT(DL, Res, Res.getValueType()); - - return DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y); + SDValue SBB = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, + DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), + Sub.getValue(1)); + return DAG.getNode(ISD::OR, DL, VT, SBB, Y); } else if (!Subtarget.hasCMov() && CondCode == X86::COND_E && Cmp.getOperand(0).getOpcode() == ISD::AND && isOneConstant(Cmp.getOperand(0).getOperand(1))) { @@ -25725,9 +25808,9 @@ static SDValue getMaskNode(SDValue Mask, MVT MaskVT, /// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the /// necessary casting or extending for \p Mask when lowering masking intrinsics static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask, - SDValue PreservedSrc, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { + SDValue PreservedSrc, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); unsigned OpcodeSelect = ISD::VSELECT; @@ -29743,20 +29826,106 @@ static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget, bool IsFSHR = Op.getOpcode() == ISD::FSHR; if (VT.isVector()) { - assert(Subtarget.hasVBMI2() && "Expected VBMI2"); + APInt APIntShiftAmt; + bool IsCstSplat = X86::isConstantSplat(Amt, APIntShiftAmt); - if (IsFSHR) - std::swap(Op0, Op1); + if (Subtarget.hasVBMI2() && EltSizeInBits > 8) { + if (IsFSHR) + std::swap(Op0, Op1); - APInt APIntShiftAmt; - if (X86::isConstantSplat(Amt, APIntShiftAmt)) { - uint64_t ShiftAmt = APIntShiftAmt.urem(EltSizeInBits); - SDValue Imm = DAG.getTargetConstant(ShiftAmt, DL, MVT::i8); - return getAVX512Node(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, VT, - {Op0, Op1, Imm}, DAG, Subtarget); + if (IsCstSplat) { + uint64_t ShiftAmt = APIntShiftAmt.urem(EltSizeInBits); + SDValue Imm = DAG.getTargetConstant(ShiftAmt, DL, MVT::i8); + return getAVX512Node(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, VT, + {Op0, Op1, Imm}, DAG, Subtarget); + } + return getAVX512Node(IsFSHR ? X86ISD::VSHRDV : X86ISD::VSHLDV, DL, VT, + {Op0, Op1, Amt}, DAG, Subtarget); } - return getAVX512Node(IsFSHR ? X86ISD::VSHRDV : X86ISD::VSHLDV, DL, VT, - {Op0, Op1, Amt}, DAG, Subtarget); + assert((VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 || + VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) && + "Unexpected funnel shift type!"); + + // fshl(x,y,z) -> unpack(y,x) << (z & (bw-1))) >> bw. + // fshr(x,y,z) -> unpack(y,x) >> (z & (bw-1))). + if (IsCstSplat) + return SDValue(); + + SDValue AmtMask = DAG.getConstant(EltSizeInBits - 1, DL, VT); + SDValue AmtMod = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask); + bool IsCst = ISD::isBuildVectorOfConstantSDNodes(AmtMod.getNode()); + + unsigned ShiftOpc = IsFSHR ? ISD::SRL : ISD::SHL; + unsigned NumElts = VT.getVectorNumElements(); + MVT ExtSVT = MVT::getIntegerVT(2 * EltSizeInBits); + MVT ExtVT = MVT::getVectorVT(ExtSVT, NumElts / 2); + + // Split 256-bit integers on XOP/pre-AVX2 targets. + // Split 512-bit integers on non 512-bit BWI targets. + if ((VT.is256BitVector() && ((Subtarget.hasXOP() && EltSizeInBits < 32) || + !Subtarget.hasAVX2())) || + (VT.is512BitVector() && !Subtarget.useBWIRegs() && + EltSizeInBits < 32)) { + // Pre-mask the amount modulo using the wider vector. + Op = DAG.getNode(Op.getOpcode(), DL, VT, Op0, Op1, AmtMod); + return splitVectorOp(Op, DAG); + } + + // Attempt to fold scalar shift as unpack(y,x) << zext(splat(z)) + if (supportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, ShiftOpc)) { + if (SDValue ScalarAmt = DAG.getSplatValue(AmtMod)) { + SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, Op1, Op0)); + SDValue Hi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, Op1, Op0)); + ScalarAmt = DAG.getZExtOrTrunc(ScalarAmt, DL, MVT::i32); + Lo = getTargetVShiftNode(ShiftOpc, DL, ExtVT, Lo, ScalarAmt, Subtarget, + DAG); + Hi = getTargetVShiftNode(ShiftOpc, DL, ExtVT, Hi, ScalarAmt, Subtarget, + DAG); + return getPack(DAG, Subtarget, DL, VT, Lo, Hi, !IsFSHR); + } + } + + MVT WideSVT = MVT::getIntegerVT( + std::min<unsigned>(EltSizeInBits * 2, Subtarget.hasBWI() ? 16 : 32)); + MVT WideVT = MVT::getVectorVT(WideSVT, NumElts); + + // If per-element shifts are legal, fallback to generic expansion. + if (supportedVectorVarShift(VT, Subtarget, ShiftOpc) || Subtarget.hasXOP()) + return SDValue(); + + // Attempt to fold as: + // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw. + // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))). + if (supportedVectorVarShift(WideVT, Subtarget, ShiftOpc) && + supportedVectorShiftWithImm(WideVT, Subtarget, ShiftOpc)) { + Op0 = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Op0); + Op1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, Op1); + AmtMod = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, AmtMod); + Op0 = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, WideVT, Op0, + EltSizeInBits, DAG); + SDValue Res = DAG.getNode(ISD::OR, DL, WideVT, Op0, Op1); + Res = DAG.getNode(ShiftOpc, DL, WideVT, Res, AmtMod); + if (!IsFSHR) + Res = getTargetVShiftByConstNode(X86ISD::VSRLI, DL, WideVT, Res, + EltSizeInBits, DAG); + return DAG.getNode(ISD::TRUNCATE, DL, VT, Res); + } + + // Attempt to fold per-element (ExtVT) shift as unpack(y,x) << zext(z) + if ((IsCst && !IsFSHR && EltSizeInBits == 8) || + supportedVectorVarShift(ExtVT, Subtarget, ShiftOpc)) { + SDValue Z = DAG.getConstant(0, DL, VT); + SDValue RLo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, Op1, Op0)); + SDValue RHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, Op1, Op0)); + SDValue ALo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, AmtMod, Z)); + SDValue AHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, AmtMod, Z)); + SDValue Lo = DAG.getNode(ShiftOpc, DL, ExtVT, RLo, ALo); + SDValue Hi = DAG.getNode(ShiftOpc, DL, ExtVT, RHi, AHi); + return getPack(DAG, Subtarget, DL, VT, Lo, Hi, !IsFSHR); + } + + // Fallback to generic expansion. + return SDValue(); } assert( (VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) && @@ -29901,8 +30070,9 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget, // Attempt to fold as unpack(x,x) << zext(splat(y)): // rotl(x,y) -> (unpack(x,x) << (y & (bw-1))) >> bw. // rotr(x,y) -> (unpack(x,x) >> (y & (bw-1))). - // TODO: Handle vXi16 cases. - if (EltSizeInBits == 8 || EltSizeInBits == 32) { + // TODO: Handle vXi16 cases on all targets. + if (EltSizeInBits == 8 || EltSizeInBits == 32 || + (IsROTL && EltSizeInBits == 16 && !Subtarget.hasAVX())) { if (SDValue BaseRotAmt = DAG.getSplatValue(AmtMod)) { unsigned ShiftX86Opc = IsROTL ? X86ISD::VSHLI : X86ISD::VSRLI; SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, R, R)); @@ -33013,7 +33183,7 @@ bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const { // AVX512BW has shifts such as vpsllvw. if (Subtarget.hasBWI() && Bits == 16) - return false; + return false; // Otherwise, it's significantly cheaper to shift by a scalar amount than by a // fully general vector. @@ -33029,6 +33199,11 @@ bool X86TargetLowering::isBinOp(unsigned Opcode) const { case X86ISD::FMAX: case X86ISD::FMIN: case X86ISD::FANDN: + case X86ISD::VPSHA: + case X86ISD::VPSHL: + case X86ISD::VSHLV: + case X86ISD::VSRLV: + case X86ISD::VSRAV: return true; } @@ -33285,9 +33460,7 @@ bool X86TargetLowering::areJTsAllowed(const Function *Fn) const { static bool isEFLAGSLiveAfter(MachineBasicBlock::iterator Itr, MachineBasicBlock *BB) { // Scan forward through BB for a use/def of EFLAGS. - for (MachineBasicBlock::iterator miI = std::next(Itr), miE = BB->end(); - miI != miE; ++miI) { - const MachineInstr& mi = *miI; + for (const MachineInstr &mi : llvm::make_range(std::next(Itr), BB->end())) { if (mi.readsRegister(X86::EFLAGS)) return true; // If we found a def, we can stop searching. @@ -38724,6 +38897,8 @@ static SDValue canonicalizeShuffleWithBinOps(SDValue N, SelectionDAG &DAG, case X86ISD::VBROADCAST: case X86ISD::MOVDDUP: case X86ISD::PSHUFD: + case X86ISD::PSHUFHW: + case X86ISD::PSHUFLW: case X86ISD::VPERMI: case X86ISD::VPERMILPI: { if (N.getOperand(0).getValueType() == ShuffleVT && @@ -38877,9 +39052,6 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG, if (SDValue R = combineCommutableSHUFP(N, VT, DL, DAG)) return R; - if (SDValue R = canonicalizeShuffleWithBinOps(N, DAG, DL)) - return R; - // Handle specific target shuffles. switch (Opcode) { case X86ISD::MOVDDUP: { @@ -39844,6 +40016,12 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG, if (TLI.SimplifyDemandedVectorElts(Op, DemandedElts, KnownUndef, KnownZero, DCI)) return SDValue(N, 0); + + // Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)). + // Perform this after other shuffle combines to allow inner shuffles to be + // combined away first. + if (SDValue BinOp = canonicalizeShuffleWithBinOps(Op, DAG, SDLoc(N))) + return BinOp; } return SDValue(); @@ -40037,6 +40215,24 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc, Op.getOperand(1))); break; } + case X86ISD::VPSHA: + case X86ISD::VPSHL: + case X86ISD::VSHLV: + case X86ISD::VSRLV: + case X86ISD::VSRAV: { + APInt LHSUndef, LHSZero; + APInt RHSUndef, RHSZero; + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO, + Depth + 1)) + return true; + if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO, + Depth + 1)) + return true; + KnownZero = LHSZero; + break; + } case X86ISD::KSHIFTL: { SDValue Src = Op.getOperand(0); auto *Amt = cast<ConstantSDNode>(Op.getOperand(1)); @@ -41799,6 +41995,37 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG, return SDValue(); } +// (mul (zext a), (sext, b)) +static bool detectExtMul(SelectionDAG &DAG, const SDValue &Mul, SDValue &Op0, + SDValue &Op1) { + Op0 = Mul.getOperand(0); + Op1 = Mul.getOperand(1); + + // The operand1 should be signed extend + if (Op0.getOpcode() == ISD::SIGN_EXTEND) + std::swap(Op0, Op1); + + auto IsFreeTruncation = [](SDValue &Op) -> bool { + if ((Op.getOpcode() == ISD::ZERO_EXTEND || + Op.getOpcode() == ISD::SIGN_EXTEND) && + Op.getOperand(0).getScalarValueSizeInBits() <= 8) + return true; + + auto *BV = dyn_cast<BuildVectorSDNode>(Op); + return (BV && BV->isConstant()); + }; + + // (dpbusd (zext a), (sext, b)). Since the first operand should be unsigned + // value, we need to check Op0 is zero extended value. Op1 should be signed + // value, so we just check the signed bits. + if ((IsFreeTruncation(Op0) && + DAG.computeKnownBits(Op0).countMaxActiveBits() <= 8) && + (IsFreeTruncation(Op1) && DAG.ComputeMaxSignificantBits(Op1) <= 8)) + return true; + + return false; +} + // Given a ABS node, detect the following pattern: // (ABS (SUB (ZERO_EXTEND a), (ZERO_EXTEND b))). // This is useful as it is the input into a SAD pattern. @@ -41820,6 +42047,50 @@ static bool detectZextAbsDiff(const SDValue &Abs, SDValue &Op0, SDValue &Op1) { return true; } +static SDValue createVPDPBUSD(SelectionDAG &DAG, SDValue LHS, SDValue RHS, + unsigned &LogBias, const SDLoc &DL, + const X86Subtarget &Subtarget) { + // Extend or truncate to MVT::i8 first. + MVT Vi8VT = + MVT::getVectorVT(MVT::i8, LHS.getValueType().getVectorElementCount()); + LHS = DAG.getZExtOrTrunc(LHS, DL, Vi8VT); + RHS = DAG.getSExtOrTrunc(RHS, DL, Vi8VT); + + // VPDPBUSD(<16 x i32>C, <16 x i8>A, <16 x i8>B). For each dst element + // C[0] = C[0] + A[0]B[0] + A[1]B[1] + A[2]B[2] + A[3]B[3]. + // The src A, B element type is i8, but the dst C element type is i32. + // When we calculate the reduce stage, we use src vector type vXi8 for it + // so we need logbias 2 to avoid extra 2 stages. + LogBias = 2; + + unsigned RegSize = std::max(128u, (unsigned)Vi8VT.getSizeInBits()); + if (Subtarget.hasVNNI() && !Subtarget.hasVLX()) + RegSize = std::max(512u, RegSize); + + // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we + // fill in the missing vector elements with 0. + unsigned NumConcat = RegSize / Vi8VT.getSizeInBits(); + SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, Vi8VT)); + Ops[0] = LHS; + MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8); + SDValue DpOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops); + Ops[0] = RHS; + SDValue DpOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops); + + // Actually build the DotProduct, split as 256/512 bits for + // AVXVNNI/AVX512VNNI. + auto DpBuilder = [](SelectionDAG &DAG, const SDLoc &DL, + ArrayRef<SDValue> Ops) { + MVT VT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32); + return DAG.getNode(X86ISD::VPDPBUSD, DL, VT, Ops); + }; + MVT DpVT = MVT::getVectorVT(MVT::i32, RegSize / 32); + SDValue Zero = DAG.getConstant(0, DL, DpVT); + + return SplitOpsAndApply(DAG, Subtarget, DL, DpVT, {Zero, DpOp0, DpOp1}, + DpBuilder, false); +} + // Given two zexts of <k x i8> to <k x i32>, create a PSADBW of the inputs // to these zexts. static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0, @@ -41967,18 +42238,19 @@ static SDValue combinePredicateReduction(SDNode *Extract, SelectionDAG &DAG, EVT MovmskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts); Movmsk = DAG.getBitcast(MovmskVT, Match); } else { - // For all_of(setcc(vec,0,eq)) - avoid vXi64 comparisons if we don't have - // PCMPEQQ (SSE41+), use PCMPEQD instead. - if (BinOp == ISD::AND && !Subtarget.hasSSE41() && - Match.getOpcode() == ISD::SETCC && - ISD::isBuildVectorAllZeros(Match.getOperand(1).getNode()) && + // For all_of(setcc(x,y,eq)) + // - avoid vXi64 comparisons without PCMPEQQ (SSE41+), use PCMPEQD. + // - avoid vXi16 comparisons, use PMOVMSKB(PCMPEQB()). + if (BinOp == ISD::AND && Match.getOpcode() == ISD::SETCC && cast<CondCodeSDNode>(Match.getOperand(2))->get() == ISD::CondCode::SETEQ) { SDValue Vec = Match.getOperand(0); - if (Vec.getValueType().getScalarType() == MVT::i64 && - (2 * NumElts) <= MaxElts) { + EVT VecSVT = Vec.getValueType().getScalarType(); + if ((VecSVT == MVT::i16 && !Subtarget.hasBWI()) || + (VecSVT == MVT::i64 && !Subtarget.hasSSE41())) { NumElts *= 2; - EVT CmpVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts); + VecSVT = VecSVT.getHalfSizedIntegerVT(*DAG.getContext()); + EVT CmpVT = EVT::getVectorVT(*DAG.getContext(), VecSVT, NumElts); MatchVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts); Match = DAG.getSetCC( DL, MatchVT, DAG.getBitcast(CmpVT, Match.getOperand(0)), @@ -42069,6 +42341,77 @@ static SDValue combinePredicateReduction(SDNode *Extract, SelectionDAG &DAG, return DAG.getNode(ISD::SUB, DL, ExtractVT, Zero, Zext); } +static SDValue combineVPDPBUSDPattern(SDNode *Extract, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + if (!Subtarget.hasVNNI() && !Subtarget.hasAVXVNNI()) + return SDValue(); + + EVT ExtractVT = Extract->getValueType(0); + // Verify the type we're extracting is i32, as the output element type of + // vpdpbusd is i32. + if (ExtractVT != MVT::i32) + return SDValue(); + + EVT VT = Extract->getOperand(0).getValueType(); + if (!isPowerOf2_32(VT.getVectorNumElements())) + return SDValue(); + + // Match shuffle + add pyramid. + ISD::NodeType BinOp; + SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD}); + + // We can't combine to vpdpbusd for zext, because each of the 4 multiplies + // done by vpdpbusd compute a signed 16-bit product that will be sign extended + // before adding into the accumulator. + // TODO: + // We also need to verify that the multiply has at least 2x the number of bits + // of the input. We shouldn't match + // (sign_extend (mul (vXi9 (zext (vXi8 X))), (vXi9 (zext (vXi8 Y)))). + // if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND)) + // Root = Root.getOperand(0); + + // If there was a match, we want Root to be a mul. + if (!Root || Root.getOpcode() != ISD::MUL) + return SDValue(); + + // Check whether we have an extend and mul pattern + SDValue LHS, RHS; + if (!detectExtMul(DAG, Root, LHS, RHS)) + return SDValue(); + + // Create the dot product instruction. + SDLoc DL(Extract); + unsigned StageBias; + SDValue DP = createVPDPBUSD(DAG, LHS, RHS, StageBias, DL, Subtarget); + + // If the original vector was wider than 4 elements, sum over the results + // in the DP vector. + unsigned Stages = Log2_32(VT.getVectorNumElements()); + EVT DpVT = DP.getValueType(); + + if (Stages > StageBias) { + unsigned DpElems = DpVT.getVectorNumElements(); + + for (unsigned i = Stages - StageBias; i > 0; --i) { + SmallVector<int, 16> Mask(DpElems, -1); + for (unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j) + Mask[j] = MaskEnd + j; + + SDValue Shuffle = + DAG.getVectorShuffle(DpVT, DL, DP, DAG.getUNDEF(DpVT), Mask); + DP = DAG.getNode(ISD::ADD, DL, DpVT, DP, Shuffle); + } + } + + // Return the lowest ExtractSizeInBits bits. + EVT ResVT = + EVT::getVectorVT(*DAG.getContext(), ExtractVT, + DpVT.getSizeInBits() / ExtractVT.getSizeInBits()); + DP = DAG.getBitcast(ResVT, DP); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, DP, + Extract->getOperand(1)); +} + static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget) { // PSADBW is only supported on SSE2 and up. @@ -42676,6 +43019,9 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG, if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget)) return SAD; + if (SDValue VPDPBUSD = combineVPDPBUSDPattern(N, DAG, Subtarget)) + return VPDPBUSD; + // Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK. if (SDValue Cmp = combinePredicateReduction(N, DAG, Subtarget)) return Cmp; @@ -42903,6 +43249,15 @@ static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) { // multiplier, convert to 'and' + 'add'. const APInt &TrueVal = TrueC->getAPIntValue(); const APInt &FalseVal = FalseC->getAPIntValue(); + + // We have a more efficient lowering for "(X == 0) ? Y : -1" using SBB. + if ((TrueVal.isAllOnes() || FalseVal.isAllOnes()) && + Cond.getOpcode() == ISD::SETCC && isNullConstant(Cond.getOperand(1))) { + ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); + if (CC == ISD::SETEQ || CC == ISD::SETNE) + return SDValue(); + } + bool OV; APInt Diff = TrueVal.ssub_ov(FalseVal, OV); if (OV) @@ -44052,6 +44407,23 @@ static SDValue combinePTESTCC(SDValue EFLAGS, X86::CondCode &CC, // TESTZ(X,-1) == TESTZ(X,X) if (ISD::isBuildVectorAllOnes(Op1.getNode())) return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op0, Op0); + + // TESTZ(OR(LO(X),HI(X)),OR(LO(Y),HI(Y))) -> TESTZ(X,Y) + // TODO: Add COND_NE handling? + if (CC == X86::COND_E && OpVT.is128BitVector() && Subtarget.hasAVX()) { + SDValue Src0 = peekThroughBitcasts(Op0); + SDValue Src1 = peekThroughBitcasts(Op1); + if (Src0.getOpcode() == ISD::OR && Src1.getOpcode() == ISD::OR) { + Src0 = getSplitVectorSrc(peekThroughBitcasts(Src0.getOperand(0)), + peekThroughBitcasts(Src0.getOperand(1)), true); + Src1 = getSplitVectorSrc(peekThroughBitcasts(Src1.getOperand(0)), + peekThroughBitcasts(Src1.getOperand(1)), true); + if (Src0 && Src1) + return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, + DAG.getBitcast(MVT::v4i64, Src0), + DAG.getBitcast(MVT::v4i64, Src1)); + } + } } return SDValue(); @@ -44117,21 +44489,58 @@ static SDValue combineSetCCMOVMSK(SDValue EFLAGS, X86::CondCode &CC, BCNumEltBits > NumEltBits && DAG.ComputeNumSignBits(BC) > (BCNumEltBits - NumEltBits)) { SDLoc DL(EFLAGS); - unsigned CmpMask = IsAnyOf ? 0 : ((1 << BCNumElts) - 1); + APInt CmpMask = APInt::getLowBitsSet(32, IsAnyOf ? 0 : BCNumElts); return DAG.getNode(X86ISD::CMP, DL, MVT::i32, DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, BC), DAG.getConstant(CmpMask, DL, MVT::i32)); } } + // MOVMSK(CONCAT(X,Y)) == 0 -> MOVMSK(OR(X,Y)). + // MOVMSK(CONCAT(X,Y)) != 0 -> MOVMSK(OR(X,Y)). + // MOVMSK(CONCAT(X,Y)) == -1 -> MOVMSK(AND(X,Y)). + // MOVMSK(CONCAT(X,Y)) != -1 -> MOVMSK(AND(X,Y)). + if (VecVT.is256BitVector()) { + SmallVector<SDValue> Ops; + if (collectConcatOps(peekThroughBitcasts(Vec).getNode(), Ops) && + Ops.size() == 2) { + SDLoc DL(EFLAGS); + EVT SubVT = Ops[0].getValueType(); + APInt CmpMask = APInt::getLowBitsSet(32, IsAnyOf ? 0 : NumElts / 2); + SDValue V = DAG.getNode(IsAnyOf ? ISD::OR : ISD::AND, DL, SubVT, Ops); + V = DAG.getBitcast(VecVT.getHalfNumVectorElementsVT(), V); + return DAG.getNode(X86ISD::CMP, DL, MVT::i32, + DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V), + DAG.getConstant(CmpMask, DL, MVT::i32)); + } + } + // MOVMSK(PCMPEQ(X,0)) == -1 -> PTESTZ(X,X). // MOVMSK(PCMPEQ(X,0)) != -1 -> !PTESTZ(X,X). + // MOVMSK(PCMPEQ(X,Y)) == -1 -> PTESTZ(SUB(X,Y),SUB(X,Y)). + // MOVMSK(PCMPEQ(X,Y)) != -1 -> !PTESTZ(SUB(X,Y),SUB(X,Y)). if (IsAllOf && Subtarget.hasSSE41()) { + MVT TestVT = VecVT.is128BitVector() ? MVT::v2i64 : MVT::v4i64; SDValue BC = peekThroughBitcasts(Vec); - if (BC.getOpcode() == X86ISD::PCMPEQ && - ISD::isBuildVectorAllZeros(BC.getOperand(1).getNode())) { - MVT TestVT = VecVT.is128BitVector() ? MVT::v2i64 : MVT::v4i64; - SDValue V = DAG.getBitcast(TestVT, BC.getOperand(0)); + if (BC.getOpcode() == X86ISD::PCMPEQ) { + SDValue V = DAG.getNode(ISD::SUB, SDLoc(BC), BC.getValueType(), + BC.getOperand(0), BC.getOperand(1)); + V = DAG.getBitcast(TestVT, V); + return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V); + } + // Check for 256-bit split vector cases. + if (BC.getOpcode() == ISD::AND && + BC.getOperand(0).getOpcode() == X86ISD::PCMPEQ && + BC.getOperand(1).getOpcode() == X86ISD::PCMPEQ) { + SDValue LHS = BC.getOperand(0); + SDValue RHS = BC.getOperand(1); + LHS = DAG.getNode(ISD::SUB, SDLoc(LHS), LHS.getValueType(), + LHS.getOperand(0), LHS.getOperand(1)); + RHS = DAG.getNode(ISD::SUB, SDLoc(RHS), RHS.getValueType(), + RHS.getOperand(0), RHS.getOperand(1)); + LHS = DAG.getBitcast(TestVT, LHS); + RHS = DAG.getBitcast(TestVT, RHS); + SDValue V = DAG.getNode(ISD::OR, SDLoc(EFLAGS), TestVT, LHS, RHS); return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V); } } @@ -44162,23 +44571,28 @@ static SDValue combineSetCCMOVMSK(SDValue EFLAGS, X86::CondCode &CC, // PMOVMSKB(PACKSSBW(LO(X), HI(X))) // -> PMOVMSKB(BITCAST_v32i8(X)) & 0xAAAAAAAA. if (CmpBits >= 16 && Subtarget.hasInt256() && - VecOp0.getOpcode() == ISD::EXTRACT_SUBVECTOR && - VecOp1.getOpcode() == ISD::EXTRACT_SUBVECTOR && - VecOp0.getOperand(0) == VecOp1.getOperand(0) && - VecOp0.getConstantOperandAPInt(1) == 0 && - VecOp1.getConstantOperandAPInt(1) == 8 && (IsAnyOf || (SignExt0 && SignExt1))) { - SDLoc DL(EFLAGS); - SDValue Result = DAG.getBitcast(MVT::v32i8, VecOp0.getOperand(0)); - Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result); - unsigned CmpMask = IsAnyOf ? 0 : 0xFFFFFFFF; - if (!SignExt0 || !SignExt1) { - assert(IsAnyOf && "Only perform v16i16 signmasks for any_of patterns"); - Result = DAG.getNode(ISD::AND, DL, MVT::i32, Result, - DAG.getConstant(0xAAAAAAAA, DL, MVT::i32)); + if (SDValue Src = getSplitVectorSrc(VecOp0, VecOp1, true)) { + SDLoc DL(EFLAGS); + SDValue Result = peekThroughBitcasts(Src); + if (IsAllOf && Result.getOpcode() == X86ISD::PCMPEQ) { + SDValue V = DAG.getNode(ISD::SUB, DL, Result.getValueType(), + Result.getOperand(0), Result.getOperand(1)); + V = DAG.getBitcast(MVT::v4i64, V); + return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V); + } + Result = DAG.getBitcast(MVT::v32i8, Result); + Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result); + unsigned CmpMask = IsAnyOf ? 0 : 0xFFFFFFFF; + if (!SignExt0 || !SignExt1) { + assert(IsAnyOf && + "Only perform v16i16 signmasks for any_of patterns"); + Result = DAG.getNode(ISD::AND, DL, MVT::i32, Result, + DAG.getConstant(0xAAAAAAAA, DL, MVT::i32)); + } + return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result, + DAG.getConstant(CmpMask, DL, MVT::i32)); } - return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result, - DAG.getConstant(CmpMask, DL, MVT::i32)); } } @@ -44732,7 +45146,8 @@ static SDValue combineMulToPMADDWD(SDNode *N, SelectionDAG &DAG, return SDValue(); // Sign bits must extend down to the lowest i16. - if (DAG.ComputeMinSignedBits(N1) > 16 || DAG.ComputeMinSignedBits(N0) > 16) + if (DAG.ComputeMaxSignificantBits(N1) > 16 || + DAG.ComputeMaxSignificantBits(N0) > 16) return SDValue(); // At least one of the elements must be zero in the upper 17 bits, or can be @@ -45224,33 +45639,28 @@ static SDValue combineHorizOpWithShuffle(SDNode *N, SelectionDAG &DAG, // truncation trees that help us avoid lane crossing shuffles. // TODO: There's a lot more we can do for PACK/HADD style shuffle combines. // TODO: We don't handle vXf64 shuffles yet. - if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32 && - BC0.getOpcode() == ISD::EXTRACT_SUBVECTOR && - BC1.getOpcode() == ISD::EXTRACT_SUBVECTOR && - BC0.getOperand(0) == BC1.getOperand(0) && - BC0.getOperand(0).getValueType().is256BitVector() && - BC0.getConstantOperandAPInt(1) == 0 && - BC1.getConstantOperandAPInt(1) == - BC0.getValueType().getVectorNumElements()) { - SmallVector<SDValue> ShuffleOps; - SmallVector<int> ShuffleMask, ScaledMask; - SDValue Vec = peekThroughBitcasts(BC0.getOperand(0)); - if (getTargetShuffleInputs(Vec, ShuffleOps, ShuffleMask, DAG)) { - resolveTargetShuffleInputsAndMask(ShuffleOps, ShuffleMask); - // To keep the HOP LHS/RHS coherency, we must be able to scale the unary - // shuffle to a v4X64 width - we can probably relax this in the future. - if (!isAnyZero(ShuffleMask) && ShuffleOps.size() == 1 && - ShuffleOps[0].getValueType().is256BitVector() && - scaleShuffleElements(ShuffleMask, 4, ScaledMask)) { - SDValue Lo, Hi; - MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32; - std::tie(Lo, Hi) = DAG.SplitVector(ShuffleOps[0], DL); - Lo = DAG.getBitcast(SrcVT, Lo); - Hi = DAG.getBitcast(SrcVT, Hi); - SDValue Res = DAG.getNode(Opcode, DL, VT, Lo, Hi); - Res = DAG.getBitcast(ShufVT, Res); - Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ScaledMask); - return DAG.getBitcast(VT, Res); + if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) { + if (SDValue BCSrc = getSplitVectorSrc(BC0, BC1, false)) { + SmallVector<SDValue> ShuffleOps; + SmallVector<int> ShuffleMask, ScaledMask; + SDValue Vec = peekThroughBitcasts(BCSrc); + if (getTargetShuffleInputs(Vec, ShuffleOps, ShuffleMask, DAG)) { + resolveTargetShuffleInputsAndMask(ShuffleOps, ShuffleMask); + // To keep the HOP LHS/RHS coherency, we must be able to scale the unary + // shuffle to a v4X64 width - we can probably relax this in the future. + if (!isAnyZero(ShuffleMask) && ShuffleOps.size() == 1 && + ShuffleOps[0].getValueType().is256BitVector() && + scaleShuffleElements(ShuffleMask, 4, ScaledMask)) { + SDValue Lo, Hi; + MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32; + std::tie(Lo, Hi) = DAG.SplitVector(ShuffleOps[0], DL); + Lo = DAG.getBitcast(SrcVT, Lo); + Hi = DAG.getBitcast(SrcVT, Hi); + SDValue Res = DAG.getNode(Opcode, DL, VT, Lo, Hi); + Res = DAG.getBitcast(ShufVT, Res); + Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ScaledMask); + return DAG.getBitcast(VT, Res); + } } } } @@ -46047,6 +46457,49 @@ static SDValue combineBitOpWithMOVMSK(SDNode *N, SelectionDAG &DAG) { return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result); } +// Attempt to fold BITOP(SHIFT(X,Z),SHIFT(Y,Z)) -> SHIFT(BITOP(X,Y),Z). +// NOTE: This is a very limited case of what SimplifyUsingDistributiveLaws +// handles in InstCombine. +static SDValue combineBitOpWithShift(SDNode *N, SelectionDAG &DAG) { + unsigned Opc = N->getOpcode(); + assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) && + "Unexpected bit opcode"); + + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + EVT VT = N->getValueType(0); + + // Both operands must be single use. + if (!N0.hasOneUse() || !N1.hasOneUse()) + return SDValue(); + + // Search for matching shifts. + SDValue BC0 = peekThroughOneUseBitcasts(N0); + SDValue BC1 = peekThroughOneUseBitcasts(N1); + + unsigned BCOpc = BC0.getOpcode(); + EVT BCVT = BC0.getValueType(); + if (BCOpc != BC1->getOpcode() || BCVT != BC1.getValueType()) + return SDValue(); + + switch (BCOpc) { + case X86ISD::VSHLI: + case X86ISD::VSRLI: + case X86ISD::VSRAI: { + if (BC0.getOperand(1) != BC1.getOperand(1)) + return SDValue(); + + SDLoc DL(N); + SDValue BitOp = + DAG.getNode(Opc, DL, BCVT, BC0.getOperand(0), BC1.getOperand(0)); + SDValue Shift = DAG.getNode(BCOpc, DL, BCVT, BitOp, BC0.getOperand(1)); + return DAG.getBitcast(VT, Shift); + } + } + + return SDValue(); +} + /// If this is a zero/all-bits result that is bitwise-anded with a low bits /// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and' /// with a shift-right to eliminate loading the vector constant mask value. @@ -46350,6 +46803,9 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, if (SDValue R = combineBitOpWithMOVMSK(N, DAG)) return R; + if (SDValue R = combineBitOpWithShift(N, DAG)) + return R; + if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, DCI, Subtarget)) return FPLogic; @@ -46797,6 +47253,9 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG, if (SDValue R = combineBitOpWithMOVMSK(N, DAG)) return R; + if (SDValue R = combineBitOpWithShift(N, DAG)) + return R; + if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, DCI, Subtarget)) return FPLogic; @@ -47837,7 +48296,8 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, St->getValue().getOperand(0).getValueType() == MVT::v16i16 && TLI.isTruncStoreLegal(MVT::v16i32, MVT::v16i8) && St->getValue().hasOneUse() && !DCI.isBeforeLegalizeOps()) { - SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v16i32, St->getValue()); + SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v16i32, + St->getValue().getOperand(0)); return DAG.getTruncStore(St->getChain(), dl, Ext, St->getBasePtr(), MVT::v16i8, St->getMemOperand()); } @@ -48630,7 +49090,7 @@ static SDValue combineVectorSignBitsTruncation(SDNode *N, const SDLoc &DL, // originally concatenated from subvectors. SmallVector<SDValue> ConcatOps; if (VT.getSizeInBits() > 128 || !collectConcatOps(In.getNode(), ConcatOps)) - return SDValue(); + return SDValue(); } unsigned NumPackedSignBits = std::min<unsigned>(SVT.getSizeInBits(), 16); @@ -48714,7 +49174,7 @@ static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL, // sequence or using AVX512 truncations. If the inputs are sext/zext then the // truncations may actually be free by peeking through to the ext source. auto IsSext = [&DAG](SDValue V) { - return DAG.ComputeMinSignedBits(V) <= 16; + return DAG.ComputeMaxSignificantBits(V) <= 16; }; auto IsZext = [&DAG](SDValue V) { return DAG.computeKnownBits(V).countMaxActiveBits() <= 16; @@ -49268,6 +49728,9 @@ static SDValue combineXor(SDNode *N, SelectionDAG &DAG, if (SDValue R = combineBitOpWithMOVMSK(N, DAG)) return R; + if (SDValue R = combineBitOpWithShift(N, DAG)) + return R; + if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, DCI, Subtarget)) return FPLogic; @@ -52185,6 +52648,22 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, unsigned NumOps = Ops.size(); switch (Op0.getOpcode()) { + case X86ISD::VBROADCAST: { + if (!IsSplat && VT == MVT::v4f64 && llvm::all_of(Ops, [](SDValue Op) { + return Op.getOperand(0).getValueType().is128BitVector(); + })) + return DAG.getNode(X86ISD::MOVDDUP, DL, VT, + ConcatSubOperand(VT, Ops, 0)); + break; + } + case X86ISD::MOVDDUP: + case X86ISD::MOVSHDUP: + case X86ISD::MOVSLDUP: { + if (!IsSplat) + return DAG.getNode(Op0.getOpcode(), DL, VT, + ConcatSubOperand(VT, Ops, 0)); + break; + } case X86ISD::SHUFP: { // Add SHUFPD support if/when necessary. if (!IsSplat && VT.getScalarType() == MVT::f32 && @@ -52207,14 +52686,21 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, } LLVM_FALLTHROUGH; case X86ISD::VPERMILPI: - // TODO - add support for vXf64/vXi64 shuffles. if (!IsSplat && NumOps == 2 && (VT == MVT::v8f32 || VT == MVT::v8i32) && - Subtarget.hasAVX() && Op0.getOperand(1) == Ops[1].getOperand(1)) { + Op0.getOperand(1) == Ops[1].getOperand(1)) { SDValue Res = DAG.getBitcast(MVT::v8f32, ConcatSubOperand(VT, Ops, 0)); Res = DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, Res, Op0.getOperand(1)); return DAG.getBitcast(VT, Res); } + if (!IsSplat && NumOps == 2 && VT == MVT::v4f64) { + uint64_t Idx0 = Ops[0].getConstantOperandVal(1); + uint64_t Idx1 = Ops[1].getConstantOperandVal(1); + uint64_t Idx = ((Idx1 & 3) << 2) | (Idx0 & 3); + return DAG.getNode(Op0.getOpcode(), DL, VT, + ConcatSubOperand(VT, Ops, 0), + DAG.getTargetConstant(Idx, DL, MVT::i8)); + } break; case X86ISD::VPERMV3: if (!IsSplat && NumOps == 2 && VT.is512BitVector()) { @@ -52268,6 +52754,9 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, } LLVM_FALLTHROUGH; case X86ISD::VSRAI: + case X86ISD::VSHL: + case X86ISD::VSRL: + case X86ISD::VSRA: if (((VT.is256BitVector() && Subtarget.hasInt256()) || (VT.is512BitVector() && Subtarget.useAVX512Regs() && (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) && diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index d1d6e319f16b..3f6d567d3f4d 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -1540,7 +1540,7 @@ namespace llvm { unsigned GetAlignedArgumentStackSize(unsigned StackSize, SelectionDAG &DAG) const; - unsigned getAddressSpace(void) const; + unsigned getAddressSpace() const; SDValue FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned, SDValue &Chain) const; diff --git a/llvm/lib/Target/X86/X86IndirectBranchTracking.cpp b/llvm/lib/Target/X86/X86IndirectBranchTracking.cpp index 6642f46e64b2..7e751a4c8811 100644 --- a/llvm/lib/Target/X86/X86IndirectBranchTracking.cpp +++ b/llvm/lib/Target/X86/X86IndirectBranchTracking.cpp @@ -95,14 +95,45 @@ static bool IsCallReturnTwice(llvm::MachineOperand &MOp) { return Attrs.hasFnAttr(Attribute::ReturnsTwice); } +// Checks if function should have an ENDBR in its prologue +static bool needsPrologueENDBR(MachineFunction &MF, const Module *M) { + Function &F = MF.getFunction(); + + if (F.doesNoCfCheck()) + return false; + + const X86TargetMachine *TM = + static_cast<const X86TargetMachine *>(&MF.getTarget()); + Metadata *IBTSeal = M->getModuleFlag("ibt-seal"); + + switch (TM->getCodeModel()) { + // Large code model functions always reachable through indirect calls. + case CodeModel::Large: + return true; + // Only address taken functions in LTO'ed kernel are reachable indirectly. + // IBTSeal implies LTO, thus only check if function is address taken. + case CodeModel::Kernel: + // Check if ibt-seal was enabled (implies LTO is being used). + if (IBTSeal) { + return F.hasAddressTaken(); + } + // if !IBTSeal, fall into default case. + LLVM_FALLTHROUGH; + // Address taken or externally linked functions may be reachable. + default: + return (F.hasAddressTaken() || !F.hasLocalLinkage()); + } +} + bool X86IndirectBranchTrackingPass::runOnMachineFunction(MachineFunction &MF) { const X86Subtarget &SubTarget = MF.getSubtarget<X86Subtarget>(); + const Module *M = MF.getMMI().getModule(); // Check that the cf-protection-branch is enabled. - Metadata *isCFProtectionSupported = - MF.getMMI().getModule()->getModuleFlag("cf-protection-branch"); - // NB: We need to enable IBT in jitted code if JIT compiler is CET - // enabled. + Metadata *isCFProtectionSupported = M->getModuleFlag("cf-protection-branch"); + + // NB: We need to enable IBT in jitted code if JIT compiler is CET + // enabled. const X86TargetMachine *TM = static_cast<const X86TargetMachine *>(&MF.getTarget()); #ifdef __CET__ @@ -119,13 +150,8 @@ bool X86IndirectBranchTrackingPass::runOnMachineFunction(MachineFunction &MF) { TII = SubTarget.getInstrInfo(); EndbrOpcode = SubTarget.is64Bit() ? X86::ENDBR64 : X86::ENDBR32; - // Large code model, non-internal function or function whose address - // was taken, can be accessed through indirect calls. Mark the first - // BB with ENDBR instruction unless nocf_check attribute is used. - if ((TM->getCodeModel() == CodeModel::Large || - MF.getFunction().hasAddressTaken() || - !MF.getFunction().hasLocalLinkage()) && - !MF.getFunction().doesNoCfCheck()) { + // If function is reachable indirectly, mark the first BB with ENDBR. + if (needsPrologueENDBR(MF, M)) { auto MBB = MF.begin(); Changed |= addENDBR(*MBB, MBB->begin()); } diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index ecd4777c3533..bc67d1f89d7f 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -10537,13 +10537,12 @@ def rr : AVX512XS8I<opc, MRMSrcReg, (outs Vec.RC:$dst), (ins Vec.KRC:$src), multiclass cvt_mask_by_elt_width<bits<8> opc, AVX512VLVectorVTInfo VTInfo, string OpcodeStr, Predicate prd> { -// TODO - Replace WriteMove with WriteVecTrunc? let Predicates = [prd] in - defm Z : cvt_by_vec_width<opc, VTInfo.info512, OpcodeStr, WriteMove>, EVEX_V512; + defm Z : cvt_by_vec_width<opc, VTInfo.info512, OpcodeStr, WriteVecMoveZ>, EVEX_V512; let Predicates = [prd, HasVLX] in { - defm Z256 : cvt_by_vec_width<opc, VTInfo.info256, OpcodeStr, WriteMove>, EVEX_V256; - defm Z128 : cvt_by_vec_width<opc, VTInfo.info128, OpcodeStr, WriteMove>, EVEX_V128; + defm Z256 : cvt_by_vec_width<opc, VTInfo.info256, OpcodeStr, WriteVecMoveY>, EVEX_V256; + defm Z128 : cvt_by_vec_width<opc, VTInfo.info128, OpcodeStr, WriteVecMoveX>, EVEX_V128; } } diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index c379aa8d9258..4dcd886fa3b2 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -4088,8 +4088,8 @@ bool X86InstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg, bool X86InstrInfo::isRedundantFlagInstr(const MachineInstr &FlagI, Register SrcReg, Register SrcReg2, int64_t ImmMask, int64_t ImmValue, - const MachineInstr &OI, - bool *IsSwapped) const { + const MachineInstr &OI, bool *IsSwapped, + int64_t *ImmDelta) const { switch (OI.getOpcode()) { case X86::CMP64rr: case X86::CMP32rr: @@ -4140,10 +4140,21 @@ bool X86InstrInfo::isRedundantFlagInstr(const MachineInstr &FlagI, int64_t OIMask; int64_t OIValue; if (analyzeCompare(OI, OISrcReg, OISrcReg2, OIMask, OIValue) && - SrcReg == OISrcReg && ImmMask == OIMask && OIValue == ImmValue) { - assert(SrcReg2 == X86::NoRegister && OISrcReg2 == X86::NoRegister && - "should not have 2nd register"); - return true; + SrcReg == OISrcReg && ImmMask == OIMask) { + if (OIValue == ImmValue) { + *ImmDelta = 0; + return true; + } else if (static_cast<uint64_t>(ImmValue) == + static_cast<uint64_t>(OIValue) - 1) { + *ImmDelta = -1; + return true; + } else if (static_cast<uint64_t>(ImmValue) == + static_cast<uint64_t>(OIValue) + 1) { + *ImmDelta = 1; + return true; + } else { + return false; + } } } return FlagI.isIdenticalTo(OI); @@ -4393,6 +4404,7 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, bool ShouldUpdateCC = false; bool IsSwapped = false; X86::CondCode NewCC = X86::COND_INVALID; + int64_t ImmDelta = 0; // Search backward from CmpInstr for the next instruction defining EFLAGS. const TargetRegisterInfo *TRI = &getRegisterInfo(); @@ -4439,7 +4451,7 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, // ... // EFLAGS not changed // cmp x, y // <-- can be removed if (isRedundantFlagInstr(CmpInstr, SrcReg, SrcReg2, CmpMask, CmpValue, - Inst, &IsSwapped)) { + Inst, &IsSwapped, &ImmDelta)) { Sub = &Inst; break; } @@ -4473,7 +4485,7 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, // It is safe to remove CmpInstr if EFLAGS is redefined or killed. // If we are done with the basic block, we need to check whether EFLAGS is // live-out. - bool IsSafe = false; + bool FlagsMayLiveOut = true; SmallVector<std::pair<MachineInstr*, X86::CondCode>, 4> OpsToUpdate; MachineBasicBlock::iterator AfterCmpInstr = std::next(MachineBasicBlock::iterator(CmpInstr)); @@ -4483,7 +4495,7 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, // We should check the usage if this instruction uses and updates EFLAGS. if (!UseEFLAGS && ModifyEFLAGS) { // It is safe to remove CmpInstr if EFLAGS is updated again. - IsSafe = true; + FlagsMayLiveOut = false; break; } if (!UseEFLAGS && !ModifyEFLAGS) @@ -4491,7 +4503,7 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, // EFLAGS is used by this instruction. X86::CondCode OldCC = X86::COND_INVALID; - if (MI || IsSwapped) { + if (MI || IsSwapped || ImmDelta != 0) { // We decode the condition code from opcode. if (Instr.isBranch()) OldCC = X86::getCondFromBranch(Instr); @@ -4545,9 +4557,59 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, ReplacementCC = getSwappedCondition(OldCC); if (ReplacementCC == X86::COND_INVALID) return false; + ShouldUpdateCC = true; + } else if (ImmDelta != 0) { + unsigned BitWidth = TRI->getRegSizeInBits(*MRI->getRegClass(SrcReg)); + // Shift amount for min/max constants to adjust for 8/16/32 instruction + // sizes. + switch (OldCC) { + case X86::COND_L: // x <s (C + 1) --> x <=s C + if (ImmDelta != 1 || APInt::getSignedMinValue(BitWidth) == CmpValue) + return false; + ReplacementCC = X86::COND_LE; + break; + case X86::COND_B: // x <u (C + 1) --> x <=u C + if (ImmDelta != 1 || CmpValue == 0) + return false; + ReplacementCC = X86::COND_BE; + break; + case X86::COND_GE: // x >=s (C + 1) --> x >s C + if (ImmDelta != 1 || APInt::getSignedMinValue(BitWidth) == CmpValue) + return false; + ReplacementCC = X86::COND_G; + break; + case X86::COND_AE: // x >=u (C + 1) --> x >u C + if (ImmDelta != 1 || CmpValue == 0) + return false; + ReplacementCC = X86::COND_A; + break; + case X86::COND_G: // x >s (C - 1) --> x >=s C + if (ImmDelta != -1 || APInt::getSignedMaxValue(BitWidth) == CmpValue) + return false; + ReplacementCC = X86::COND_GE; + break; + case X86::COND_A: // x >u (C - 1) --> x >=u C + if (ImmDelta != -1 || APInt::getMaxValue(BitWidth) == CmpValue) + return false; + ReplacementCC = X86::COND_AE; + break; + case X86::COND_LE: // x <=s (C - 1) --> x <s C + if (ImmDelta != -1 || APInt::getSignedMaxValue(BitWidth) == CmpValue) + return false; + ReplacementCC = X86::COND_L; + break; + case X86::COND_BE: // x <=u (C - 1) --> x <u C + if (ImmDelta != -1 || APInt::getMaxValue(BitWidth) == CmpValue) + return false; + ReplacementCC = X86::COND_B; + break; + default: + return false; + } + ShouldUpdateCC = true; } - if ((ShouldUpdateCC || IsSwapped) && ReplacementCC != OldCC) { + if (ShouldUpdateCC && ReplacementCC != OldCC) { // Push the MachineInstr to OpsToUpdate. // If it is safe to remove CmpInstr, the condition code of these // instructions will be modified. @@ -4555,14 +4617,14 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, } if (ModifyEFLAGS || Instr.killsRegister(X86::EFLAGS, TRI)) { // It is safe to remove CmpInstr if EFLAGS is updated again or killed. - IsSafe = true; + FlagsMayLiveOut = false; break; } } - // If EFLAGS is not killed nor re-defined, we should check whether it is - // live-out. If it is live-out, do not optimize. - if ((MI || IsSwapped) && !IsSafe) { + // If we have to update users but EFLAGS is live-out abort, since we cannot + // easily find all of the users. + if ((MI != nullptr || ShouldUpdateCC) && FlagsMayLiveOut) { for (MachineBasicBlock *Successor : CmpMBB.successors()) if (Successor->isLiveIn(X86::EFLAGS)) return false; diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h index 537ada6222bf..33ce55bbdb2b 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.h +++ b/llvm/lib/Target/X86/X86InstrInfo.h @@ -643,7 +643,8 @@ private: /// CMP %1, %2 and %3 = SUB %2, %1 ; IsSwapped=true bool isRedundantFlagInstr(const MachineInstr &FlagI, Register SrcReg, Register SrcReg2, int64_t ImmMask, int64_t ImmValue, - const MachineInstr &OI, bool *IsSwapped) const; + const MachineInstr &OI, bool *IsSwapped, + int64_t *ImmDelta) const; }; } // namespace llvm diff --git a/llvm/lib/Target/X86/X86InstructionSelector.cpp b/llvm/lib/Target/X86/X86InstructionSelector.cpp index 8abbaa92c8cf..28d57ca9ae3c 100644 --- a/llvm/lib/Target/X86/X86InstructionSelector.cpp +++ b/llvm/lib/Target/X86/X86InstructionSelector.cpp @@ -153,8 +153,8 @@ private: X86InstructionSelector::X86InstructionSelector(const X86TargetMachine &TM, const X86Subtarget &STI, const X86RegisterBankInfo &RBI) - : InstructionSelector(), TM(TM), STI(STI), TII(*STI.getInstrInfo()), - TRI(*STI.getRegisterInfo()), RBI(RBI), + : TM(TM), STI(STI), TII(*STI.getInstrInfo()), TRI(*STI.getRegisterInfo()), + RBI(RBI), #define GET_GLOBALISEL_PREDICATES_INIT #include "X86GenGlobalISel.inc" #undef GET_GLOBALISEL_PREDICATES_INIT diff --git a/llvm/lib/Target/X86/X86OptimizeLEAs.cpp b/llvm/lib/Target/X86/X86OptimizeLEAs.cpp index 6967a96ce83b..d0562214a025 100644 --- a/llvm/lib/Target/X86/X86OptimizeLEAs.cpp +++ b/llvm/lib/Target/X86/X86OptimizeLEAs.cpp @@ -610,7 +610,7 @@ MachineInstr *X86OptimizeLEAPass::replaceDebugValue(MachineInstr &MI, auto replaceOldReg = [OldReg, NewReg](const MachineOperand &Op) { if (Op.isReg() && Op.getReg() == OldReg) return MachineOperand::CreateReg(NewReg, false, false, false, false, - false, false, false, false, 0, + false, false, false, false, false, /*IsRenamable*/ true); return Op; }; diff --git a/llvm/lib/Target/X86/X86PadShortFunction.cpp b/llvm/lib/Target/X86/X86PadShortFunction.cpp index 47ae517ae76d..e92b1b002bb0 100644 --- a/llvm/lib/Target/X86/X86PadShortFunction.cpp +++ b/llvm/lib/Target/X86/X86PadShortFunction.cpp @@ -129,10 +129,9 @@ bool PadShortFunc::runOnMachineFunction(MachineFunction &MF) { bool MadeChange = false; // Pad the identified basic blocks with NOOPs - for (DenseMap<MachineBasicBlock*, unsigned int>::iterator I = ReturnBBs.begin(); - I != ReturnBBs.end(); ++I) { - MachineBasicBlock *MBB = I->first; - unsigned Cycles = I->second; + for (const auto &ReturnBB : ReturnBBs) { + MachineBasicBlock *MBB = ReturnBB.first; + unsigned Cycles = ReturnBB.second; // Function::hasOptSize is already checked above. bool OptForSize = llvm::shouldOptimizeForSize(MBB, PSI, MBFI); diff --git a/llvm/lib/Target/X86/X86PartialReduction.cpp b/llvm/lib/Target/X86/X86PartialReduction.cpp index babd923e7496..4342ac089cae 100644 --- a/llvm/lib/Target/X86/X86PartialReduction.cpp +++ b/llvm/lib/Target/X86/X86PartialReduction.cpp @@ -13,15 +13,16 @@ //===----------------------------------------------------------------------===// #include "X86.h" +#include "X86TargetMachine.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/Constants.h" +#include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicsX86.h" -#include "llvm/IR/IRBuilder.h" #include "llvm/IR/Operator.h" #include "llvm/Pass.h" -#include "X86TargetMachine.h" +#include "llvm/Support/KnownBits.h" using namespace llvm; @@ -49,7 +50,7 @@ public: } private: - bool tryMAddReplacement(Instruction *Op); + bool tryMAddReplacement(Instruction *Op, bool ReduceInOneBB); bool trySADReplacement(Instruction *Op); }; } @@ -63,7 +64,43 @@ char X86PartialReduction::ID = 0; INITIALIZE_PASS(X86PartialReduction, DEBUG_TYPE, "X86 Partial Reduction", false, false) -bool X86PartialReduction::tryMAddReplacement(Instruction *Op) { +// This function should be aligned with detectExtMul() in X86ISelLowering.cpp. +static bool matchVPDPBUSDPattern(const X86Subtarget *ST, BinaryOperator *Mul, + const DataLayout *DL) { + if (!ST->hasVNNI() && !ST->hasAVXVNNI()) + return false; + + Value *LHS = Mul->getOperand(0); + Value *RHS = Mul->getOperand(1); + + if (isa<SExtInst>(LHS)) + std::swap(LHS, RHS); + + auto IsFreeTruncation = [&](Value *Op) { + if (auto *Cast = dyn_cast<CastInst>(Op)) { + if (Cast->getParent() == Mul->getParent() && + (Cast->getOpcode() == Instruction::SExt || + Cast->getOpcode() == Instruction::ZExt) && + Cast->getOperand(0)->getType()->getScalarSizeInBits() <= 8) + return true; + } + + return isa<Constant>(Op); + }; + + // (dpbusd (zext a), (sext, b)). Since the first operand should be unsigned + // value, we need to check LHS is zero extended value. RHS should be signed + // value, so we just check the signed bits. + if ((IsFreeTruncation(LHS) && + computeKnownBits(LHS, *DL).countMaxActiveBits() <= 8) && + (IsFreeTruncation(RHS) && ComputeMaxSignificantBits(RHS, *DL) <= 8)) + return true; + + return false; +} + +bool X86PartialReduction::tryMAddReplacement(Instruction *Op, + bool ReduceInOneBB) { if (!ST->hasSSE2()) return false; @@ -82,6 +119,13 @@ bool X86PartialReduction::tryMAddReplacement(Instruction *Op) { Value *LHS = Mul->getOperand(0); Value *RHS = Mul->getOperand(1); + // If the target support VNNI, leave it to ISel to combine reduce operation + // to VNNI instruction. + // TODO: we can support transforming reduce to VNNI intrinsic for across block + // in this pass. + if (ReduceInOneBB && matchVPDPBUSDPattern(ST, Mul, DL)) + return false; + // LHS and RHS should be only used once or if they are the same then only // used twice. Only check this when SSE4.1 is enabled and we have zext/sext // instructions, otherwise we use punpck to emulate zero extend in stages. The @@ -300,7 +344,9 @@ bool X86PartialReduction::trySADReplacement(Instruction *Op) { // Walk backwards from the ExtractElementInst and determine if it is the end of // a horizontal reduction. Return the input to the reduction if we find one. -static Value *matchAddReduction(const ExtractElementInst &EE) { +static Value *matchAddReduction(const ExtractElementInst &EE, + bool &ReduceInOneBB) { + ReduceInOneBB = true; // Make sure we're extracting index 0. auto *Index = dyn_cast<ConstantInt>(EE.getIndexOperand()); if (!Index || !Index->isNullValue()) @@ -309,6 +355,8 @@ static Value *matchAddReduction(const ExtractElementInst &EE) { const auto *BO = dyn_cast<BinaryOperator>(EE.getVectorOperand()); if (!BO || BO->getOpcode() != Instruction::Add || !BO->hasOneUse()) return nullptr; + if (EE.getParent() != BO->getParent()) + ReduceInOneBB = false; unsigned NumElems = cast<FixedVectorType>(BO->getType())->getNumElements(); // Ensure the reduction size is a power of 2. @@ -321,6 +369,8 @@ static Value *matchAddReduction(const ExtractElementInst &EE) { const auto *BO = dyn_cast<BinaryOperator>(Op); if (!BO || BO->getOpcode() != Instruction::Add) return nullptr; + if (EE.getParent() != BO->getParent()) + ReduceInOneBB = false; // If this isn't the first add, then it should only have 2 users, the // shuffle and another add which we checked in the previous iteration. @@ -460,9 +510,10 @@ bool X86PartialReduction::runOnFunction(Function &F) { if (!EE) continue; + bool ReduceInOneBB; // First find a reduction tree. // FIXME: Do we need to handle other opcodes than Add? - Value *Root = matchAddReduction(*EE); + Value *Root = matchAddReduction(*EE, ReduceInOneBB); if (!Root) continue; @@ -470,7 +521,7 @@ bool X86PartialReduction::runOnFunction(Function &F) { collectLeaves(Root, Leaves); for (Instruction *I : Leaves) { - if (tryMAddReplacement(I)) { + if (tryMAddReplacement(I, ReduceInOneBB)) { MadeChange = true; continue; } diff --git a/llvm/lib/Target/X86/X86SchedBroadwell.td b/llvm/lib/Target/X86/X86SchedBroadwell.td index a6ff472aac6f..8e317dc22bd6 100644 --- a/llvm/lib/Target/X86/X86SchedBroadwell.td +++ b/llvm/lib/Target/X86/X86SchedBroadwell.td @@ -255,6 +255,7 @@ defm : X86WriteRes<WriteFMaskedStore64Y, [BWPort0,BWPort4,BWPort237,BWPort15], 5 defm : X86WriteRes<WriteFMove, [BWPort5], 1, [1], 1>; defm : X86WriteRes<WriteFMoveX, [BWPort5], 1, [1], 1>; defm : X86WriteRes<WriteFMoveY, [BWPort5], 1, [1], 1>; +defm : X86WriteResUnsupported<WriteFMoveZ>; defm : X86WriteRes<WriteEMMS, [BWPort01,BWPort15,BWPort015,BWPort0156], 31, [8,1,21,1], 31>; defm : BWWriteResPair<WriteFAdd, [BWPort1], 3, [1], 1, 5>; // Floating point add/sub. @@ -418,6 +419,7 @@ defm : X86WriteRes<WriteVecMaskedStore64Y, [BWPort0,BWPort4,BWPort237,BWPort15], defm : X86WriteRes<WriteVecMove, [BWPort015], 1, [1], 1>; defm : X86WriteRes<WriteVecMoveX, [BWPort015], 1, [1], 1>; defm : X86WriteRes<WriteVecMoveY, [BWPort015], 1, [1], 1>; +defm : X86WriteResUnsupported<WriteVecMoveZ>; defm : X86WriteRes<WriteVecMoveToGpr, [BWPort0], 1, [1], 1>; defm : X86WriteRes<WriteVecMoveFromGpr, [BWPort5], 1, [1], 1>; @@ -1741,4 +1743,40 @@ def BWSETA_SETBErm : SchedWriteVariant<[ def : InstRW<[BWSETA_SETBErr], (instrs SETCCr)>; def : InstRW<[BWSETA_SETBErm], (instrs SETCCm)>; +/////////////////////////////////////////////////////////////////////////////// +// Dependency breaking instructions. +/////////////////////////////////////////////////////////////////////////////// + +def : IsZeroIdiomFunction<[ + // GPR Zero-idioms. + DepBreakingClass<[ SUB32rr, SUB64rr, XOR32rr, XOR64rr ], ZeroIdiomPredicate>, + + // SSE Zero-idioms. + DepBreakingClass<[ + // fp variants. + XORPSrr, XORPDrr, + + // int variants. + PXORrr, + PSUBBrr, PSUBWrr, PSUBDrr, PSUBQrr, + PCMPGTBrr, PCMPGTDrr, PCMPGTQrr, PCMPGTWrr + ], ZeroIdiomPredicate>, + + // AVX Zero-idioms. + DepBreakingClass<[ + // xmm fp variants. + VXORPSrr, VXORPDrr, + + // xmm int variants. + VPXORrr, + VPSUBBrr, VPSUBWrr, VPSUBDrr, VPSUBQrr, + VPCMPGTBrr, VPCMPGTWrr, VPCMPGTDrr, VPCMPGTQrr, + + // ymm variants. + VXORPSYrr, VXORPDYrr, VPXORYrr, + VPSUBBYrr, VPSUBWYrr, VPSUBDYrr, VPSUBQYrr, + VPCMPGTBYrr, VPCMPGTWYrr, VPCMPGTDYrr, VPCMPGTQYrr + ], ZeroIdiomPredicate>, +]>; + } // SchedModel diff --git a/llvm/lib/Target/X86/X86SchedHaswell.td b/llvm/lib/Target/X86/X86SchedHaswell.td index 371a9571ae39..1cd0b3379684 100644 --- a/llvm/lib/Target/X86/X86SchedHaswell.td +++ b/llvm/lib/Target/X86/X86SchedHaswell.td @@ -257,6 +257,7 @@ defm : X86WriteRes<WriteFMaskedStore64Y, [HWPort0,HWPort4,HWPort237,HWPort15], 5 defm : X86WriteRes<WriteFMove, [HWPort5], 1, [1], 1>; defm : X86WriteRes<WriteFMoveX, [HWPort5], 1, [1], 1>; defm : X86WriteRes<WriteFMoveY, [HWPort5], 1, [1], 1>; +defm : X86WriteResUnsupported<WriteFMoveZ>; defm : X86WriteRes<WriteEMMS, [HWPort01,HWPort15,HWPort015,HWPort0156], 31, [8,1,21,1], 31>; defm : HWWriteResPair<WriteFAdd, [HWPort1], 3, [1], 1, 5>; @@ -416,6 +417,7 @@ defm : X86WriteRes<WriteVecMaskedStore64Y, [HWPort0,HWPort4,HWPort237,HWPort15], defm : X86WriteRes<WriteVecMove, [HWPort015], 1, [1], 1>; defm : X86WriteRes<WriteVecMoveX, [HWPort015], 1, [1], 1>; defm : X86WriteRes<WriteVecMoveY, [HWPort015], 1, [1], 1>; +defm : X86WriteResUnsupported<WriteVecMoveZ>; defm : X86WriteRes<WriteVecMoveToGpr, [HWPort0], 1, [1], 1>; defm : X86WriteRes<WriteVecMoveFromGpr, [HWPort5], 1, [1], 1>; @@ -2030,4 +2032,40 @@ def HWSETA_SETBErm : SchedWriteVariant<[ def : InstRW<[HWSETA_SETBErr], (instrs SETCCr)>; def : InstRW<[HWSETA_SETBErm], (instrs SETCCm)>; +/////////////////////////////////////////////////////////////////////////////// +// Dependency breaking instructions. +/////////////////////////////////////////////////////////////////////////////// + +def : IsZeroIdiomFunction<[ + // GPR Zero-idioms. + DepBreakingClass<[ SUB32rr, SUB64rr, XOR32rr, XOR64rr ], ZeroIdiomPredicate>, + + // SSE Zero-idioms. + DepBreakingClass<[ + // fp variants. + XORPSrr, XORPDrr, + + // int variants. + PXORrr, + PSUBBrr, PSUBWrr, PSUBDrr, PSUBQrr, + PCMPGTBrr, PCMPGTDrr, PCMPGTQrr, PCMPGTWrr + ], ZeroIdiomPredicate>, + + // AVX Zero-idioms. + DepBreakingClass<[ + // xmm fp variants. + VXORPSrr, VXORPDrr, + + // xmm int variants. + VPXORrr, + VPSUBBrr, VPSUBWrr, VPSUBDrr, VPSUBQrr, + VPCMPGTBrr, VPCMPGTWrr, VPCMPGTDrr, VPCMPGTQrr, + + // ymm variants. + VXORPSYrr, VXORPDYrr, VPXORYrr, + VPSUBBYrr, VPSUBWYrr, VPSUBDYrr, VPSUBQYrr, + VPCMPGTBYrr, VPCMPGTWYrr, VPCMPGTDYrr + ], ZeroIdiomPredicate>, +]>; + } // SchedModel diff --git a/llvm/lib/Target/X86/X86SchedIceLake.td b/llvm/lib/Target/X86/X86SchedIceLake.td index 789de9eb5751..9fd986e34181 100644 --- a/llvm/lib/Target/X86/X86SchedIceLake.td +++ b/llvm/lib/Target/X86/X86SchedIceLake.td @@ -252,6 +252,7 @@ defm : X86WriteRes<WriteFMaskedStore64Y, [ICXPort237,ICXPort0], 2, [1,1], 2>; defm : X86WriteRes<WriteFMove, [ICXPort015], 1, [1], 1>; defm : X86WriteRes<WriteFMoveX, [ICXPort015], 1, [1], 1>; defm : X86WriteRes<WriteFMoveY, [ICXPort015], 1, [1], 1>; +defm : X86WriteRes<WriteFMoveZ, [ICXPort05], 1, [1], 1>; defm : X86WriteRes<WriteEMMS, [ICXPort05,ICXPort0156], 10, [9,1], 10>; defm : ICXWriteResPair<WriteFAdd, [ICXPort01], 4, [1], 1, 5>; // Floating point add/sub. @@ -367,6 +368,7 @@ defm : X86WriteRes<WriteVecMaskedStore64Y, [ICXPort237,ICXPort0], 2, [1,1], 2>; defm : X86WriteRes<WriteVecMove, [ICXPort05], 1, [1], 1>; defm : X86WriteRes<WriteVecMoveX, [ICXPort015], 1, [1], 1>; defm : X86WriteRes<WriteVecMoveY, [ICXPort015], 1, [1], 1>; +defm : X86WriteRes<WriteVecMoveZ, [ICXPort05], 1, [1], 1>; defm : X86WriteRes<WriteVecMoveToGpr, [ICXPort0], 2, [1], 1>; defm : X86WriteRes<WriteVecMoveFromGpr, [ICXPort5], 1, [1], 1>; @@ -2630,4 +2632,48 @@ def ICXSETA_SETBErm : SchedWriteVariant<[ def : InstRW<[ICXSETA_SETBErr], (instrs SETCCr)>; def : InstRW<[ICXSETA_SETBErm], (instrs SETCCm)>; +/////////////////////////////////////////////////////////////////////////////// +// Dependency breaking instructions. +/////////////////////////////////////////////////////////////////////////////// + +def : IsZeroIdiomFunction<[ + // GPR Zero-idioms. + DepBreakingClass<[ SUB32rr, SUB64rr, XOR32rr, XOR64rr ], ZeroIdiomPredicate>, + + // SSE Zero-idioms. + DepBreakingClass<[ + // fp variants. + XORPSrr, XORPDrr, + + // int variants. + PXORrr, + PSUBBrr, PSUBWrr, PSUBDrr, PSUBQrr, + PCMPGTBrr, PCMPGTDrr, PCMPGTQrr, PCMPGTWrr + ], ZeroIdiomPredicate>, + + // AVX Zero-idioms. + DepBreakingClass<[ + // xmm fp variants. + VXORPSrr, VXORPDrr, + + // xmm int variants. + VPXORrr, + VPSUBBrr, VPSUBWrr, VPSUBDrr, VPSUBQrr, + VPCMPGTBrr, VPCMPGTWrr, VPCMPGTDrr, VPCMPGTQrr, + + // ymm variants. + VXORPSYrr, VXORPDYrr, VPXORYrr, + VPSUBBYrr, VPSUBWYrr, VPSUBDYrr, VPSUBQYrr, + VPCMPGTBYrr, VPCMPGTWYrr, VPCMPGTDYrr, VPCMPGTQYrr, + + // zmm variants. + VXORPSZrr, VXORPDZrr, VPXORDZrr, VPXORQZrr, + VXORPSZ128rr, VXORPDZ128rr, VPXORDZ128rr, VPXORQZ128rr, + VXORPSZ256rr, VXORPDZ256rr, VPXORDZ256rr, VPXORQZ256rr, + VPSUBBZrr, VPSUBWZrr, VPSUBDZrr, VPSUBQZrr, + VPSUBBZ128rr, VPSUBWZ128rr, VPSUBDZ128rr, VPSUBQZ128rr, + VPSUBBZ256rr, VPSUBWZ256rr, VPSUBDZ256rr, VPSUBQZ256rr, + ], ZeroIdiomPredicate>, +]>; + } // SchedModel diff --git a/llvm/lib/Target/X86/X86SchedSandyBridge.td b/llvm/lib/Target/X86/X86SchedSandyBridge.td index af5c0540deb5..7e619a3a8722 100644 --- a/llvm/lib/Target/X86/X86SchedSandyBridge.td +++ b/llvm/lib/Target/X86/X86SchedSandyBridge.td @@ -223,6 +223,7 @@ defm : X86WriteRes<WriteFMaskedStore64Y, [SBPort4,SBPort01,SBPort23], 5, [1,1,1] defm : X86WriteRes<WriteFMove, [SBPort5], 1, [1], 1>; defm : X86WriteRes<WriteFMoveX, [SBPort5], 1, [1], 1>; defm : X86WriteRes<WriteFMoveY, [SBPort5], 1, [1], 1>; +defm : X86WriteRes<WriteFMoveZ, [SBPort5], 1, [1], 1>; defm : X86WriteRes<WriteEMMS, [SBPort015], 31, [31], 31>; defm : SBWriteResPair<WriteFAdd, [SBPort1], 3, [1], 1, 6>; @@ -380,6 +381,7 @@ defm : X86WriteRes<WriteVecMaskedStore64Y, [SBPort4,SBPort01,SBPort23], 5, [1,1, defm : X86WriteRes<WriteVecMove, [SBPort05], 1, [1], 1>; defm : X86WriteRes<WriteVecMoveX, [SBPort015], 1, [1], 1>; defm : X86WriteRes<WriteVecMoveY, [SBPort05], 1, [1], 1>; +defm : X86WriteRes<WriteVecMoveZ, [SBPort05], 1, [1], 1>; defm : X86WriteRes<WriteVecMoveToGpr, [SBPort0], 2, [1], 1>; defm : X86WriteRes<WriteVecMoveFromGpr, [SBPort5], 1, [1], 1>; @@ -1230,4 +1232,35 @@ def SBSETA_SETBErm : SchedWriteVariant<[ def : InstRW<[SBSETA_SETBErr], (instrs SETCCr)>; def : InstRW<[SBSETA_SETBErm], (instrs SETCCm)>; +/////////////////////////////////////////////////////////////////////////////// +// Dependency breaking instructions. +/////////////////////////////////////////////////////////////////////////////// + +def : IsZeroIdiomFunction<[ + // GPR Zero-idioms. + DepBreakingClass<[ SUB32rr, SUB64rr, XOR32rr, XOR64rr ], ZeroIdiomPredicate>, + + // SSE Zero-idioms. + DepBreakingClass<[ + // fp variants. + XORPSrr, XORPDrr, + + // int variants. + PXORrr, + PSUBBrr, PSUBWrr, PSUBDrr, PSUBQrr, + PCMPGTBrr, PCMPGTDrr, PCMPGTQrr, PCMPGTWrr + ], ZeroIdiomPredicate>, + + // AVX Zero-idioms. + DepBreakingClass<[ + // xmm fp variants. + VXORPSrr, VXORPDrr, + + // xmm int variants. + VPXORrr, + VPSUBBrr, VPSUBWrr, VPSUBDrr, VPSUBQrr, + VPCMPGTBrr, VPCMPGTWrr, VPCMPGTDrr, VPCMPGTQrr, + ], ZeroIdiomPredicate>, +]>; + } // SchedModel diff --git a/llvm/lib/Target/X86/X86SchedSkylakeClient.td b/llvm/lib/Target/X86/X86SchedSkylakeClient.td index b3c13c72dd01..0a88bac5aa66 100644 --- a/llvm/lib/Target/X86/X86SchedSkylakeClient.td +++ b/llvm/lib/Target/X86/X86SchedSkylakeClient.td @@ -244,6 +244,7 @@ defm : X86WriteRes<WriteFMaskedStore64Y, [SKLPort237,SKLPort0], 2, [1,1], 2>; defm : X86WriteRes<WriteFMove, [SKLPort015], 1, [1], 1>; defm : X86WriteRes<WriteFMoveX, [SKLPort015], 1, [1], 1>; defm : X86WriteRes<WriteFMoveY, [SKLPort015], 1, [1], 1>; +defm : X86WriteResUnsupported<WriteFMoveZ>; defm : X86WriteRes<WriteEMMS, [SKLPort05,SKLPort0156], 10, [9,1], 10>; defm : SKLWriteResPair<WriteFAdd, [SKLPort01], 4, [1], 1, 5>; // Floating point add/sub. @@ -359,6 +360,7 @@ defm : X86WriteRes<WriteVecMaskedStore64Y, [SKLPort237,SKLPort0], 2, [1,1], 2>; defm : X86WriteRes<WriteVecMove, [SKLPort05], 1, [1], 1>; defm : X86WriteRes<WriteVecMoveX, [SKLPort015], 1, [1], 1>; defm : X86WriteRes<WriteVecMoveY, [SKLPort015], 1, [1], 1>; +defm : X86WriteResUnsupported<WriteVecMoveZ>; defm : X86WriteRes<WriteVecMoveToGpr, [SKLPort0], 2, [1], 1>; defm : X86WriteRes<WriteVecMoveFromGpr, [SKLPort5], 1, [1], 1>; @@ -1901,4 +1903,40 @@ def SKLSETA_SETBErm : SchedWriteVariant<[ def : InstRW<[SKLSETA_SETBErr], (instrs SETCCr)>; def : InstRW<[SKLSETA_SETBErm], (instrs SETCCm)>; +/////////////////////////////////////////////////////////////////////////////// +// Dependency breaking instructions. +/////////////////////////////////////////////////////////////////////////////// + +def : IsZeroIdiomFunction<[ + // GPR Zero-idioms. + DepBreakingClass<[ SUB32rr, SUB64rr, XOR32rr, XOR64rr ], ZeroIdiomPredicate>, + + // SSE Zero-idioms. + DepBreakingClass<[ + // fp variants. + XORPSrr, XORPDrr, + + // int variants. + PXORrr, + PSUBBrr, PSUBWrr, PSUBDrr, PSUBQrr, + PCMPGTBrr, PCMPGTDrr, PCMPGTQrr, PCMPGTWrr + ], ZeroIdiomPredicate>, + + // AVX Zero-idioms. + DepBreakingClass<[ + // xmm fp variants. + VXORPSrr, VXORPDrr, + + // xmm int variants. + VPXORrr, + VPSUBBrr, VPSUBWrr, VPSUBDrr, VPSUBQrr, + VPCMPGTBrr, VPCMPGTWrr, VPCMPGTDrr, VPCMPGTQrr, + + // ymm variants. + VXORPSYrr, VXORPDYrr, VPXORYrr, + VPSUBBYrr, VPSUBWYrr, VPSUBDYrr, VPSUBQYrr, + VPCMPGTBYrr, VPCMPGTWYrr, VPCMPGTDYrr, VPCMPGTQYrr + ], ZeroIdiomPredicate>, +]>; + } // SchedModel diff --git a/llvm/lib/Target/X86/X86SchedSkylakeServer.td b/llvm/lib/Target/X86/X86SchedSkylakeServer.td index 74f9da158353..b28a18f0dcd7 100644 --- a/llvm/lib/Target/X86/X86SchedSkylakeServer.td +++ b/llvm/lib/Target/X86/X86SchedSkylakeServer.td @@ -244,6 +244,7 @@ defm : X86WriteRes<WriteFMaskedStore64Y, [SKXPort237,SKXPort0], 2, [1,1], 2>; defm : X86WriteRes<WriteFMove, [SKXPort015], 1, [1], 1>; defm : X86WriteRes<WriteFMoveX, [SKXPort015], 1, [1], 1>; defm : X86WriteRes<WriteFMoveY, [SKXPort015], 1, [1], 1>; +defm : X86WriteRes<WriteFMoveZ, [SKXPort05], 1, [1], 1>; defm : X86WriteRes<WriteEMMS, [SKXPort05,SKXPort0156], 10, [9,1], 10>; defm : SKXWriteResPair<WriteFAdd, [SKXPort01], 4, [1], 1, 5>; // Floating point add/sub. @@ -359,6 +360,7 @@ defm : X86WriteRes<WriteVecMaskedStore64Y, [SKXPort237,SKXPort0], 2, [1,1], 2>; defm : X86WriteRes<WriteVecMove, [SKXPort05], 1, [1], 1>; defm : X86WriteRes<WriteVecMoveX, [SKXPort015], 1, [1], 1>; defm : X86WriteRes<WriteVecMoveY, [SKXPort015], 1, [1], 1>; +defm : X86WriteRes<WriteVecMoveZ, [SKXPort05], 1, [1], 1>; defm : X86WriteRes<WriteVecMoveToGpr, [SKXPort0], 2, [1], 1>; defm : X86WriteRes<WriteVecMoveFromGpr, [SKXPort5], 1, [1], 1>; @@ -2613,4 +2615,48 @@ def SKXSETA_SETBErm : SchedWriteVariant<[ def : InstRW<[SKXSETA_SETBErr], (instrs SETCCr)>; def : InstRW<[SKXSETA_SETBErm], (instrs SETCCm)>; +/////////////////////////////////////////////////////////////////////////////// +// Dependency breaking instructions. +/////////////////////////////////////////////////////////////////////////////// + +def : IsZeroIdiomFunction<[ + // GPR Zero-idioms. + DepBreakingClass<[ SUB32rr, SUB64rr, XOR32rr, XOR64rr ], ZeroIdiomPredicate>, + + // SSE Zero-idioms. + DepBreakingClass<[ + // fp variants. + XORPSrr, XORPDrr, + + // int variants. + PXORrr, + PSUBBrr, PSUBWrr, PSUBDrr, PSUBQrr, + PCMPGTBrr, PCMPGTDrr, PCMPGTQrr, PCMPGTWrr + ], ZeroIdiomPredicate>, + + // AVX Zero-idioms. + DepBreakingClass<[ + // xmm fp variants. + VXORPSrr, VXORPDrr, + + // xmm int variants. + VPXORrr, + VPSUBBrr, VPSUBWrr, VPSUBDrr, VPSUBQrr, + VPCMPGTBrr, VPCMPGTWrr, VPCMPGTDrr, VPCMPGTQrr, + + // ymm variants. + VXORPSYrr, VXORPDYrr, VPXORYrr, + VPSUBBYrr, VPSUBWYrr, VPSUBDYrr, VPSUBQYrr, + VPCMPGTBYrr, VPCMPGTWYrr, VPCMPGTDYrr, VPCMPGTQYrr, + + // zmm variants. + VXORPSZrr, VXORPDZrr, VPXORDZrr, VPXORQZrr, + VXORPSZ128rr, VXORPDZ128rr, VPXORDZ128rr, VPXORQZ128rr, + VXORPSZ256rr, VXORPDZ256rr, VPXORDZ256rr, VPXORQZ256rr, + VPSUBBZrr, VPSUBWZrr, VPSUBDZrr, VPSUBQZrr, + VPSUBBZ128rr, VPSUBWZ128rr, VPSUBDZ128rr, VPSUBQZ128rr, + VPSUBBZ256rr, VPSUBWZ256rr, VPSUBDZ256rr, VPSUBQZ256rr, + ], ZeroIdiomPredicate>, +]>; + } // SchedModel diff --git a/llvm/lib/Target/X86/X86Schedule.td b/llvm/lib/Target/X86/X86Schedule.td index 1cb48175260a..d57e14715a4e 100644 --- a/llvm/lib/Target/X86/X86Schedule.td +++ b/llvm/lib/Target/X86/X86Schedule.td @@ -239,6 +239,7 @@ def WriteFMaskedStore64Y : SchedWrite; def WriteFMove : SchedWrite; def WriteFMoveX : SchedWrite; def WriteFMoveY : SchedWrite; +def WriteFMoveZ : SchedWrite; defm WriteFAdd : X86SchedWritePair<ReadAfterVecLd>; // Floating point add/sub. defm WriteFAddX : X86SchedWritePair<ReadAfterVecXLd>; // Floating point add/sub (XMM). @@ -354,6 +355,7 @@ def WriteVecMaskedStore64Y : SchedWrite; def WriteVecMove : SchedWrite; def WriteVecMoveX : SchedWrite; def WriteVecMoveY : SchedWrite; +def WriteVecMoveZ : SchedWrite; def WriteVecMoveToGpr : SchedWrite; def WriteVecMoveFromGpr : SchedWrite; @@ -516,9 +518,11 @@ def WriteFMoveLSX : X86SchedWriteMoveLS<WriteFMoveX, WriteFLoadX, WriteFStoreX>; def WriteFMoveLSY : X86SchedWriteMoveLS<WriteFMoveY, WriteFLoadY, WriteFStoreY>; +def WriteFMoveLSZ + : X86SchedWriteMoveLS<WriteFMoveZ, WriteFLoadY, WriteFStoreY>; def SchedWriteFMoveLS : X86SchedWriteMoveLSWidths<WriteFMoveLS, WriteFMoveLSX, - WriteFMoveLSY, WriteFMoveLSY>; + WriteFMoveLSY, WriteFMoveLSZ>; def WriteFMoveLSNT : X86SchedWriteMoveLS<WriteFMove, WriteFLoad, WriteFStoreNT>; @@ -536,9 +540,11 @@ def WriteVecMoveLSX : X86SchedWriteMoveLS<WriteVecMoveX, WriteVecLoadX, WriteVecStoreX>; def WriteVecMoveLSY : X86SchedWriteMoveLS<WriteVecMoveY, WriteVecLoadY, WriteVecStoreY>; +def WriteVecMoveLSZ + : X86SchedWriteMoveLS<WriteVecMoveZ, WriteVecLoadY, WriteVecStoreY>; def SchedWriteVecMoveLS : X86SchedWriteMoveLSWidths<WriteVecMoveLS, WriteVecMoveLSX, - WriteVecMoveLSY, WriteVecMoveLSY>; + WriteVecMoveLSY, WriteVecMoveLSZ>; def WriteVecMoveLSNT : X86SchedWriteMoveLS<WriteVecMove, WriteVecLoadNT, WriteVecStoreNT>; diff --git a/llvm/lib/Target/X86/X86ScheduleAtom.td b/llvm/lib/Target/X86/X86ScheduleAtom.td index 0fedfc01092c..8ae8e574f87a 100644 --- a/llvm/lib/Target/X86/X86ScheduleAtom.td +++ b/llvm/lib/Target/X86/X86ScheduleAtom.td @@ -229,6 +229,7 @@ defm : X86WriteResUnsupported<WriteFMaskedStore64Y>; def : WriteRes<WriteFMove, [AtomPort01]>; def : WriteRes<WriteFMoveX, [AtomPort01]>; defm : X86WriteResUnsupported<WriteFMoveY>; +defm : X86WriteResUnsupported<WriteFMoveZ>; defm : X86WriteRes<WriteEMMS, [AtomPort01], 5, [5], 1>; @@ -382,6 +383,7 @@ defm : X86WriteResUnsupported<WriteVecMaskedStore64Y>; def : WriteRes<WriteVecMove, [AtomPort0]>; def : WriteRes<WriteVecMoveX, [AtomPort01]>; defm : X86WriteResUnsupported<WriteVecMoveY>; +defm : X86WriteResUnsupported<WriteVecMoveZ>; defm : X86WriteRes<WriteVecMoveToGpr, [AtomPort0], 3, [3], 1>; defm : X86WriteRes<WriteVecMoveFromGpr, [AtomPort0], 1, [1], 1>; diff --git a/llvm/lib/Target/X86/X86ScheduleBdVer2.td b/llvm/lib/Target/X86/X86ScheduleBdVer2.td index 0f6f24f9f1fe..cb75c3660728 100644 --- a/llvm/lib/Target/X86/X86ScheduleBdVer2.td +++ b/llvm/lib/Target/X86/X86ScheduleBdVer2.td @@ -772,6 +772,7 @@ defm : PdWriteRes<WriteFMaskedStore64Y, [PdStore, PdFPU01, PdFPFMA], 6, [2, 2 defm : PdWriteRes<WriteFMove, [PdFPU01, PdFPFMA]>; defm : PdWriteRes<WriteFMoveX, [PdFPU01, PdFPFMA], 1, [1, 2]>; defm : PdWriteRes<WriteFMoveY, [PdFPU01, PdFPFMA], 2, [2, 2], 2>; +defm : X86WriteResUnsupported<WriteFMoveZ>; defm : PdWriteRes<WriteEMMS, [PdFPU01, PdFPFMA], 2>; @@ -1107,6 +1108,7 @@ defm : X86WriteResUnsupported<WriteVecMaskedStore64Y>; defm : PdWriteRes<WriteVecMove, [PdFPU01, PdFPMAL], 2>; defm : PdWriteRes<WriteVecMoveX, [PdFPU01, PdFPMAL], 1, [1, 2]>; defm : PdWriteRes<WriteVecMoveY, [PdFPU01, PdFPMAL], 2, [2, 2], 2>; +defm : X86WriteResUnsupported<WriteVecMoveZ>; def PdWriteMOVDQArr : SchedWriteRes<[PdFPU01, PdFPMAL]> { } diff --git a/llvm/lib/Target/X86/X86ScheduleBtVer2.td b/llvm/lib/Target/X86/X86ScheduleBtVer2.td index a070da34cab5..4b2fa87a25b5 100644 --- a/llvm/lib/Target/X86/X86ScheduleBtVer2.td +++ b/llvm/lib/Target/X86/X86ScheduleBtVer2.td @@ -525,6 +525,7 @@ defm : X86WriteRes<WriteFMaskedStore64Y, [JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU defm : X86WriteRes<WriteFMove, [JFPU01, JFPX], 1, [1, 1], 1>; defm : X86WriteRes<WriteFMoveX, [JFPU01, JFPX], 1, [1, 1], 1>; defm : X86WriteRes<WriteFMoveY, [JFPU01, JFPX], 1, [2, 2], 2>; +defm : X86WriteResUnsupported<WriteFMoveZ>; defm : X86WriteRes<WriteEMMS, [JFPU01, JFPX], 2, [1, 1], 1>; @@ -682,6 +683,7 @@ defm : X86WriteResUnsupported<WriteVecMaskedStore64Y>; defm : X86WriteRes<WriteVecMove, [JFPU01, JVALU], 1, [1, 1], 1>; defm : X86WriteRes<WriteVecMoveX, [JFPU01, JVALU], 1, [1, 1], 1>; defm : X86WriteRes<WriteVecMoveY, [JFPU01, JVALU], 1, [2, 2], 2>; +defm : X86WriteResUnsupported<WriteVecMoveZ>; defm : X86WriteRes<WriteVecMoveToGpr, [JFPU0, JFPA, JALU0], 4, [1, 1, 1], 1>; defm : X86WriteRes<WriteVecMoveFromGpr, [JFPU01, JFPX], 8, [1, 1], 2>; diff --git a/llvm/lib/Target/X86/X86ScheduleSLM.td b/llvm/lib/Target/X86/X86ScheduleSLM.td index 36e5b55a4194..52605c031617 100644 --- a/llvm/lib/Target/X86/X86ScheduleSLM.td +++ b/llvm/lib/Target/X86/X86ScheduleSLM.td @@ -200,6 +200,7 @@ def : WriteRes<WriteFMaskedStore64Y, [SLM_MEC_RSV]>; def : WriteRes<WriteFMove, [SLM_FPC_RSV01]>; def : WriteRes<WriteFMoveX, [SLM_FPC_RSV01]>; def : WriteRes<WriteFMoveY, [SLM_FPC_RSV01]>; +defm : X86WriteResUnsupported<WriteFMoveZ>; defm : X86WriteRes<WriteEMMS, [SLM_FPC_RSV01], 10, [10], 9>; defm : SLMWriteResPair<WriteFAdd, [SLM_FPC_RSV1], 3>; @@ -345,6 +346,7 @@ def : WriteRes<WriteVecMaskedStore64Y, [SLM_MEC_RSV]>; def : WriteRes<WriteVecMove, [SLM_FPC_RSV01]>; def : WriteRes<WriteVecMoveX, [SLM_FPC_RSV01]>; def : WriteRes<WriteVecMoveY, [SLM_FPC_RSV01]>; +defm : X86WriteResUnsupported<WriteVecMoveZ>; def : WriteRes<WriteVecMoveToGpr, [SLM_IEC_RSV01]>; def : WriteRes<WriteVecMoveFromGpr, [SLM_IEC_RSV01]>; @@ -480,4 +482,22 @@ def: InstRW<[SLMWriteResGroup1rm], (instrs MMX_PADDQrm, PADDQrm, MMX_PSUBQrm, PSUBQrm, PCMPEQQrm)>; +/////////////////////////////////////////////////////////////////////////////// +// Dependency breaking instructions. +/////////////////////////////////////////////////////////////////////////////// + +def : IsZeroIdiomFunction<[ + // GPR Zero-idioms. + DepBreakingClass<[ XOR32rr ], ZeroIdiomPredicate>, + + // SSE Zero-idioms. + DepBreakingClass<[ + // fp variants. + XORPSrr, XORPDrr, + + // int variants. + PXORrr, + ], ZeroIdiomPredicate>, +]>; + } // SchedModel diff --git a/llvm/lib/Target/X86/X86ScheduleZnver1.td b/llvm/lib/Target/X86/X86ScheduleZnver1.td index 4343e1ed45d1..fe0484afd227 100644 --- a/llvm/lib/Target/X86/X86ScheduleZnver1.td +++ b/llvm/lib/Target/X86/X86ScheduleZnver1.td @@ -286,6 +286,7 @@ defm : X86WriteRes<WriteFMaskedStore64Y, [ZnAGU,ZnFPU01], 5, [1,2], 2>; defm : X86WriteRes<WriteFMove, [ZnFPU], 1, [1], 1>; defm : X86WriteRes<WriteFMoveX, [ZnFPU], 1, [1], 1>; defm : X86WriteRes<WriteFMoveY, [ZnFPU], 1, [1], 1>; +defm : X86WriteResUnsupported<WriteFMoveZ>; defm : ZnWriteResFpuPair<WriteFAdd, [ZnFPU0], 3>; defm : ZnWriteResFpuPair<WriteFAddX, [ZnFPU0], 3>; @@ -404,6 +405,7 @@ defm : X86WriteRes<WriteVecMaskedStore64Y, [ZnAGU,ZnFPU01], 5, [1,2], 2>; defm : X86WriteRes<WriteVecMove, [ZnFPU], 1, [1], 1>; defm : X86WriteRes<WriteVecMoveX, [ZnFPU], 1, [1], 1>; defm : X86WriteRes<WriteVecMoveY, [ZnFPU], 2, [1], 2>; +defm : X86WriteResUnsupported<WriteVecMoveZ>; defm : X86WriteRes<WriteVecMoveToGpr, [ZnFPU2], 2, [1], 1>; defm : X86WriteRes<WriteVecMoveFromGpr, [ZnFPU2], 3, [1], 1>; defm : X86WriteRes<WriteEMMS, [ZnFPU], 2, [1], 1>; @@ -1541,4 +1543,83 @@ def : InstRW<[WriteMicrocoded], (instrs VZEROUPPER)>; // VZEROALL. def : InstRW<[WriteMicrocoded], (instrs VZEROALL)>; +/////////////////////////////////////////////////////////////////////////////// +// Dependency breaking instructions. +/////////////////////////////////////////////////////////////////////////////// + +def : IsZeroIdiomFunction<[ + // GPR Zero-idioms. + DepBreakingClass<[ + SUB32rr, SUB64rr, + XOR32rr, XOR64rr + ], ZeroIdiomPredicate>, + + // MMX Zero-idioms. + DepBreakingClass<[ + MMX_PXORrr, MMX_PANDNrr, MMX_PSUBBrr, + MMX_PSUBDrr, MMX_PSUBQrr, MMX_PSUBWrr, + MMX_PSUBSBrr, MMX_PSUBSWrr, MMX_PSUBUSBrr, MMX_PSUBUSWrr, + MMX_PCMPGTBrr, MMX_PCMPGTDrr, MMX_PCMPGTWrr + ], ZeroIdiomPredicate>, + + // SSE Zero-idioms. + DepBreakingClass<[ + // fp variants. + XORPSrr, XORPDrr, ANDNPSrr, ANDNPDrr, + + // int variants. + PXORrr, PANDNrr, + PSUBBrr, PSUBWrr, PSUBDrr, PSUBQrr, + PCMPGTBrr, PCMPGTDrr, PCMPGTQrr, PCMPGTWrr + ], ZeroIdiomPredicate>, + + // AVX XMM Zero-idioms. + DepBreakingClass<[ + // fp variants. + VXORPSrr, VXORPDrr, VANDNPSrr, VANDNPDrr, + + // int variants. + VPXORrr, VPANDNrr, + VPSUBBrr, VPSUBWrr, VPSUBDrr, VPSUBQrr, + VPCMPGTBrr, VPCMPGTWrr, VPCMPGTDrr, VPCMPGTQrr + ], ZeroIdiomPredicate>, + + // AVX YMM Zero-idioms. + DepBreakingClass<[ + // fp variants + VXORPSYrr, VXORPDYrr, VANDNPSYrr, VANDNPDYrr, + + // int variants + VPXORYrr, VPANDNYrr, + VPSUBBYrr, VPSUBWYrr, VPSUBDYrr, VPSUBQYrr, + VPCMPGTBYrr, VPCMPGTWYrr, VPCMPGTDYrr, VPCMPGTQYrr + ], ZeroIdiomPredicate> +]>; + +def : IsDepBreakingFunction<[ + // GPR + DepBreakingClass<[ SBB32rr, SBB64rr ], ZeroIdiomPredicate>, + DepBreakingClass<[ CMP32rr, CMP64rr ], CheckSameRegOperand<0, 1> >, + + // MMX + DepBreakingClass<[ + MMX_PCMPEQBrr, MMX_PCMPEQWrr, MMX_PCMPEQDrr + ], ZeroIdiomPredicate>, + + // SSE + DepBreakingClass<[ + PCMPEQBrr, PCMPEQWrr, PCMPEQDrr, PCMPEQQrr + ], ZeroIdiomPredicate>, + + // AVX XMM + DepBreakingClass<[ + VPCMPEQBrr, VPCMPEQWrr, VPCMPEQDrr, VPCMPEQQrr + ], ZeroIdiomPredicate>, + + // AVX YMM + DepBreakingClass<[ + VPCMPEQBYrr, VPCMPEQWYrr, VPCMPEQDYrr, VPCMPEQQYrr + ], ZeroIdiomPredicate>, +]>; + } // SchedModel diff --git a/llvm/lib/Target/X86/X86ScheduleZnver2.td b/llvm/lib/Target/X86/X86ScheduleZnver2.td index 96d2837880c7..38908a987595 100644 --- a/llvm/lib/Target/X86/X86ScheduleZnver2.td +++ b/llvm/lib/Target/X86/X86ScheduleZnver2.td @@ -274,6 +274,7 @@ defm : X86WriteRes<WriteFStoreNTY, [Zn2AGU], 1, [1], 1>; defm : X86WriteRes<WriteFMove, [Zn2FPU], 1, [1], 1>; defm : X86WriteRes<WriteFMoveX, [Zn2FPU], 1, [1], 1>; defm : X86WriteRes<WriteFMoveY, [Zn2FPU], 1, [1], 1>; +defm : X86WriteResUnsupported<WriteFMoveZ>; defm : Zn2WriteResFpuPair<WriteFAdd, [Zn2FPU0], 3>; defm : Zn2WriteResFpuPair<WriteFAddX, [Zn2FPU0], 3>; @@ -388,6 +389,7 @@ defm : X86WriteRes<WriteVecMaskedStore64Y, [Zn2AGU,Zn2FPU01], 5, [1,2], 2>; defm : X86WriteRes<WriteVecMove, [Zn2FPU], 1, [1], 1>; defm : X86WriteRes<WriteVecMoveX, [Zn2FPU], 1, [1], 1>; defm : X86WriteRes<WriteVecMoveY, [Zn2FPU], 2, [1], 2>; +defm : X86WriteResUnsupported<WriteVecMoveZ>; defm : X86WriteRes<WriteVecMoveToGpr, [Zn2FPU2], 2, [1], 1>; defm : X86WriteRes<WriteVecMoveFromGpr, [Zn2FPU2], 3, [1], 1>; defm : X86WriteRes<WriteEMMS, [Zn2FPU], 2, [1], 1>; @@ -1530,4 +1532,83 @@ def : InstRW<[WriteALU], (instrs VZEROUPPER)>; // VZEROALL. def : InstRW<[WriteMicrocoded], (instrs VZEROALL)>; +/////////////////////////////////////////////////////////////////////////////// +// Dependency breaking instructions. +/////////////////////////////////////////////////////////////////////////////// + +def : IsZeroIdiomFunction<[ + // GPR Zero-idioms. + DepBreakingClass<[ + SUB32rr, SUB64rr, + XOR32rr, XOR64rr + ], ZeroIdiomPredicate>, + + // MMX Zero-idioms. + DepBreakingClass<[ + MMX_PXORrr, MMX_PANDNrr, MMX_PSUBBrr, + MMX_PSUBDrr, MMX_PSUBQrr, MMX_PSUBWrr, + MMX_PSUBSBrr, MMX_PSUBSWrr, MMX_PSUBUSBrr, MMX_PSUBUSWrr, + MMX_PCMPGTBrr, MMX_PCMPGTDrr, MMX_PCMPGTWrr + ], ZeroIdiomPredicate>, + + // SSE Zero-idioms. + DepBreakingClass<[ + // fp variants. + XORPSrr, XORPDrr, ANDNPSrr, ANDNPDrr, + + // int variants. + PXORrr, PANDNrr, + PSUBBrr, PSUBWrr, PSUBDrr, PSUBQrr, + PCMPGTBrr, PCMPGTDrr, PCMPGTQrr, PCMPGTWrr + ], ZeroIdiomPredicate>, + + // AVX XMM Zero-idioms. + DepBreakingClass<[ + // fp variants. + VXORPSrr, VXORPDrr, VANDNPSrr, VANDNPDrr, + + // int variants. + VPXORrr, VPANDNrr, + VPSUBBrr, VPSUBWrr, VPSUBDrr, VPSUBQrr, + VPCMPGTBrr, VPCMPGTWrr, VPCMPGTDrr, VPCMPGTQrr + ], ZeroIdiomPredicate>, + + // AVX YMM Zero-idioms. + DepBreakingClass<[ + // fp variants + VXORPSYrr, VXORPDYrr, VANDNPSYrr, VANDNPDYrr, + + // int variants + VPXORYrr, VPANDNYrr, + VPSUBBYrr, VPSUBWYrr, VPSUBDYrr, VPSUBQYrr, + VPCMPGTBYrr, VPCMPGTWYrr, VPCMPGTDYrr, VPCMPGTQYrr + ], ZeroIdiomPredicate> +]>; + +def : IsDepBreakingFunction<[ + // GPR + DepBreakingClass<[ SBB32rr, SBB64rr ], ZeroIdiomPredicate>, + DepBreakingClass<[ CMP32rr, CMP64rr ], CheckSameRegOperand<0, 1> >, + + // MMX + DepBreakingClass<[ + MMX_PCMPEQBrr, MMX_PCMPEQWrr, MMX_PCMPEQDrr + ], ZeroIdiomPredicate>, + + // SSE + DepBreakingClass<[ + PCMPEQBrr, PCMPEQWrr, PCMPEQDrr, PCMPEQQrr + ], ZeroIdiomPredicate>, + + // AVX XMM + DepBreakingClass<[ + VPCMPEQBrr, VPCMPEQWrr, VPCMPEQDrr, VPCMPEQQrr + ], ZeroIdiomPredicate>, + + // AVX YMM + DepBreakingClass<[ + VPCMPEQBYrr, VPCMPEQWYrr, VPCMPEQDYrr, VPCMPEQQYrr + ], ZeroIdiomPredicate>, +]>; + } // SchedModel diff --git a/llvm/lib/Target/X86/X86ScheduleZnver3.td b/llvm/lib/Target/X86/X86ScheduleZnver3.td index f4e03ac11f0b..02f7f8376fdb 100644 --- a/llvm/lib/Target/X86/X86ScheduleZnver3.td +++ b/llvm/lib/Target/X86/X86ScheduleZnver3.td @@ -1446,10 +1446,12 @@ defm : Zn3WriteResInt<WriteXCHG, [Zn3ALU0123], 0, [8], 2>; // Compare+Exc defm : Zn3WriteResXMM<WriteFMove, [Zn3FPVMisc0123], 1, [1], 1>; // Empty sched class defm : Zn3WriteResXMM<WriteFMoveX, [], 0, [], 1>; defm : Zn3WriteResYMM<WriteFMoveY, [], 0, [], 1>; +defm : X86WriteResUnsupported<WriteFMoveZ>; defm : Zn3WriteResXMM<WriteVecMove, [Zn3FPFMisc0123], 1, [1], 1>; // MMX defm : Zn3WriteResXMM<WriteVecMoveX, [], 0, [], 1>; defm : Zn3WriteResYMM<WriteVecMoveY, [], 0, [], 1>; +defm : X86WriteResUnsupported<WriteVecMoveZ>; def : IsOptimizableRegisterMove<[ InstructionEquivalenceClass<[ diff --git a/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp b/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp index 83a4a025f518..dba11e8b4000 100644 --- a/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp +++ b/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp @@ -1139,7 +1139,7 @@ X86SpeculativeLoadHardeningPass::tracePredStateThroughIndirectBranches( // branch back to itself. We can do this here because at this point, every // predecessor of this block has an available value. This is basically just // automating the construction of a PHI node for this target. - unsigned TargetReg = TargetAddrSSA.GetValueInMiddleOfBlock(&MBB); + Register TargetReg = TargetAddrSSA.GetValueInMiddleOfBlock(&MBB); // Insert a comparison of the incoming target register with this block's // address. This also requires us to mark the block as having its address @@ -1642,7 +1642,7 @@ void X86SpeculativeLoadHardeningPass::hardenLoadAddr( return; // Compute the current predicate state. - unsigned StateReg = PS->SSA.GetValueAtEndOfBlock(&MBB); + Register StateReg = PS->SSA.GetValueAtEndOfBlock(&MBB); auto InsertPt = MI.getIterator(); @@ -1913,7 +1913,7 @@ unsigned X86SpeculativeLoadHardeningPass::hardenValueInRegister( auto *RC = MRI->getRegClass(Reg); int Bytes = TRI->getRegSizeInBits(*RC) / 8; - unsigned StateReg = PS->SSA.GetValueAtEndOfBlock(&MBB); + Register StateReg = PS->SSA.GetValueAtEndOfBlock(&MBB); assert((Bytes == 1 || Bytes == 2 || Bytes == 4 || Bytes == 8) && "Unknown register size"); @@ -2078,7 +2078,7 @@ void X86SpeculativeLoadHardeningPass::tracePredStateThroughCall( // First, we transfer the predicate state into the called function by merging // it into the stack pointer. This will kill the current def of the state. - unsigned StateReg = PS->SSA.GetValueAtEndOfBlock(&MBB); + Register StateReg = PS->SSA.GetValueAtEndOfBlock(&MBB); mergePredStateIntoSP(MBB, InsertPt, Loc, StateReg); // If this call is also a return, it is a tail call and we don't need anything diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp index 78bc5519c23f..e3d0128dd73d 100644 --- a/llvm/lib/Target/X86/X86TargetMachine.cpp +++ b/llvm/lib/Target/X86/X86TargetMachine.cpp @@ -127,7 +127,7 @@ static std::string computeDataLayout(const Triple &TT) { // Some ABIs align long double to 128 bits, others to 32. if (TT.isOSNaCl() || TT.isOSIAMCU()) ; // No f80 - else if (TT.isArch64Bit() || TT.isOSDarwin()) + else if (TT.isArch64Bit() || TT.isOSDarwin() || TT.isWindowsMSVCEnvironment()) Ret += "-f80:128"; else Ret += "-f80:32"; diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index d8cd7311a0d5..5b95c10332dc 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -43,6 +43,7 @@ #include "llvm/CodeGen/BasicTTIImpl.h" #include "llvm/CodeGen/CostTable.h" #include "llvm/CodeGen/TargetLowering.h" +#include "llvm/IR/InstIterator.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/Support/Debug.h" @@ -3429,6 +3430,20 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, if (ICA.isTypeBasedOnly()) return getTypeBasedIntrinsicInstrCost(ICA, CostKind); + static const CostTblEntry AVX512BWCostTbl[] = { + { ISD::ROTL, MVT::v32i16, 2 }, + { ISD::ROTL, MVT::v16i16, 2 }, + { ISD::ROTL, MVT::v8i16, 2 }, + { ISD::ROTL, MVT::v64i8, 5 }, + { ISD::ROTL, MVT::v32i8, 5 }, + { ISD::ROTL, MVT::v16i8, 5 }, + { ISD::ROTR, MVT::v32i16, 2 }, + { ISD::ROTR, MVT::v16i16, 2 }, + { ISD::ROTR, MVT::v8i16, 2 }, + { ISD::ROTR, MVT::v64i8, 5 }, + { ISD::ROTR, MVT::v32i8, 5 }, + { ISD::ROTR, MVT::v16i8, 5 } + }; static const CostTblEntry AVX512CostTbl[] = { { ISD::ROTL, MVT::v8i64, 1 }, { ISD::ROTL, MVT::v4i64, 1 }, @@ -3506,6 +3521,10 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, MVT MTy = LT.second; // Attempt to lookup cost. + if (ST->hasBWI()) + if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy)) + return LT.first * Entry->Cost; + if (ST->hasAVX512()) if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy)) return LT.first * Entry->Cost; @@ -4976,9 +4995,13 @@ InstructionCost X86TTIImpl::getGatherScatterOpCost( const Instruction *I = nullptr) { if (CostKind != TTI::TCK_RecipThroughput) { if ((Opcode == Instruction::Load && - isLegalMaskedGather(SrcVTy, Align(Alignment))) || + isLegalMaskedGather(SrcVTy, Align(Alignment)) && + !forceScalarizeMaskedGather(cast<VectorType>(SrcVTy), + Align(Alignment))) || (Opcode == Instruction::Store && - isLegalMaskedScatter(SrcVTy, Align(Alignment)))) + isLegalMaskedScatter(SrcVTy, Align(Alignment)) && + !forceScalarizeMaskedScatter(cast<VectorType>(SrcVTy), + Align(Alignment)))) return 1; return BaseT::getGatherScatterOpCost(Opcode, SrcVTy, Ptr, VariableMask, Alignment, CostKind, I); @@ -4993,9 +5016,13 @@ InstructionCost X86TTIImpl::getGatherScatterOpCost( unsigned AddressSpace = PtrTy->getAddressSpace(); if ((Opcode == Instruction::Load && - !isLegalMaskedGather(SrcVTy, Align(Alignment))) || + (!isLegalMaskedGather(SrcVTy, Align(Alignment)) || + forceScalarizeMaskedGather(cast<VectorType>(SrcVTy), + Align(Alignment)))) || (Opcode == Instruction::Store && - !isLegalMaskedScatter(SrcVTy, Align(Alignment)))) + (!isLegalMaskedScatter(SrcVTy, Align(Alignment)) || + forceScalarizeMaskedScatter(cast<VectorType>(SrcVTy), + Align(Alignment))))) return getGSScalarCost(Opcode, SrcVTy, VariableMask, Alignment, AddressSpace); @@ -5118,35 +5145,21 @@ bool X86TTIImpl::supportsGather() const { return ST->hasAVX512() || (ST->hasFastGather() && ST->hasAVX2()); } +bool X86TTIImpl::forceScalarizeMaskedGather(VectorType *VTy, Align Alignment) { + // Gather / Scatter for vector 2 is not profitable on KNL / SKX + // Vector-4 of gather/scatter instruction does not exist on KNL. We can extend + // it to 8 elements, but zeroing upper bits of the mask vector will add more + // instructions. Right now we give the scalar cost of vector-4 for KNL. TODO: + // Check, maybe the gather/scatter instruction is better in the VariableMask + // case. + unsigned NumElts = cast<FixedVectorType>(VTy)->getNumElements(); + return NumElts == 1 || + (ST->hasAVX512() && (NumElts == 2 || (NumElts == 4 && !ST->hasVLX()))); +} + bool X86TTIImpl::isLegalMaskedGather(Type *DataTy, Align Alignment) { if (!supportsGather()) return false; - - // This function is called now in two cases: from the Loop Vectorizer - // and from the Scalarizer. - // When the Loop Vectorizer asks about legality of the feature, - // the vectorization factor is not calculated yet. The Loop Vectorizer - // sends a scalar type and the decision is based on the width of the - // scalar element. - // Later on, the cost model will estimate usage this intrinsic based on - // the vector type. - // The Scalarizer asks again about legality. It sends a vector type. - // In this case we can reject non-power-of-2 vectors. - // We also reject single element vectors as the type legalizer can't - // scalarize it. - if (auto *DataVTy = dyn_cast<FixedVectorType>(DataTy)) { - unsigned NumElts = DataVTy->getNumElements(); - if (NumElts == 1) - return false; - // Gather / Scatter for vector 2 is not profitable on KNL / SKX - // Vector-4 of gather/scatter instruction does not exist on KNL. - // We can extend it to 8 elements, but zeroing upper bits of - // the mask vector will add more instructions. Right now we give the scalar - // cost of vector-4 for KNL. TODO: Check, maybe the gather/scatter - // instruction is better in the VariableMask case. - if (ST->hasAVX512() && (NumElts == 2 || (NumElts == 4 && !ST->hasVLX()))) - return false; - } Type *ScalarTy = DataTy->getScalarType(); if (ScalarTy->isPointerTy()) return true; @@ -5187,9 +5200,48 @@ bool X86TTIImpl::areInlineCompatible(const Function *Caller, const FeatureBitset &CalleeBits = TM.getSubtargetImpl(*Callee)->getFeatureBits(); + // Check whether features are the same (apart from the ignore list). FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList; FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList; - return (RealCallerBits & RealCalleeBits) == RealCalleeBits; + if (RealCallerBits == RealCalleeBits) + return true; + + // If the features are a subset, we need to additionally check for calls + // that may become ABI-incompatible as a result of inlining. + if ((RealCallerBits & RealCalleeBits) != RealCalleeBits) + return false; + + for (const Instruction &I : instructions(Callee)) { + if (const auto *CB = dyn_cast<CallBase>(&I)) { + SmallVector<Type *, 8> Types; + for (Value *Arg : CB->args()) + Types.push_back(Arg->getType()); + if (!CB->getType()->isVoidTy()) + Types.push_back(CB->getType()); + + // Simple types are always ABI compatible. + auto IsSimpleTy = [](Type *Ty) { + return !Ty->isVectorTy() && !Ty->isAggregateType(); + }; + if (all_of(Types, IsSimpleTy)) + continue; + + if (Function *NestedCallee = CB->getCalledFunction()) { + // Assume that intrinsics are always ABI compatible. + if (NestedCallee->isIntrinsic()) + continue; + + // Do a precise compatibility check. + if (!areTypesABICompatible(Caller, NestedCallee, Types)) + return false; + } else { + // We don't know the target features of the callee, + // assume it is incompatible. + return false; + } + } + } + return true; } bool X86TTIImpl::areTypesABICompatible(const Function *Caller, diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h index 11e9cb09c7d5..69715072426f 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.h +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -226,6 +226,10 @@ public: bool isLegalMaskedStore(Type *DataType, Align Alignment); bool isLegalNTLoad(Type *DataType, Align Alignment); bool isLegalNTStore(Type *DataType, Align Alignment); + bool forceScalarizeMaskedGather(VectorType *VTy, Align Alignment); + bool forceScalarizeMaskedScatter(VectorType *VTy, Align Alignment) { + return forceScalarizeMaskedGather(VTy, Alignment); + } bool isLegalMaskedGather(Type *DataType, Align Alignment); bool isLegalMaskedScatter(Type *DataType, Align Alignment); bool isLegalMaskedExpandLoad(Type *DataType); diff --git a/llvm/lib/Target/XCore/XCoreFrameLowering.cpp b/llvm/lib/Target/XCore/XCoreFrameLowering.cpp index f2f89f4269ed..19ebcb3ea3e8 100644 --- a/llvm/lib/Target/XCore/XCoreFrameLowering.cpp +++ b/llvm/lib/Target/XCore/XCoreFrameLowering.cpp @@ -428,7 +428,7 @@ bool XCoreFrameLowering::spillCalleeSavedRegisters( DL = MI->getDebugLoc(); for (const CalleeSavedInfo &I : CSI) { - unsigned Reg = I.getReg(); + Register Reg = I.getReg(); assert(Reg != XCore::LR && !(Reg == XCore::R10 && hasFP(*MF)) && "LR & FP are always handled in emitPrologue"); @@ -455,7 +455,7 @@ bool XCoreFrameLowering::restoreCalleeSavedRegisters( if (!AtStart) --BeforeI; for (const CalleeSavedInfo &CSR : CSI) { - unsigned Reg = CSR.getReg(); + Register Reg = CSR.getReg(); assert(Reg != XCore::LR && !(Reg == XCore::R10 && hasFP(*MF)) && "LR & FP are always handled in emitEpilogue"); diff --git a/llvm/lib/Target/XCore/XCoreRegisterInfo.cpp b/llvm/lib/Target/XCore/XCoreRegisterInfo.cpp index 6799823f6fcb..0d1ba39b8b10 100644 --- a/llvm/lib/Target/XCore/XCoreRegisterInfo.cpp +++ b/llvm/lib/Target/XCore/XCoreRegisterInfo.cpp @@ -97,7 +97,7 @@ static void InsertFPConstInst(MachineBasicBlock::iterator II, MachineInstr &MI = *II; MachineBasicBlock &MBB = *MI.getParent(); DebugLoc dl = MI.getDebugLoc(); - unsigned ScratchOffset = RS->scavengeRegister(&XCore::GRRegsRegClass, II, 0); + Register ScratchOffset = RS->scavengeRegister(&XCore::GRRegsRegClass, II, 0); RS->setRegUsed(ScratchOffset); TII.loadImmediate(MBB, II, ScratchOffset, Offset); @@ -174,7 +174,7 @@ static void InsertSPConstInst(MachineBasicBlock::iterator II, } else ScratchBase = Reg; BuildMI(MBB, II, dl, TII.get(XCore::LDAWSP_ru6), ScratchBase).addImm(0); - unsigned ScratchOffset = RS->scavengeRegister(&XCore::GRRegsRegClass, II, 0); + Register ScratchOffset = RS->scavengeRegister(&XCore::GRRegsRegClass, II, 0); RS->setRegUsed(ScratchOffset); TII.loadImmediate(MBB, II, ScratchOffset, Offset); diff --git a/llvm/lib/Target/XCore/XCoreSubtarget.cpp b/llvm/lib/Target/XCore/XCoreSubtarget.cpp index 1be707cb488c..d4b777ef447f 100644 --- a/llvm/lib/Target/XCore/XCoreSubtarget.cpp +++ b/llvm/lib/Target/XCore/XCoreSubtarget.cpp @@ -26,5 +26,5 @@ void XCoreSubtarget::anchor() { } XCoreSubtarget::XCoreSubtarget(const Triple &TT, const std::string &CPU, const std::string &FS, const TargetMachine &TM) - : XCoreGenSubtargetInfo(TT, CPU, /*TuneCPU*/ CPU, FS), InstrInfo(), - FrameLowering(*this), TLInfo(TM, *this), TSInfo() {} + : XCoreGenSubtargetInfo(TT, CPU, /*TuneCPU*/ CPU, FS), FrameLowering(*this), + TLInfo(TM, *this) {} diff --git a/llvm/lib/TextAPI/Architecture.cpp b/llvm/lib/TextAPI/Architecture.cpp index e1901d5c0ce5..bb349b21774e 100644 --- a/llvm/lib/TextAPI/Architecture.cpp +++ b/llvm/lib/TextAPI/Architecture.cpp @@ -15,7 +15,7 @@ #include "llvm/ADT/Triple.h" #include "llvm/BinaryFormat/MachO.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/TextAPI/ArchitectureSet.h" +#include "llvm/Support/ErrorHandling.h" namespace llvm { namespace MachO { diff --git a/llvm/lib/TextAPI/PackedVersion.cpp b/llvm/lib/TextAPI/PackedVersion.cpp index f8171e02b6d3..67fb30aeb127 100644 --- a/llvm/lib/TextAPI/PackedVersion.cpp +++ b/llvm/lib/TextAPI/PackedVersion.cpp @@ -11,7 +11,6 @@ //===----------------------------------------------------------------------===// #include "llvm/TextAPI/PackedVersion.h" -#include "llvm/ADT/SmallString.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Support/Format.h" diff --git a/llvm/lib/TextAPI/Platform.cpp b/llvm/lib/TextAPI/Platform.cpp index a2ce6d0cac86..c3c74252301e 100644 --- a/llvm/lib/TextAPI/Platform.cpp +++ b/llvm/lib/TextAPI/Platform.cpp @@ -18,120 +18,118 @@ namespace llvm { namespace MachO { -PlatformKind mapToPlatformKind(PlatformKind Platform, bool WantSim) { +PlatformType mapToPlatformType(PlatformType Platform, bool WantSim) { switch (Platform) { default: return Platform; - case PlatformKind::iOS: - return WantSim ? PlatformKind::iOSSimulator : PlatformKind::iOS; - case PlatformKind::tvOS: - return WantSim ? PlatformKind::tvOSSimulator : PlatformKind::tvOS; - case PlatformKind::watchOS: - return WantSim ? PlatformKind::watchOSSimulator : PlatformKind::watchOS; + case PLATFORM_IOS: + return WantSim ? PLATFORM_IOSSIMULATOR : PLATFORM_IOS; + case PLATFORM_TVOS: + return WantSim ? PLATFORM_TVOSSIMULATOR : PLATFORM_TVOS; + case PLATFORM_WATCHOS: + return WantSim ? PLATFORM_WATCHOSSIMULATOR : PLATFORM_WATCHOS; } - llvm_unreachable("Unknown llvm::MachO::PlatformKind enum"); } -PlatformKind mapToPlatformKind(const Triple &Target) { +PlatformType mapToPlatformType(const Triple &Target) { switch (Target.getOS()) { default: - return PlatformKind::unknown; + return PLATFORM_UNKNOWN; case Triple::MacOSX: - return PlatformKind::macOS; + return PLATFORM_MACOS; case Triple::IOS: if (Target.isSimulatorEnvironment()) - return PlatformKind::iOSSimulator; + return PLATFORM_IOSSIMULATOR; if (Target.getEnvironment() == Triple::MacABI) - return PlatformKind::macCatalyst; - return PlatformKind::iOS; + return PLATFORM_MACCATALYST; + return PLATFORM_IOS; case Triple::TvOS: - return Target.isSimulatorEnvironment() ? PlatformKind::tvOSSimulator - : PlatformKind::tvOS; + return Target.isSimulatorEnvironment() ? PLATFORM_TVOSSIMULATOR + : PLATFORM_TVOS; case Triple::WatchOS: - return Target.isSimulatorEnvironment() ? PlatformKind::watchOSSimulator - : PlatformKind::watchOS; + return Target.isSimulatorEnvironment() ? PLATFORM_WATCHOSSIMULATOR + : PLATFORM_WATCHOS; // TODO: add bridgeOS & driverKit once in llvm::Triple } - llvm_unreachable("Unknown Target Triple"); } PlatformSet mapToPlatformSet(ArrayRef<Triple> Targets) { PlatformSet Result; for (const auto &Target : Targets) - Result.insert(mapToPlatformKind(Target)); + Result.insert(mapToPlatformType(Target)); return Result; } -StringRef getPlatformName(PlatformKind Platform) { +StringRef getPlatformName(PlatformType Platform) { switch (Platform) { - case PlatformKind::unknown: + case PLATFORM_UNKNOWN: return "unknown"; - case PlatformKind::macOS: + case PLATFORM_MACOS: return "macOS"; - case PlatformKind::iOS: + case PLATFORM_IOS: return "iOS"; - case PlatformKind::tvOS: + case PLATFORM_TVOS: return "tvOS"; - case PlatformKind::watchOS: + case PLATFORM_WATCHOS: return "watchOS"; - case PlatformKind::bridgeOS: + case PLATFORM_BRIDGEOS: return "bridgeOS"; - case PlatformKind::macCatalyst: + case PLATFORM_MACCATALYST: return "macCatalyst"; - case PlatformKind::iOSSimulator: + case PLATFORM_IOSSIMULATOR: return "iOS Simulator"; - case PlatformKind::tvOSSimulator: + case PLATFORM_TVOSSIMULATOR: return "tvOS Simulator"; - case PlatformKind::watchOSSimulator: + case PLATFORM_WATCHOSSIMULATOR: return "watchOS Simulator"; - case PlatformKind::driverKit: + case PLATFORM_DRIVERKIT: return "DriverKit"; } - llvm_unreachable("Unknown llvm::MachO::PlatformKind enum"); + llvm_unreachable("Unknown llvm::MachO::PlatformType enum"); } -PlatformKind getPlatformFromName(StringRef Name) { - return StringSwitch<PlatformKind>(Name) - .Case("macos", PlatformKind::macOS) - .Case("ios", PlatformKind::iOS) - .Case("tvos", PlatformKind::tvOS) - .Case("watchos", PlatformKind::watchOS) - .Case("bridgeos", PlatformKind::macOS) - .Case("ios-macabi", PlatformKind::macCatalyst) - .Case("ios-simulator", PlatformKind::iOSSimulator) - .Case("tvos-simulator", PlatformKind::tvOSSimulator) - .Case("watchos-simulator", PlatformKind::watchOSSimulator) - .Case("driverkit", PlatformKind::driverKit) - .Default(PlatformKind::unknown); +PlatformType getPlatformFromName(StringRef Name) { + return StringSwitch<PlatformType>(Name) + .Case("macos", PLATFORM_MACOS) + .Case("ios", PLATFORM_IOS) + .Case("tvos", PLATFORM_TVOS) + .Case("watchos", PLATFORM_WATCHOS) + .Case("bridgeos", PLATFORM_BRIDGEOS) + .Case("ios-macabi", PLATFORM_MACCATALYST) + .Case("ios-simulator", PLATFORM_IOSSIMULATOR) + .Case("tvos-simulator", PLATFORM_TVOSSIMULATOR) + .Case("watchos-simulator", PLATFORM_WATCHOSSIMULATOR) + .Case("driverkit", PLATFORM_DRIVERKIT) + .Default(PLATFORM_UNKNOWN); } -std::string getOSAndEnvironmentName(PlatformKind Platform, +std::string getOSAndEnvironmentName(PlatformType Platform, std::string Version) { switch (Platform) { - case PlatformKind::unknown: + case PLATFORM_UNKNOWN: return "darwin" + Version; - case PlatformKind::macOS: + case PLATFORM_MACOS: return "macos" + Version; - case PlatformKind::iOS: + case PLATFORM_IOS: return "ios" + Version; - case PlatformKind::tvOS: + case PLATFORM_TVOS: return "tvos" + Version; - case PlatformKind::watchOS: + case PLATFORM_WATCHOS: return "watchos" + Version; - case PlatformKind::bridgeOS: + case PLATFORM_BRIDGEOS: return "bridgeos" + Version; - case PlatformKind::macCatalyst: + case PLATFORM_MACCATALYST: return "ios" + Version + "-macabi"; - case PlatformKind::iOSSimulator: + case PLATFORM_IOSSIMULATOR: return "ios" + Version + "-simulator"; - case PlatformKind::tvOSSimulator: + case PLATFORM_TVOSSIMULATOR: return "tvos" + Version + "-simulator"; - case PlatformKind::watchOSSimulator: + case PLATFORM_WATCHOSSIMULATOR: return "watchos" + Version + "-simulator"; - case PlatformKind::driverKit: + case PLATFORM_DRIVERKIT: return "driverkit" + Version; } - llvm_unreachable("Unknown llvm::MachO::PlatformKind enum"); + llvm_unreachable("Unknown llvm::MachO::PlatformType enum"); } } // end namespace MachO. diff --git a/llvm/lib/TextAPI/Target.cpp b/llvm/lib/TextAPI/Target.cpp index 35fe1bf65e6f..c54c3bd66b9d 100644 --- a/llvm/lib/TextAPI/Target.cpp +++ b/llvm/lib/TextAPI/Target.cpp @@ -7,11 +7,8 @@ //===----------------------------------------------------------------------===// #include "llvm/TextAPI/Target.h" -#include "llvm/ADT/SmallString.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringSwitch.h" -#include "llvm/Support/Format.h" +#include "llvm/ADT/Twine.h" #include "llvm/Support/raw_ostream.h" namespace llvm { @@ -22,26 +19,26 @@ Expected<Target> Target::create(StringRef TargetValue) { auto ArchitectureStr = Result.first; auto Architecture = getArchitectureFromName(ArchitectureStr); auto PlatformStr = Result.second; - PlatformKind Platform; - Platform = StringSwitch<PlatformKind>(PlatformStr) - .Case("macos", PlatformKind::macOS) - .Case("ios", PlatformKind::iOS) - .Case("tvos", PlatformKind::tvOS) - .Case("watchos", PlatformKind::watchOS) - .Case("bridgeos", PlatformKind::bridgeOS) - .Case("maccatalyst", PlatformKind::macCatalyst) - .Case("ios-simulator", PlatformKind::iOSSimulator) - .Case("tvos-simulator", PlatformKind::tvOSSimulator) - .Case("watchos-simulator", PlatformKind::watchOSSimulator) - .Case("driverkit", PlatformKind::driverKit) - .Default(PlatformKind::unknown); + PlatformType Platform; + Platform = StringSwitch<PlatformType>(PlatformStr) + .Case("macos", PLATFORM_MACOS) + .Case("ios", PLATFORM_IOS) + .Case("tvos", PLATFORM_TVOS) + .Case("watchos", PLATFORM_WATCHOS) + .Case("bridgeos", PLATFORM_BRIDGEOS) + .Case("maccatalyst", PLATFORM_MACCATALYST) + .Case("ios-simulator", PLATFORM_IOSSIMULATOR) + .Case("tvos-simulator", PLATFORM_TVOSSIMULATOR) + .Case("watchos-simulator", PLATFORM_WATCHOSSIMULATOR) + .Case("driverkit", PLATFORM_DRIVERKIT) + .Default(PLATFORM_UNKNOWN); - if (Platform == PlatformKind::unknown) { + if (Platform == PLATFORM_UNKNOWN) { if (PlatformStr.startswith("<") && PlatformStr.endswith(">")) { PlatformStr = PlatformStr.drop_front().drop_back(); unsigned long long RawValue; if (!PlatformStr.getAsInteger(10, RawValue)) - Platform = (PlatformKind)RawValue; + Platform = (PlatformType)RawValue; } } diff --git a/llvm/lib/TextAPI/TextStub.cpp b/llvm/lib/TextAPI/TextStub.cpp index b64f19ab65cc..ff93e43356f7 100644 --- a/llvm/lib/TextAPI/TextStub.cpp +++ b/llvm/lib/TextAPI/TextStub.cpp @@ -380,34 +380,34 @@ template <> struct ScalarTraits<Target> { default: OS << "unknown"; break; - case PlatformKind::macOS: + case PLATFORM_MACOS: OS << "macos"; break; - case PlatformKind::iOS: + case PLATFORM_IOS: OS << "ios"; break; - case PlatformKind::tvOS: + case PLATFORM_TVOS: OS << "tvos"; break; - case PlatformKind::watchOS: + case PLATFORM_WATCHOS: OS << "watchos"; break; - case PlatformKind::bridgeOS: + case PLATFORM_BRIDGEOS: OS << "bridgeos"; break; - case PlatformKind::macCatalyst: + case PLATFORM_MACCATALYST: OS << "maccatalyst"; break; - case PlatformKind::iOSSimulator: + case PLATFORM_IOSSIMULATOR: OS << "ios-simulator"; break; - case PlatformKind::tvOSSimulator: + case PLATFORM_TVOSSIMULATOR: OS << "tvos-simulator"; break; - case PlatformKind::watchOSSimulator: + case PLATFORM_WATCHOSSIMULATOR: OS << "watchos-simulator"; break; - case PlatformKind::driverKit: + case PLATFORM_DRIVERKIT: OS << "driverkit"; break; } @@ -423,7 +423,7 @@ template <> struct ScalarTraits<Target> { Value = *Result; if (Value.Arch == AK_unknown) return "unknown architecture"; - if (Value.Platform == PlatformKind::unknown) + if (Value.Platform == PLATFORM_UNKNOWN) return "unknown platform"; return {}; @@ -597,11 +597,10 @@ template <> struct MappingTraits<const InterfaceFile *> { TargetList Targets; for (auto Platform : Platforms) { - Platform = mapToPlatformKind(Platform, Architectures.hasX86()); + Platform = mapToPlatformType(Platform, Architectures.hasX86()); for (const auto &&Architecture : Architectures) { - if ((Architecture == AK_i386) && - (Platform == PlatformKind::macCatalyst)) + if ((Architecture == AK_i386) && (Platform == PLATFORM_MACCATALYST)) continue; Targets.emplace_back(Architecture, Platform); diff --git a/llvm/lib/TextAPI/TextStubCommon.cpp b/llvm/lib/TextAPI/TextStubCommon.cpp index c2713b9b5203..29b74f981a91 100644 --- a/llvm/lib/TextAPI/TextStubCommon.cpp +++ b/llvm/lib/TextAPI/TextStubCommon.cpp @@ -49,8 +49,8 @@ void ScalarTraits<PlatformSet>::output(const PlatformSet &Values, void *IO, assert((!Ctx || Ctx->FileKind != FileType::Invalid) && "File type is not set in context"); - if (Ctx && Ctx->FileKind == TBD_V3 && Values.count(PlatformKind::macOS) && - Values.count(PlatformKind::macCatalyst)) { + if (Ctx && Ctx->FileKind == TBD_V3 && Values.count(PLATFORM_MACOS) && + Values.count(PLATFORM_MACCATALYST)) { OS << "zippered"; return; } @@ -60,31 +60,31 @@ void ScalarTraits<PlatformSet>::output(const PlatformSet &Values, void *IO, default: llvm_unreachable("unexpected platform"); break; - case PlatformKind::macOS: + case PLATFORM_MACOS: OS << "macosx"; break; - case PlatformKind::iOSSimulator: + case PLATFORM_IOSSIMULATOR: LLVM_FALLTHROUGH; - case PlatformKind::iOS: + case PLATFORM_IOS: OS << "ios"; break; - case PlatformKind::watchOSSimulator: + case PLATFORM_WATCHOSSIMULATOR: LLVM_FALLTHROUGH; - case PlatformKind::watchOS: + case PLATFORM_WATCHOS: OS << "watchos"; break; - case PlatformKind::tvOSSimulator: + case PLATFORM_TVOSSIMULATOR: LLVM_FALLTHROUGH; - case PlatformKind::tvOS: + case PLATFORM_TVOS: OS << "tvos"; break; - case PlatformKind::bridgeOS: + case PLATFORM_BRIDGEOS: OS << "bridgeos"; break; - case PlatformKind::macCatalyst: + case PLATFORM_MACCATALYST: OS << "iosmac"; break; - case PlatformKind::driverKit: + case PLATFORM_DRIVERKIT: OS << "driverkit"; break; } @@ -98,28 +98,27 @@ StringRef ScalarTraits<PlatformSet>::input(StringRef Scalar, void *IO, if (Scalar == "zippered") { if (Ctx && Ctx->FileKind == FileType::TBD_V3) { - Values.insert(PlatformKind::macOS); - Values.insert(PlatformKind::macCatalyst); + Values.insert(PLATFORM_MACOS); + Values.insert(PLATFORM_MACCATALYST); return {}; } return "invalid platform"; } - auto Platform = StringSwitch<PlatformKind>(Scalar) - .Case("unknown", PlatformKind::unknown) - .Case("macosx", PlatformKind::macOS) - .Case("ios", PlatformKind::iOS) - .Case("watchos", PlatformKind::watchOS) - .Case("tvos", PlatformKind::tvOS) - .Case("bridgeos", PlatformKind::bridgeOS) - .Case("iosmac", PlatformKind::macCatalyst) - .Default(PlatformKind::unknown); - - if (Platform == PlatformKind::macCatalyst) + auto Platform = StringSwitch<PlatformType>(Scalar) + .Case("macosx", PLATFORM_MACOS) + .Case("ios", PLATFORM_IOS) + .Case("watchos", PLATFORM_WATCHOS) + .Case("tvos", PLATFORM_TVOS) + .Case("bridgeos", PLATFORM_BRIDGEOS) + .Case("iosmac", PLATFORM_MACCATALYST) + .Default(PLATFORM_UNKNOWN); + + if (Platform == PLATFORM_MACCATALYST) if (Ctx && Ctx->FileKind != FileType::TBD_V3) return "invalid platform"; - if (Platform == PlatformKind::unknown) + if (Platform == PLATFORM_UNKNOWN) return "unknown platform"; Values.insert(Platform); @@ -226,7 +225,7 @@ StringRef ScalarTraits<UUID>::input(StringRef Scalar, void *, UUID &Value) { if (UUID.empty()) return "invalid uuid string pair"; Value.second = std::string(UUID); - Value.first = Target{getArchitectureFromName(Arch), PlatformKind::unknown}; + Value.first = Target{getArchitectureFromName(Arch), PLATFORM_UNKNOWN}; return {}; } diff --git a/llvm/lib/TextAPI/TextStubCommon.h b/llvm/lib/TextAPI/TextStubCommon.h index 89ae5d56297c..aac27221b5ff 100644 --- a/llvm/lib/TextAPI/TextStubCommon.h +++ b/llvm/lib/TextAPI/TextStubCommon.h @@ -16,9 +16,9 @@ #include "llvm/ADT/StringRef.h" #include "llvm/Support/YAMLTraits.h" #include "llvm/TextAPI/Architecture.h" -#include "llvm/TextAPI/ArchitectureSet.h" #include "llvm/TextAPI/InterfaceFile.h" -#include "llvm/TextAPI/PackedVersion.h" +#include "llvm/TextAPI/Platform.h" +#include "llvm/TextAPI/Target.h" using UUID = std::pair<llvm::MachO::Target, std::string>; @@ -28,6 +28,11 @@ LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(UUID) LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(FlowStringRef) namespace llvm { + +namespace MachO { + class ArchitectureSet; + class PackedVersion; +} namespace yaml { template <> struct ScalarTraits<FlowStringRef> { diff --git a/llvm/lib/Transforms/Coroutines/CoroEarly.cpp b/llvm/lib/Transforms/Coroutines/CoroEarly.cpp index 68a34bdcb1cd..1533e1805f17 100644 --- a/llvm/lib/Transforms/Coroutines/CoroEarly.cpp +++ b/llvm/lib/Transforms/Coroutines/CoroEarly.cpp @@ -176,11 +176,14 @@ bool Lowerer::lowerEarlyIntrinsics(Function &F) { lowerCoroNoop(cast<IntrinsicInst>(&I)); break; case Intrinsic::coro_id: - // Mark a function that comes out of the frontend that has a coro.id - // with a coroutine attribute. if (auto *CII = cast<CoroIdInst>(&I)) { if (CII->getInfo().isPreSplit()) { - F.addFnAttr(CORO_PRESPLIT_ATTR, UNPREPARED_FOR_SPLIT); + assert(F.hasFnAttribute(CORO_PRESPLIT_ATTR) && + F.getFnAttribute(CORO_PRESPLIT_ATTR).getValueAsString() == + UNPREPARED_FOR_SPLIT && + "The frontend uses Swtich-Resumed ABI should emit " + "\"coroutine.presplit\" attribute with value \"0\" for the " + "coroutine."); setCannotDuplicate(CII); CII->setCoroutineSelf(); CoroId = cast<CoroIdInst>(&I); @@ -190,6 +193,8 @@ bool Lowerer::lowerEarlyIntrinsics(Function &F) { case Intrinsic::coro_id_retcon: case Intrinsic::coro_id_retcon_once: case Intrinsic::coro_id_async: + // TODO: Remove the line once we support it in the corresponding + // frontend. F.addFnAttr(CORO_PRESPLIT_ATTR, PREPARED_FOR_SPLIT); break; case Intrinsic::coro_resume: diff --git a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp index a0d12865bd3a..92acfb93057a 100644 --- a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp +++ b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp @@ -587,7 +587,7 @@ void FrameTypeBuilder::addFieldForAllocas(const Function &F, } }); - if (!Shape.ReuseFrameSlot && !EnableReuseStorageInFrame) { + if (!Shape.OptimizeFrame && !EnableReuseStorageInFrame) { for (const auto &A : FrameData.Allocas) { AllocaInst *Alloca = A.Alloca; NonOverlapedAllocas.emplace_back(AllocaSetType(1, Alloca)); @@ -808,7 +808,7 @@ static StringRef solveTypeName(Type *Ty) { if (Ty->isPointerTy()) { auto *PtrTy = cast<PointerType>(Ty); - Type *PointeeTy = PtrTy->getElementType(); + Type *PointeeTy = PtrTy->getPointerElementType(); auto Name = solveTypeName(PointeeTy); if (Name == "UnknownType") return "PointerType"; @@ -1659,7 +1659,7 @@ static Instruction *insertSpills(const FrameDataInfo &FrameData, &*Builder.GetInsertPoint()); // This dbg.declare is for the main function entry point. It // will be deleted in all coro-split functions. - coro::salvageDebugInfo(DbgPtrAllocaCache, DDI, Shape.ReuseFrameSlot); + coro::salvageDebugInfo(DbgPtrAllocaCache, DDI, Shape.OptimizeFrame); } } @@ -2278,7 +2278,7 @@ static void eliminateSwiftErrorArgument(Function &F, Argument &Arg, IRBuilder<> Builder(F.getEntryBlock().getFirstNonPHIOrDbg()); auto ArgTy = cast<PointerType>(Arg.getType()); - auto ValueTy = ArgTy->getElementType(); + auto ValueTy = ArgTy->getPointerElementType(); // Reduce to the alloca case: @@ -2506,7 +2506,7 @@ static void collectFrameAllocas(Function &F, coro::Shape &Shape, void coro::salvageDebugInfo( SmallDenseMap<llvm::Value *, llvm::AllocaInst *, 4> &DbgPtrAllocaCache, - DbgVariableIntrinsic *DVI, bool ReuseFrameSlot) { + DbgVariableIntrinsic *DVI, bool OptimizeFrame) { Function *F = DVI->getFunction(); IRBuilder<> Builder(F->getContext()); auto InsertPt = F->getEntryBlock().getFirstInsertionPt(); @@ -2558,7 +2558,7 @@ void coro::salvageDebugInfo( // // Avoid to create the alloca would be eliminated by optimization // passes and the corresponding dbg.declares would be invalid. - if (!ReuseFrameSlot && !EnableReuseStorageInFrame) + if (!OptimizeFrame && !EnableReuseStorageInFrame) if (auto *Arg = dyn_cast<llvm::Argument>(Storage)) { auto &Cached = DbgPtrAllocaCache[Storage]; if (!Cached) { diff --git a/llvm/lib/Transforms/Coroutines/CoroInstr.h b/llvm/lib/Transforms/Coroutines/CoroInstr.h index bf3d781ba43e..014938c15a0a 100644 --- a/llvm/lib/Transforms/Coroutines/CoroInstr.h +++ b/llvm/lib/Transforms/Coroutines/CoroInstr.h @@ -599,6 +599,18 @@ public: } }; +/// This represents the llvm.coro.align instruction. +class LLVM_LIBRARY_VISIBILITY CoroAlignInst : public IntrinsicInst { +public: + // Methods to support type inquiry through isa, cast, and dyn_cast: + static bool classof(const IntrinsicInst *I) { + return I->getIntrinsicID() == Intrinsic::coro_align; + } + static bool classof(const Value *V) { + return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V)); + } +}; + class LLVM_LIBRARY_VISIBILITY AnyCoroEndInst : public IntrinsicInst { enum { FrameArg, UnwindArg }; diff --git a/llvm/lib/Transforms/Coroutines/CoroInternal.h b/llvm/lib/Transforms/Coroutines/CoroInternal.h index 27ba8524f975..9a17068df3a9 100644 --- a/llvm/lib/Transforms/Coroutines/CoroInternal.h +++ b/llvm/lib/Transforms/Coroutines/CoroInternal.h @@ -36,6 +36,11 @@ void initializeCoroCleanupLegacyPass(PassRegistry &); // adds coroutine subfunctions to the SCC to be processed by IPO pipeline. // Async lowering similarily triggers a restart of the pipeline after it has // split the coroutine. +// +// FIXME: Refactor these attributes as LLVM attributes instead of string +// attributes since these attributes are already used outside LLVM's +// coroutine module. +// FIXME: Remove these values once we remove the Legacy PM. #define CORO_PRESPLIT_ATTR "coroutine.presplit" #define UNPREPARED_FOR_SPLIT "0" #define PREPARED_FOR_SPLIT "1" @@ -54,7 +59,7 @@ void updateCallGraph(Function &Caller, ArrayRef<Function *> Funcs, /// holding a pointer to the coroutine frame. void salvageDebugInfo( SmallDenseMap<llvm::Value *, llvm::AllocaInst *, 4> &DbgPtrAllocaCache, - DbgVariableIntrinsic *DVI, bool ReuseFrameSlot); + DbgVariableIntrinsic *DVI, bool OptimizeFrame); // Keeps data and helper functions for lowering coroutine intrinsics. struct LowererBase { @@ -99,6 +104,7 @@ struct LLVM_LIBRARY_VISIBILITY Shape { CoroBeginInst *CoroBegin; SmallVector<AnyCoroEndInst *, 4> CoroEnds; SmallVector<CoroSizeInst *, 2> CoroSizes; + SmallVector<CoroAlignInst *, 2> CoroAligns; SmallVector<AnyCoroSuspendInst *, 4> CoroSuspends; SmallVector<CallInst*, 2> SwiftErrorOps; @@ -126,7 +132,7 @@ struct LLVM_LIBRARY_VISIBILITY Shape { BasicBlock *AllocaSpillBlock; /// This would only be true if optimization are enabled. - bool ReuseFrameSlot; + bool OptimizeFrame; struct SwitchLoweringStorage { SwitchInst *ResumeSwitch; @@ -272,8 +278,8 @@ struct LLVM_LIBRARY_VISIBILITY Shape { void emitDealloc(IRBuilder<> &Builder, Value *Ptr, CallGraph *CG) const; Shape() = default; - explicit Shape(Function &F, bool ReuseFrameSlot = false) - : ReuseFrameSlot(ReuseFrameSlot) { + explicit Shape(Function &F, bool OptimizeFrame = false) + : OptimizeFrame(OptimizeFrame) { buildFrom(F); } void buildFrom(Function &F); diff --git a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp index 12c1829524ef..b5129809c6a6 100644 --- a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp +++ b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp @@ -29,6 +29,7 @@ #include "llvm/Analysis/CFG.h" #include "llvm/Analysis/CallGraph.h" #include "llvm/Analysis/CallGraphSCCPass.h" +#include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/LazyCallGraph.h" #include "llvm/IR/Argument.h" #include "llvm/IR/Attributes.h" @@ -617,7 +618,8 @@ static void replaceSwiftErrorOps(Function &F, coro::Shape &Shape, Value *CachedSlot = nullptr; auto getSwiftErrorSlot = [&](Type *ValueTy) -> Value * { if (CachedSlot) { - assert(CachedSlot->getType()->getPointerElementType() == ValueTy && + assert(cast<PointerType>(CachedSlot->getType()) + ->isOpaqueOrPointeeTypeMatches(ValueTy) && "multiple swifterror slots in function with different types"); return CachedSlot; } @@ -626,7 +628,8 @@ static void replaceSwiftErrorOps(Function &F, coro::Shape &Shape, for (auto &Arg : F.args()) { if (Arg.isSwiftError()) { CachedSlot = &Arg; - assert(Arg.getType()->getPointerElementType() == ValueTy && + assert(cast<PointerType>(Arg.getType()) + ->isOpaqueOrPointeeTypeMatches(ValueTy) && "swifterror argument does not have expected type"); return &Arg; } @@ -682,7 +685,7 @@ void CoroCloner::salvageDebugInfo() { if (auto *DVI = dyn_cast<DbgVariableIntrinsic>(&I)) Worklist.push_back(DVI); for (DbgVariableIntrinsic *DVI : Worklist) - coro::salvageDebugInfo(DbgPtrAllocaCache, DVI, Shape.ReuseFrameSlot); + coro::salvageDebugInfo(DbgPtrAllocaCache, DVI, Shape.OptimizeFrame); // Remove all salvaged dbg.declare intrinsics that became // either unreachable or stale due to the CoroSplit transformation. @@ -835,7 +838,7 @@ Value *CoroCloner::deriveNewFramePointer() { static void addFramePointerAttrs(AttributeList &Attrs, LLVMContext &Context, unsigned ParamIndex, uint64_t Size, Align Alignment) { - AttrBuilder ParamAttrs; + AttrBuilder ParamAttrs(Context); ParamAttrs.addAttribute(Attribute::NonNull); ParamAttrs.addAttribute(Attribute::NoAlias); ParamAttrs.addAlignmentAttr(Alignment); @@ -845,14 +848,14 @@ static void addFramePointerAttrs(AttributeList &Attrs, LLVMContext &Context, static void addAsyncContextAttrs(AttributeList &Attrs, LLVMContext &Context, unsigned ParamIndex) { - AttrBuilder ParamAttrs; + AttrBuilder ParamAttrs(Context); ParamAttrs.addAttribute(Attribute::SwiftAsync); Attrs = Attrs.addParamAttributes(Context, ParamIndex, ParamAttrs); } static void addSwiftSelfAttrs(AttributeList &Attrs, LLVMContext &Context, unsigned ParamIndex) { - AttrBuilder ParamAttrs; + AttrBuilder ParamAttrs(Context); ParamAttrs.addAttribute(Attribute::SwiftSelf); Attrs = Attrs.addParamAttributes(Context, ParamIndex, ParamAttrs); } @@ -929,7 +932,7 @@ void CoroCloner::create() { case coro::ABI::Switch: // Bootstrap attributes by copying function attributes from the // original function. This should include optimization settings and so on. - NewAttrs = NewAttrs.addFnAttributes(Context, OrigAttrs.getFnAttrs()); + NewAttrs = NewAttrs.addFnAttributes(Context, AttrBuilder(Context, OrigAttrs.getFnAttrs())); addFramePointerAttrs(NewAttrs, Context, 0, Shape.FrameSize, Shape.FrameAlign); @@ -952,7 +955,7 @@ void CoroCloner::create() { // Transfer the original function's attributes. auto FnAttrs = OrigF.getAttributes().getFnAttrs(); - NewAttrs = NewAttrs.addFnAttributes(Context, FnAttrs); + NewAttrs = NewAttrs.addFnAttributes(Context, AttrBuilder(Context, FnAttrs)); break; } case coro::ABI::Retcon: @@ -1082,10 +1085,16 @@ static void updateAsyncFuncPointerContextSize(coro::Shape &Shape) { Shape.AsyncLowering.AsyncFuncPointer->setInitializer(NewFuncPtrStruct); } -static void replaceFrameSize(coro::Shape &Shape) { +static void replaceFrameSizeAndAlignment(coro::Shape &Shape) { if (Shape.ABI == coro::ABI::Async) updateAsyncFuncPointerContextSize(Shape); + for (CoroAlignInst *CA : Shape.CoroAligns) { + CA->replaceAllUsesWith( + ConstantInt::get(CA->getType(), Shape.FrameAlign.value())); + CA->eraseFromParent(); + } + if (Shape.CoroSizes.empty()) return; @@ -1197,10 +1206,34 @@ scanPHIsAndUpdateValueMap(Instruction *Prev, BasicBlock *NewBlock, static bool simplifyTerminatorLeadingToRet(Instruction *InitialInst) { DenseMap<Value *, Value *> ResolvedValues; BasicBlock *UnconditionalSucc = nullptr; + assert(InitialInst->getModule()); + const DataLayout &DL = InitialInst->getModule()->getDataLayout(); + + auto GetFirstValidInstruction = [](Instruction *I) { + while (I) { + // BitCastInst wouldn't generate actual code so that we could skip it. + if (isa<BitCastInst>(I) || I->isDebugOrPseudoInst() || + I->isLifetimeStartOrEnd()) + I = I->getNextNode(); + else if (isInstructionTriviallyDead(I)) + // Duing we are in the middle of the transformation, we need to erase + // the dead instruction manually. + I = &*I->eraseFromParent(); + else + break; + } + return I; + }; + + auto TryResolveConstant = [&ResolvedValues](Value *V) { + auto It = ResolvedValues.find(V); + if (It != ResolvedValues.end()) + V = It->second; + return dyn_cast<ConstantInt>(V); + }; Instruction *I = InitialInst; - while (I->isTerminator() || - (isa<CmpInst>(I) && I->getNextNode()->isTerminator())) { + while (I->isTerminator() || isa<CmpInst>(I)) { if (isa<ReturnInst>(I)) { if (I != InitialInst) { // If InitialInst is an unconditional branch, @@ -1213,48 +1246,68 @@ static bool simplifyTerminatorLeadingToRet(Instruction *InitialInst) { } if (auto *BR = dyn_cast<BranchInst>(I)) { if (BR->isUnconditional()) { - BasicBlock *BB = BR->getSuccessor(0); + BasicBlock *Succ = BR->getSuccessor(0); if (I == InitialInst) - UnconditionalSucc = BB; - scanPHIsAndUpdateValueMap(I, BB, ResolvedValues); - I = BB->getFirstNonPHIOrDbgOrLifetime(); + UnconditionalSucc = Succ; + scanPHIsAndUpdateValueMap(I, Succ, ResolvedValues); + I = GetFirstValidInstruction(Succ->getFirstNonPHIOrDbgOrLifetime()); continue; } - } else if (auto *CondCmp = dyn_cast<CmpInst>(I)) { - auto *BR = dyn_cast<BranchInst>(I->getNextNode()); - if (BR && BR->isConditional() && CondCmp == BR->getCondition()) { - // If the case number of suspended switch instruction is reduced to - // 1, then it is simplified to CmpInst in llvm::ConstantFoldTerminator. - // And the comparsion looks like : %cond = icmp eq i8 %V, constant. - ConstantInt *CondConst = dyn_cast<ConstantInt>(CondCmp->getOperand(1)); - if (CondConst && CondCmp->getPredicate() == CmpInst::ICMP_EQ) { - Value *V = CondCmp->getOperand(0); - auto it = ResolvedValues.find(V); - if (it != ResolvedValues.end()) - V = it->second; - - if (ConstantInt *Cond0 = dyn_cast<ConstantInt>(V)) { - BasicBlock *BB = Cond0->equalsInt(CondConst->getZExtValue()) - ? BR->getSuccessor(0) - : BR->getSuccessor(1); - scanPHIsAndUpdateValueMap(I, BB, ResolvedValues); - I = BB->getFirstNonPHIOrDbgOrLifetime(); - continue; - } - } - } - } else if (auto *SI = dyn_cast<SwitchInst>(I)) { - Value *V = SI->getCondition(); - auto it = ResolvedValues.find(V); - if (it != ResolvedValues.end()) - V = it->second; - if (ConstantInt *Cond = dyn_cast<ConstantInt>(V)) { - BasicBlock *BB = SI->findCaseValue(Cond)->getCaseSuccessor(); - scanPHIsAndUpdateValueMap(I, BB, ResolvedValues); - I = BB->getFirstNonPHIOrDbgOrLifetime(); + + BasicBlock *BB = BR->getParent(); + // Handle the case the condition of the conditional branch is constant. + // e.g., + // + // br i1 false, label %cleanup, label %CoroEnd + // + // It is possible during the transformation. We could continue the + // simplifying in this case. + if (ConstantFoldTerminator(BB, /*DeleteDeadConditions=*/true)) { + // Handle this branch in next iteration. + I = BB->getTerminator(); continue; } + } else if (auto *CondCmp = dyn_cast<CmpInst>(I)) { + // If the case number of suspended switch instruction is reduced to + // 1, then it is simplified to CmpInst in llvm::ConstantFoldTerminator. + auto *BR = dyn_cast<BranchInst>( + GetFirstValidInstruction(CondCmp->getNextNode())); + if (!BR || !BR->isConditional() || CondCmp != BR->getCondition()) + return false; + + // And the comparsion looks like : %cond = icmp eq i8 %V, constant. + // So we try to resolve constant for the first operand only since the + // second operand should be literal constant by design. + ConstantInt *Cond0 = TryResolveConstant(CondCmp->getOperand(0)); + auto *Cond1 = dyn_cast<ConstantInt>(CondCmp->getOperand(1)); + if (!Cond0 || !Cond1) + return false; + + // Both operands of the CmpInst are Constant. So that we could evaluate + // it immediately to get the destination. + auto *ConstResult = + dyn_cast_or_null<ConstantInt>(ConstantFoldCompareInstOperands( + CondCmp->getPredicate(), Cond0, Cond1, DL)); + if (!ConstResult) + return false; + + CondCmp->replaceAllUsesWith(ConstResult); + CondCmp->eraseFromParent(); + + // Handle this branch in next iteration. + I = BR; + continue; + } else if (auto *SI = dyn_cast<SwitchInst>(I)) { + ConstantInt *Cond = TryResolveConstant(SI->getCondition()); + if (!Cond) + return false; + + BasicBlock *BB = SI->findCaseValue(Cond)->getCaseSuccessor(); + scanPHIsAndUpdateValueMap(I, BB, ResolvedValues); + I = GetFirstValidInstruction(BB->getFirstNonPHIOrDbgOrLifetime()); + continue; } + return false; } return false; @@ -1826,20 +1879,20 @@ namespace { static coro::Shape splitCoroutine(Function &F, SmallVectorImpl<Function *> &Clones, - bool ReuseFrameSlot) { + bool OptimizeFrame) { PrettyStackTraceFunction prettyStackTrace(F); // The suspend-crossing algorithm in buildCoroutineFrame get tripped // up by uses in unreachable blocks, so remove them as a first pass. removeUnreachableBlocks(F); - coro::Shape Shape(F, ReuseFrameSlot); + coro::Shape Shape(F, OptimizeFrame); if (!Shape.CoroBegin) return Shape; simplifySuspendPoints(Shape); buildCoroutineFrame(F, Shape); - replaceFrameSize(Shape); + replaceFrameSizeAndAlignment(Shape); // If there are no suspend points, no split required, just remove // the allocation and deallocation blocks, they are not needed. @@ -2165,7 +2218,7 @@ PreservedAnalyses CoroSplitPass::run(LazyCallGraph::SCC &C, F.removeFnAttr(CORO_PRESPLIT_ATTR); SmallVector<Function *, 4> Clones; - const coro::Shape Shape = splitCoroutine(F, Clones, ReuseFrameSlot); + const coro::Shape Shape = splitCoroutine(F, Clones, OptimizeFrame); updateCallGraphAfterCoroutineSplit(*N, Shape, Clones, C, CG, AM, UR, FAM); if (!Shape.CoroSuspends.empty()) { @@ -2198,13 +2251,13 @@ namespace { struct CoroSplitLegacy : public CallGraphSCCPass { static char ID; // Pass identification, replacement for typeid - CoroSplitLegacy(bool ReuseFrameSlot = false) - : CallGraphSCCPass(ID), ReuseFrameSlot(ReuseFrameSlot) { + CoroSplitLegacy(bool OptimizeFrame = false) + : CallGraphSCCPass(ID), OptimizeFrame(OptimizeFrame) { initializeCoroSplitLegacyPass(*PassRegistry::getPassRegistry()); } bool Run = false; - bool ReuseFrameSlot; + bool OptimizeFrame; // A coroutine is identified by the presence of coro.begin intrinsic, if // we don't have any, this pass has nothing to do. @@ -2263,7 +2316,7 @@ struct CoroSplitLegacy : public CallGraphSCCPass { F->removeFnAttr(CORO_PRESPLIT_ATTR); SmallVector<Function *, 4> Clones; - const coro::Shape Shape = splitCoroutine(*F, Clones, ReuseFrameSlot); + const coro::Shape Shape = splitCoroutine(*F, Clones, OptimizeFrame); updateCallGraphAfterCoroutineSplit(*F, Shape, Clones, CG, SCC); if (Shape.ABI == coro::ABI::Async) { // Restart SCC passes. @@ -2300,6 +2353,6 @@ INITIALIZE_PASS_END( "Split coroutine into a set of functions driving its state machine", false, false) -Pass *llvm::createCoroSplitLegacyPass(bool ReuseFrameSlot) { - return new CoroSplitLegacy(ReuseFrameSlot); +Pass *llvm::createCoroSplitLegacyPass(bool OptimizeFrame) { + return new CoroSplitLegacy(OptimizeFrame); } diff --git a/llvm/lib/Transforms/Coroutines/Coroutines.cpp b/llvm/lib/Transforms/Coroutines/Coroutines.cpp index fba8b03e44ba..965a146c143f 100644 --- a/llvm/lib/Transforms/Coroutines/Coroutines.cpp +++ b/llvm/lib/Transforms/Coroutines/Coroutines.cpp @@ -123,6 +123,7 @@ Value *coro::LowererBase::makeSubFnCall(Value *Arg, int Index, static bool isCoroutineIntrinsicName(StringRef Name) { // NOTE: Must be sorted! static const char *const CoroIntrinsics[] = { + "llvm.coro.align", "llvm.coro.alloc", "llvm.coro.async.context.alloc", "llvm.coro.async.context.dealloc", @@ -268,6 +269,9 @@ void coro::Shape::buildFrom(Function &F) { case Intrinsic::coro_size: CoroSizes.push_back(cast<CoroSizeInst>(II)); break; + case Intrinsic::coro_align: + CoroAligns.push_back(cast<CoroAlignInst>(II)); + break; case Intrinsic::coro_frame: CoroFrames.push_back(cast<CoroFrameInst>(II)); break; @@ -672,8 +676,11 @@ static void checkAsyncFuncPointer(const Instruction *I, Value *V) { if (!AsyncFuncPtrAddr) fail(I, "llvm.coro.id.async async function pointer not a global", V); - auto *StructTy = - cast<StructType>(AsyncFuncPtrAddr->getType()->getPointerElementType()); + if (AsyncFuncPtrAddr->getType()->isOpaquePointerTy()) + return; + + auto *StructTy = cast<StructType>( + AsyncFuncPtrAddr->getType()->getNonOpaquePointerElementType()); if (StructTy->isOpaque() || !StructTy->isPacked() || StructTy->getNumElements() != 2 || !StructTy->getElementType(0)->isIntegerTy(32) || @@ -697,14 +704,16 @@ void CoroIdAsyncInst::checkWellFormed() const { static void checkAsyncContextProjectFunction(const Instruction *I, Function *F) { auto *FunTy = cast<FunctionType>(F->getValueType()); - if (!FunTy->getReturnType()->isPointerTy() || - !FunTy->getReturnType()->getPointerElementType()->isIntegerTy(8)) + Type *Int8Ty = Type::getInt8Ty(F->getContext()); + auto *RetPtrTy = dyn_cast<PointerType>(FunTy->getReturnType()); + if (!RetPtrTy || !RetPtrTy->isOpaqueOrPointeeTypeMatches(Int8Ty)) fail(I, "llvm.coro.suspend.async resume function projection function must " "return an i8* type", F); if (FunTy->getNumParams() != 1 || !FunTy->getParamType(0)->isPointerTy() || - !FunTy->getParamType(0)->getPointerElementType()->isIntegerTy(8)) + !cast<PointerType>(FunTy->getParamType(0)) + ->isOpaqueOrPointeeTypeMatches(Int8Ty)) fail(I, "llvm.coro.suspend.async resume function projection function must " "take one i8* type as parameter", @@ -719,8 +728,7 @@ void CoroAsyncEndInst::checkWellFormed() const { auto *MustTailCallFunc = getMustTailCallFunction(); if (!MustTailCallFunc) return; - auto *FnTy = - cast<FunctionType>(MustTailCallFunc->getType()->getPointerElementType()); + auto *FnTy = MustTailCallFunc->getFunctionType(); if (FnTy->getNumParams() != (arg_size() - 3)) fail(this, "llvm.coro.end.async must tail call function argument type must " diff --git a/llvm/lib/Transforms/IPO/AlwaysInliner.cpp b/llvm/lib/Transforms/IPO/AlwaysInliner.cpp index 01e724e22dcf..a6d9ce1033f3 100644 --- a/llvm/lib/Transforms/IPO/AlwaysInliner.cpp +++ b/llvm/lib/Transforms/IPO/AlwaysInliner.cpp @@ -54,13 +54,13 @@ PreservedAnalyses AlwaysInlinerPass::run(Module &M, if (F.isPresplitCoroutine()) continue; - if (!F.isDeclaration() && F.hasFnAttribute(Attribute::AlwaysInline) && - isInlineViable(F).isSuccess()) { + if (!F.isDeclaration() && isInlineViable(F).isSuccess()) { Calls.clear(); for (User *U : F.users()) if (auto *CB = dyn_cast<CallBase>(U)) - if (CB->getCalledFunction() == &F) + if (CB->getCalledFunction() == &F && + CB->hasFnAttr(Attribute::AlwaysInline)) Calls.insert(CB); for (CallBase *CB : Calls) { @@ -92,10 +92,12 @@ PreservedAnalyses AlwaysInlinerPass::run(Module &M, Changed = true; } - // Remember to try and delete this function afterward. This both avoids - // re-walking the rest of the module and avoids dealing with any iterator - // invalidation issues while deleting functions. - InlinedFunctions.push_back(&F); + if (F.hasFnAttribute(Attribute::AlwaysInline)) { + // Remember to try and delete this function afterward. This both avoids + // re-walking the rest of the module and avoids dealing with any + // iterator invalidation issues while deleting functions. + InlinedFunctions.push_back(&F); + } } } @@ -117,7 +119,7 @@ PreservedAnalyses AlwaysInlinerPass::run(Module &M, if (!InlinedFunctions.empty()) { // Now we just have the comdat functions. Filter out the ones whose comdats // are not actually dead. - filterDeadComdatFunctions(M, InlinedFunctions); + filterDeadComdatFunctions(InlinedFunctions); // The remaining functions are actually dead. for (Function *F : InlinedFunctions) { M.getFunctionList().erase(F); diff --git a/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp b/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp index 3a42a2cac928..ce3c5153bde2 100644 --- a/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp +++ b/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp @@ -196,8 +196,7 @@ doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote, for (const auto &ArgIndex : ArgIndices) { // not allowed to dereference ->begin() if size() is 0 Params.push_back(GetElementPtrInst::getIndexedType( - cast<PointerType>(I->getType())->getElementType(), - ArgIndex.second)); + I->getType()->getPointerElementType(), ArgIndex.second)); ArgAttrVec.push_back(AttributeSet()); assert(Params.back()); } @@ -298,7 +297,7 @@ doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote, Ops.push_back(ConstantInt::get(IdxTy, II)); // Keep track of the type we're currently indexing. if (auto *ElPTy = dyn_cast<PointerType>(ElTy)) - ElTy = ElPTy->getElementType(); + ElTy = ElPTy->getPointerElementType(); else ElTy = GetElementPtrInst::getTypeAtIndex(ElTy, II); } @@ -928,7 +927,7 @@ promoteArguments(Function *F, function_ref<AAResults &(Function &F)> AARGetter, SmallPtrSet<Argument *, 8> ArgsToPromote; SmallPtrSet<Argument *, 8> ByValArgsToTransform; for (Argument *PtrArg : PointerArgs) { - Type *AgTy = cast<PointerType>(PtrArg->getType())->getElementType(); + Type *AgTy = PtrArg->getType()->getPointerElementType(); // Replace sret attribute with noalias. This reduces register pressure by // avoiding a register copy. diff --git a/llvm/lib/Transforms/IPO/Attributor.cpp b/llvm/lib/Transforms/IPO/Attributor.cpp index 7e729e57153c..12b8a0ef9d00 100644 --- a/llvm/lib/Transforms/IPO/Attributor.cpp +++ b/llvm/lib/Transforms/IPO/Attributor.cpp @@ -22,6 +22,7 @@ #include "llvm/ADT/TinyPtrVector.h" #include "llvm/Analysis/InlineCost.h" #include "llvm/Analysis/LazyValueInfo.h" +#include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/Analysis/MemorySSAUpdater.h" #include "llvm/Analysis/MustExecute.h" #include "llvm/Analysis/ValueTracking.h" @@ -202,9 +203,12 @@ bool AA::isDynamicallyUnique(Attributor &A, const AbstractAttribute &QueryingAA, return NoRecurseAA.isAssumedNoRecurse(); } -Constant *AA::getInitialValueForObj(Value &Obj, Type &Ty) { +Constant *AA::getInitialValueForObj(Value &Obj, Type &Ty, + const TargetLibraryInfo *TLI) { if (isa<AllocaInst>(Obj)) return UndefValue::get(&Ty); + if (isAllocationFn(&Obj, TLI)) + return getInitialValueOfAllocation(&cast<CallBase>(Obj), TLI, &Ty); auto *GV = dyn_cast<GlobalVariable>(&Obj); if (!GV || !GV->hasLocalLinkage()) return nullptr; @@ -316,7 +320,8 @@ bool AA::getPotentialCopiesOfStoredValue( dbgs() << "Underlying object is a valid nullptr, giving up.\n";); return false; } - if (!isa<AllocaInst>(Obj) && !isa<GlobalVariable>(Obj)) { + if (!isa<AllocaInst>(Obj) && !isa<GlobalVariable>(Obj) && + !isNoAliasCall(Obj)) { LLVM_DEBUG(dbgs() << "Underlying object is not supported yet: " << *Obj << "\n";); return false; @@ -741,6 +746,7 @@ void IRPosition::verify() { assert((CBContext == nullptr) && "'call site argument' position must not have CallBaseContext!"); Use *U = getAsUsePtr(); + (void)U; // Silence unused variable warning. assert(U && "Expected use for a 'call site argument' position!"); assert(isa<CallBase>(U->getUser()) && "Expected call base user for a 'call site argument' position!"); @@ -999,10 +1005,11 @@ bool Attributor::isAssumedDead(const BasicBlock &BB, return false; } -bool Attributor::checkForAllUses(function_ref<bool(const Use &, bool &)> Pred, - const AbstractAttribute &QueryingAA, - const Value &V, bool CheckBBLivenessOnly, - DepClassTy LivenessDepClass) { +bool Attributor::checkForAllUses( + function_ref<bool(const Use &, bool &)> Pred, + const AbstractAttribute &QueryingAA, const Value &V, + bool CheckBBLivenessOnly, DepClassTy LivenessDepClass, + function_ref<bool(const Use &OldU, const Use &NewU)> EquivalentUseCB) { // Check the trivial case first as it catches void values. if (V.use_empty()) @@ -1053,8 +1060,15 @@ bool Attributor::checkForAllUses(function_ref<bool(const Use &, bool &)> Pred, << PotentialCopies.size() << " potential copies instead!\n"); for (Value *PotentialCopy : PotentialCopies) - for (const Use &U : PotentialCopy->uses()) - Worklist.push_back(&U); + for (const Use &CopyUse : PotentialCopy->uses()) { + if (EquivalentUseCB && !EquivalentUseCB(*U, CopyUse)) { + LLVM_DEBUG(dbgs() << "[Attributor] Potential copy was " + "rejected by the equivalence call back: " + << *CopyUse << "!\n"); + return false; + } + Worklist.push_back(&CopyUse); + } continue; } } diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp index b977821bcaa6..76420783b2d1 100644 --- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp +++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp @@ -417,12 +417,10 @@ const Value *stripAndAccumulateMinimalOffsets( AttributorAnalysis); } -static const Value *getMinimalBaseOfAccessPointerOperand( - Attributor &A, const AbstractAttribute &QueryingAA, const Instruction *I, - int64_t &BytesOffset, const DataLayout &DL, bool AllowNonInbounds = false) { - const Value *Ptr = getPointerOperand(I, /* AllowVolatile */ false); - if (!Ptr) - return nullptr; +static const Value * +getMinimalBaseOfPointer(Attributor &A, const AbstractAttribute &QueryingAA, + const Value *Ptr, int64_t &BytesOffset, + const DataLayout &DL, bool AllowNonInbounds = false) { APInt OffsetAPInt(DL.getIndexTypeSizeInBits(Ptr->getType()), 0); const Value *Base = stripAndAccumulateMinimalOffsets( A, QueryingAA, Ptr, DL, OffsetAPInt, AllowNonInbounds); @@ -431,18 +429,6 @@ static const Value *getMinimalBaseOfAccessPointerOperand( return Base; } -static const Value * -getBasePointerOfAccessPointerOperand(const Instruction *I, int64_t &BytesOffset, - const DataLayout &DL, - bool AllowNonInbounds = false) { - const Value *Ptr = getPointerOperand(I, /* AllowVolatile */ false); - if (!Ptr) - return nullptr; - - return GetPointerBaseWithConstantOffset(Ptr, BytesOffset, DL, - AllowNonInbounds); -} - /// Clamp the information known for all returned values of a function /// (identified by \p QueryingAA) into \p S. template <typename AAType, typename StateType = typename AAType::StateType> @@ -810,14 +796,17 @@ struct AA::PointerInfo::OffsetAndSize : public std::pair<int64_t, int64_t> { int64_t getSize() const { return second; } static OffsetAndSize getUnknown() { return OffsetAndSize(Unknown, Unknown); } + /// Return true if offset or size are unknown. + bool offsetOrSizeAreUnknown() const { + return getOffset() == OffsetAndSize::Unknown || + getSize() == OffsetAndSize::Unknown; + } + /// Return true if this offset and size pair might describe an address that /// overlaps with \p OAS. bool mayOverlap(const OffsetAndSize &OAS) const { // Any unknown value and we are giving up -> overlap. - if (OAS.getOffset() == OffsetAndSize::Unknown || - OAS.getSize() == OffsetAndSize::Unknown || - getOffset() == OffsetAndSize::Unknown || - getSize() == OffsetAndSize::Unknown) + if (offsetOrSizeAreUnknown() || OAS.offsetOrSizeAreUnknown()) return true; // Check if one offset point is in the other interval [offset, offset+size]. @@ -1024,8 +1013,9 @@ protected: OffsetAndSize ItOAS = It.getFirst(); if (!OAS.mayOverlap(ItOAS)) continue; + bool IsExact = OAS == ItOAS && !OAS.offsetOrSizeAreUnknown(); for (auto &Access : It.getSecond()) - if (!CB(Access, OAS == ItOAS)) + if (!CB(Access, IsExact)) return false; } return true; @@ -1161,27 +1151,34 @@ struct AAPointerInfoFloating : public AAPointerInfoImpl { return true; }; + const auto *TLI = getAnchorScope() + ? A.getInfoCache().getTargetLibraryInfoForFunction( + *getAnchorScope()) + : nullptr; auto UsePred = [&](const Use &U, bool &Follow) -> bool { Value *CurPtr = U.get(); User *Usr = U.getUser(); LLVM_DEBUG(dbgs() << "[AAPointerInfo] Analyze " << *CurPtr << " in " << *Usr << "\n"); - - OffsetInfo &PtrOI = OffsetInfoMap[CurPtr]; + assert(OffsetInfoMap.count(CurPtr) && + "The current pointer offset should have been seeded!"); if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Usr)) { if (CE->isCast()) - return HandlePassthroughUser(Usr, PtrOI, Follow); + return HandlePassthroughUser(Usr, OffsetInfoMap[CurPtr], Follow); if (CE->isCompare()) return true; - if (!CE->isGEPWithNoNotionalOverIndexing()) { + if (!isa<GEPOperator>(CE)) { LLVM_DEBUG(dbgs() << "[AAPointerInfo] Unhandled constant user " << *CE << "\n"); return false; } } if (auto *GEP = dyn_cast<GEPOperator>(Usr)) { + // Note the order here, the Usr access might change the map, CurPtr is + // already in it though. OffsetInfo &UsrOI = OffsetInfoMap[Usr]; + OffsetInfo &PtrOI = OffsetInfoMap[CurPtr]; UsrOI = PtrOI; // TODO: Use range information. @@ -1205,19 +1202,22 @@ struct AAPointerInfoFloating : public AAPointerInfoImpl { } UsrOI.Offset = PtrOI.Offset + DL.getIndexedOffsetInType( - CurPtr->getType()->getPointerElementType(), Indices); + GEP->getSourceElementType(), Indices); Follow = true; return true; } if (isa<CastInst>(Usr) || isa<SelectInst>(Usr)) - return HandlePassthroughUser(Usr, PtrOI, Follow); + return HandlePassthroughUser(Usr, OffsetInfoMap[CurPtr], Follow); // For PHIs we need to take care of the recurrence explicitly as the value // might change while we iterate through a loop. For now, we give up if // the PHI is not invariant. if (isa<PHINode>(Usr)) { - // Check if the PHI is invariant (so far). + // Note the order here, the Usr access might change the map, CurPtr is + // already in it though. OffsetInfo &UsrOI = OffsetInfoMap[Usr]; + OffsetInfo &PtrOI = OffsetInfoMap[CurPtr]; + // Check if the PHI is invariant (so far). if (UsrOI == PtrOI) return true; @@ -1257,8 +1257,8 @@ struct AAPointerInfoFloating : public AAPointerInfoImpl { if (auto *LoadI = dyn_cast<LoadInst>(Usr)) return handleAccess(A, *LoadI, *CurPtr, /* Content */ nullptr, - AccessKind::AK_READ, PtrOI.Offset, Changed, - LoadI->getType()); + AccessKind::AK_READ, OffsetInfoMap[CurPtr].Offset, + Changed, LoadI->getType()); if (auto *StoreI = dyn_cast<StoreInst>(Usr)) { if (StoreI->getValueOperand() == CurPtr) { LLVM_DEBUG(dbgs() << "[AAPointerInfo] Escaping use in store " @@ -1269,18 +1269,21 @@ struct AAPointerInfoFloating : public AAPointerInfoImpl { Optional<Value *> Content = A.getAssumedSimplified( *StoreI->getValueOperand(), *this, UsedAssumedInformation); return handleAccess(A, *StoreI, *CurPtr, Content, AccessKind::AK_WRITE, - PtrOI.Offset, Changed, + OffsetInfoMap[CurPtr].Offset, Changed, StoreI->getValueOperand()->getType()); } if (auto *CB = dyn_cast<CallBase>(Usr)) { if (CB->isLifetimeStartOrEnd()) return true; + if (TLI && isFreeCall(CB, TLI)) + return true; if (CB->isArgOperand(&U)) { unsigned ArgNo = CB->getArgOperandNo(&U); const auto &CSArgPI = A.getAAFor<AAPointerInfo>( *this, IRPosition::callsite_argument(*CB, ArgNo), DepClassTy::REQUIRED); - Changed = translateAndAddCalleeState(A, CSArgPI, PtrOI.Offset, *CB) | + Changed = translateAndAddCalleeState( + A, CSArgPI, OffsetInfoMap[CurPtr].Offset, *CB) | Changed; return true; } @@ -1293,8 +1296,15 @@ struct AAPointerInfoFloating : public AAPointerInfoImpl { LLVM_DEBUG(dbgs() << "[AAPointerInfo] User not handled " << *Usr << "\n"); return false; }; + auto EquivalentUseCB = [&](const Use &OldU, const Use &NewU) { + if (OffsetInfoMap.count(NewU)) + return OffsetInfoMap[NewU] == OffsetInfoMap[OldU]; + OffsetInfoMap[NewU] = OffsetInfoMap[OldU]; + return true; + }; if (!A.checkForAllUses(UsePred, *this, AssociatedValue, - /* CheckBBLivenessOnly */ true)) + /* CheckBBLivenessOnly */ true, DepClassTy::OPTIONAL, + EquivalentUseCB)) return indicatePessimisticFixpoint(); LLVM_DEBUG({ @@ -2127,31 +2137,26 @@ static int64_t getKnownNonNullAndDerefBytesForUse( return DerefAA.getKnownDereferenceableBytes(); } + Optional<MemoryLocation> Loc = MemoryLocation::getOrNone(I); + if (!Loc || Loc->Ptr != UseV || !Loc->Size.isPrecise() || I->isVolatile()) + return 0; + int64_t Offset; const Value *Base = - getMinimalBaseOfAccessPointerOperand(A, QueryingAA, I, Offset, DL); - if (Base) { - if (Base == &AssociatedValue && - getPointerOperand(I, /* AllowVolatile */ false) == UseV) { - int64_t DerefBytes = - (int64_t)DL.getTypeStoreSize(PtrTy->getPointerElementType()) + Offset; - - IsNonNull |= !NullPointerIsDefined; - return std::max(int64_t(0), DerefBytes); - } + getMinimalBaseOfPointer(A, QueryingAA, Loc->Ptr, Offset, DL); + if (Base && Base == &AssociatedValue) { + int64_t DerefBytes = Loc->Size.getValue() + Offset; + IsNonNull |= !NullPointerIsDefined; + return std::max(int64_t(0), DerefBytes); } /// Corner case when an offset is 0. - Base = getBasePointerOfAccessPointerOperand(I, Offset, DL, - /*AllowNonInbounds*/ true); - if (Base) { - if (Offset == 0 && Base == &AssociatedValue && - getPointerOperand(I, /* AllowVolatile */ false) == UseV) { - int64_t DerefBytes = - (int64_t)DL.getTypeStoreSize(PtrTy->getPointerElementType()); - IsNonNull |= !NullPointerIsDefined; - return std::max(int64_t(0), DerefBytes); - } + Base = GetPointerBaseWithConstantOffset(Loc->Ptr, Offset, DL, + /*AllowNonInbounds*/ true); + if (Base && Base == &AssociatedValue && Offset == 0) { + int64_t DerefBytes = Loc->Size.getValue(); + IsNonNull |= !NullPointerIsDefined; + return std::max(int64_t(0), DerefBytes); } return 0; @@ -2325,6 +2330,8 @@ struct AANoRecurseFunction final : AANoRecurseImpl { /// See AbstractAttribute::initialize(...). void initialize(Attributor &A) override { AANoRecurseImpl::initialize(A); + // TODO: We should build a call graph ourselves to enable this in the module + // pass as well. if (const Function *F = getAnchorScope()) if (A.getInfoCache().getSccSize(*F) != 1) indicatePessimisticFixpoint(); @@ -4057,17 +4064,15 @@ struct AADereferenceableImpl : AADereferenceable { if (!UseV->getType()->isPointerTy()) return; - Type *PtrTy = UseV->getType(); - const DataLayout &DL = A.getDataLayout(); + Optional<MemoryLocation> Loc = MemoryLocation::getOrNone(I); + if (!Loc || Loc->Ptr != UseV || !Loc->Size.isPrecise() || I->isVolatile()) + return; + int64_t Offset; - if (const Value *Base = getBasePointerOfAccessPointerOperand( - I, Offset, DL, /*AllowNonInbounds*/ true)) { - if (Base == &getAssociatedValue() && - getPointerOperand(I, /* AllowVolatile */ false) == UseV) { - uint64_t Size = DL.getTypeStoreSize(PtrTy->getPointerElementType()); - State.addAccessedBytes(Offset, Size); - } - } + const Value *Base = GetPointerBaseWithConstantOffset( + Loc->Ptr, Offset, A.getDataLayout(), /*AllowNonInbounds*/ true); + if (Base && Base == &getAssociatedValue()) + State.addAccessedBytes(Offset, Loc->Size.getValue()); } /// See followUsesInMBEC @@ -5236,6 +5241,8 @@ struct AAValueSimplifyImpl : AAValueSimplify { if (!AA::getAssumedUnderlyingObjects(A, Ptr, Objects, AA, &L)) return false; + const auto *TLI = + A.getInfoCache().getTargetLibraryInfoForFunction(*L.getFunction()); for (Value *Obj : Objects) { LLVM_DEBUG(dbgs() << "Visit underlying object " << *Obj << "\n"); if (isa<UndefValue>(Obj)) @@ -5250,9 +5257,7 @@ struct AAValueSimplifyImpl : AAValueSimplify { continue; return false; } - if (!isa<AllocaInst>(Obj) && !isa<GlobalVariable>(Obj)) - return false; - Constant *InitialVal = AA::getInitialValueForObj(*Obj, *L.getType()); + Constant *InitialVal = AA::getInitialValueForObj(*Obj, *L.getType(), TLI); if (!InitialVal || !Union(*InitialVal)) return false; @@ -5745,13 +5750,6 @@ struct AAHeapToStackFunction final : public AAHeapToStack { /// The call that allocates the memory. CallBase *const CB; - /// The kind of allocation. - const enum class AllocationKind { - MALLOC, - CALLOC, - ALIGNED_ALLOC, - } Kind; - /// The library function id for the allocation. LibFunc LibraryFunctionId = NotLibFunc; @@ -5808,20 +5806,17 @@ struct AAHeapToStackFunction final : public AAHeapToStack { DeallocationInfos[CB] = new (A.Allocator) DeallocationInfo{CB}; return true; } - bool IsMalloc = isMallocLikeFn(CB, TLI); - bool IsAlignedAllocLike = !IsMalloc && isAlignedAllocLikeFn(CB, TLI); - bool IsCalloc = - !IsMalloc && !IsAlignedAllocLike && isCallocLikeFn(CB, TLI); - if (!IsMalloc && !IsAlignedAllocLike && !IsCalloc) - return true; - auto Kind = - IsMalloc ? AllocationInfo::AllocationKind::MALLOC - : (IsCalloc ? AllocationInfo::AllocationKind::CALLOC - : AllocationInfo::AllocationKind::ALIGNED_ALLOC); - - AllocationInfo *AI = new (A.Allocator) AllocationInfo{CB, Kind}; - AllocationInfos[CB] = AI; - TLI->getLibFunc(*CB, AI->LibraryFunctionId); + // To do heap to stack, we need to know that the allocation itself is + // removable once uses are rewritten, and that we can initialize the + // alloca to the same pattern as the original allocation result. + if (isAllocationFn(CB, TLI) && isAllocRemovable(CB, TLI)) { + auto *I8Ty = Type::getInt8Ty(CB->getParent()->getContext()); + if (nullptr != getInitialValueOfAllocation(CB, TLI, I8Ty)) { + AllocationInfo *AI = new (A.Allocator) AllocationInfo{CB}; + AllocationInfos[CB] = AI; + TLI->getLibFunc(*CB, AI->LibraryFunctionId); + } + } return true; }; @@ -5917,21 +5912,22 @@ struct AAHeapToStackFunction final : public AAHeapToStack { Optional<APInt> SizeAPI = getSize(A, *this, AI); if (SizeAPI.hasValue()) { Size = ConstantInt::get(AI.CB->getContext(), *SizeAPI); - } else if (AI.Kind == AllocationInfo::AllocationKind::CALLOC) { - auto *Num = AI.CB->getOperand(0); - auto *SizeT = AI.CB->getOperand(1); - IRBuilder<> B(AI.CB); - Size = B.CreateMul(Num, SizeT, "h2s.calloc.size"); - } else if (AI.Kind == AllocationInfo::AllocationKind::ALIGNED_ALLOC) { - Size = AI.CB->getOperand(1); } else { - Size = AI.CB->getOperand(0); + LLVMContext &Ctx = AI.CB->getContext(); + auto &DL = A.getInfoCache().getDL(); + ObjectSizeOpts Opts; + ObjectSizeOffsetEvaluator Eval(DL, TLI, Ctx, Opts); + SizeOffsetEvalType SizeOffsetPair = Eval.compute(AI.CB); + assert(SizeOffsetPair != ObjectSizeOffsetEvaluator::unknown() && + cast<ConstantInt>(SizeOffsetPair.second)->isZero()); + Size = SizeOffsetPair.first; } Align Alignment(1); - if (AI.Kind == AllocationInfo::AllocationKind::ALIGNED_ALLOC) { - Optional<APInt> AlignmentAPI = - getAPInt(A, *this, *AI.CB->getArgOperand(0)); + if (MaybeAlign RetAlign = AI.CB->getRetAlign()) + Alignment = max(Alignment, RetAlign); + if (Value *Align = getAllocAlignment(AI.CB, TLI)) { + Optional<APInt> AlignmentAPI = getAPInt(A, *this, *Align); assert(AlignmentAPI.hasValue() && "Expected an alignment during manifest!"); Alignment = @@ -5947,6 +5943,11 @@ struct AAHeapToStackFunction final : public AAHeapToStack { Alloca = new BitCastInst(Alloca, AI.CB->getType(), "malloc_bc", Alloca->getNextNode()); + auto *I8Ty = Type::getInt8Ty(F->getContext()); + auto *InitVal = getInitialValueOfAllocation(AI.CB, TLI, I8Ty); + assert(InitVal && + "Must be able to materialize initial memory state of allocation"); + A.changeValueAfterManifest(*AI.CB, *Alloca); if (auto *II = dyn_cast<InvokeInst>(AI.CB)) { @@ -5957,18 +5958,13 @@ struct AAHeapToStackFunction final : public AAHeapToStack { A.deleteAfterManifest(*AI.CB); } - // Zero out the allocated memory if it was a calloc. - if (AI.Kind == AllocationInfo::AllocationKind::CALLOC) { - auto *BI = new BitCastInst(Alloca, AI.CB->getType(), "calloc_bc", - Alloca->getNextNode()); - Value *Ops[] = { - BI, ConstantInt::get(F->getContext(), APInt(8, 0, false)), Size, - ConstantInt::get(Type::getInt1Ty(F->getContext()), false)}; - - Type *Tys[] = {BI->getType(), AI.CB->getOperand(0)->getType()}; - Module *M = F->getParent(); - Function *Fn = Intrinsic::getDeclaration(M, Intrinsic::memset, Tys); - CallInst::Create(Fn, Ops, "", BI->getNextNode()); + // Initialize the alloca with the same value as used by the allocation + // function. We can skip undef as the initial value of an alloc is + // undef, and the memset would simply end up being DSEd. + if (!isa<UndefValue>(InitVal)) { + IRBuilder<> Builder(Alloca->getNextNode()); + // TODO: Use alignment above if align!=1 + Builder.CreateMemSet(Alloca, InitVal, Size, None); } HasChanged = ChangeStatus::CHANGED; } @@ -5990,25 +5986,18 @@ struct AAHeapToStackFunction final : public AAHeapToStack { Optional<APInt> getSize(Attributor &A, const AbstractAttribute &AA, AllocationInfo &AI) { + auto Mapper = [&](const Value *V) -> const Value * { + bool UsedAssumedInformation = false; + if (Optional<Constant *> SimpleV = + A.getAssumedConstant(*V, AA, UsedAssumedInformation)) + if (*SimpleV) + return *SimpleV; + return V; + }; - if (AI.Kind == AllocationInfo::AllocationKind::MALLOC) - return getAPInt(A, AA, *AI.CB->getArgOperand(0)); - - if (AI.Kind == AllocationInfo::AllocationKind::ALIGNED_ALLOC) - // Only if the alignment is also constant we return a size. - return getAPInt(A, AA, *AI.CB->getArgOperand(0)).hasValue() - ? getAPInt(A, AA, *AI.CB->getArgOperand(1)) - : llvm::None; - - assert(AI.Kind == AllocationInfo::AllocationKind::CALLOC && - "Expected only callocs are left"); - Optional<APInt> Num = getAPInt(A, AA, *AI.CB->getArgOperand(0)); - Optional<APInt> Size = getAPInt(A, AA, *AI.CB->getArgOperand(1)); - if (!Num.hasValue() || !Size.hasValue()) - return llvm::None; - bool Overflow = false; - Size = Size.getValue().umul_ov(Num.getValue(), Overflow); - return Overflow ? llvm::None : Size; + const Function *F = getAnchorScope(); + const auto *TLI = A.getInfoCache().getTargetLibraryInfoForFunction(*F); + return getAllocSize(AI.CB, TLI, Mapper); } /// Collection of all malloc-like calls in a function with associated @@ -6025,6 +6014,7 @@ struct AAHeapToStackFunction final : public AAHeapToStack { ChangeStatus AAHeapToStackFunction::updateImpl(Attributor &A) { ChangeStatus Changed = ChangeStatus::UNCHANGED; const Function *F = getAnchorScope(); + const auto *TLI = A.getInfoCache().getTargetLibraryInfoForFunction(*F); const auto &LivenessAA = A.getAAFor<AAIsDead>(*this, IRPosition::function(*F), DepClassTy::NONE); @@ -6239,22 +6229,24 @@ ChangeStatus AAHeapToStackFunction::updateImpl(Attributor &A) { if (AI.Status == AllocationInfo::INVALID) continue; - if (MaxHeapToStackSize == -1) { - if (AI.Kind == AllocationInfo::AllocationKind::ALIGNED_ALLOC) - if (!getAPInt(A, *this, *AI.CB->getArgOperand(0)).hasValue()) { - LLVM_DEBUG(dbgs() << "[H2S] Unknown allocation alignment: " << *AI.CB - << "\n"); - AI.Status = AllocationInfo::INVALID; - Changed = ChangeStatus::CHANGED; - continue; - } - } else { + if (Value *Align = getAllocAlignment(AI.CB, TLI)) { + if (!getAPInt(A, *this, *Align)) { + // Can't generate an alloca which respects the required alignment + // on the allocation. + LLVM_DEBUG(dbgs() << "[H2S] Unknown allocation alignment: " << *AI.CB + << "\n"); + AI.Status = AllocationInfo::INVALID; + Changed = ChangeStatus::CHANGED; + continue; + } + } + + if (MaxHeapToStackSize != -1) { Optional<APInt> Size = getSize(A, *this, AI); if (!Size.hasValue() || Size.getValue().ugt(MaxHeapToStackSize)) { LLVM_DEBUG({ if (!Size.hasValue()) - dbgs() << "[H2S] Unknown allocation size (or alignment): " << *AI.CB - << "\n"; + dbgs() << "[H2S] Unknown allocation size: " << *AI.CB << "\n"; else dbgs() << "[H2S] Allocation size too large: " << *AI.CB << " vs. " << MaxHeapToStackSize << "\n"; @@ -6637,9 +6629,10 @@ struct AAPrivatizablePtrArgument final : public AAPrivatizablePtrImpl { IRBuilder<NoFolder> IRB(IP); const DataLayout &DL = IP->getModule()->getDataLayout(); - if (Base->getType()->getPointerElementType() != PrivType) - Base = BitCastInst::CreateBitOrPointerCast(Base, PrivType->getPointerTo(), - "", ACS.getInstruction()); + Type *PrivPtrType = PrivType->getPointerTo(); + if (Base->getType() != PrivPtrType) + Base = BitCastInst::CreateBitOrPointerCast(Base, PrivPtrType, "", + ACS.getInstruction()); // Traverse the type, build GEPs and loads. if (auto *PrivStructType = dyn_cast<StructType>(PrivType)) { @@ -6781,7 +6774,7 @@ struct AAPrivatizablePtrFloating : public AAPrivatizablePtrImpl { if (auto *AI = dyn_cast<AllocaInst>(Obj)) if (auto *CI = dyn_cast<ConstantInt>(AI->getArraySize())) if (CI->isOne()) - return Obj->getType()->getPointerElementType(); + return AI->getAllocatedType(); if (auto *Arg = dyn_cast<Argument>(Obj)) { auto &PrivArgAA = A.getAAFor<AAPrivatizablePtr>( *this, IRPosition::argument(*Arg), DepClassTy::REQUIRED); @@ -7675,7 +7668,6 @@ void AAMemoryLocationImpl::categorizePtrValue( for (Value *Obj : Objects) { // TODO: recognize the TBAA used for constant accesses. MemoryLocationsKind MLK = NO_LOCATIONS; - assert(!isa<GEPOperator>(Obj) && "GEPs should have been stripped."); if (isa<UndefValue>(Obj)) continue; if (isa<Argument>(Obj)) { @@ -8485,13 +8477,30 @@ struct AAValueConstantRangeFloating : AAValueConstantRangeImpl { /* UseValueSimplify */ false)) return indicatePessimisticFixpoint(); - return clampStateAndIndicateChange(getState(), T); + // Ensure that long def-use chains can't cause circular reasoning either by + // introducing a cutoff below. + if (clampStateAndIndicateChange(getState(), T) == ChangeStatus::UNCHANGED) + return ChangeStatus::UNCHANGED; + if (++NumChanges > MaxNumChanges) { + LLVM_DEBUG(dbgs() << "[AAValueConstantRange] performed " << NumChanges + << " but only " << MaxNumChanges + << " are allowed to avoid cyclic reasoning."); + return indicatePessimisticFixpoint(); + } + return ChangeStatus::CHANGED; } /// See AbstractAttribute::trackStatistics() void trackStatistics() const override { STATS_DECLTRACK_FLOATING_ATTR(value_range) } + + /// Tracker to bail after too many widening steps of the constant range. + int NumChanges = 0; + + /// Upper bound for the number of allowed changes (=widening steps) for the + /// constant range before we give up. + static constexpr int MaxNumChanges = 5; }; struct AAValueConstantRangeFunction : AAValueConstantRangeImpl { diff --git a/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp b/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp index fb9ab7954e36..2a6e38b0437f 100644 --- a/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp +++ b/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp @@ -287,7 +287,8 @@ bool DeadArgumentEliminationPass::RemoveDeadArgumentsFromCallers(Function &Fn) { SmallVector<unsigned, 8> UnusedArgs; bool Changed = false; - AttrBuilder UBImplyingAttributes = AttributeFuncs::getUBImplyingAttributes(); + AttributeMask UBImplyingAttributes = + AttributeFuncs::getUBImplyingAttributes(); for (Argument &Arg : Fn.args()) { if (!Arg.hasSwiftErrorAttr() && Arg.use_empty() && !Arg.hasPassPointeeByValueCopyAttr()) { @@ -838,7 +839,7 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) { assert(NRetTy && "No new return type found?"); // The existing function return attributes. - AttrBuilder RAttrs(PAL.getRetAttrs()); + AttrBuilder RAttrs(F->getContext(), PAL.getRetAttrs()); // Remove any incompatible attributes, but only if we removed all return // values. Otherwise, ensure that we don't have any conflicting attributes @@ -889,7 +890,7 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) { // Adjust the call return attributes in case the function was changed to // return void. - AttrBuilder RAttrs(CallPAL.getRetAttrs()); + AttrBuilder RAttrs(F->getContext(), CallPAL.getRetAttrs()); RAttrs.remove(AttributeFuncs::typeIncompatible(NRetTy)); AttributeSet RetAttrs = AttributeSet::get(F->getContext(), RAttrs); @@ -912,7 +913,7 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) { // this is not an expected case anyway ArgAttrVec.push_back(AttributeSet::get( F->getContext(), - AttrBuilder(Attrs).removeAttribute(Attribute::Returned))); + AttrBuilder(F->getContext(), Attrs).removeAttribute(Attribute::Returned))); } else { // Otherwise, use the original attributes. ArgAttrVec.push_back(Attrs); diff --git a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp index 321d4a19a585..213a998d5bba 100644 --- a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp +++ b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp @@ -133,7 +133,7 @@ static MemoryAccessKind checkFunctionMemoryAccess(Function &F, bool ThisBody, if (AliasAnalysis::onlyReadsMemory(MRB)) return MAK_ReadOnly; - if (AliasAnalysis::doesNotReadMemory(MRB)) + if (AliasAnalysis::onlyWritesMemory(MRB)) return MAK_WriteOnly; // Conservatively assume it reads and writes to memory. @@ -295,13 +295,13 @@ static void addReadAttrs(const SCCNodeSet &SCCNodes, AARGetterT &&AARGetter, // No change. continue; - if (F->doesNotReadMemory() && WritesMemory) + if (F->onlyWritesMemory() && WritesMemory) continue; Changed.insert(F); // Clear out any existing attributes. - AttrBuilder AttrsToRemove; + AttributeMask AttrsToRemove; AttrsToRemove.addAttribute(Attribute::ReadOnly); AttrsToRemove.addAttribute(Attribute::ReadNone); AttrsToRemove.addAttribute(Attribute::WriteOnly); @@ -720,10 +720,16 @@ determinePointerAccessAttrs(Argument *A, // The accessors used on call site here do the right thing for calls and // invokes with operand bundles. - if (!CB.onlyReadsMemory() && !CB.onlyReadsMemory(UseIndex)) - return Attribute::None; - if (!CB.doesNotAccessMemory(UseIndex)) + if (CB.doesNotAccessMemory(UseIndex)) { + /* nop */ + } else if (CB.onlyReadsMemory() || CB.onlyReadsMemory(UseIndex)) { IsRead = true; + } else if (CB.hasFnAttr(Attribute::WriteOnly) || + CB.dataOperandHasImpliedAttr(UseIndex, Attribute::WriteOnly)) { + IsWrite = true; + } else { + return Attribute::None; + } break; } diff --git a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp index 2425646455bd..6c3cc3914337 100644 --- a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp +++ b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp @@ -6,15 +6,24 @@ // //===----------------------------------------------------------------------===// // -// This specialises functions with constant parameters (e.g. functions, -// globals). Constant parameters like function pointers and constant globals -// are propagated to the callee by specializing the function. +// This specialises functions with constant parameters. Constant parameters +// like function pointers and constant globals are propagated to the callee by +// specializing the function. The main benefit of this pass at the moment is +// that indirect calls are transformed into direct calls, which provides inline +// opportunities that the inliner would not have been able to achieve. That's +// why function specialisation is run before the inliner in the optimisation +// pipeline; that is by design. Otherwise, we would only benefit from constant +// passing, which is a valid use-case too, but hasn't been explored much in +// terms of performance uplifts, cost-model and compile-time impact. // // Current limitations: -// - It does not yet handle integer ranges. +// - It does not yet handle integer ranges. We do support "literal constants", +// but that's off by default under an option. // - Only 1 argument per function is specialised, -// - The cost-model could be further looked into, -// - We are not yet caching analysis results. +// - The cost-model could be further looked into (it mainly focuses on inlining +// benefits), +// - We are not yet caching analysis results, but profiling and checking where +// extra compile time is spent didn't suggest this to be a problem. // // Ideas: // - With a function specialization attribute for arguments, we could have @@ -30,8 +39,12 @@ // https://reviews.llvm.org/D106426 for details. Perhaps there is a // compile-time friendlier way to control/limit the number of specialisations // for recursive functions. -// - Don't transform the function if there is no function specialization -// happens. +// - Don't transform the function if function specialization does not trigger; +// the SCCPSolver may make IR changes. +// +// References: +// - 2021 LLVM Dev Mtg “Introducing function specialisation, and can we enable +// it by default?”, https://www.youtube.com/watch?v=zJiCjeXgV5Q // //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp index b1f3ff15c97b..d3cac3efce86 100644 --- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp +++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp @@ -303,11 +303,11 @@ static bool CleanupConstantGlobalUsers(GlobalVariable *GV, else if (auto *GEP = dyn_cast<GEPOperator>(U)) append_range(WorkList, GEP->users()); else if (auto *LI = dyn_cast<LoadInst>(U)) { - // A load from zeroinitializer is always zeroinitializer, regardless of - // any applied offset. + // A load from a uniform value is always the same, regardless of any + // applied offset. Type *Ty = LI->getType(); - if (Init->isNullValue() && !Ty->isX86_MMXTy() && !Ty->isX86_AMXTy()) { - LI->replaceAllUsesWith(Constant::getNullValue(Ty)); + if (Constant *Res = ConstantFoldLoadFromUniformValue(Init, Ty)) { + LI->replaceAllUsesWith(Res); EraseFromParent(LI); continue; } @@ -337,107 +337,68 @@ static bool CleanupConstantGlobalUsers(GlobalVariable *GV, return Changed; } -static bool isSafeSROAElementUse(Value *V); - -/// Return true if the specified GEP is a safe user of a derived -/// expression from a global that we want to SROA. -static bool isSafeSROAGEP(User *U) { - // Check to see if this ConstantExpr GEP is SRA'able. In particular, we - // don't like < 3 operand CE's, and we don't like non-constant integer - // indices. This enforces that all uses are 'gep GV, 0, C, ...' for some - // value of C. - if (U->getNumOperands() < 3 || !isa<Constant>(U->getOperand(1)) || - !cast<Constant>(U->getOperand(1))->isNullValue()) - return false; - - gep_type_iterator GEPI = gep_type_begin(U), E = gep_type_end(U); - ++GEPI; // Skip over the pointer index. - - // For all other level we require that the indices are constant and inrange. - // In particular, consider: A[0][i]. We cannot know that the user isn't doing - // invalid things like allowing i to index an out-of-range subscript that - // accesses A[1]. This can also happen between different members of a struct - // in llvm IR. - for (; GEPI != E; ++GEPI) { - if (GEPI.isStruct()) +/// Look at all uses of the global and determine which (offset, type) pairs it +/// can be split into. +static bool collectSRATypes(DenseMap<uint64_t, Type *> &Types, GlobalValue *GV, + const DataLayout &DL) { + SmallVector<Use *, 16> Worklist; + SmallPtrSet<Use *, 16> Visited; + auto AppendUses = [&](Value *V) { + for (Use &U : V->uses()) + if (Visited.insert(&U).second) + Worklist.push_back(&U); + }; + AppendUses(GV); + while (!Worklist.empty()) { + Use *U = Worklist.pop_back_val(); + User *V = U->getUser(); + if (isa<BitCastOperator>(V) || isa<AddrSpaceCastOperator>(V)) { + AppendUses(V); continue; + } - ConstantInt *IdxVal = dyn_cast<ConstantInt>(GEPI.getOperand()); - if (!IdxVal || (GEPI.isBoundedSequential() && - IdxVal->getZExtValue() >= GEPI.getSequentialNumElements())) - return false; - } - - return llvm::all_of(U->users(), isSafeSROAElementUse); -} - -/// Return true if the specified instruction is a safe user of a derived -/// expression from a global that we want to SROA. -static bool isSafeSROAElementUse(Value *V) { - // We might have a dead and dangling constant hanging off of here. - if (Constant *C = dyn_cast<Constant>(V)) - return isSafeToDestroyConstant(C); - - Instruction *I = dyn_cast<Instruction>(V); - if (!I) return false; + if (auto *GEP = dyn_cast<GEPOperator>(V)) { + if (!GEP->hasAllConstantIndices()) + return false; + AppendUses(V); + continue; + } - // Loads are ok. - if (isa<LoadInst>(I)) return true; + if (Value *Ptr = getLoadStorePointerOperand(V)) { + // This is storing the global address into somewhere, not storing into + // the global. + if (isa<StoreInst>(V) && U->getOperandNo() == 0) + return false; - // Stores *to* the pointer are ok. - if (StoreInst *SI = dyn_cast<StoreInst>(I)) - return SI->getOperand(0) != V; + APInt Offset(DL.getIndexTypeSizeInBits(Ptr->getType()), 0); + Ptr = Ptr->stripAndAccumulateConstantOffsets(DL, Offset, + /* AllowNonInbounds */ true); + if (Ptr != GV || Offset.getActiveBits() >= 64) + return false; - // Otherwise, it must be a GEP. Check it and its users are safe to SRA. - return isa<GetElementPtrInst>(I) && isSafeSROAGEP(I); -} + // TODO: We currently require that all accesses at a given offset must + // use the same type. This could be relaxed. + Type *Ty = getLoadStoreType(V); + auto It = Types.try_emplace(Offset.getZExtValue(), Ty).first; + if (Ty != It->second) + return false; + continue; + } -/// Look at all uses of the global and decide whether it is safe for us to -/// perform this transformation. -static bool GlobalUsersSafeToSRA(GlobalValue *GV) { - for (User *U : GV->users()) { - // The user of the global must be a GEP Inst or a ConstantExpr GEP. - if (!isa<GetElementPtrInst>(U) && - (!isa<ConstantExpr>(U) || - cast<ConstantExpr>(U)->getOpcode() != Instruction::GetElementPtr)) - return false; + // Ignore dead constant users. + if (auto *C = dyn_cast<Constant>(V)) { + if (!isSafeToDestroyConstant(C)) + return false; + continue; + } - // Check the gep and it's users are safe to SRA - if (!isSafeSROAGEP(U)) - return false; + // Unknown user. + return false; } return true; } -static bool IsSRASequential(Type *T) { - return isa<ArrayType>(T) || isa<VectorType>(T); -} -static uint64_t GetSRASequentialNumElements(Type *T) { - if (ArrayType *AT = dyn_cast<ArrayType>(T)) - return AT->getNumElements(); - return cast<FixedVectorType>(T)->getNumElements(); -} -static Type *GetSRASequentialElementType(Type *T) { - if (ArrayType *AT = dyn_cast<ArrayType>(T)) - return AT->getElementType(); - return cast<VectorType>(T)->getElementType(); -} -static bool CanDoGlobalSRA(GlobalVariable *GV) { - Constant *Init = GV->getInitializer(); - - if (isa<StructType>(Init->getType())) { - // nothing to check - } else if (IsSRASequential(Init->getType())) { - if (GetSRASequentialNumElements(Init->getType()) > 16 && - GV->hasNUsesOrMore(16)) - return false; // It's not worth it. - } else - return false; - - return GlobalUsersSafeToSRA(GV); -} - /// Copy over the debug info for a variable to its SRA replacements. static void transferSRADebugInfo(GlobalVariable *GV, GlobalVariable *NGV, uint64_t FragmentOffsetInBits, @@ -468,161 +429,140 @@ static void transferSRADebugInfo(GlobalVariable *GV, GlobalVariable *NGV, /// transformation is safe already. We return the first global variable we /// insert so that the caller can reprocess it. static GlobalVariable *SRAGlobal(GlobalVariable *GV, const DataLayout &DL) { - // Make sure this global only has simple uses that we can SRA. - if (!CanDoGlobalSRA(GV)) + assert(GV->hasLocalLinkage()); + + // Collect types to split into. + DenseMap<uint64_t, Type *> Types; + if (!collectSRATypes(Types, GV, DL) || Types.empty()) return nullptr; - assert(GV->hasLocalLinkage()); - Constant *Init = GV->getInitializer(); - Type *Ty = Init->getType(); - uint64_t VarSize = DL.getTypeSizeInBits(Ty); + // Make sure we don't SRA back to the same type. + if (Types.size() == 1 && Types.begin()->second == GV->getValueType()) + return nullptr; - std::map<unsigned, GlobalVariable *> NewGlobals; + // Don't perform SRA if we would have to split into many globals. + if (Types.size() > 16) + return nullptr; - // Get the alignment of the global, either explicit or target-specific. - Align StartAlignment = - DL.getValueOrABITypeAlignment(GV->getAlign(), GV->getType()); - - // Loop over all users and create replacement variables for used aggregate - // elements. - for (User *GEP : GV->users()) { - assert(((isa<ConstantExpr>(GEP) && cast<ConstantExpr>(GEP)->getOpcode() == - Instruction::GetElementPtr) || - isa<GetElementPtrInst>(GEP)) && - "NonGEP CE's are not SRAable!"); - - // Ignore the 1th operand, which has to be zero or else the program is quite - // broken (undefined). Get the 2nd operand, which is the structure or array - // index. - unsigned ElementIdx = cast<ConstantInt>(GEP->getOperand(2))->getZExtValue(); - if (NewGlobals.count(ElementIdx) == 1) - continue; // we`ve already created replacement variable - assert(NewGlobals.count(ElementIdx) == 0); - - Type *ElTy = nullptr; - if (StructType *STy = dyn_cast<StructType>(Ty)) - ElTy = STy->getElementType(ElementIdx); - else - ElTy = GetSRASequentialElementType(Ty); - assert(ElTy); + // Sort by offset. + SmallVector<std::pair<uint64_t, Type *>, 16> TypesVector; + append_range(TypesVector, Types); + sort(TypesVector, + [](const auto &A, const auto &B) { return A.first < B.first; }); - Constant *In = Init->getAggregateElement(ElementIdx); - assert(In && "Couldn't get element of initializer?"); + // Check that the types are non-overlapping. + uint64_t Offset = 0; + for (const auto &Pair : TypesVector) { + // Overlaps with previous type. + if (Pair.first < Offset) + return nullptr; - GlobalVariable *NGV = new GlobalVariable( - ElTy, false, GlobalVariable::InternalLinkage, In, - GV->getName() + "." + Twine(ElementIdx), GV->getThreadLocalMode(), - GV->getType()->getAddressSpace()); - NGV->setExternallyInitialized(GV->isExternallyInitialized()); - NGV->copyAttributesFrom(GV); - NewGlobals.insert(std::make_pair(ElementIdx, NGV)); - - if (StructType *STy = dyn_cast<StructType>(Ty)) { - const StructLayout &Layout = *DL.getStructLayout(STy); - - // Calculate the known alignment of the field. If the original aggregate - // had 256 byte alignment for example, something might depend on that: - // propagate info to each field. - uint64_t FieldOffset = Layout.getElementOffset(ElementIdx); - Align NewAlign = commonAlignment(StartAlignment, FieldOffset); - if (NewAlign > DL.getABITypeAlign(STy->getElementType(ElementIdx))) - NGV->setAlignment(NewAlign); - - // Copy over the debug info for the variable. - uint64_t Size = DL.getTypeAllocSizeInBits(NGV->getValueType()); - uint64_t FragmentOffsetInBits = Layout.getElementOffsetInBits(ElementIdx); - transferSRADebugInfo(GV, NGV, FragmentOffsetInBits, Size, VarSize); - } else { - uint64_t EltSize = DL.getTypeAllocSize(ElTy); - Align EltAlign = DL.getABITypeAlign(ElTy); - uint64_t FragmentSizeInBits = DL.getTypeAllocSizeInBits(ElTy); - - // Calculate the known alignment of the field. If the original aggregate - // had 256 byte alignment for example, something might depend on that: - // propagate info to each field. - Align NewAlign = commonAlignment(StartAlignment, EltSize * ElementIdx); - if (NewAlign > EltAlign) - NGV->setAlignment(NewAlign); - transferSRADebugInfo(GV, NGV, FragmentSizeInBits * ElementIdx, - FragmentSizeInBits, VarSize); - } + Offset = Pair.first + DL.getTypeAllocSize(Pair.second); } - if (NewGlobals.empty()) + // Some accesses go beyond the end of the global, don't bother. + if (Offset > DL.getTypeAllocSize(GV->getValueType())) return nullptr; - Module::GlobalListType &Globals = GV->getParent()->getGlobalList(); - for (auto NewGlobalVar : NewGlobals) - Globals.push_back(NewGlobalVar.second); + // Collect initializers for new globals. + Constant *OrigInit = GV->getInitializer(); + DenseMap<uint64_t, Constant *> Initializers; + for (const auto &Pair : Types) { + Constant *NewInit = ConstantFoldLoadFromConst(OrigInit, Pair.second, + APInt(64, Pair.first), DL); + if (!NewInit) { + LLVM_DEBUG(dbgs() << "Global SRA: Failed to evaluate initializer of " + << *GV << " with type " << *Pair.second << " at offset " + << Pair.first << "\n"); + return nullptr; + } + Initializers.insert({Pair.first, NewInit}); + } LLVM_DEBUG(dbgs() << "PERFORMING GLOBAL SRA ON: " << *GV << "\n"); - Constant *NullInt =Constant::getNullValue(Type::getInt32Ty(GV->getContext())); + // Get the alignment of the global, either explicit or target-specific. + Align StartAlignment = + DL.getValueOrABITypeAlignment(GV->getAlign(), GV->getValueType()); + uint64_t VarSize = DL.getTypeSizeInBits(GV->getValueType()); + + // Create replacement globals. + DenseMap<uint64_t, GlobalVariable *> NewGlobals; + unsigned NameSuffix = 0; + for (auto &Pair : TypesVector) { + uint64_t Offset = Pair.first; + Type *Ty = Pair.second; + GlobalVariable *NGV = new GlobalVariable( + *GV->getParent(), Ty, false, GlobalVariable::InternalLinkage, + Initializers[Offset], GV->getName() + "." + Twine(NameSuffix++), GV, + GV->getThreadLocalMode(), GV->getAddressSpace()); + NGV->copyAttributesFrom(GV); + NewGlobals.insert({Offset, NGV}); + + // Calculate the known alignment of the field. If the original aggregate + // had 256 byte alignment for example, something might depend on that: + // propagate info to each field. + Align NewAlign = commonAlignment(StartAlignment, Offset); + if (NewAlign > DL.getABITypeAlign(Ty)) + NGV->setAlignment(NewAlign); + + // Copy over the debug info for the variable. + transferSRADebugInfo(GV, NGV, Offset * 8, DL.getTypeAllocSizeInBits(Ty), + VarSize); + } + + // Replace uses of the original global with uses of the new global. + SmallVector<Value *, 16> Worklist; + SmallPtrSet<Value *, 16> Visited; + SmallVector<WeakTrackingVH, 16> DeadInsts; + auto AppendUsers = [&](Value *V) { + for (User *U : V->users()) + if (Visited.insert(U).second) + Worklist.push_back(U); + }; + AppendUsers(GV); + while (!Worklist.empty()) { + Value *V = Worklist.pop_back_val(); + if (isa<BitCastOperator>(V) || isa<AddrSpaceCastOperator>(V) || + isa<GEPOperator>(V)) { + AppendUsers(V); + if (isa<Instruction>(V)) + DeadInsts.push_back(V); + continue; + } - // Loop over all of the uses of the global, replacing the constantexpr geps, - // with smaller constantexpr geps or direct references. - while (!GV->use_empty()) { - User *GEP = GV->user_back(); - assert(((isa<ConstantExpr>(GEP) && - cast<ConstantExpr>(GEP)->getOpcode()==Instruction::GetElementPtr)|| - isa<GetElementPtrInst>(GEP)) && "NonGEP CE's are not SRAable!"); - - // Ignore the 1th operand, which has to be zero or else the program is quite - // broken (undefined). Get the 2nd operand, which is the structure or array - // index. - unsigned ElementIdx = cast<ConstantInt>(GEP->getOperand(2))->getZExtValue(); - assert(NewGlobals.count(ElementIdx) == 1); - - Value *NewPtr = NewGlobals[ElementIdx]; - Type *NewTy = NewGlobals[ElementIdx]->getValueType(); - - // Form a shorter GEP if needed. - if (GEP->getNumOperands() > 3) { - if (ConstantExpr *CE = dyn_cast<ConstantExpr>(GEP)) { - SmallVector<Constant*, 8> Idxs; - Idxs.push_back(NullInt); - for (unsigned i = 3, e = CE->getNumOperands(); i != e; ++i) - Idxs.push_back(CE->getOperand(i)); - NewPtr = - ConstantExpr::getGetElementPtr(NewTy, cast<Constant>(NewPtr), Idxs); + if (Value *Ptr = getLoadStorePointerOperand(V)) { + APInt Offset(DL.getIndexTypeSizeInBits(Ptr->getType()), 0); + Ptr = Ptr->stripAndAccumulateConstantOffsets(DL, Offset, + /* AllowNonInbounds */ true); + assert(Ptr == GV && "Load/store must be from/to global"); + GlobalVariable *NGV = NewGlobals[Offset.getZExtValue()]; + assert(NGV && "Must have replacement global for this offset"); + + // Update the pointer operand and recalculate alignment. + Align PrefAlign = DL.getPrefTypeAlign(getLoadStoreType(V)); + Align NewAlign = + getOrEnforceKnownAlignment(NGV, PrefAlign, DL, cast<Instruction>(V)); + + if (auto *LI = dyn_cast<LoadInst>(V)) { + LI->setOperand(0, NGV); + LI->setAlignment(NewAlign); } else { - GetElementPtrInst *GEPI = cast<GetElementPtrInst>(GEP); - SmallVector<Value*, 8> Idxs; - Idxs.push_back(NullInt); - for (unsigned i = 3, e = GEPI->getNumOperands(); i != e; ++i) - Idxs.push_back(GEPI->getOperand(i)); - NewPtr = GetElementPtrInst::Create( - NewTy, NewPtr, Idxs, GEPI->getName() + "." + Twine(ElementIdx), - GEPI); - } - } - GEP->replaceAllUsesWith(NewPtr); - - // We changed the pointer of any memory access user. Recalculate alignments. - for (User *U : NewPtr->users()) { - if (auto *Load = dyn_cast<LoadInst>(U)) { - Align PrefAlign = DL.getPrefTypeAlign(Load->getType()); - Align NewAlign = getOrEnforceKnownAlignment(Load->getPointerOperand(), - PrefAlign, DL, Load); - Load->setAlignment(NewAlign); - } - if (auto *Store = dyn_cast<StoreInst>(U)) { - Align PrefAlign = - DL.getPrefTypeAlign(Store->getValueOperand()->getType()); - Align NewAlign = getOrEnforceKnownAlignment(Store->getPointerOperand(), - PrefAlign, DL, Store); - Store->setAlignment(NewAlign); + auto *SI = cast<StoreInst>(V); + SI->setOperand(1, NGV); + SI->setAlignment(NewAlign); } + continue; } - if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(GEP)) - GEPI->eraseFromParent(); - else - cast<ConstantExpr>(GEP)->destroyConstant(); + assert(isa<Constant>(V) && isSafeToDestroyConstant(cast<Constant>(V)) && + "Other users can only be dead constants"); } - // Delete the old global, now that it is dead. - Globals.erase(GV); + // Delete old instructions and global. + RecursivelyDeleteTriviallyDeadInstructions(DeadInsts); + GV->removeDeadConstantUsers(); + GV->eraseFromParent(); ++NumSRA; assert(NewGlobals.size() > 0); @@ -677,7 +617,7 @@ static bool AllUsesOfValueWillTrapIfNull(const Value *V, "Should be GlobalVariable"); // This and only this kind of non-signed ICmpInst is to be replaced with // the comparing of the value of the created global init bool later in - // optimizeGlobalAddressOfMalloc for the global variable. + // optimizeGlobalAddressOfAllocation for the global variable. } else { //cerr << "NONTRAPPING USE: " << *U; return false; @@ -895,29 +835,36 @@ static void ConstantPropUsersOf(Value *V, const DataLayout &DL, /// to actually DO the malloc. Instead, turn the malloc into a global, and any /// loads of GV as uses of the new global. static GlobalVariable * -OptimizeGlobalAddressOfMalloc(GlobalVariable *GV, CallInst *CI, Type *AllocTy, - ConstantInt *NElements, const DataLayout &DL, - TargetLibraryInfo *TLI) { +OptimizeGlobalAddressOfAllocation(GlobalVariable *GV, CallInst *CI, + uint64_t AllocSize, Constant *InitVal, + const DataLayout &DL, + TargetLibraryInfo *TLI) { LLVM_DEBUG(errs() << "PROMOTING GLOBAL: " << *GV << " CALL = " << *CI << '\n'); - Type *GlobalType; - if (NElements->getZExtValue() == 1) - GlobalType = AllocTy; - else - // If we have an array allocation, the global variable is of an array. - GlobalType = ArrayType::get(AllocTy, NElements->getZExtValue()); + // Create global of type [AllocSize x i8]. + Type *GlobalType = ArrayType::get(Type::getInt8Ty(GV->getContext()), + AllocSize); - // Create the new global variable. The contents of the malloc'd memory is - // undefined, so initialize with an undef value. + // Create the new global variable. The contents of the allocated memory is + // undefined initially, so initialize with an undef value. GlobalVariable *NewGV = new GlobalVariable( *GV->getParent(), GlobalType, false, GlobalValue::InternalLinkage, UndefValue::get(GlobalType), GV->getName() + ".body", nullptr, GV->getThreadLocalMode()); - // If there are bitcast users of the malloc (which is typical, usually we have - // a malloc + bitcast) then replace them with uses of the new global. Update - // other users to use the global as well. + // Initialize the global at the point of the original call. Note that this + // is a different point from the initialization referred to below for the + // nullability handling. Sublety: We have not proven the original global was + // only initialized once. As such, we can not fold this into the initializer + // of the new global as may need to re-init the storage multiple times. + if (!isa<UndefValue>(InitVal)) { + IRBuilder<> Builder(CI->getNextNode()); + // TODO: Use alignment above if align!=1 + Builder.CreateMemSet(NewGV, InitVal, AllocSize, None); + } + + // Update users of the allocation to use the new global instead. BitCastInst *TheBC = nullptr; while (!CI->use_empty()) { Instruction *User = cast<Instruction>(CI->user_back()); @@ -1009,7 +956,7 @@ OptimizeGlobalAddressOfMalloc(GlobalVariable *GV, CallInst *CI, Type *AllocTy, } else GV->getParent()->getGlobalList().insert(GV->getIterator(), InitBool); - // Now the GV is dead, nuke it and the malloc.. + // Now the GV is dead, nuke it and the allocation.. GV->eraseFromParent(); CI->eraseFromParent(); @@ -1066,15 +1013,33 @@ valueIsOnlyUsedLocallyOrStoredToOneGlobal(const CallInst *CI, return true; } -/// This function is called when we see a pointer global variable with a single -/// value stored it that is a malloc or cast of malloc. -static bool tryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV, CallInst *CI, - Type *AllocTy, - AtomicOrdering Ordering, - const DataLayout &DL, - TargetLibraryInfo *TLI) { - // If this is a malloc of an abstract type, don't touch it. - if (!AllocTy->isSized()) +/// If we have a global that is only initialized with a fixed size allocation +/// try to transform the program to use global memory instead of heap +/// allocated memory. This eliminates dynamic allocation, avoids an indirection +/// accessing the data, and exposes the resultant global to further GlobalOpt. +static bool tryToOptimizeStoreOfAllocationToGlobal(GlobalVariable *GV, + CallInst *CI, + AtomicOrdering Ordering, + const DataLayout &DL, + TargetLibraryInfo *TLI) { + if (!isAllocRemovable(CI, TLI)) + // Must be able to remove the call when we get done.. + return false; + + Type *Int8Ty = Type::getInt8Ty(CI->getFunction()->getContext()); + Constant *InitVal = getInitialValueOfAllocation(CI, TLI, Int8Ty); + if (!InitVal) + // Must be able to emit a memset for initialization + return false; + + uint64_t AllocSize; + if (!getObjectSize(CI, AllocSize, DL, TLI, ObjectSizeOpts())) + return false; + + // Restrict this transformation to only working on small allocations + // (2048 bytes currently), as we don't want to introduce a 16M global or + // something. + if (AllocSize >= 2048) return false; // We can't optimize this global unless all uses of it are *known* to be @@ -1093,25 +1058,8 @@ static bool tryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV, CallInst *CI, if (!valueIsOnlyUsedLocallyOrStoredToOneGlobal(CI, GV)) return false; - // If we have a global that is only initialized with a fixed size malloc, - // transform the program to use global memory instead of malloc'd memory. - // This eliminates dynamic allocation, avoids an indirection accessing the - // data, and exposes the resultant global to further GlobalOpt. - // We cannot optimize the malloc if we cannot determine malloc array size. - Value *NElems = getMallocArraySize(CI, DL, TLI, true); - if (!NElems) - return false; - - if (ConstantInt *NElements = dyn_cast<ConstantInt>(NElems)) - // Restrict this transformation to only working on small allocations - // (2048 bytes currently), as we don't want to introduce a 16M global or - // something. - if (NElements->getZExtValue() * DL.getTypeAllocSize(AllocTy) < 2048) { - OptimizeGlobalAddressOfMalloc(GV, CI, AllocTy, NElements, DL, TLI); - return true; - } - - return false; + OptimizeGlobalAddressOfAllocation(GV, CI, AllocSize, InitVal, DL, TLI); + return true; } // Try to optimize globals based on the knowledge that only one value (besides @@ -1140,12 +1088,12 @@ optimizeOnceStoredGlobal(GlobalVariable *GV, Value *StoredOnceVal, // Optimize away any trapping uses of the loaded value. if (OptimizeAwayTrappingUsesOfLoads(GV, SOVC, DL, GetTLI)) return true; - } else if (CallInst *CI = extractMallocCall(StoredOnceVal, GetTLI)) { - auto *TLI = &GetTLI(*CI->getFunction()); - Type *MallocType = getMallocAllocatedType(CI, TLI); - if (MallocType && tryToOptimizeStoreOfMallocToGlobal(GV, CI, MallocType, - Ordering, DL, TLI)) - return true; + } else if (isAllocationFn(StoredOnceVal, GetTLI)) { + if (auto *CI = dyn_cast<CallInst>(StoredOnceVal)) { + auto *TLI = &GetTLI(*CI->getFunction()); + if (tryToOptimizeStoreOfAllocationToGlobal(GV, CI, Ordering, DL, TLI)) + return true; + } } } @@ -1171,9 +1119,12 @@ static bool TryToShrinkGlobalToBoolean(GlobalVariable *GV, Constant *OtherVal) { // Walk the use list of the global seeing if all the uses are load or store. // If there is anything else, bail out. - for (User *U : GV->users()) + for (User *U : GV->users()) { if (!isa<LoadInst>(U) && !isa<StoreInst>(U)) return false; + if (getLoadStoreType(U) != GVElType) + return false; + } LLVM_DEBUG(dbgs() << " *** SHRINKING TO BOOL: " << *GV << "\n"); @@ -1590,11 +1541,25 @@ processInternalGlobal(GlobalVariable *GV, const GlobalStatus &GS, // This is restricted to address spaces that allow globals to have // initializers. NVPTX, for example, does not support initializers for // shared memory (AS 3). - if (SOVConstant && SOVConstant->getType() == GV->getValueType() && - isa<UndefValue>(GV->getInitializer()) && + if (SOVConstant && isa<UndefValue>(GV->getInitializer()) && + DL.getTypeAllocSize(SOVConstant->getType()) == + DL.getTypeAllocSize(GV->getValueType()) && CanHaveNonUndefGlobalInitializer) { - // Change the initial value here. - GV->setInitializer(SOVConstant); + if (SOVConstant->getType() == GV->getValueType()) { + // Change the initializer in place. + GV->setInitializer(SOVConstant); + } else { + // Create a new global with adjusted type. + auto *NGV = new GlobalVariable( + *GV->getParent(), SOVConstant->getType(), GV->isConstant(), + GV->getLinkage(), SOVConstant, "", GV, GV->getThreadLocalMode(), + GV->getAddressSpace()); + NGV->takeName(GV); + NGV->copyAttributesFrom(GV); + GV->replaceAllUsesWith(ConstantExpr::getBitCast(NGV, GV->getType())); + GV->eraseFromParent(); + GV = NGV; + } // Clean up any obviously simplifiable users now. CleanupConstantGlobalUsers(GV, DL); @@ -2066,194 +2031,6 @@ OptimizeGlobalVars(Module &M, return Changed; } -/// Evaluate a piece of a constantexpr store into a global initializer. This -/// returns 'Init' modified to reflect 'Val' stored into it. At this point, the -/// GEP operands of Addr [0, OpNo) have been stepped into. -static Constant *EvaluateStoreInto(Constant *Init, Constant *Val, - ConstantExpr *Addr, unsigned OpNo) { - // Base case of the recursion. - if (OpNo == Addr->getNumOperands()) { - assert(Val->getType() == Init->getType() && "Type mismatch!"); - return Val; - } - - SmallVector<Constant*, 32> Elts; - if (StructType *STy = dyn_cast<StructType>(Init->getType())) { - // Break up the constant into its elements. - for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) - Elts.push_back(Init->getAggregateElement(i)); - - // Replace the element that we are supposed to. - ConstantInt *CU = cast<ConstantInt>(Addr->getOperand(OpNo)); - unsigned Idx = CU->getZExtValue(); - assert(Idx < STy->getNumElements() && "Struct index out of range!"); - Elts[Idx] = EvaluateStoreInto(Elts[Idx], Val, Addr, OpNo+1); - - // Return the modified struct. - return ConstantStruct::get(STy, Elts); - } - - ConstantInt *CI = cast<ConstantInt>(Addr->getOperand(OpNo)); - uint64_t NumElts; - if (ArrayType *ATy = dyn_cast<ArrayType>(Init->getType())) - NumElts = ATy->getNumElements(); - else - NumElts = cast<FixedVectorType>(Init->getType())->getNumElements(); - - // Break up the array into elements. - for (uint64_t i = 0, e = NumElts; i != e; ++i) - Elts.push_back(Init->getAggregateElement(i)); - - assert(CI->getZExtValue() < NumElts); - Elts[CI->getZExtValue()] = - EvaluateStoreInto(Elts[CI->getZExtValue()], Val, Addr, OpNo+1); - - if (Init->getType()->isArrayTy()) - return ConstantArray::get(cast<ArrayType>(Init->getType()), Elts); - return ConstantVector::get(Elts); -} - -/// We have decided that Addr (which satisfies the predicate -/// isSimpleEnoughPointerToCommit) should get Val as its value. Make it happen. -static void CommitValueTo(Constant *Val, Constant *Addr) { - if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Addr)) { - assert(GV->hasInitializer()); - GV->setInitializer(Val); - return; - } - - ConstantExpr *CE = cast<ConstantExpr>(Addr); - GlobalVariable *GV = cast<GlobalVariable>(CE->getOperand(0)); - GV->setInitializer(EvaluateStoreInto(GV->getInitializer(), Val, CE, 2)); -} - -/// Given a map of address -> value, where addresses are expected to be some form -/// of either a global or a constant GEP, set the initializer for the address to -/// be the value. This performs mostly the same function as CommitValueTo() -/// and EvaluateStoreInto() but is optimized to be more efficient for the common -/// case where the set of addresses are GEPs sharing the same underlying global, -/// processing the GEPs in batches rather than individually. -/// -/// To give an example, consider the following C++ code adapted from the clang -/// regression tests: -/// struct S { -/// int n = 10; -/// int m = 2 * n; -/// S(int a) : n(a) {} -/// }; -/// -/// template<typename T> -/// struct U { -/// T *r = &q; -/// T q = 42; -/// U *p = this; -/// }; -/// -/// U<S> e; -/// -/// The global static constructor for 'e' will need to initialize 'r' and 'p' of -/// the outer struct, while also initializing the inner 'q' structs 'n' and 'm' -/// members. This batch algorithm will simply use general CommitValueTo() method -/// to handle the complex nested S struct initialization of 'q', before -/// processing the outermost members in a single batch. Using CommitValueTo() to -/// handle member in the outer struct is inefficient when the struct/array is -/// very large as we end up creating and destroy constant arrays for each -/// initialization. -/// For the above case, we expect the following IR to be generated: -/// -/// %struct.U = type { %struct.S*, %struct.S, %struct.U* } -/// %struct.S = type { i32, i32 } -/// @e = global %struct.U { %struct.S* gep inbounds (%struct.U, %struct.U* @e, -/// i64 0, i32 1), -/// %struct.S { i32 42, i32 84 }, %struct.U* @e } -/// The %struct.S { i32 42, i32 84 } inner initializer is treated as a complex -/// constant expression, while the other two elements of @e are "simple". -static void BatchCommitValueTo(const DenseMap<Constant*, Constant*> &Mem) { - SmallVector<std::pair<GlobalVariable*, Constant*>, 32> GVs; - SmallVector<std::pair<ConstantExpr*, Constant*>, 32> ComplexCEs; - SmallVector<std::pair<ConstantExpr*, Constant*>, 32> SimpleCEs; - SimpleCEs.reserve(Mem.size()); - - for (const auto &I : Mem) { - if (auto *GV = dyn_cast<GlobalVariable>(I.first)) { - GVs.push_back(std::make_pair(GV, I.second)); - } else { - ConstantExpr *GEP = cast<ConstantExpr>(I.first); - // We don't handle the deeply recursive case using the batch method. - if (GEP->getNumOperands() > 3) - ComplexCEs.push_back(std::make_pair(GEP, I.second)); - else - SimpleCEs.push_back(std::make_pair(GEP, I.second)); - } - } - - // The algorithm below doesn't handle cases like nested structs, so use the - // slower fully general method if we have to. - for (auto ComplexCE : ComplexCEs) - CommitValueTo(ComplexCE.second, ComplexCE.first); - - for (auto GVPair : GVs) { - assert(GVPair.first->hasInitializer()); - GVPair.first->setInitializer(GVPair.second); - } - - if (SimpleCEs.empty()) - return; - - // We cache a single global's initializer elements in the case where the - // subsequent address/val pair uses the same one. This avoids throwing away and - // rebuilding the constant struct/vector/array just because one element is - // modified at a time. - SmallVector<Constant *, 32> Elts; - Elts.reserve(SimpleCEs.size()); - GlobalVariable *CurrentGV = nullptr; - - auto commitAndSetupCache = [&](GlobalVariable *GV, bool Update) { - Constant *Init = GV->getInitializer(); - Type *Ty = Init->getType(); - if (Update) { - if (CurrentGV) { - assert(CurrentGV && "Expected a GV to commit to!"); - Type *CurrentInitTy = CurrentGV->getInitializer()->getType(); - // We have a valid cache that needs to be committed. - if (StructType *STy = dyn_cast<StructType>(CurrentInitTy)) - CurrentGV->setInitializer(ConstantStruct::get(STy, Elts)); - else if (ArrayType *ArrTy = dyn_cast<ArrayType>(CurrentInitTy)) - CurrentGV->setInitializer(ConstantArray::get(ArrTy, Elts)); - else - CurrentGV->setInitializer(ConstantVector::get(Elts)); - } - if (CurrentGV == GV) - return; - // Need to clear and set up cache for new initializer. - CurrentGV = GV; - Elts.clear(); - unsigned NumElts; - if (auto *STy = dyn_cast<StructType>(Ty)) - NumElts = STy->getNumElements(); - else if (auto *ATy = dyn_cast<ArrayType>(Ty)) - NumElts = ATy->getNumElements(); - else - NumElts = cast<FixedVectorType>(Ty)->getNumElements(); - for (unsigned i = 0, e = NumElts; i != e; ++i) - Elts.push_back(Init->getAggregateElement(i)); - } - }; - - for (auto CEPair : SimpleCEs) { - ConstantExpr *GEP = CEPair.first; - Constant *Val = CEPair.second; - - GlobalVariable *GV = cast<GlobalVariable>(GEP->getOperand(0)); - commitAndSetupCache(GV, GV != CurrentGV); - ConstantInt *CI = cast<ConstantInt>(GEP->getOperand(2)); - Elts[CI->getZExtValue()] = Val; - } - // The last initializer in the list needs to be committed, others - // will be committed on a new initializer being processed. - commitAndSetupCache(CurrentGV, true); -} - /// Evaluate static constructors in the function, if we can. Return true if we /// can, false otherwise. static bool EvaluateStaticConstructor(Function *F, const DataLayout &DL, @@ -2268,10 +2045,12 @@ static bool EvaluateStaticConstructor(Function *F, const DataLayout &DL, ++NumCtorsEvaluated; // We succeeded at evaluation: commit the result. + auto NewInitializers = Eval.getMutatedInitializers(); LLVM_DEBUG(dbgs() << "FULLY EVALUATED GLOBAL CTOR FUNCTION '" - << F->getName() << "' to " - << Eval.getMutatedMemory().size() << " stores.\n"); - BatchCommitValueTo(Eval.getMutatedMemory()); + << F->getName() << "' to " << NewInitializers.size() + << " stores.\n"); + for (const auto &Pair : NewInitializers) + Pair.first->setInitializer(Pair.second); for (GlobalVariable *GV : Eval.getInvariants()) GV->setConstant(true); } diff --git a/llvm/lib/Transforms/IPO/IROutliner.cpp b/llvm/lib/Transforms/IPO/IROutliner.cpp index b8a314c54f18..e064fbbef595 100644 --- a/llvm/lib/Transforms/IPO/IROutliner.cpp +++ b/llvm/lib/Transforms/IPO/IROutliner.cpp @@ -36,8 +36,14 @@ using namespace IRSimilarity; // A command flag to be used for debugging to exclude branches from similarity // matching and outlining. +namespace llvm { extern cl::opt<bool> DisableBranches; +// A command flag to be used for debugging to indirect calls from similarity +// matching and outlining. +extern cl::opt<bool> DisableIndirectCalls; +} // namespace llvm + // Set to true if the user wants the ir outliner to run on linkonceodr linkage // functions. This is false by default because the linker can dedupe linkonceodr // functions. Since the outliner is confined to a single module (modulo LTO), @@ -104,6 +110,16 @@ struct OutlinableGroup { /// of the region. unsigned BranchesToOutside = 0; + /// Tracker counting backwards from the highest unsigned value possible to + /// avoid conflicting with the GVNs of assigned values. We start at -3 since + /// -2 and -1 are assigned by the DenseMap. + unsigned PHINodeGVNTracker = -3; + + DenseMap<unsigned, + std::pair<std::pair<unsigned, unsigned>, SmallVector<unsigned, 2>>> + PHINodeGVNToGVNs; + DenseMap<hash_code, unsigned> GVNsToPHINodeGVN; + /// The number of instructions that will be outlined by extracting \ref /// Regions. InstructionCost Benefit = 0; @@ -169,6 +185,44 @@ Value *OutlinableRegion::findCorrespondingValueIn(const OutlinableRegion &Other, return FoundValueOpt.getValueOr(nullptr); } +/// Rewrite the BranchInsts in the incoming blocks to \p PHIBlock that are found +/// in \p Included to branch to BasicBlock \p Replace if they currently branch +/// to the BasicBlock \p Find. This is used to fix up the incoming basic blocks +/// when PHINodes are included in outlined regions. +/// +/// \param PHIBlock - The BasicBlock containing the PHINodes that need to be +/// checked. +/// \param Find - The successor block to be replaced. +/// \param Replace - The new succesor block to branch to. +/// \param Included - The set of blocks about to be outlined. +static void replaceTargetsFromPHINode(BasicBlock *PHIBlock, BasicBlock *Find, + BasicBlock *Replace, + DenseSet<BasicBlock *> &Included) { + for (PHINode &PN : PHIBlock->phis()) { + for (unsigned Idx = 0, PNEnd = PN.getNumIncomingValues(); Idx != PNEnd; + ++Idx) { + // Check if the incoming block is included in the set of blocks being + // outlined. + BasicBlock *Incoming = PN.getIncomingBlock(Idx); + if (!Included.contains(Incoming)) + continue; + + BranchInst *BI = dyn_cast<BranchInst>(Incoming->getTerminator()); + assert(BI && "Not a branch instruction?"); + // Look over the branching instructions into this block to see if we + // used to branch to Find in this outlined block. + for (unsigned Succ = 0, End = BI->getNumSuccessors(); Succ != End; + Succ++) { + // If we have found the block to replace, we do so here. + if (BI->getSuccessor(Succ) != Find) + continue; + BI->setSuccessor(Succ, Replace); + } + } + } +} + + void OutlinableRegion::splitCandidate() { assert(!CandidateSplit && "Candidate already split!"); @@ -199,6 +253,39 @@ void OutlinableRegion::splitCandidate() { StartBB = StartInst->getParent(); PrevBB = StartBB; + DenseSet<BasicBlock *> BBSet; + Candidate->getBasicBlocks(BBSet); + + // We iterate over the instructions in the region, if we find a PHINode, we + // check if there are predecessors outside of the region, if there are, + // we ignore this region since we are unable to handle the severing of the + // phi node right now. + BasicBlock::iterator It = StartInst->getIterator(); + while (PHINode *PN = dyn_cast<PHINode>(&*It)) { + unsigned NumPredsOutsideRegion = 0; + for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) + if (!BBSet.contains(PN->getIncomingBlock(i))) + ++NumPredsOutsideRegion; + + if (NumPredsOutsideRegion > 1) + return; + + It++; + } + + // If the region starts with a PHINode, but is not the initial instruction of + // the BasicBlock, we ignore this region for now. + if (isa<PHINode>(StartInst) && StartInst != &*StartBB->begin()) + return; + + // If the region ends with a PHINode, but does not contain all of the phi node + // instructions of the region, we ignore it for now. + if (isa<PHINode>(BackInst)) { + EndBB = BackInst->getParent(); + if (BackInst != &*std::prev(EndBB->getFirstInsertionPt())) + return; + } + // The basic block gets split like so: // block: block: // inst1 inst1 @@ -225,12 +312,20 @@ void OutlinableRegion::splitCandidate() { FollowBB = EndBB->splitBasicBlock(EndInst, OriginalName + "_after_outline"); EndBB->replaceSuccessorsPhiUsesWith(EndBB, FollowBB); FollowBB->replaceSuccessorsPhiUsesWith(PrevBB, FollowBB); - return; + } else { + EndBB = BackInst->getParent(); + EndsInBranch = true; + FollowBB = nullptr; } - EndBB = BackInst->getParent(); - EndsInBranch = true; - FollowBB = nullptr; + // Refind the basic block set. + BBSet.clear(); + Candidate->getBasicBlocks(BBSet); + // For the phi nodes in the new starting basic block of the region, we + // reassign the targets of the basic blocks branching instructions. + replaceTargetsFromPHINode(StartBB, PrevBB, StartBB, BBSet); + if (FollowBB) + replaceTargetsFromPHINode(FollowBB, EndBB, FollowBB, BBSet); } void OutlinableRegion::reattachCandidate() { @@ -252,15 +347,21 @@ void OutlinableRegion::reattachCandidate() { // inst4 assert(StartBB != nullptr && "StartBB for Candidate is not defined!"); - // StartBB should only have one predecessor since we put an unconditional - // branch at the end of PrevBB when we split the BasicBlock. - PrevBB = StartBB->getSinglePredecessor(); - assert(PrevBB != nullptr && - "No Predecessor for the region start basic block!"); - assert(PrevBB->getTerminator() && "Terminator removed from PrevBB!"); PrevBB->getTerminator()->eraseFromParent(); + // If we reattaching after outlining, we iterate over the phi nodes to + // the initial block, and reassign the branch instructions of the incoming + // blocks to the block we are remerging into. + if (!ExtractedFunction) { + DenseSet<BasicBlock *> BBSet; + Candidate->getBasicBlocks(BBSet); + + replaceTargetsFromPHINode(StartBB, StartBB, PrevBB, BBSet); + if (!EndsInBranch) + replaceTargetsFromPHINode(FollowBB, FollowBB, EndBB, BBSet); + } + moveBBContents(*StartBB, *PrevBB); BasicBlock *PlacementBB = PrevBB; @@ -354,6 +455,24 @@ InstructionCost OutlinableRegion::getBenefit(TargetTransformInfo &TTI) { return Benefit; } +/// Check the \p OutputMappings structure for value \p Input, if it exists +/// it has been used as an output for outlining, and has been renamed, and we +/// return the new value, otherwise, we return the same value. +/// +/// \param OutputMappings [in] - The mapping of values to their renamed value +/// after being used as an output for an outlined region. +/// \param Input [in] - The value to find the remapped value of, if it exists. +/// \return The remapped value if it has been renamed, and the same value if has +/// not. +static Value *findOutputMapping(const DenseMap<Value *, Value *> OutputMappings, + Value *Input) { + DenseMap<Value *, Value *>::const_iterator OutputMapping = + OutputMappings.find(Input); + if (OutputMapping != OutputMappings.end()) + return OutputMapping->second; + return Input; +} + /// Find whether \p Region matches the global value numbering to Constant /// mapping found so far. /// @@ -830,6 +949,209 @@ findExtractedInputToOverallInputMapping(OutlinableRegion &Region, Region.NumExtractedInputs = OriginalIndex; } +/// Check if the \p V has any uses outside of the region other than \p PN. +/// +/// \param V [in] - The value to check. +/// \param PHILoc [in] - The location in the PHINode of \p V. +/// \param PN [in] - The PHINode using \p V. +/// \param Exits [in] - The potential blocks we exit to from the outlined +/// region. +/// \param BlocksInRegion [in] - The basic blocks contained in the region. +/// \returns true if \p V has any use soutside its region other than \p PN. +static bool outputHasNonPHI(Value *V, unsigned PHILoc, PHINode &PN, + SmallPtrSet<BasicBlock *, 1> &Exits, + DenseSet<BasicBlock *> &BlocksInRegion) { + // We check to see if the value is used by the PHINode from some other + // predecessor not included in the region. If it is, we make sure + // to keep it as an output. + SmallVector<unsigned, 2> IncomingNumbers(PN.getNumIncomingValues()); + std::iota(IncomingNumbers.begin(), IncomingNumbers.end(), 0); + if (any_of(IncomingNumbers, [PHILoc, &PN, V, &BlocksInRegion](unsigned Idx) { + return (Idx != PHILoc && V == PN.getIncomingValue(Idx) && + !BlocksInRegion.contains(PN.getIncomingBlock(Idx))); + })) + return true; + + // Check if the value is used by any other instructions outside the region. + return any_of(V->users(), [&Exits, &BlocksInRegion](User *U) { + Instruction *I = dyn_cast<Instruction>(U); + if (!I) + return false; + + // If the use of the item is inside the region, we skip it. Uses + // inside the region give us useful information about how the item could be + // used as an output. + BasicBlock *Parent = I->getParent(); + if (BlocksInRegion.contains(Parent)) + return false; + + // If it's not a PHINode then we definitely know the use matters. This + // output value will not completely combined with another item in a PHINode + // as it is directly reference by another non-phi instruction + if (!isa<PHINode>(I)) + return true; + + // If we have a PHINode outside one of the exit locations, then it + // can be considered an outside use as well. If there is a PHINode + // contained in the Exit where this values use matters, it will be + // caught when we analyze that PHINode. + if (!Exits.contains(Parent)) + return true; + + return false; + }); +} + +/// Test whether \p CurrentExitFromRegion contains any PhiNodes that should be +/// considered outputs. A PHINodes is an output when more than one incoming +/// value has been marked by the CodeExtractor as an output. +/// +/// \param CurrentExitFromRegion [in] - The block to analyze. +/// \param PotentialExitsFromRegion [in] - The potential exit blocks from the +/// region. +/// \param RegionBlocks [in] - The basic blocks in the region. +/// \param Outputs [in, out] - The existing outputs for the region, we may add +/// PHINodes to this as we find that they replace output values. +/// \param OutputsReplacedByPHINode [out] - A set containing outputs that are +/// totally replaced by a PHINode. +/// \param OutputsWithNonPhiUses [out] - A set containing outputs that are used +/// in PHINodes, but have other uses, and should still be considered outputs. +static void analyzeExitPHIsForOutputUses( + BasicBlock *CurrentExitFromRegion, + SmallPtrSet<BasicBlock *, 1> &PotentialExitsFromRegion, + DenseSet<BasicBlock *> &RegionBlocks, SetVector<Value *> &Outputs, + DenseSet<Value *> &OutputsReplacedByPHINode, + DenseSet<Value *> &OutputsWithNonPhiUses) { + for (PHINode &PN : CurrentExitFromRegion->phis()) { + // Find all incoming values from the outlining region. + SmallVector<unsigned, 2> IncomingVals; + for (unsigned I = 0, E = PN.getNumIncomingValues(); I < E; ++I) + if (RegionBlocks.contains(PN.getIncomingBlock(I))) + IncomingVals.push_back(I); + + // Do not process PHI if there are no predecessors from region. + unsigned NumIncomingVals = IncomingVals.size(); + if (NumIncomingVals == 0) + continue; + + // If there is one predecessor, we mark it as a value that needs to be kept + // as an output. + if (NumIncomingVals == 1) { + Value *V = PN.getIncomingValue(*IncomingVals.begin()); + OutputsWithNonPhiUses.insert(V); + OutputsReplacedByPHINode.erase(V); + continue; + } + + // This PHINode will be used as an output value, so we add it to our list. + Outputs.insert(&PN); + + // Not all of the incoming values should be ignored as other inputs and + // outputs may have uses in outlined region. If they have other uses + // outside of the single PHINode we should not skip over it. + for (unsigned Idx : IncomingVals) { + Value *V = PN.getIncomingValue(Idx); + if (outputHasNonPHI(V, Idx, PN, PotentialExitsFromRegion, RegionBlocks)) { + OutputsWithNonPhiUses.insert(V); + OutputsReplacedByPHINode.erase(V); + continue; + } + if (!OutputsWithNonPhiUses.contains(V)) + OutputsReplacedByPHINode.insert(V); + } + } +} + +// Represents the type for the unsigned number denoting the output number for +// phi node, along with the canonical number for the exit block. +using ArgLocWithBBCanon = std::pair<unsigned, unsigned>; +// The list of canonical numbers for the incoming values to a PHINode. +using CanonList = SmallVector<unsigned, 2>; +// The pair type representing the set of canonical values being combined in the +// PHINode, along with the location data for the PHINode. +using PHINodeData = std::pair<ArgLocWithBBCanon, CanonList>; + +/// Encode \p PND as an integer for easy lookup based on the argument location, +/// the parent BasicBlock canonical numbering, and the canonical numbering of +/// the values stored in the PHINode. +/// +/// \param PND - The data to hash. +/// \returns The hash code of \p PND. +static hash_code encodePHINodeData(PHINodeData &PND) { + return llvm::hash_combine( + llvm::hash_value(PND.first.first), llvm::hash_value(PND.first.second), + llvm::hash_combine_range(PND.second.begin(), PND.second.end())); +} + +/// Create a special GVN for PHINodes that will be used outside of +/// the region. We create a hash code based on the Canonical number of the +/// parent BasicBlock, the canonical numbering of the values stored in the +/// PHINode and the aggregate argument location. This is used to find whether +/// this PHINode type has been given a canonical numbering already. If not, we +/// assign it a value and store it for later use. The value is returned to +/// identify different output schemes for the set of regions. +/// +/// \param Region - The region that \p PN is an output for. +/// \param PN - The PHINode we are analyzing. +/// \param AggArgIdx - The argument \p PN will be stored into. +/// \returns An optional holding the assigned canonical number, or None if +/// there is some attribute of the PHINode blocking it from being used. +static Optional<unsigned> getGVNForPHINode(OutlinableRegion &Region, + PHINode *PN, unsigned AggArgIdx) { + OutlinableGroup &Group = *Region.Parent; + IRSimilarityCandidate &Cand = *Region.Candidate; + BasicBlock *PHIBB = PN->getParent(); + CanonList PHIGVNs; + for (Value *Incoming : PN->incoming_values()) { + // If we cannot find a GVN, this means that the input to the PHINode is + // not included in the region we are trying to analyze, meaning, that if + // it was outlined, we would be adding an extra input. We ignore this + // case for now, and so ignore the region. + Optional<unsigned> OGVN = Cand.getGVN(Incoming); + if (!OGVN.hasValue()) { + Region.IgnoreRegion = true; + return None; + } + + // Collect the canonical numbers of the values in the PHINode. + unsigned GVN = OGVN.getValue(); + OGVN = Cand.getCanonicalNum(GVN); + assert(OGVN.hasValue() && "No GVN found for incoming value?"); + PHIGVNs.push_back(*OGVN); + } + + // Now that we have the GVNs for the incoming values, we are going to combine + // them with the GVN of the incoming bock, and the output location of the + // PHINode to generate a hash value representing this instance of the PHINode. + DenseMap<hash_code, unsigned>::iterator GVNToPHIIt; + DenseMap<unsigned, PHINodeData>::iterator PHIToGVNIt; + Optional<unsigned> BBGVN = Cand.getGVN(PHIBB); + assert(BBGVN.hasValue() && "Could not find GVN for the incoming block!"); + + BBGVN = Cand.getCanonicalNum(BBGVN.getValue()); + assert(BBGVN.hasValue() && + "Could not find canonical number for the incoming block!"); + // Create a pair of the exit block canonical value, and the aggregate + // argument location, connected to the canonical numbers stored in the + // PHINode. + PHINodeData TemporaryPair = + std::make_pair(std::make_pair(BBGVN.getValue(), AggArgIdx), PHIGVNs); + hash_code PHINodeDataHash = encodePHINodeData(TemporaryPair); + + // Look for and create a new entry in our connection between canonical + // numbers for PHINodes, and the set of objects we just created. + GVNToPHIIt = Group.GVNsToPHINodeGVN.find(PHINodeDataHash); + if (GVNToPHIIt == Group.GVNsToPHINodeGVN.end()) { + bool Inserted = false; + std::tie(PHIToGVNIt, Inserted) = Group.PHINodeGVNToGVNs.insert( + std::make_pair(Group.PHINodeGVNTracker, TemporaryPair)); + std::tie(GVNToPHIIt, Inserted) = Group.GVNsToPHINodeGVN.insert( + std::make_pair(PHINodeDataHash, Group.PHINodeGVNTracker--)); + } + + return GVNToPHIIt->second; +} + /// Create a mapping of the output arguments for the \p Region to the output /// arguments of the overall outlined function. /// @@ -842,35 +1164,25 @@ findExtractedOutputToOverallOutputMapping(OutlinableRegion &Region, IRSimilarityCandidate &C = *Region.Candidate; SmallVector<BasicBlock *> BE; - DenseSet<BasicBlock *> BBSet; - C.getBasicBlocks(BBSet, BE); + DenseSet<BasicBlock *> BlocksInRegion; + C.getBasicBlocks(BlocksInRegion, BE); // Find the exits to the region. SmallPtrSet<BasicBlock *, 1> Exits; for (BasicBlock *Block : BE) for (BasicBlock *Succ : successors(Block)) - if (!BBSet.contains(Succ)) + if (!BlocksInRegion.contains(Succ)) Exits.insert(Succ); // After determining which blocks exit to PHINodes, we add these PHINodes to // the set of outputs to be processed. We also check the incoming values of // the PHINodes for whether they should no longer be considered outputs. - for (BasicBlock *ExitBB : Exits) { - for (PHINode &PN : ExitBB->phis()) { - // Find all incoming values from the outlining region. - SmallVector<unsigned, 2> IncomingVals; - for (unsigned Idx = 0; Idx < PN.getNumIncomingValues(); ++Idx) - if (BBSet.contains(PN.getIncomingBlock(Idx))) - IncomingVals.push_back(Idx); - - // Do not process PHI if there is one (or fewer) predecessor from region. - if (IncomingVals.size() <= 1) - continue; - - Region.IgnoreRegion = true; - return; - } - } + DenseSet<Value *> OutputsReplacedByPHINode; + DenseSet<Value *> OutputsWithNonPhiUses; + for (BasicBlock *ExitBB : Exits) + analyzeExitPHIsForOutputUses(ExitBB, Exits, BlocksInRegion, Outputs, + OutputsReplacedByPHINode, + OutputsWithNonPhiUses); // This counts the argument number in the extracted function. unsigned OriginalIndex = Region.NumExtractedInputs; @@ -893,9 +1205,13 @@ findExtractedOutputToOverallOutputMapping(OutlinableRegion &Region, // do not have to be in same order, but are functionally the same, we will // have to use a different scheme, as one-to-one correspondence is not // guaranteed. - unsigned GlobalValue = C.getGVN(Output).getValue(); unsigned ArgumentSize = Group.ArgumentTypes.size(); + // If the output is combined in a PHINode, we make sure to skip over it. + if (OutputsReplacedByPHINode.contains(Output)) + continue; + + unsigned AggArgIdx = 0; for (unsigned Jdx = TypeIndex; Jdx < ArgumentSize; Jdx++) { if (Group.ArgumentTypes[Jdx] != PointerType::getUnqual(Output->getType())) continue; @@ -907,7 +1223,7 @@ findExtractedOutputToOverallOutputMapping(OutlinableRegion &Region, AggArgsUsed.insert(Jdx); Region.ExtractedArgToAgg.insert(std::make_pair(OriginalIndex, Jdx)); Region.AggArgToExtracted.insert(std::make_pair(Jdx, OriginalIndex)); - Region.GVNStores.push_back(GlobalValue); + AggArgIdx = Jdx; break; } @@ -916,18 +1232,54 @@ findExtractedOutputToOverallOutputMapping(OutlinableRegion &Region, // function to handle this output and create a mapping to it. if (!TypeFound) { Group.ArgumentTypes.push_back(PointerType::getUnqual(Output->getType())); - AggArgsUsed.insert(Group.ArgumentTypes.size() - 1); + // Mark the new pointer type as the last value in the aggregate argument + // list. + unsigned ArgTypeIdx = Group.ArgumentTypes.size() - 1; + AggArgsUsed.insert(ArgTypeIdx); Region.ExtractedArgToAgg.insert( - std::make_pair(OriginalIndex, Group.ArgumentTypes.size() - 1)); + std::make_pair(OriginalIndex, ArgTypeIdx)); Region.AggArgToExtracted.insert( - std::make_pair(Group.ArgumentTypes.size() - 1, OriginalIndex)); - Region.GVNStores.push_back(GlobalValue); + std::make_pair(ArgTypeIdx, OriginalIndex)); + AggArgIdx = ArgTypeIdx; + } + + // TODO: Adapt to the extra input from the PHINode. + PHINode *PN = dyn_cast<PHINode>(Output); + + Optional<unsigned> GVN; + if (PN && !BlocksInRegion.contains(PN->getParent())) { + // Values outside the region can be combined into PHINode when we + // have multiple exits. We collect both of these into a list to identify + // which values are being used in the PHINode. Each list identifies a + // different PHINode, and a different output. We store the PHINode as it's + // own canonical value. These canonical values are also dependent on the + // output argument it is saved to. + + // If two PHINodes have the same canonical values, but different aggregate + // argument locations, then they will have distinct Canonical Values. + GVN = getGVNForPHINode(Region, PN, AggArgIdx); + if (!GVN.hasValue()) + return; + } else { + // If we do not have a PHINode we use the global value numbering for the + // output value, to find the canonical number to add to the set of stored + // values. + GVN = C.getGVN(Output); + GVN = C.getCanonicalNum(*GVN); } - stable_sort(Region.GVNStores); + // Each region has a potentially unique set of outputs. We save which + // values are output in a list of canonical values so we can differentiate + // among the different store schemes. + Region.GVNStores.push_back(*GVN); + OriginalIndex++; TypeIndex++; } + + // We sort the stored values to make sure that we are not affected by analysis + // order when determining what combination of items were stored. + stable_sort(Region.GVNStores); } void IROutliner::findAddInputsOutputs(Module &M, OutlinableRegion &Region, @@ -1063,6 +1415,214 @@ CallInst *replaceCalledFunction(Module &M, OutlinableRegion &Region) { return Call; } +/// Find or create a BasicBlock in the outlined function containing PhiBlocks +/// for \p RetVal. +/// +/// \param Group - The OutlinableGroup containing the information about the +/// overall outlined function. +/// \param RetVal - The return value or exit option that we are currently +/// evaluating. +/// \returns The found or newly created BasicBlock to contain the needed +/// PHINodes to be used as outputs. +static BasicBlock *findOrCreatePHIBlock(OutlinableGroup &Group, Value *RetVal) { + DenseMap<Value *, BasicBlock *>::iterator PhiBlockForRetVal, + ReturnBlockForRetVal; + PhiBlockForRetVal = Group.PHIBlocks.find(RetVal); + ReturnBlockForRetVal = Group.EndBBs.find(RetVal); + assert(ReturnBlockForRetVal != Group.EndBBs.end() && + "Could not find output value!"); + BasicBlock *ReturnBB = ReturnBlockForRetVal->second; + + // Find if a PHIBlock exists for this return value already. If it is + // the first time we are analyzing this, we will not, so we record it. + PhiBlockForRetVal = Group.PHIBlocks.find(RetVal); + if (PhiBlockForRetVal != Group.PHIBlocks.end()) + return PhiBlockForRetVal->second; + + // If we did not find a block, we create one, and insert it into the + // overall function and record it. + bool Inserted = false; + BasicBlock *PHIBlock = BasicBlock::Create(ReturnBB->getContext(), "phi_block", + ReturnBB->getParent()); + std::tie(PhiBlockForRetVal, Inserted) = + Group.PHIBlocks.insert(std::make_pair(RetVal, PHIBlock)); + + // We find the predecessors of the return block in the newly created outlined + // function in order to point them to the new PHIBlock rather than the already + // existing return block. + SmallVector<BranchInst *, 2> BranchesToChange; + for (BasicBlock *Pred : predecessors(ReturnBB)) + BranchesToChange.push_back(cast<BranchInst>(Pred->getTerminator())); + + // Now we mark the branch instructions found, and change the references of the + // return block to the newly created PHIBlock. + for (BranchInst *BI : BranchesToChange) + for (unsigned Succ = 0, End = BI->getNumSuccessors(); Succ < End; Succ++) { + if (BI->getSuccessor(Succ) != ReturnBB) + continue; + BI->setSuccessor(Succ, PHIBlock); + } + + BranchInst::Create(ReturnBB, PHIBlock); + + return PhiBlockForRetVal->second; +} + +/// For the function call now representing the \p Region, find the passed value +/// to that call that represents Argument \p A at the call location if the +/// call has already been replaced with a call to the overall, aggregate +/// function. +/// +/// \param A - The Argument to get the passed value for. +/// \param Region - The extracted Region corresponding to the outlined function. +/// \returns The Value representing \p A at the call site. +static Value * +getPassedArgumentInAlreadyOutlinedFunction(const Argument *A, + const OutlinableRegion &Region) { + // If we don't need to adjust the argument number at all (since the call + // has already been replaced by a call to the overall outlined function) + // we can just get the specified argument. + return Region.Call->getArgOperand(A->getArgNo()); +} + +/// For the function call now representing the \p Region, find the passed value +/// to that call that represents Argument \p A at the call location if the +/// call has only been replaced by the call to the aggregate function. +/// +/// \param A - The Argument to get the passed value for. +/// \param Region - The extracted Region corresponding to the outlined function. +/// \returns The Value representing \p A at the call site. +static Value * +getPassedArgumentAndAdjustArgumentLocation(const Argument *A, + const OutlinableRegion &Region) { + unsigned ArgNum = A->getArgNo(); + + // If it is a constant, we can look at our mapping from when we created + // the outputs to figure out what the constant value is. + if (Region.AggArgToConstant.count(ArgNum)) + return Region.AggArgToConstant.find(ArgNum)->second; + + // If it is not a constant, and we are not looking at the overall function, we + // need to adjust which argument we are looking at. + ArgNum = Region.AggArgToExtracted.find(ArgNum)->second; + return Region.Call->getArgOperand(ArgNum); +} + +/// Find the canonical numbering for the incoming Values into the PHINode \p PN. +/// +/// \param PN [in] - The PHINode that we are finding the canonical numbers for. +/// \param Region [in] - The OutlinableRegion containing \p PN. +/// \param OutputMappings [in] - The mapping of output values from outlined +/// region to their original values. +/// \param CanonNums [out] - The canonical numbering for the incoming values to +/// \p PN. +/// \param ReplacedWithOutlinedCall - A flag to use the extracted function call +/// of \p Region rather than the overall function's call. +static void +findCanonNumsForPHI(PHINode *PN, OutlinableRegion &Region, + const DenseMap<Value *, Value *> &OutputMappings, + DenseSet<unsigned> &CanonNums, + bool ReplacedWithOutlinedCall = true) { + // Iterate over the incoming values. + for (unsigned Idx = 0, EIdx = PN->getNumIncomingValues(); Idx < EIdx; Idx++) { + Value *IVal = PN->getIncomingValue(Idx); + // If we have an argument as incoming value, we need to grab the passed + // value from the call itself. + if (Argument *A = dyn_cast<Argument>(IVal)) { + if (ReplacedWithOutlinedCall) + IVal = getPassedArgumentInAlreadyOutlinedFunction(A, Region); + else + IVal = getPassedArgumentAndAdjustArgumentLocation(A, Region); + } + + // Get the original value if it has been replaced by an output value. + IVal = findOutputMapping(OutputMappings, IVal); + + // Find and add the canonical number for the incoming value. + Optional<unsigned> GVN = Region.Candidate->getGVN(IVal); + assert(GVN.hasValue() && "No GVN for incoming value"); + Optional<unsigned> CanonNum = Region.Candidate->getCanonicalNum(*GVN); + assert(CanonNum.hasValue() && "No Canonical Number for GVN"); + CanonNums.insert(*CanonNum); + } +} + +/// Find, or add PHINode \p PN to the combined PHINode Block \p OverallPHIBlock +/// in order to condense the number of instructions added to the outlined +/// function. +/// +/// \param PN [in] - The PHINode that we are finding the canonical numbers for. +/// \param Region [in] - The OutlinableRegion containing \p PN. +/// \param OverallPhiBlock [in] - The overall PHIBlock we are trying to find +/// \p PN in. +/// \param OutputMappings [in] - The mapping of output values from outlined +/// region to their original values. +/// \return the newly found or created PHINode in \p OverallPhiBlock. +static PHINode* +findOrCreatePHIInBlock(PHINode &PN, OutlinableRegion &Region, + BasicBlock *OverallPhiBlock, + const DenseMap<Value *, Value *> &OutputMappings) { + OutlinableGroup &Group = *Region.Parent; + + DenseSet<unsigned> PNCanonNums; + // We have to use the extracted function since we have merged this region into + // the overall function yet. We make sure to reassign the argument numbering + // since it is possible that the argument ordering is different between the + // functions. + findCanonNumsForPHI(&PN, Region, OutputMappings, PNCanonNums, + /* ReplacedWithOutlinedCall = */ false); + + OutlinableRegion *FirstRegion = Group.Regions[0]; + DenseSet<unsigned> CurrentCanonNums; + // Find the Canonical Numbering for each PHINode, if it matches, we replace + // the uses of the PHINode we are searching for, with the found PHINode. + for (PHINode &CurrPN : OverallPhiBlock->phis()) { + CurrentCanonNums.clear(); + findCanonNumsForPHI(&CurrPN, *FirstRegion, OutputMappings, CurrentCanonNums, + /* ReplacedWithOutlinedCall = */ true); + + if (all_of(PNCanonNums, [&CurrentCanonNums](unsigned CanonNum) { + return CurrentCanonNums.contains(CanonNum); + })) + return &CurrPN; + } + + // If we've made it here, it means we weren't able to replace the PHINode, so + // we must insert it ourselves. + PHINode *NewPN = cast<PHINode>(PN.clone()); + NewPN->insertBefore(&*OverallPhiBlock->begin()); + for (unsigned Idx = 0, Edx = NewPN->getNumIncomingValues(); Idx < Edx; + Idx++) { + Value *IncomingVal = NewPN->getIncomingValue(Idx); + BasicBlock *IncomingBlock = NewPN->getIncomingBlock(Idx); + + // Find corresponding basic block in the overall function for the incoming + // block. + Instruction *FirstNonPHI = IncomingBlock->getFirstNonPHI(); + assert(FirstNonPHI && "Incoming block is empty?"); + Value *CorrespondingVal = + Region.findCorrespondingValueIn(*FirstRegion, FirstNonPHI); + assert(CorrespondingVal && "Value is nullptr?"); + BasicBlock *BlockToUse = cast<Instruction>(CorrespondingVal)->getParent(); + NewPN->setIncomingBlock(Idx, BlockToUse); + + // If we have an argument we make sure we replace using the argument from + // the correct function. + if (Argument *A = dyn_cast<Argument>(IncomingVal)) { + Value *Val = Group.OutlinedFunction->getArg(A->getArgNo()); + NewPN->setIncomingValue(Idx, Val); + continue; + } + + // Find the corresponding value in the overall function. + IncomingVal = findOutputMapping(OutputMappings, IncomingVal); + Value *Val = Region.findCorrespondingValueIn(*FirstRegion, IncomingVal); + assert(Val && "Value is nullptr?"); + NewPN->setIncomingValue(Idx, Val); + } + return NewPN; +} + // Within an extracted function, replace the argument uses of the extracted // region with the arguments of the function for an OutlinableGroup. // @@ -1075,6 +1635,7 @@ CallInst *replaceCalledFunction(Module &M, OutlinableRegion &Region) { static void replaceArgumentUses(OutlinableRegion &Region, DenseMap<Value *, BasicBlock *> &OutputBBs, + const DenseMap<Value *, Value *> &OutputMappings, bool FirstFunction = false) { OutlinableGroup &Group = *Region.Parent; assert(Region.ExtractedFunction && "Region has no extracted function?"); @@ -1144,12 +1705,47 @@ replaceArgumentUses(OutlinableRegion &Region, LLVM_DEBUG(dbgs() << "Move store for instruction " << *I << " to " << *OutputBB << "\n"); - if (FirstFunction) + // If this is storing a PHINode, we must make sure it is included in the + // overall function. + if (!isa<PHINode>(ValueOperand) || + Region.Candidate->getGVN(ValueOperand).hasValue()) { + if (FirstFunction) + continue; + Value *CorrVal = + Region.findCorrespondingValueIn(*Group.Regions[0], ValueOperand); + assert(CorrVal && "Value is nullptr?"); + NewI->setOperand(0, CorrVal); + continue; + } + PHINode *PN = cast<PHINode>(SI->getValueOperand()); + // If it has a value, it was not split by the code extractor, which + // is what we are looking for. + if (Region.Candidate->getGVN(PN).hasValue()) continue; - Value *CorrVal = - Region.findCorrespondingValueIn(*Group.Regions[0], ValueOperand); - assert(CorrVal && "Value is nullptr?"); - NewI->setOperand(0, CorrVal); + + // We record the parent block for the PHINode in the Region so that + // we can exclude it from checks later on. + Region.PHIBlocks.insert(std::make_pair(RetVal, PN->getParent())); + + // If this is the first function, we do not need to worry about mergiing + // this with any other block in the overall outlined function, so we can + // just continue. + if (FirstFunction) { + BasicBlock *PHIBlock = PN->getParent(); + Group.PHIBlocks.insert(std::make_pair(RetVal, PHIBlock)); + continue; + } + + // We look for the aggregate block that contains the PHINodes leading into + // this exit path. If we can't find one, we create one. + BasicBlock *OverallPhiBlock = findOrCreatePHIBlock(Group, RetVal); + + // For our PHINode, we find the combined canonical numbering, and + // attempt to find a matching PHINode in the overall PHIBlock. If we + // cannot, we copy the PHINode and move it into this new block. + PHINode *NewPN = + findOrCreatePHIInBlock(*PN, Region, OverallPhiBlock, OutputMappings); + NewI->setOperand(0, NewPN); } // If we added an edge for basic blocks without a predecessor, we remove it @@ -1390,7 +1986,12 @@ void createSwitchStatement( Module &M, OutlinableGroup &OG, DenseMap<Value *, BasicBlock *> &EndBBs, std::vector<DenseMap<Value *, BasicBlock *>> &OutputStoreBBs) { // We only need the switch statement if there is more than one store - // combination. + // combination, or there is more than one set of output blocks. The first + // will occur when we store different sets of values for two different + // regions. The second will occur when we have two outputs that are combined + // in a PHINode outside of the region in one outlined instance, and are used + // seaparately in another. This will create the same set of OutputGVNs, but + // will generate two different output schemes. if (OG.OutputGVNCombinations.size() > 1) { Function *AggFunc = OG.OutlinedFunction; // Create a final block for each different return block. @@ -1433,8 +2034,14 @@ void createSwitchStatement( return; } + assert(OutputStoreBBs.size() < 2 && "Different store sets not handled!"); + // If there needs to be stores, move them from the output blocks to their - // corresponding ending block. + // corresponding ending block. We do not check that the OutputGVNCombinations + // is equal to 1 here since that could just been the case where there are 0 + // outputs. Instead, we check whether there is more than one set of output + // blocks since this is the only case where we would have to move the + // stores, and erase the extraneous blocks. if (OutputStoreBBs.size() == 1) { LLVM_DEBUG(dbgs() << "Move store instructions to the end block in " << *OG.OutlinedFunction << "\n"); @@ -1466,10 +2073,13 @@ void createSwitchStatement( /// set of stores needed for the different functions. /// \param [in,out] FuncsToRemove - Extracted functions to erase from module /// once outlining is complete. +/// \param [in] OutputMappings - Extracted functions to erase from module +/// once outlining is complete. static void fillOverallFunction( Module &M, OutlinableGroup &CurrentGroup, std::vector<DenseMap<Value *, BasicBlock *>> &OutputStoreBBs, - std::vector<Function *> &FuncsToRemove) { + std::vector<Function *> &FuncsToRemove, + const DenseMap<Value *, Value *> &OutputMappings) { OutlinableRegion *CurrentOS = CurrentGroup.Regions[0]; // Move first extracted function's instructions into new function. @@ -1489,7 +2099,7 @@ static void fillOverallFunction( CurrentGroup.OutlinedFunction, "output_block_0"); CurrentOS->OutputBlockNum = 0; - replaceArgumentUses(*CurrentOS, NewBBs, true); + replaceArgumentUses(*CurrentOS, NewBBs, OutputMappings, true); replaceConstants(*CurrentOS); // We first identify if any output blocks are empty, if they are we remove @@ -1523,7 +2133,8 @@ void IROutliner::deduplicateExtractedSections( OutlinableRegion *CurrentOS; - fillOverallFunction(M, CurrentGroup, OutputStoreBBs, FuncsToRemove); + fillOverallFunction(M, CurrentGroup, OutputStoreBBs, FuncsToRemove, + OutputMappings); std::vector<Value *> SortedKeys; for (unsigned Idx = 1; Idx < CurrentGroup.Regions.size(); Idx++) { @@ -1537,8 +2148,7 @@ void IROutliner::deduplicateExtractedSections( createAndInsertBasicBlocks( CurrentGroup.EndBBs, NewBBs, CurrentGroup.OutlinedFunction, "output_block_" + Twine(static_cast<unsigned>(Idx))); - - replaceArgumentUses(*CurrentOS, NewBBs); + replaceArgumentUses(*CurrentOS, NewBBs, OutputMappings); alignOutputBlockWithAggFunc(CurrentGroup, *CurrentOS, NewBBs, CurrentGroup.EndBBs, OutputMappings, OutputStoreBBs); @@ -1637,7 +2247,7 @@ void IROutliner::pruneIncompatibleRegions( if (FirstCandidate.getLength() == 2) { if (isa<CallInst>(FirstCandidate.front()->Inst) && isa<BranchInst>(FirstCandidate.back()->Inst)) - return; + return; } unsigned CurrentEndIdx = 0; @@ -1706,6 +2316,34 @@ IROutliner::findBenefitFromAllRegions(OutlinableGroup &CurrentGroup) { return RegionBenefit; } +/// For the \p OutputCanon number passed in find the value represented by this +/// canonical number. If it is from a PHINode, we pick the first incoming +/// value and return that Value instead. +/// +/// \param Region - The OutlinableRegion to get the Value from. +/// \param OutputCanon - The canonical number to find the Value from. +/// \returns The Value represented by a canonical number \p OutputCanon in \p +/// Region. +static Value *findOutputValueInRegion(OutlinableRegion &Region, + unsigned OutputCanon) { + OutlinableGroup &CurrentGroup = *Region.Parent; + // If the value is greater than the value in the tracker, we have a + // PHINode and will instead use one of the incoming values to find the + // type. + if (OutputCanon > CurrentGroup.PHINodeGVNTracker) { + auto It = CurrentGroup.PHINodeGVNToGVNs.find(OutputCanon); + assert(It != CurrentGroup.PHINodeGVNToGVNs.end() && + "Could not find GVN set for PHINode number!"); + assert(It->second.second.size() > 0 && "PHINode does not have any values!"); + OutputCanon = *It->second.second.begin(); + } + Optional<unsigned> OGVN = Region.Candidate->fromCanonicalNum(OutputCanon); + assert(OGVN.hasValue() && "Could not find GVN for Canonical Number?"); + Optional<Value *> OV = Region.Candidate->fromGVN(*OGVN); + assert(OV.hasValue() && "Could not find value for GVN?"); + return *OV; +} + InstructionCost IROutliner::findCostOutputReloads(OutlinableGroup &CurrentGroup) { InstructionCost OverallCost = 0; @@ -1713,10 +2351,8 @@ IROutliner::findCostOutputReloads(OutlinableGroup &CurrentGroup) { TargetTransformInfo &TTI = getTTI(*Region->StartBB->getParent()); // Each output incurs a load after the call, so we add that to the cost. - for (unsigned OutputGVN : Region->GVNStores) { - Optional<Value *> OV = Region->Candidate->fromGVN(OutputGVN); - assert(OV.hasValue() && "Could not find value for GVN?"); - Value *V = OV.getValue(); + for (unsigned OutputCanon : Region->GVNStores) { + Value *V = findOutputValueInRegion(*Region, OutputCanon); InstructionCost LoadCost = TTI.getMemoryOpCost(Instruction::Load, V->getType(), Align(1), 0, TargetTransformInfo::TCK_CodeSize); @@ -1745,6 +2381,7 @@ static InstructionCost findCostForOutputBlocks(Module &M, InstructionCost OutputCost = 0; unsigned NumOutputBranches = 0; + OutlinableRegion &FirstRegion = *CurrentGroup.Regions[0]; IRSimilarityCandidate &Candidate = *CurrentGroup.Regions[0]->Candidate; DenseSet<BasicBlock *> CandidateBlocks; Candidate.getBasicBlocks(CandidateBlocks); @@ -1770,10 +2407,8 @@ static InstructionCost findCostForOutputBlocks(Module &M, for (const ArrayRef<unsigned> &OutputUse : CurrentGroup.OutputGVNCombinations) { - for (unsigned GVN : OutputUse) { - Optional<Value *> OV = Candidate.fromGVN(GVN); - assert(OV.hasValue() && "Could not find value for GVN?"); - Value *V = OV.getValue(); + for (unsigned OutputCanon : OutputUse) { + Value *V = findOutputValueInRegion(FirstRegion, OutputCanon); InstructionCost StoreCost = TTI.getMemoryOpCost(Instruction::Load, V->getType(), Align(1), 0, TargetTransformInfo::TCK_CodeSize); @@ -1974,6 +2609,7 @@ bool IROutliner::extractSection(OutlinableRegion &Region) { unsigned IROutliner::doOutline(Module &M) { // Find the possible similarity sections. InstructionClassifier.EnableBranches = !DisableBranches; + InstructionClassifier.EnableIndirectCalls = !DisableIndirectCalls; IRSimilarityIdentifier &Identifier = getIRSI(M); SimilarityGroupList &SimilarityCandidates = *Identifier.getSimilarity(); @@ -2033,8 +2669,8 @@ unsigned IROutliner::doOutline(Module &M) { continue; SmallVector<BasicBlock *> BE; - DenseSet<BasicBlock *> BBSet; - OS->Candidate->getBasicBlocks(BBSet, BE); + DenseSet<BasicBlock *> BlocksInRegion; + OS->Candidate->getBasicBlocks(BlocksInRegion, BE); OS->CE = new (ExtractorAllocator.Allocate()) CodeExtractor(BE, nullptr, false, nullptr, nullptr, nullptr, false, false, "outlined"); @@ -2144,8 +2780,8 @@ unsigned IROutliner::doOutline(Module &M) { OutlinedRegions.clear(); for (OutlinableRegion *OS : CurrentGroup.Regions) { SmallVector<BasicBlock *> BE; - DenseSet<BasicBlock *> BBSet; - OS->Candidate->getBasicBlocks(BBSet, BE); + DenseSet<BasicBlock *> BlocksInRegion; + OS->Candidate->getBasicBlocks(BlocksInRegion, BE); OS->CE = new (ExtractorAllocator.Allocate()) CodeExtractor(BE, nullptr, false, nullptr, nullptr, nullptr, false, false, "outlined"); diff --git a/llvm/lib/Transforms/IPO/Inliner.cpp b/llvm/lib/Transforms/IPO/Inliner.cpp index 4e3689f09536..49babc24cb82 100644 --- a/llvm/lib/Transforms/IPO/Inliner.cpp +++ b/llvm/lib/Transforms/IPO/Inliner.cpp @@ -92,6 +92,11 @@ static cl::opt<bool> DisableInlinedAllocaMerging("disable-inlined-alloca-merging", cl::init(false), cl::Hidden); +/// A flag for test, so we can print the content of the advisor when running it +/// as part of the default (e.g. -O3) pipeline. +static cl::opt<bool> KeepAdvisorForPrinting("keep-inline-advisor-for-printing", + cl::init(false), cl::Hidden); + extern cl::opt<InlinerFunctionImportStatsOpts> InlinerFunctionImportStats; static cl::opt<std::string> CGSCCInlineReplayFile( @@ -660,7 +665,7 @@ bool LegacyInlinerBase::removeDeadFunctions(CallGraph &CG, } if (!DeadFunctionsInComdats.empty()) { // Filter out the functions whose comdats remain alive. - filterDeadComdatFunctions(CG.getModule(), DeadFunctionsInComdats); + filterDeadComdatFunctions(DeadFunctionsInComdats); // Remove the rest. for (Function *F : DeadFunctionsInComdats) RemoveCGN(CG[F]); @@ -741,7 +746,7 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC, InlineAdvisor &Advisor = getAdvisor(MAMProxy, FAM, M); Advisor.onPassEntry(); - auto AdvisorOnExit = make_scope_exit([&] { Advisor.onPassExit(); }); + auto AdvisorOnExit = make_scope_exit([&] { Advisor.onPassExit(&InitialC); }); // We use a single common worklist for calls across the entire SCC. We // process these in-order and append new calls introduced during inlining to @@ -823,6 +828,10 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC, // defer deleting these to make it easier to handle the call graph updates. SmallVector<Function *, 4> DeadFunctions; + // Track potentially dead non-local functions with comdats to see if they can + // be deleted as a batch after inlining. + SmallVector<Function *, 4> DeadFunctionsInComdats; + // Loop forward over all of the calls. while (!Calls->empty()) { // We expect the calls to typically be batched with sequences of calls that @@ -935,16 +944,15 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC, // Merge the attributes based on the inlining. AttributeFuncs::mergeAttributesForInlining(F, Callee); - // For local functions, check whether this makes the callee trivially - // dead. In that case, we can drop the body of the function eagerly - // which may reduce the number of callers of other functions to one, - // changing inline cost thresholds. + // For local functions or discardable functions without comdats, check + // whether this makes the callee trivially dead. In that case, we can drop + // the body of the function eagerly which may reduce the number of callers + // of other functions to one, changing inline cost thresholds. Non-local + // discardable functions with comdats are checked later on. bool CalleeWasDeleted = false; - if (Callee.hasLocalLinkage()) { - // To check this we also need to nuke any dead constant uses (perhaps - // made dead by this operation on other functions). - Callee.removeDeadConstantUsers(); - if (Callee.use_empty() && !CG.isLibFunction(Callee)) { + if (Callee.isDiscardableIfUnused() && Callee.hasZeroLiveUses() && + !CG.isLibFunction(Callee)) { + if (Callee.hasLocalLinkage() || !Callee.hasComdat()) { Calls->erase_if([&](const std::pair<CallBase *, int> &Call) { return Call.first->getCaller() == &Callee; }); @@ -957,6 +965,8 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC, "Cannot put cause a function to become dead twice!"); DeadFunctions.push_back(&Callee); CalleeWasDeleted = true; + } else { + DeadFunctionsInComdats.push_back(&Callee); } } if (CalleeWasDeleted) @@ -1019,6 +1029,15 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC, FAM.invalidate(F, PreservedAnalyses::none()); } + // We must ensure that we only delete functions with comdats if every function + // in the comdat is going to be deleted. + if (!DeadFunctionsInComdats.empty()) { + filterDeadComdatFunctions(DeadFunctionsInComdats); + for (auto *Callee : DeadFunctionsInComdats) + Callee->dropAllReferences(); + DeadFunctions.append(DeadFunctionsInComdats); + } + // Now that we've finished inlining all of the calls across this SCC, delete // all of the trivially dead functions, updating the call graph and the CGSCC // pass manager in the process. @@ -1045,14 +1064,7 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC, UR.UpdatedC = nullptr; // And delete the actual function from the module. - // The Advisor may use Function pointers to efficiently index various - // internal maps, e.g. for memoization. Function cleanup passes like - // argument promotion create new functions. It is possible for a new - // function to be allocated at the address of a deleted function. We could - // index using names, but that's inefficient. Alternatively, we let the - // Advisor free the functions when it sees fit. - DeadF->getBasicBlockList().clear(); - M.getFunctionList().remove(DeadF); + M.getFunctionList().erase(DeadF); ++NumDeleted; } @@ -1073,8 +1085,7 @@ ModuleInlinerWrapperPass::ModuleInlinerWrapperPass(InlineParams Params, bool MandatoryFirst, InliningAdvisorMode Mode, unsigned MaxDevirtIterations) - : Params(Params), Mode(Mode), MaxDevirtIterations(MaxDevirtIterations), - PM(), MPM() { + : Params(Params), Mode(Mode), MaxDevirtIterations(MaxDevirtIterations) { // Run the inliner first. The theory is that we are walking bottom-up and so // the callees have already been fully optimized, and we want to inline them // into the callers so that our optimizations can reflect that. @@ -1118,7 +1129,8 @@ PreservedAnalyses ModuleInlinerWrapperPass::run(Module &M, // Discard the InlineAdvisor, a subsequent inlining session should construct // its own. auto PA = PreservedAnalyses::all(); - PA.abandon<InlineAdvisorAnalysis>(); + if (!KeepAdvisorForPrinting) + PA.abandon<InlineAdvisorAnalysis>(); return PA; } diff --git a/llvm/lib/Transforms/IPO/ModuleInliner.cpp b/llvm/lib/Transforms/IPO/ModuleInliner.cpp index ebf080e87c3b..d515303e4911 100644 --- a/llvm/lib/Transforms/IPO/ModuleInliner.cpp +++ b/llvm/lib/Transforms/IPO/ModuleInliner.cpp @@ -335,14 +335,7 @@ PreservedAnalyses ModuleInlinerPass::run(Module &M, FAM.clear(*DeadF, DeadF->getName()); // And delete the actual function from the module. - // The Advisor may use Function pointers to efficiently index various - // internal maps, e.g. for memoization. Function cleanup passes like - // argument promotion create new functions. It is possible for a new - // function to be allocated at the address of a deleted function. We could - // index using names, but that's inefficient. Alternatively, we let the - // Advisor free the functions when it sees fit. - DeadF->getBasicBlockList().clear(); - M.getFunctionList().remove(DeadF); + M.getFunctionList().erase(DeadF); ++NumDeleted; } diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp index f289e3ecc979..68f33410c602 100644 --- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -21,6 +21,7 @@ #include "llvm/ADT/EnumeratedArray.h" #include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/SetVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringRef.h" #include "llvm/Analysis/CallGraph.h" @@ -153,14 +154,6 @@ static constexpr auto TAG = "[" DEBUG_TYPE "]"; namespace { -enum class AddressSpace : unsigned { - Generic = 0, - Global = 1, - Shared = 3, - Constant = 4, - Local = 5, -}; - struct AAHeapToShared; struct AAICVTracker; @@ -170,7 +163,7 @@ struct AAICVTracker; struct OMPInformationCache : public InformationCache { OMPInformationCache(Module &M, AnalysisGetter &AG, BumpPtrAllocator &Allocator, SetVector<Function *> &CGSCC, - SmallPtrSetImpl<Kernel> &Kernels) + KernelSet &Kernels) : InformationCache(M, AG, Allocator, &CGSCC), OMPBuilder(M), Kernels(Kernels) { @@ -424,6 +417,12 @@ struct OMPInformationCache : public InformationCache { recollectUsesForFunction(static_cast<RuntimeFunction>(Idx)); } + // Helper function to inherit the calling convention of the function callee. + void setCallingConvention(FunctionCallee Callee, CallInst *CI) { + if (Function *Fn = dyn_cast<Function>(Callee.getCallee())) + CI->setCallingConv(Fn->getCallingConv()); + } + /// Helper to initialize all runtime function information for those defined /// in OpenMPKinds.def. void initializeRuntimeFunctions() { @@ -485,7 +484,7 @@ struct OMPInformationCache : public InformationCache { } /// Collection of known kernels (\see Kernel) in the module. - SmallPtrSetImpl<Kernel> &Kernels; + KernelSet &Kernels; /// Collection of known OpenMP runtime functions.. DenseSet<const Function *> RTLFunctions; @@ -1013,7 +1012,8 @@ private: // into a single parallel region is contained in a single basic block // without any other instructions. We use the OpenMPIRBuilder to outline // that block and call the resulting function via __kmpc_fork_call. - auto Merge = [&](SmallVectorImpl<CallInst *> &MergableCIs, BasicBlock *BB) { + auto Merge = [&](const SmallVectorImpl<CallInst *> &MergableCIs, + BasicBlock *BB) { // TODO: Change the interface to allow single CIs expanded, e.g, to // include an outer loop. assert(MergableCIs.size() > 1 && "Assumed multiple mergable CIs"); @@ -1075,8 +1075,7 @@ private: BranchInst::Create(AfterBB, AfterIP.getBlock()); // Perform the actual outlining. - OMPInfoCache.OMPBuilder.finalize(OriginalFn, - /* AllowExtractorSinking */ true); + OMPInfoCache.OMPBuilder.finalize(OriginalFn); Function *OutlinedFn = MergableCIs.front()->getCaller(); @@ -1538,6 +1537,7 @@ private: CallInst *IssueCallsite = CallInst::Create(IssueDecl, Args, /*NameStr=*/"", &RuntimeCall); + OMPInfoCache.setCallingConvention(IssueDecl, IssueCallsite); RuntimeCall.eraseFromParent(); // Add "wait" runtime call declaration: @@ -1550,7 +1550,9 @@ private: OffloadArray::DeviceIDArgNum), // device_id. Handle // handle to wait on. }; - CallInst::Create(WaitDecl, WaitParams, /*NameStr=*/"", &WaitMovementPoint); + CallInst *WaitCallsite = CallInst::Create( + WaitDecl, WaitParams, /*NameStr=*/"", &WaitMovementPoint); + OMPInfoCache.setCallingConvention(WaitDecl, WaitCallsite); return true; } @@ -1597,8 +1599,10 @@ private: &F.getEntryBlock(), F.getEntryBlock().begin())); // Create a fallback location if non was found. // TODO: Use the debug locations of the calls instead. - Constant *Loc = OMPInfoCache.OMPBuilder.getOrCreateDefaultSrcLocStr(); - Ident = OMPInfoCache.OMPBuilder.getOrCreateIdent(Loc); + uint32_t SrcLocStrSize; + Constant *Loc = + OMPInfoCache.OMPBuilder.getOrCreateDefaultSrcLocStr(SrcLocStrSize); + Ident = OMPInfoCache.OMPBuilder.getOrCreateIdent(Loc, SrcLocStrSize); } return Ident; } @@ -2171,7 +2175,7 @@ struct AAICVTrackerFunction : public AAICVTracker { }; auto CallCheck = [&](Instruction &I) { - Optional<Value *> ReplVal = getValueForCall(A, &I, ICV); + Optional<Value *> ReplVal = getValueForCall(A, I, ICV); if (ReplVal.hasValue() && ValuesMap.insert(std::make_pair(&I, *ReplVal)).second) HasChanged = ChangeStatus::CHANGED; @@ -2197,12 +2201,12 @@ struct AAICVTrackerFunction : public AAICVTracker { return HasChanged; } - /// Hepler to check if \p I is a call and get the value for it if it is + /// Helper to check if \p I is a call and get the value for it if it is /// unique. - Optional<Value *> getValueForCall(Attributor &A, const Instruction *I, + Optional<Value *> getValueForCall(Attributor &A, const Instruction &I, InternalControlVar &ICV) const { - const auto *CB = dyn_cast<CallBase>(I); + const auto *CB = dyn_cast<CallBase>(&I); if (!CB || CB->hasFnAttr("no_openmp") || CB->hasFnAttr("no_openmp_routines")) return None; @@ -2218,8 +2222,8 @@ struct AAICVTrackerFunction : public AAICVTracker { if (CalledFunction == GetterRFI.Declaration) return None; if (CalledFunction == SetterRFI.Declaration) { - if (ICVReplacementValuesMap[ICV].count(I)) - return ICVReplacementValuesMap[ICV].lookup(I); + if (ICVReplacementValuesMap[ICV].count(&I)) + return ICVReplacementValuesMap[ICV].lookup(&I); return nullptr; } @@ -2231,8 +2235,11 @@ struct AAICVTrackerFunction : public AAICVTracker { const auto &ICVTrackingAA = A.getAAFor<AAICVTracker>( *this, IRPosition::callsite_returned(*CB), DepClassTy::REQUIRED); - if (ICVTrackingAA.isAssumedTracked()) - return ICVTrackingAA.getUniqueReplacementValue(ICV); + if (ICVTrackingAA.isAssumedTracked()) { + Optional<Value *> URV = ICVTrackingAA.getUniqueReplacementValue(ICV); + if (!URV || (*URV && AA::isValidAtPosition(**URV, I, OMPInfoCache))) + return URV; + } // If we don't know, assume it changes. return nullptr; @@ -2284,7 +2291,7 @@ struct AAICVTrackerFunction : public AAICVTracker { break; } - Optional<Value *> NewReplVal = getValueForCall(A, CurrInst, ICV); + Optional<Value *> NewReplVal = getValueForCall(A, *CurrInst, ICV); if (!NewReplVal.hasValue()) continue; @@ -2548,7 +2555,7 @@ struct AAExecutionDomainFunction : public AAExecutionDomain { } /// Set of basic blocks that are executed by a single thread. - DenseSet<const BasicBlock *> SingleThreadedBBs; + SmallSetVector<const BasicBlock *, 16> SingleThreadedBBs; /// Total number of basic blocks in this function. long unsigned NumBBs; @@ -2572,7 +2579,7 @@ ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) { if (!A.checkForAllCallSites(PredForCallSite, *this, /* RequiresAllCallSites */ true, AllCallSitesKnown)) - SingleThreadedBBs.erase(&F->getEntryBlock()); + SingleThreadedBBs.remove(&F->getEntryBlock()); auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache()); auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_target_init]; @@ -2637,7 +2644,7 @@ ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) { for (auto *BB : RPOT) { if (!MergePredecessorStates(BB)) - SingleThreadedBBs.erase(BB); + SingleThreadedBBs.remove(BB); } return (NumSingleThreadedBBs == SingleThreadedBBs.size()) @@ -2759,7 +2766,7 @@ struct AAHeapToSharedFunction : public AAHeapToShared { if (FreeCalls.size() != 1) continue; - ConstantInt *AllocSize = dyn_cast<ConstantInt>(CB->getArgOperand(0)); + auto *AllocSize = cast<ConstantInt>(CB->getArgOperand(0)); LLVM_DEBUG(dbgs() << TAG << "Replace globalization call " << *CB << " with " << AllocSize->getZExtValue() @@ -2772,7 +2779,7 @@ struct AAHeapToSharedFunction : public AAHeapToShared { Type *Int8ArrTy = ArrayType::get(Int8Ty, AllocSize->getZExtValue()); auto *SharedMem = new GlobalVariable( *M, Int8ArrTy, /* IsConstant */ false, GlobalValue::InternalLinkage, - UndefValue::get(Int8ArrTy), CB->getName(), nullptr, + UndefValue::get(Int8ArrTy), CB->getName() + "_shared", nullptr, GlobalValue::NotThreadLocal, static_cast<unsigned>(AddressSpace::Shared)); auto *NewBuffer = @@ -2786,7 +2793,10 @@ struct AAHeapToSharedFunction : public AAHeapToShared { }; A.emitRemark<OptimizationRemark>(CB, "OMP111", Remark); - SharedMem->setAlignment(MaybeAlign(32)); + MaybeAlign Alignment = CB->getRetAlign(); + assert(Alignment && + "HeapToShared on allocation without alignment attribute"); + SharedMem->setAlignment(MaybeAlign(Alignment)); A.changeValueAfterManifest(*CB, *NewBuffer); A.deleteAfterManifest(*CB); @@ -2813,7 +2823,7 @@ struct AAHeapToSharedFunction : public AAHeapToShared { if (CallBase *CB = dyn_cast<CallBase>(U)) if (!isa<ConstantInt>(CB->getArgOperand(0)) || !ED.isExecutedByInitialThreadOnly(*CB)) - MallocCalls.erase(CB); + MallocCalls.remove(CB); } findPotentialRemovedFreeCalls(A); @@ -2825,7 +2835,7 @@ struct AAHeapToSharedFunction : public AAHeapToShared { } /// Collection of all malloc calls in a function. - SmallPtrSet<CallBase *, 4> MallocCalls; + SmallSetVector<CallBase *, 4> MallocCalls; /// Collection of potentially removed free calls in a function. SmallPtrSet<CallBase *, 4> PotentialRemovedFreeCalls; }; @@ -2962,7 +2972,7 @@ struct AAKernelInfoFunction : AAKernelInfo { A.recordDependence(*this, *AA, DepClassTy::OPTIONAL); UsedAssumedInformation = !isAtFixpoint(); auto *FalseVal = - ConstantInt::getBool(IRP.getAnchorValue().getContext(), 0); + ConstantInt::getBool(IRP.getAnchorValue().getContext(), false); return FalseVal; }; @@ -3225,8 +3235,11 @@ struct AAKernelInfoFunction : AAKernelInfo { OpenMPIRBuilder::LocationDescription Loc( InsertPointTy(ParentBB, ParentBB->end()), DL); OMPInfoCache.OMPBuilder.updateToLocation(Loc); - auto *SrcLocStr = OMPInfoCache.OMPBuilder.getOrCreateSrcLocStr(Loc); - Value *Ident = OMPInfoCache.OMPBuilder.getOrCreateIdent(SrcLocStr); + uint32_t SrcLocStrSize; + auto *SrcLocStr = + OMPInfoCache.OMPBuilder.getOrCreateSrcLocStr(Loc, SrcLocStrSize); + Value *Ident = + OMPInfoCache.OMPBuilder.getOrCreateIdent(SrcLocStr, SrcLocStrSize); BranchInst::Create(RegionCheckTidBB, ParentBB)->setDebugLoc(DL); // Add check for Tid in RegionCheckTidBB @@ -3237,8 +3250,10 @@ struct AAKernelInfoFunction : AAKernelInfo { FunctionCallee HardwareTidFn = OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction( M, OMPRTL___kmpc_get_hardware_thread_id_in_block); - Value *Tid = + CallInst *Tid = OMPInfoCache.OMPBuilder.Builder.CreateCall(HardwareTidFn, {}); + Tid->setDebugLoc(DL); + OMPInfoCache.setCallingConvention(HardwareTidFn, Tid); Value *TidCheck = OMPInfoCache.OMPBuilder.Builder.CreateIsNull(Tid); OMPInfoCache.OMPBuilder.Builder .CreateCondBr(TidCheck, RegionStartBB, RegionBarrierBB) @@ -3251,14 +3266,18 @@ struct AAKernelInfoFunction : AAKernelInfo { M, OMPRTL___kmpc_barrier_simple_spmd); OMPInfoCache.OMPBuilder.updateToLocation(InsertPointTy( RegionBarrierBB, RegionBarrierBB->getFirstInsertionPt())); - OMPInfoCache.OMPBuilder.Builder.CreateCall(BarrierFn, {Ident, Tid}) - ->setDebugLoc(DL); + CallInst *Barrier = + OMPInfoCache.OMPBuilder.Builder.CreateCall(BarrierFn, {Ident, Tid}); + Barrier->setDebugLoc(DL); + OMPInfoCache.setCallingConvention(BarrierFn, Barrier); // Second barrier ensures workers have read broadcast values. - if (HasBroadcastValues) - CallInst::Create(BarrierFn, {Ident, Tid}, "", - RegionBarrierBB->getTerminator()) - ->setDebugLoc(DL); + if (HasBroadcastValues) { + CallInst *Barrier = CallInst::Create(BarrierFn, {Ident, Tid}, "", + RegionBarrierBB->getTerminator()); + Barrier->setDebugLoc(DL); + OMPInfoCache.setCallingConvention(BarrierFn, Barrier); + } }; auto &AllocSharedRFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared]; @@ -3352,17 +3371,17 @@ struct AAKernelInfoFunction : AAKernelInfo { OMP_TGT_EXEC_MODE_SPMD)); A.changeUseAfterManifest( KernelInitCB->getArgOperandUse(InitUseStateMachineArgNo), - *ConstantInt::getBool(Ctx, 0)); + *ConstantInt::getBool(Ctx, false)); A.changeUseAfterManifest( KernelDeinitCB->getArgOperandUse(DeinitModeArgNo), *ConstantInt::getSigned(IntegerType::getInt8Ty(Ctx), OMP_TGT_EXEC_MODE_SPMD)); A.changeUseAfterManifest( KernelInitCB->getArgOperandUse(InitRequiresFullRuntimeArgNo), - *ConstantInt::getBool(Ctx, 0)); + *ConstantInt::getBool(Ctx, false)); A.changeUseAfterManifest( KernelDeinitCB->getArgOperandUse(DeinitRequiresFullRuntimeArgNo), - *ConstantInt::getBool(Ctx, 0)); + *ConstantInt::getBool(Ctx, false)); ++NumOpenMPTargetRegionKernelsSPMD; @@ -3403,7 +3422,7 @@ struct AAKernelInfoFunction : AAKernelInfo { // If not SPMD mode, indicate we use a custom state machine now. auto &Ctx = getAnchorValue().getContext(); - auto *FalseVal = ConstantInt::getBool(Ctx, 0); + auto *FalseVal = ConstantInt::getBool(Ctx, false); A.changeUseAfterManifest( KernelInitCB->getArgOperandUse(InitUseStateMachineArgNo), *FalseVal); @@ -3528,10 +3547,12 @@ struct AAKernelInfoFunction : AAKernelInfo { FunctionCallee WarpSizeFn = OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction( M, OMPRTL___kmpc_get_warp_size); - Instruction *BlockHwSize = + CallInst *BlockHwSize = CallInst::Create(BlockHwSizeFn, "block.hw_size", InitBB); + OMPInfoCache.setCallingConvention(BlockHwSizeFn, BlockHwSize); BlockHwSize->setDebugLoc(DLoc); - Instruction *WarpSize = CallInst::Create(WarpSizeFn, "warp.size", InitBB); + CallInst *WarpSize = CallInst::Create(WarpSizeFn, "warp.size", InitBB); + OMPInfoCache.setCallingConvention(WarpSizeFn, WarpSize); WarpSize->setDebugLoc(DLoc); Instruction *BlockSize = BinaryOperator::CreateSub(BlockHwSize, WarpSize, "block.size", InitBB); @@ -3571,8 +3592,10 @@ struct AAKernelInfoFunction : AAKernelInfo { FunctionCallee BarrierFn = OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction( M, OMPRTL___kmpc_barrier_simple_generic); - CallInst::Create(BarrierFn, {Ident, GTid}, "", StateMachineBeginBB) - ->setDebugLoc(DLoc); + CallInst *Barrier = + CallInst::Create(BarrierFn, {Ident, GTid}, "", StateMachineBeginBB); + OMPInfoCache.setCallingConvention(BarrierFn, Barrier); + Barrier->setDebugLoc(DLoc); if (WorkFnAI->getType()->getPointerAddressSpace() != (unsigned int)AddressSpace::Generic) { @@ -3588,8 +3611,9 @@ struct AAKernelInfoFunction : AAKernelInfo { FunctionCallee KernelParallelFn = OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction( M, OMPRTL___kmpc_kernel_parallel); - Instruction *IsActiveWorker = CallInst::Create( + CallInst *IsActiveWorker = CallInst::Create( KernelParallelFn, {WorkFnAI}, "worker.is_active", StateMachineBeginBB); + OMPInfoCache.setCallingConvention(KernelParallelFn, IsActiveWorker); IsActiveWorker->setDebugLoc(DLoc); Instruction *WorkFn = new LoadInst(VoidPtrTy, WorkFnAI, "worker.work_fn", StateMachineBeginBB); @@ -3669,10 +3693,13 @@ struct AAKernelInfoFunction : AAKernelInfo { StateMachineIfCascadeCurrentBB) ->setDebugLoc(DLoc); - CallInst::Create(OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction( - M, OMPRTL___kmpc_kernel_end_parallel), - {}, "", StateMachineEndParallelBB) - ->setDebugLoc(DLoc); + FunctionCallee EndParallelFn = + OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction( + M, OMPRTL___kmpc_kernel_end_parallel); + CallInst *EndParallel = + CallInst::Create(EndParallelFn, {}, "", StateMachineEndParallelBB); + OMPInfoCache.setCallingConvention(EndParallelFn, EndParallel); + EndParallel->setDebugLoc(DLoc); BranchInst::Create(StateMachineDoneBarrierBB, StateMachineEndParallelBB) ->setDebugLoc(DLoc); @@ -4508,6 +4535,8 @@ void OpenMPOpt::registerAAs(bool IsModulePass) { bool UsedAssumedInformation = false; A.getAssumedSimplified(IRPosition::value(*LI), /* AA */ nullptr, UsedAssumedInformation); + } else if (auto *SI = dyn_cast<StoreInst>(&I)) { + A.getOrCreateAAFor<AAIsDead>(IRPosition::value(*SI)); } } } diff --git a/llvm/lib/Transforms/IPO/PartialInlining.cpp b/llvm/lib/Transforms/IPO/PartialInlining.cpp index 2d717475ce7f..5f2223e4047e 100644 --- a/llvm/lib/Transforms/IPO/PartialInlining.cpp +++ b/llvm/lib/Transforms/IPO/PartialInlining.cpp @@ -169,8 +169,7 @@ struct FunctionOutliningInfo { }; struct FunctionOutliningMultiRegionInfo { - FunctionOutliningMultiRegionInfo() - : ORI() {} + FunctionOutliningMultiRegionInfo() {} // Container for outline regions struct OutlineRegionInfo { @@ -971,6 +970,9 @@ void PartialInlinerImpl::computeCallsiteToProfCountMap( }; for (User *User : Users) { + // Don't bother with BlockAddress used by CallBr for asm goto. + if (isa<BlockAddress>(User)) + continue; CallBase *CB = getSupportedCallBase(User); Function *Caller = CB->getCaller(); if (CurrentCaller != Caller) { @@ -1414,6 +1416,10 @@ bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) { bool AnyInline = false; for (User *User : Users) { + // Don't bother with BlockAddress used by CallBr for asm goto. + if (isa<BlockAddress>(User)) + continue; + CallBase *CB = getSupportedCallBase(User); if (isLimitReached()) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp index eb1b8a29cfc5..0598f751febe 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp @@ -519,13 +519,6 @@ Value *FAddCombine::simplifyFAdd(AddendVect& Addends, unsigned InstrQuota) { unsigned NextTmpIdx = 0; FAddend TmpResult[3]; - // Points to the constant addend of the resulting simplified expression. - // If the resulting expr has constant-addend, this constant-addend is - // desirable to reside at the top of the resulting expression tree. Placing - // constant close to supper-expr(s) will potentially reveal some optimization - // opportunities in super-expr(s). - const FAddend *ConstAdd = nullptr; - // Simplified addends are placed <SimpVect>. AddendVect SimpVect; @@ -541,6 +534,14 @@ Value *FAddCombine::simplifyFAdd(AddendVect& Addends, unsigned InstrQuota) { } Value *Val = ThisAddend->getSymVal(); + + // If the resulting expr has constant-addend, this constant-addend is + // desirable to reside at the top of the resulting expression tree. Placing + // constant close to super-expr(s) will potentially reveal some + // optimization opportunities in super-expr(s). Here we do not implement + // this logic intentionally and rely on SimplifyAssociativeOrCommutative + // call later. + unsigned StartIdx = SimpVect.size(); SimpVect.push_back(ThisAddend); @@ -569,14 +570,8 @@ Value *FAddCombine::simplifyFAdd(AddendVect& Addends, unsigned InstrQuota) { // Pop all addends being folded and push the resulting folded addend. SimpVect.resize(StartIdx); - if (Val) { - if (!R.isZero()) { - SimpVect.push_back(&R); - } - } else { - // Don't push constant addend at this time. It will be the last element - // of <SimpVect>. - ConstAdd = &R; + if (!R.isZero()) { + SimpVect.push_back(&R); } } } @@ -584,9 +579,6 @@ Value *FAddCombine::simplifyFAdd(AddendVect& Addends, unsigned InstrQuota) { assert((NextTmpIdx <= array_lengthof(TmpResult) + 1) && "out-of-bound access"); - if (ConstAdd) - SimpVect.push_back(ConstAdd); - Value *Result; if (!SimpVect.empty()) Result = createNaryFAdd(SimpVect, InstrQuota); @@ -1296,6 +1288,9 @@ Instruction *InstCombinerImpl::visitAdd(BinaryOperator &I) { if (Instruction *X = foldVectorBinop(I)) return X; + if (Instruction *Phi = foldBinopWithPhiOperands(I)) + return Phi; + // (A*B)+(A*C) -> A*(B+C) etc if (Value *V = SimplifyUsingDistributiveLaws(I)) return replaceInstUsesWith(I, V); @@ -1498,15 +1493,18 @@ static Instruction *factorizeFAddFSub(BinaryOperator &I, return Lerp; Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); + if (!Op0->hasOneUse() || !Op1->hasOneUse()) + return nullptr; + Value *X, *Y, *Z; bool IsFMul; - if ((match(Op0, m_OneUse(m_FMul(m_Value(X), m_Value(Z)))) && - match(Op1, m_OneUse(m_c_FMul(m_Value(Y), m_Specific(Z))))) || - (match(Op0, m_OneUse(m_FMul(m_Value(Z), m_Value(X)))) && - match(Op1, m_OneUse(m_c_FMul(m_Value(Y), m_Specific(Z)))))) + if ((match(Op0, m_FMul(m_Value(X), m_Value(Z))) && + match(Op1, m_c_FMul(m_Value(Y), m_Specific(Z)))) || + (match(Op0, m_FMul(m_Value(Z), m_Value(X))) && + match(Op1, m_c_FMul(m_Value(Y), m_Specific(Z))))) IsFMul = true; - else if (match(Op0, m_OneUse(m_FDiv(m_Value(X), m_Value(Z)))) && - match(Op1, m_OneUse(m_FDiv(m_Value(Y), m_Specific(Z))))) + else if (match(Op0, m_FDiv(m_Value(X), m_Value(Z))) && + match(Op1, m_FDiv(m_Value(Y), m_Specific(Z)))) IsFMul = false; else return nullptr; @@ -1541,6 +1539,9 @@ Instruction *InstCombinerImpl::visitFAdd(BinaryOperator &I) { if (Instruction *X = foldVectorBinop(I)) return X; + if (Instruction *Phi = foldBinopWithPhiOperands(I)) + return Phi; + if (Instruction *FoldedFAdd = foldBinOpIntoSelectOrPhi(I)) return FoldedFAdd; @@ -1654,6 +1655,14 @@ Instruction *InstCombinerImpl::visitFAdd(BinaryOperator &I) { {X->getType()}, {NewStartC, X}, &I)); } + // (X * MulC) + X --> X * (MulC + 1.0) + Constant *MulC; + if (match(&I, m_c_FAdd(m_FMul(m_Value(X), m_ImmConstant(MulC)), + m_Deferred(X)))) { + MulC = ConstantExpr::getFAdd(MulC, ConstantFP::get(I.getType(), 1.0)); + return BinaryOperator::CreateFMulFMF(X, MulC, &I); + } + if (Value *V = FAddCombine(Builder).simplify(&I)) return replaceInstUsesWith(I, V); } @@ -1748,6 +1757,9 @@ Instruction *InstCombinerImpl::visitSub(BinaryOperator &I) { if (Instruction *X = foldVectorBinop(I)) return X; + if (Instruction *Phi = foldBinopWithPhiOperands(I)) + return Phi; + Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); // If this is a 'B = x-(-A)', change to B = x+A. @@ -2310,6 +2322,9 @@ Instruction *InstCombinerImpl::visitFSub(BinaryOperator &I) { if (Instruction *X = foldVectorBinop(I)) return X; + if (Instruction *Phi = foldBinopWithPhiOperands(I)) + return Phi; + // Subtraction from -0.0 is the canonical form of fneg. // fsub -0.0, X ==> fneg X // fsub nsz 0.0, X ==> fneg nsz X diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp index de1034c910d5..6bbb0251f2bc 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp @@ -1727,25 +1727,37 @@ static Instruction *foldComplexAndOrPatterns(BinaryOperator &I, (Opcode == Instruction::And) ? Instruction::Or : Instruction::And; Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); - Value *A, *B, *C, *X, *Y; + Value *A, *B, *C, *X, *Y, *Dummy; + + // Match following expressions: + // (~(A | B) & C) + // (~(A & B) | C) + // Captures X = ~(A | B) or ~(A & B) + const auto matchNotOrAnd = + [Opcode, FlippedOpcode](Value *Op, auto m_A, auto m_B, auto m_C, + Value *&X, bool CountUses = false) -> bool { + if (CountUses && !Op->hasOneUse()) + return false; + + if (match(Op, m_c_BinOp(FlippedOpcode, + m_CombineAnd(m_Value(X), + m_Not(m_c_BinOp(Opcode, m_A, m_B))), + m_C))) + return !CountUses || X->hasOneUse(); + + return false; + }; // (~(A | B) & C) | ... --> ... // (~(A & B) | C) & ... --> ... // TODO: One use checks are conservative. We just need to check that a total // number of multiple used values does not exceed reduction // in operations. - if (match(Op0, - m_c_BinOp(FlippedOpcode, - m_CombineAnd(m_Value(X), m_Not(m_BinOp(Opcode, m_Value(A), - m_Value(B)))), - m_Value(C)))) { + if (matchNotOrAnd(Op0, m_Value(A), m_Value(B), m_Value(C), X)) { // (~(A | B) & C) | (~(A | C) & B) --> (B ^ C) & ~A // (~(A & B) | C) & (~(A & C) | B) --> ~((B ^ C) & A) - if (match(Op1, - m_OneUse(m_c_BinOp(FlippedOpcode, - m_OneUse(m_Not(m_c_BinOp(Opcode, m_Specific(A), - m_Specific(C)))), - m_Specific(B))))) { + if (matchNotOrAnd(Op1, m_Specific(A), m_Specific(C), m_Specific(B), Dummy, + true)) { Value *Xor = Builder.CreateXor(B, C); return (Opcode == Instruction::Or) ? BinaryOperator::CreateAnd(Xor, Builder.CreateNot(A)) @@ -1754,11 +1766,8 @@ static Instruction *foldComplexAndOrPatterns(BinaryOperator &I, // (~(A | B) & C) | (~(B | C) & A) --> (A ^ C) & ~B // (~(A & B) | C) & (~(B & C) | A) --> ~((A ^ C) & B) - if (match(Op1, - m_OneUse(m_c_BinOp(FlippedOpcode, - m_OneUse(m_Not(m_c_BinOp(Opcode, m_Specific(B), - m_Specific(C)))), - m_Specific(A))))) { + if (matchNotOrAnd(Op1, m_Specific(B), m_Specific(C), m_Specific(A), Dummy, + true)) { Value *Xor = Builder.CreateXor(A, C); return (Opcode == Instruction::Or) ? BinaryOperator::CreateAnd(Xor, Builder.CreateNot(B)) @@ -1863,6 +1872,9 @@ Instruction *InstCombinerImpl::visitAnd(BinaryOperator &I) { if (Instruction *X = foldVectorBinop(I)) return X; + if (Instruction *Phi = foldBinopWithPhiOperands(I)) + return Phi; + // See if we can simplify any instructions used by the instruction whose sole // purpose is to compute bits we don't care about. if (SimplifyDemandedInstructionBits(I)) @@ -2072,21 +2084,37 @@ Instruction *InstCombinerImpl::visitAnd(BinaryOperator &I) { if (Op0->hasOneUse() || isFreeToInvert(C, C->hasOneUse())) return BinaryOperator::CreateAnd(Op1, Builder.CreateNot(C)); - // (A | B) & ((~A) ^ B) -> (A & B) - // (A | B) & (B ^ (~A)) -> (A & B) - // (B | A) & ((~A) ^ B) -> (A & B) - // (B | A) & (B ^ (~A)) -> (A & B) + // (A | B) & (~A ^ B) -> A & B + // (A | B) & (B ^ ~A) -> A & B + // (B | A) & (~A ^ B) -> A & B + // (B | A) & (B ^ ~A) -> A & B if (match(Op1, m_c_Xor(m_Not(m_Value(A)), m_Value(B))) && match(Op0, m_c_Or(m_Specific(A), m_Specific(B)))) return BinaryOperator::CreateAnd(A, B); - // ((~A) ^ B) & (A | B) -> (A & B) - // ((~A) ^ B) & (B | A) -> (A & B) - // (B ^ (~A)) & (A | B) -> (A & B) - // (B ^ (~A)) & (B | A) -> (A & B) + // (~A ^ B) & (A | B) -> A & B + // (~A ^ B) & (B | A) -> A & B + // (B ^ ~A) & (A | B) -> A & B + // (B ^ ~A) & (B | A) -> A & B if (match(Op0, m_c_Xor(m_Not(m_Value(A)), m_Value(B))) && match(Op1, m_c_Or(m_Specific(A), m_Specific(B)))) return BinaryOperator::CreateAnd(A, B); + + // (~A | B) & (A ^ B) -> ~A & B + // (~A | B) & (B ^ A) -> ~A & B + // (B | ~A) & (A ^ B) -> ~A & B + // (B | ~A) & (B ^ A) -> ~A & B + if (match(Op0, m_c_Or(m_Not(m_Value(A)), m_Value(B))) && + match(Op1, m_c_Xor(m_Specific(A), m_Specific(B)))) + return BinaryOperator::CreateAnd(Builder.CreateNot(A), B); + + // (A ^ B) & (~A | B) -> ~A & B + // (B ^ A) & (~A | B) -> ~A & B + // (A ^ B) & (B | ~A) -> ~A & B + // (B ^ A) & (B | ~A) -> ~A & B + if (match(Op1, m_c_Or(m_Not(m_Value(A)), m_Value(B))) && + match(Op0, m_c_Xor(m_Specific(A), m_Specific(B)))) + return BinaryOperator::CreateAnd(Builder.CreateNot(A), B); } { @@ -2640,6 +2668,9 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) { if (Instruction *X = foldVectorBinop(I)) return X; + if (Instruction *Phi = foldBinopWithPhiOperands(I)) + return Phi; + // See if we can simplify any instructions used by the instruction whose sole // purpose is to compute bits we don't care about. if (SimplifyDemandedInstructionBits(I)) @@ -3528,6 +3559,9 @@ Instruction *InstCombinerImpl::visitXor(BinaryOperator &I) { if (Instruction *X = foldVectorBinop(I)) return X; + if (Instruction *Phi = foldBinopWithPhiOperands(I)) + return Phi; + if (Instruction *NewXor = foldXorToXor(I, Builder)) return NewXor; diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index 14427bd1f2f4..1fb46af46bee 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -352,9 +352,27 @@ Instruction *InstCombinerImpl::simplifyMaskedStore(IntrinsicInst &II) { // * Dereferenceable address & few lanes -> scalarize speculative load/selects // * Adjacent vector addresses -> masked.load // * Narrow width by halfs excluding zero/undef lanes -// * Vector splat address w/known mask -> scalar load // * Vector incrementing address -> vector masked load Instruction *InstCombinerImpl::simplifyMaskedGather(IntrinsicInst &II) { + auto *ConstMask = dyn_cast<Constant>(II.getArgOperand(2)); + if (!ConstMask) + return nullptr; + + // Vector splat address w/known mask -> scalar load + // Fold the gather to load the source vector first lane + // because it is reloading the same value each time + if (ConstMask->isAllOnesValue()) + if (auto *SplatPtr = getSplatValue(II.getArgOperand(0))) { + auto *VecTy = cast<VectorType>(II.getType()); + const Align Alignment = + cast<ConstantInt>(II.getArgOperand(1))->getAlignValue(); + LoadInst *L = Builder.CreateAlignedLoad(VecTy->getElementType(), SplatPtr, + Alignment, "load.scalar"); + Value *Shuf = + Builder.CreateVectorSplat(VecTy->getElementCount(), L, "broadcast"); + return replaceInstUsesWith(II, cast<Instruction>(Shuf)); + } + return nullptr; } @@ -362,7 +380,6 @@ Instruction *InstCombinerImpl::simplifyMaskedGather(IntrinsicInst &II) { // * Single constant active lane -> store // * Adjacent vector addresses -> masked.store // * Narrow store width by halfs excluding zero/undef lanes -// * Vector splat address w/known mask -> scalar store // * Vector incrementing address -> vector masked store Instruction *InstCombinerImpl::simplifyMaskedScatter(IntrinsicInst &II) { auto *ConstMask = dyn_cast<Constant>(II.getArgOperand(3)); @@ -373,6 +390,34 @@ Instruction *InstCombinerImpl::simplifyMaskedScatter(IntrinsicInst &II) { if (ConstMask->isNullValue()) return eraseInstFromFunction(II); + // Vector splat address -> scalar store + if (auto *SplatPtr = getSplatValue(II.getArgOperand(1))) { + // scatter(splat(value), splat(ptr), non-zero-mask) -> store value, ptr + if (auto *SplatValue = getSplatValue(II.getArgOperand(0))) { + Align Alignment = cast<ConstantInt>(II.getArgOperand(2))->getAlignValue(); + StoreInst *S = + new StoreInst(SplatValue, SplatPtr, /*IsVolatile=*/false, Alignment); + S->copyMetadata(II); + return S; + } + // scatter(vector, splat(ptr), splat(true)) -> store extract(vector, + // lastlane), ptr + if (ConstMask->isAllOnesValue()) { + Align Alignment = cast<ConstantInt>(II.getArgOperand(2))->getAlignValue(); + VectorType *WideLoadTy = cast<VectorType>(II.getArgOperand(1)->getType()); + ElementCount VF = WideLoadTy->getElementCount(); + Constant *EC = + ConstantInt::get(Builder.getInt32Ty(), VF.getKnownMinValue()); + Value *RunTimeVF = VF.isScalable() ? Builder.CreateVScale(EC) : EC; + Value *LastLane = Builder.CreateSub(RunTimeVF, Builder.getInt32(1)); + Value *Extract = + Builder.CreateExtractElement(II.getArgOperand(0), LastLane); + StoreInst *S = + new StoreInst(Extract, SplatPtr, /*IsVolatile=*/false, Alignment); + S->copyMetadata(II); + return S; + } + } if (isa<ScalableVectorType>(ConstMask->getType())) return nullptr; @@ -449,7 +494,7 @@ static Instruction *foldCttzCtlz(IntrinsicInst &II, InstCombinerImpl &IC) { // ctlz/cttz i1 Op0 --> not Op0 if (match(Op1, m_Zero())) return BinaryOperator::CreateNot(Op0); - // If zero is undef, then the input can be assumed to be "true", so the + // If zero is poison, then the input can be assumed to be "true", so the // instruction simplifies to "false". assert(match(Op1, m_One()) && "Expected ctlz/cttz operand to be 0 or 1"); return IC.replaceInstUsesWith(II, ConstantInt::getNullValue(II.getType())); @@ -474,7 +519,7 @@ static Instruction *foldCttzCtlz(IntrinsicInst &II, InstCombinerImpl &IC) { } // Zext doesn't change the number of trailing zeros, so narrow: - // cttz(zext(x)) -> zext(cttz(x)) if the 'ZeroIsUndef' parameter is 'true'. + // cttz(zext(x)) -> zext(cttz(x)) if the 'ZeroIsPoison' parameter is 'true'. if (match(Op0, m_OneUse(m_ZExt(m_Value(X)))) && match(Op1, m_One())) { auto *Cttz = IC.Builder.CreateBinaryIntrinsic(Intrinsic::cttz, X, IC.Builder.getTrue()); @@ -511,7 +556,7 @@ static Instruction *foldCttzCtlz(IntrinsicInst &II, InstCombinerImpl &IC) { } // If the input to cttz/ctlz is known to be non-zero, - // then change the 'ZeroIsUndef' parameter to 'true' + // then change the 'ZeroIsPoison' parameter to 'true' // because we know the zero behavior can't affect the result. if (!Known.One.isZero() || isKnownNonZero(Op0, IC.getDataLayout(), 0, &IC.getAssumptionCache(), &II, @@ -1188,6 +1233,21 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) { Value *IIOperand = II->getArgOperand(0); Value *X = nullptr; + KnownBits Known = computeKnownBits(IIOperand, 0, II); + uint64_t LZ = alignDown(Known.countMinLeadingZeros(), 8); + uint64_t TZ = alignDown(Known.countMinTrailingZeros(), 8); + + // bswap(x) -> shift(x) if x has exactly one "active byte" + if (Known.getBitWidth() - LZ - TZ == 8) { + assert(LZ != TZ && "active byte cannot be in the middle"); + if (LZ > TZ) // -> shl(x) if the "active byte" is in the low part of x + return BinaryOperator::CreateNUWShl( + IIOperand, ConstantInt::get(IIOperand->getType(), LZ - TZ)); + // -> lshr(x) if the "active byte" is in the high part of x + return BinaryOperator::CreateExactLShr( + IIOperand, ConstantInt::get(IIOperand->getType(), TZ - LZ)); + } + // bswap(trunc(bswap(x))) -> trunc(lshr(x, c)) if (match(IIOperand, m_Trunc(m_BSwap(m_Value(X))))) { unsigned C = X->getType()->getScalarSizeInBits() - @@ -2460,7 +2520,7 @@ static bool isSafeToEliminateVarargsCast(const CallBase &Call, if (!Call.isByValArgument(ix)) return false; - Type *SrcElemTy = SrcTy->getElementType(); + Type *SrcElemTy = SrcTy->getNonOpaquePointerElementType(); Type *DstElemTy = Call.getParamByValType(ix); if (!SrcElemTy->isSized() || !DstElemTy->isSized()) return false; @@ -2571,57 +2631,36 @@ static IntrinsicInst *findInitTrampoline(Value *Callee) { } void InstCombinerImpl::annotateAnyAllocSite(CallBase &Call, const TargetLibraryInfo *TLI) { - unsigned NumArgs = Call.arg_size(); - ConstantInt *Op0C = dyn_cast<ConstantInt>(Call.getOperand(0)); - ConstantInt *Op1C = - (NumArgs == 1) ? nullptr : dyn_cast<ConstantInt>(Call.getOperand(1)); - // Bail out if the allocation size is zero (or an invalid alignment of zero - // with aligned_alloc). - if ((Op0C && Op0C->isNullValue()) || (Op1C && Op1C->isNullValue())) - return; - - if (isMallocLikeFn(&Call, TLI) && Op0C) { - if (isOpNewLikeFn(&Call, TLI)) + // Note: We only handle cases which can't be driven from generic attributes + // here. So, for example, nonnull and noalias (which are common properties + // of some allocation functions) are expected to be handled via annotation + // of the respective allocator declaration with generic attributes. + + uint64_t Size; + ObjectSizeOpts Opts; + if (getObjectSize(&Call, Size, DL, TLI, Opts) && Size > 0) { + // TODO: We really should just emit deref_or_null here and then + // let the generic inference code combine that with nonnull. + if (Call.hasRetAttr(Attribute::NonNull)) Call.addRetAttr(Attribute::getWithDereferenceableBytes( - Call.getContext(), Op0C->getZExtValue())); + Call.getContext(), Size)); else Call.addRetAttr(Attribute::getWithDereferenceableOrNullBytes( - Call.getContext(), Op0C->getZExtValue())); - } else if (isAlignedAllocLikeFn(&Call, TLI)) { - if (Op1C) - Call.addRetAttr(Attribute::getWithDereferenceableOrNullBytes( - Call.getContext(), Op1C->getZExtValue())); - // Add alignment attribute if alignment is a power of two constant. - if (Op0C && Op0C->getValue().ult(llvm::Value::MaximumAlignment) && - isKnownNonZero(Call.getOperand(1), DL, 0, &AC, &Call, &DT)) { - uint64_t AlignmentVal = Op0C->getZExtValue(); - if (llvm::isPowerOf2_64(AlignmentVal)) { - Call.removeRetAttr(Attribute::Alignment); - Call.addRetAttr(Attribute::getWithAlignment(Call.getContext(), - Align(AlignmentVal))); - } - } - } else if (isReallocLikeFn(&Call, TLI) && Op1C) { - Call.addRetAttr(Attribute::getWithDereferenceableOrNullBytes( - Call.getContext(), Op1C->getZExtValue())); - } else if (isCallocLikeFn(&Call, TLI) && Op0C && Op1C) { - bool Overflow; - const APInt &N = Op0C->getValue(); - APInt Size = N.umul_ov(Op1C->getValue(), Overflow); - if (!Overflow) - Call.addRetAttr(Attribute::getWithDereferenceableOrNullBytes( - Call.getContext(), Size.getZExtValue())); - } else if (isStrdupLikeFn(&Call, TLI)) { - uint64_t Len = GetStringLength(Call.getOperand(0)); - if (Len) { - // strdup - if (NumArgs == 1) - Call.addRetAttr(Attribute::getWithDereferenceableOrNullBytes( - Call.getContext(), Len)); - // strndup - else if (NumArgs == 2 && Op1C) - Call.addRetAttr(Attribute::getWithDereferenceableOrNullBytes( - Call.getContext(), std::min(Len, Op1C->getZExtValue() + 1))); + Call.getContext(), Size)); + } + + // Add alignment attribute if alignment is a power of two constant. + Value *Alignment = getAllocAlignment(&Call, TLI); + if (!Alignment) + return; + + ConstantInt *AlignOpC = dyn_cast<ConstantInt>(Alignment); + if (AlignOpC && AlignOpC->getValue().ult(llvm::Value::MaximumAlignment)) { + uint64_t AlignmentVal = AlignOpC->getZExtValue(); + if (llvm::isPowerOf2_64(AlignmentVal)) { + Call.removeRetAttr(Attribute::Alignment); + Call.addRetAttr(Attribute::getWithAlignment(Call.getContext(), + Align(AlignmentVal))); } } } @@ -2744,9 +2783,9 @@ Instruction *InstCombinerImpl::visitCallBase(CallBase &Call) { PointerType *NewTy = cast<PointerType>(CI->getOperand(0)->getType()); if (!NewTy->isOpaque() && Call.isByValArgument(ix)) { Call.removeParamAttr(ix, Attribute::ByVal); - Call.addParamAttr( - ix, Attribute::getWithByValType( - Call.getContext(), NewTy->getElementType())); + Call.addParamAttr(ix, Attribute::getWithByValType( + Call.getContext(), + NewTy->getNonOpaquePointerElementType())); } Changed = true; } @@ -2782,7 +2821,8 @@ Instruction *InstCombinerImpl::visitCallBase(CallBase &Call) { Call, Builder.CreateBitOrPointerCast(ReturnedArg, CallTy)); } - if (isAllocLikeFn(&Call, &TLI)) + if (isAllocationFn(&Call, &TLI) && + isAllocRemovable(&cast<CallBase>(Call), &TLI)) return visitAllocSite(Call); // Handle intrinsics which can be used in both call and invoke context. @@ -2934,7 +2974,7 @@ bool InstCombinerImpl::transformConstExprCastCall(CallBase &Call) { } if (!CallerPAL.isEmpty() && !Caller->use_empty()) { - AttrBuilder RAttrs(CallerPAL, AttributeList::ReturnIndex); + AttrBuilder RAttrs(FT->getContext(), CallerPAL.getRetAttrs()); if (RAttrs.overlaps(AttributeFuncs::typeIncompatible(NewRetTy))) return false; // Attribute not compatible with transformed value. } @@ -2980,7 +3020,7 @@ bool InstCombinerImpl::transformConstExprCastCall(CallBase &Call) { if (!CastInst::isBitOrNoopPointerCastable(ActTy, ParamTy, DL)) return false; // Cannot transform this parameter value. - if (AttrBuilder(CallerPAL.getParamAttrs(i)) + if (AttrBuilder(FT->getContext(), CallerPAL.getParamAttrs(i)) .overlaps(AttributeFuncs::typeIncompatible(ParamTy))) return false; // Attribute not compatible with transformed value. @@ -2994,12 +3034,12 @@ bool InstCombinerImpl::transformConstExprCastCall(CallBase &Call) { // sized type and the sized type has to have the same size as the old type. if (ParamTy != ActTy && CallerPAL.hasParamAttr(i, Attribute::ByVal)) { PointerType *ParamPTy = dyn_cast<PointerType>(ParamTy); - if (!ParamPTy || !ParamPTy->getElementType()->isSized()) + if (!ParamPTy || !ParamPTy->getPointerElementType()->isSized()) return false; Type *CurElTy = Call.getParamByValType(i); if (DL.getTypeAllocSize(CurElTy) != - DL.getTypeAllocSize(ParamPTy->getElementType())) + DL.getTypeAllocSize(ParamPTy->getPointerElementType())) return false; } } @@ -3012,17 +3052,14 @@ bool InstCombinerImpl::transformConstExprCastCall(CallBase &Call) { // If the callee is just a declaration, don't change the varargsness of the // call. We don't want to introduce a varargs call where one doesn't // already exist. - PointerType *APTy = cast<PointerType>(Call.getCalledOperand()->getType()); - if (FT->isVarArg()!=cast<FunctionType>(APTy->getElementType())->isVarArg()) + if (FT->isVarArg() != Call.getFunctionType()->isVarArg()) return false; // If both the callee and the cast type are varargs, we still have to make // sure the number of fixed parameters are the same or we have the same // ABI issues as if we introduce a varargs call. - if (FT->isVarArg() && - cast<FunctionType>(APTy->getElementType())->isVarArg() && - FT->getNumParams() != - cast<FunctionType>(APTy->getElementType())->getNumParams()) + if (FT->isVarArg() && Call.getFunctionType()->isVarArg() && + FT->getNumParams() != Call.getFunctionType()->getNumParams()) return false; } @@ -3045,7 +3082,7 @@ bool InstCombinerImpl::transformConstExprCastCall(CallBase &Call) { ArgAttrs.reserve(NumActualArgs); // Get any return attributes. - AttrBuilder RAttrs(CallerPAL, AttributeList::ReturnIndex); + AttrBuilder RAttrs(FT->getContext(), CallerPAL.getRetAttrs()); // If the return value is not being used, the type may not be compatible // with the existing attributes. Wipe out any problematic attributes. @@ -3063,7 +3100,7 @@ bool InstCombinerImpl::transformConstExprCastCall(CallBase &Call) { // Add any parameter attributes. if (CallerPAL.hasParamAttr(i, Attribute::ByVal)) { - AttrBuilder AB(CallerPAL.getParamAttrs(i)); + AttrBuilder AB(FT->getContext(), CallerPAL.getParamAttrs(i)); AB.addByValAttr(NewArg->getType()->getPointerElementType()); ArgAttrs.push_back(AttributeSet::get(Ctx, AB)); } else diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp index 8df4a4529f47..f11ba8772f3c 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp @@ -85,13 +85,16 @@ static Value *decomposeSimpleLinearExpr(Value *Val, unsigned &Scale, Instruction *InstCombinerImpl::PromoteCastOfAllocation(BitCastInst &CI, AllocaInst &AI) { PointerType *PTy = cast<PointerType>(CI.getType()); + // Opaque pointers don't have an element type we could replace with. + if (PTy->isOpaque()) + return nullptr; IRBuilderBase::InsertPointGuard Guard(Builder); Builder.SetInsertPoint(&AI); // Get the type really allocated and the type casted to. Type *AllocElTy = AI.getAllocatedType(); - Type *CastElTy = PTy->getElementType(); + Type *CastElTy = PTy->getNonOpaquePointerElementType(); if (!AllocElTy->isSized() || !CastElTy->isSized()) return nullptr; // This optimisation does not work for cases where the cast type @@ -2649,8 +2652,8 @@ static Instruction *convertBitCastToGEP(BitCastInst &CI, IRBuilderBase &Builder, if (SrcPTy->isOpaque() || DstPTy->isOpaque()) return nullptr; - Type *DstElTy = DstPTy->getElementType(); - Type *SrcElTy = SrcPTy->getElementType(); + Type *DstElTy = DstPTy->getNonOpaquePointerElementType(); + Type *SrcElTy = SrcPTy->getNonOpaquePointerElementType(); // When the type pointed to is not sized the cast cannot be // turned into a gep. @@ -2669,8 +2672,8 @@ static Instruction *convertBitCastToGEP(BitCastInst &CI, IRBuilderBase &Builder, // If we found a path from the src to dest, create the getelementptr now. if (SrcElTy == DstElTy) { SmallVector<Value *, 8> Idxs(NumZeros + 1, Builder.getInt32(0)); - GetElementPtrInst *GEP = - GetElementPtrInst::Create(SrcPTy->getElementType(), Src, Idxs); + GetElementPtrInst *GEP = GetElementPtrInst::Create( + SrcPTy->getNonOpaquePointerElementType(), Src, Idxs); // If the source pointer is dereferenceable, then assume it points to an // allocated object and apply "inbounds" to the GEP. diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp index ed53b88aed61..fd58a44504b3 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -503,7 +503,7 @@ static Value *evaluateGEPOffsetExpression(User *GEP, InstCombinerImpl &IC, /// Returns true if we can rewrite Start as a GEP with pointer Base /// and some integer offset. The nodes that need to be re-written /// for this transformation will be added to Explored. -static bool canRewriteGEPAsOffset(Value *Start, Value *Base, +static bool canRewriteGEPAsOffset(Type *ElemTy, Value *Start, Value *Base, const DataLayout &DL, SetVector<Value *> &Explored) { SmallVector<Value *, 16> WorkList(1, Start); @@ -551,7 +551,7 @@ static bool canRewriteGEPAsOffset(Value *Start, Value *Base, // the original pointer type. We could handle more cases in the // future. if (GEP->getNumIndices() != 1 || !GEP->isInBounds() || - GEP->getType() != Start->getType()) + GEP->getSourceElementType() != ElemTy) return false; if (!Explored.contains(GEP->getOperand(0))) @@ -627,7 +627,7 @@ static void setInsertionPoint(IRBuilder<> &Builder, Value *V, /// Returns a re-written value of Start as an indexed GEP using Base as a /// pointer. -static Value *rewriteGEPAsOffset(Value *Start, Value *Base, +static Value *rewriteGEPAsOffset(Type *ElemTy, Value *Start, Value *Base, const DataLayout &DL, SetVector<Value *> &Explored) { // Perform all the substitutions. This is a bit tricky because we can @@ -714,6 +714,8 @@ static Value *rewriteGEPAsOffset(Value *Start, Value *Base, } } + PointerType *PtrTy = + ElemTy->getPointerTo(Start->getType()->getPointerAddressSpace()); for (Value *Val : Explored) { if (Val == Base) continue; @@ -722,22 +724,14 @@ static Value *rewriteGEPAsOffset(Value *Start, Value *Base, // a GEP or a GEP + ptrtoint. setInsertionPoint(Builder, Val, false); - // If required, create an inttoptr instruction for Base. - Value *NewBase = Base; - if (!Base->getType()->isPointerTy()) - NewBase = Builder.CreateBitOrPointerCast(Base, Start->getType(), - Start->getName() + "to.ptr"); - - Value *GEP = Builder.CreateInBoundsGEP( - Start->getType()->getPointerElementType(), NewBase, - makeArrayRef(NewInsts[Val]), Val->getName() + ".ptr"); - - if (!Val->getType()->isPointerTy()) { - Value *Cast = Builder.CreatePointerCast(GEP, Val->getType(), - Val->getName() + ".conv"); - GEP = Cast; - } - Val->replaceAllUsesWith(GEP); + // Cast base to the expected type. + Value *NewVal = Builder.CreateBitOrPointerCast( + Base, PtrTy, Start->getName() + "to.ptr"); + NewVal = Builder.CreateInBoundsGEP( + ElemTy, NewVal, makeArrayRef(NewInsts[Val]), Val->getName() + ".ptr"); + NewVal = Builder.CreateBitOrPointerCast( + NewVal, Val->getType(), Val->getName() + ".conv"); + Val->replaceAllUsesWith(NewVal); } return NewInsts[Start]; @@ -747,7 +741,7 @@ static Value *rewriteGEPAsOffset(Value *Start, Value *Base, /// the input Value as a constant indexed GEP. Returns a pair containing /// the GEPs Pointer and Index. static std::pair<Value *, Value *> -getAsConstantIndexedAddress(Value *V, const DataLayout &DL) { +getAsConstantIndexedAddress(Type *ElemTy, Value *V, const DataLayout &DL) { Type *IndexType = IntegerType::get(V->getContext(), DL.getIndexTypeSizeInBits(V->getType())); @@ -759,7 +753,7 @@ getAsConstantIndexedAddress(Value *V, const DataLayout &DL) { if (!GEP->isInBounds()) break; if (GEP->hasAllConstantIndices() && GEP->getNumIndices() == 1 && - GEP->getType() == V->getType()) { + GEP->getSourceElementType() == ElemTy) { V = GEP->getOperand(0); Constant *GEPIndex = static_cast<Constant *>(GEP->getOperand(1)); Index = ConstantExpr::getAdd( @@ -798,17 +792,14 @@ static Instruction *transformToIndexedCompare(GEPOperator *GEPLHS, Value *RHS, if (!GEPLHS->hasAllConstantIndices()) return nullptr; - // Make sure the pointers have the same type. - if (GEPLHS->getType() != RHS->getType()) - return nullptr; - + Type *ElemTy = GEPLHS->getSourceElementType(); Value *PtrBase, *Index; - std::tie(PtrBase, Index) = getAsConstantIndexedAddress(GEPLHS, DL); + std::tie(PtrBase, Index) = getAsConstantIndexedAddress(ElemTy, GEPLHS, DL); // The set of nodes that will take part in this transformation. SetVector<Value *> Nodes; - if (!canRewriteGEPAsOffset(RHS, PtrBase, DL, Nodes)) + if (!canRewriteGEPAsOffset(ElemTy, RHS, PtrBase, DL, Nodes)) return nullptr; // We know we can re-write this as @@ -817,7 +808,7 @@ static Instruction *transformToIndexedCompare(GEPOperator *GEPLHS, Value *RHS, // can't have overflow on either side. We can therefore re-write // this as: // OFFSET1 cmp OFFSET2 - Value *NewRHS = rewriteGEPAsOffset(RHS, PtrBase, DL, Nodes); + Value *NewRHS = rewriteGEPAsOffset(ElemTy, RHS, PtrBase, DL, Nodes); // RewriteGEPAsOffset has replaced RHS and all of its uses with a re-written // GEP having PtrBase as the pointer base, and has returned in NewRHS the @@ -894,9 +885,10 @@ Instruction *InstCombinerImpl::foldGEPICmp(GEPOperator *GEPLHS, Value *RHS, // If the base pointers are different, but the indices are the same, just // compare the base pointer. if (PtrBase != GEPRHS->getOperand(0)) { - bool IndicesTheSame = GEPLHS->getNumOperands()==GEPRHS->getNumOperands(); - IndicesTheSame &= GEPLHS->getOperand(0)->getType() == - GEPRHS->getOperand(0)->getType(); + bool IndicesTheSame = + GEPLHS->getNumOperands() == GEPRHS->getNumOperands() && + GEPLHS->getType() == GEPRHS->getType() && + GEPLHS->getSourceElementType() == GEPRHS->getSourceElementType(); if (IndicesTheSame) for (unsigned i = 1, e = GEPLHS->getNumOperands(); i != e; ++i) if (GEPLHS->getOperand(i) != GEPRHS->getOperand(i)) { @@ -1271,8 +1263,8 @@ static Instruction *processUGT_ADDCST_ADD(ICmpInst &I, Value *A, Value *B, // This is only really a signed overflow check if the inputs have been // sign-extended; check for that condition. For example, if CI2 is 2^31 and // the operands of the add are 64 bits wide, we need at least 33 sign bits. - if (IC.ComputeMinSignedBits(A, 0, &I) > NewWidth || - IC.ComputeMinSignedBits(B, 0, &I) > NewWidth) + if (IC.ComputeMaxSignificantBits(A, 0, &I) > NewWidth || + IC.ComputeMaxSignificantBits(B, 0, &I) > NewWidth) return nullptr; // In order to replace the original add with a narrower @@ -2221,7 +2213,7 @@ Instruction *InstCombinerImpl::foldICmpShrConstant(ICmpInst &Cmp, // icmp eq/ne (shr X, Y), 0 --> icmp eq/ne X, 0 Value *X = Shr->getOperand(0); CmpInst::Predicate Pred = Cmp.getPredicate(); - if (Cmp.isEquality() && Shr->isExact() && Shr->hasOneUse() && C.isZero()) + if (Cmp.isEquality() && Shr->isExact() && C.isZero()) return new ICmpInst(Pred, X, Cmp.getOperand(1)); const APInt *ShiftVal; @@ -2247,9 +2239,10 @@ Instruction *InstCombinerImpl::foldICmpShrConstant(ICmpInst &Cmp, // those conditions rather than checking them. This is difficult because of // undef/poison (PR34838). if (IsAShr) { - if (Pred == CmpInst::ICMP_SLT || (Pred == CmpInst::ICMP_SGT && IsExact)) { - // icmp slt (ashr X, ShAmtC), C --> icmp slt X, (C << ShAmtC) - // icmp sgt (ashr exact X, ShAmtC), C --> icmp sgt X, (C << ShAmtC) + if (IsExact || Pred == CmpInst::ICMP_SLT || Pred == CmpInst::ICMP_ULT) { + // When ShAmtC can be shifted losslessly: + // icmp PRED (ashr exact X, ShAmtC), C --> icmp PRED X, (C << ShAmtC) + // icmp slt/ult (ashr X, ShAmtC), C --> icmp slt/ult X, (C << ShAmtC) APInt ShiftedC = C.shl(ShAmtVal); if (ShiftedC.ashr(ShAmtVal) == C) return new ICmpInst(Pred, X, ConstantInt::get(ShrTy, ShiftedC)); @@ -2261,6 +2254,12 @@ Instruction *InstCombinerImpl::foldICmpShrConstant(ICmpInst &Cmp, (ShiftedC + 1).ashr(ShAmtVal) == (C + 1)) return new ICmpInst(Pred, X, ConstantInt::get(ShrTy, ShiftedC)); } + if (Pred == CmpInst::ICMP_UGT) { + // icmp ugt (ashr X, ShAmtC), C --> icmp ugt X, ((C + 1) << ShAmtC) - 1 + APInt ShiftedC = (C + 1).shl(ShAmtVal) - 1; + if ((ShiftedC + 1).ashr(ShAmtVal) == (C + 1)) + return new ICmpInst(Pred, X, ConstantInt::get(ShrTy, ShiftedC)); + } // If the compare constant has significant bits above the lowest sign-bit, // then convert an unsigned cmp to a test of the sign-bit: @@ -3957,6 +3956,33 @@ Instruction *InstCombinerImpl::foldICmpBinOp(ICmpInst &I, (Pred == ICmpInst::ICMP_UGT || Pred == ICmpInst::ICMP_ULE)) return new ICmpInst(Pred, X, Builder.CreateNot(Op0)); + { + // Similar to above: an unsigned overflow comparison may use offset + mask: + // ((Op1 + C) & C) u< Op1 --> Op1 != 0 + // ((Op1 + C) & C) u>= Op1 --> Op1 == 0 + // Op0 u> ((Op0 + C) & C) --> Op0 != 0 + // Op0 u<= ((Op0 + C) & C) --> Op0 == 0 + BinaryOperator *BO; + const APInt *C; + if ((Pred == ICmpInst::ICMP_ULT || Pred == ICmpInst::ICMP_UGE) && + match(Op0, m_And(m_BinOp(BO), m_LowBitMask(C))) && + match(BO, m_Add(m_Specific(Op1), m_SpecificIntAllowUndef(*C)))) { + CmpInst::Predicate NewPred = + Pred == ICmpInst::ICMP_ULT ? ICmpInst::ICMP_NE : ICmpInst::ICMP_EQ; + Constant *Zero = ConstantInt::getNullValue(Op1->getType()); + return new ICmpInst(NewPred, Op1, Zero); + } + + if ((Pred == ICmpInst::ICMP_UGT || Pred == ICmpInst::ICMP_ULE) && + match(Op1, m_And(m_BinOp(BO), m_LowBitMask(C))) && + match(BO, m_Add(m_Specific(Op0), m_SpecificIntAllowUndef(*C)))) { + CmpInst::Predicate NewPred = + Pred == ICmpInst::ICMP_UGT ? ICmpInst::ICMP_NE : ICmpInst::ICMP_EQ; + Constant *Zero = ConstantInt::getNullValue(Op1->getType()); + return new ICmpInst(NewPred, Op0, Zero); + } + } + bool NoOp0WrapProblem = false, NoOp1WrapProblem = false; if (BO0 && isa<OverflowingBinaryOperator>(BO0)) NoOp0WrapProblem = diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h index 39b55b028110..7743b4c41555 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h +++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h @@ -148,6 +148,8 @@ public: Instruction *SliceUpIllegalIntegerPHI(PHINode &PN); Instruction *visitPHINode(PHINode &PN); Instruction *visitGetElementPtrInst(GetElementPtrInst &GEP); + Instruction *visitGEPOfGEP(GetElementPtrInst &GEP, GEPOperator *Src); + Instruction *visitGEPOfBitcast(BitCastInst *BCI, GetElementPtrInst &GEP); Instruction *visitAllocaInst(AllocaInst &AI); Instruction *visitAllocSite(Instruction &FI); Instruction *visitFree(CallInst &FI); @@ -195,8 +197,6 @@ private: bool shouldChangeType(unsigned FromBitWidth, unsigned ToBitWidth) const; bool shouldChangeType(Type *From, Type *To) const; Value *dyn_castNegVal(Value *V) const; - Type *FindElementAtOffset(PointerType *PtrTy, int64_t Offset, - SmallVectorImpl<Value *> &NewIndices); /// Classify whether a cast is worth optimizing. /// @@ -607,6 +607,16 @@ public: /// only possible if all operands to the PHI are constants). Instruction *foldOpIntoPhi(Instruction &I, PHINode *PN); + /// For a binary operator with 2 phi operands, try to hoist the binary + /// operation before the phi. This can result in fewer instructions in + /// patterns where at least one set of phi operands simplifies. + /// Example: + /// BB3: binop (phi [X, BB1], [C1, BB2]), (phi [Y, BB1], [C2, BB2]) + /// --> + /// BB1: BO = binop X, Y + /// BB3: phi [BO, BB1], [(binop C1, C2), BB2] + Instruction *foldBinopWithPhiOperands(BinaryOperator &BO); + /// Given an instruction with a select as one operand and a constant as the /// other operand, try to fold the binary operator into the select arguments. /// This also works for Cast instructions, which obviously do not have a diff --git a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp index 0dbfdba353c4..756792918dba 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp @@ -301,16 +301,17 @@ void PointerReplacer::replace(Instruction *I) { assert(V && "Operand not replaced"); SmallVector<Value *, 8> Indices; Indices.append(GEP->idx_begin(), GEP->idx_end()); - auto *NewI = GetElementPtrInst::Create( - V->getType()->getPointerElementType(), V, Indices); + auto *NewI = + GetElementPtrInst::Create(GEP->getSourceElementType(), V, Indices); IC.InsertNewInstWith(NewI, *GEP); NewI->takeName(GEP); WorkMap[GEP] = NewI; } else if (auto *BC = dyn_cast<BitCastInst>(I)) { auto *V = getReplacement(BC->getOperand(0)); assert(V && "Operand not replaced"); - auto *NewT = PointerType::get(BC->getType()->getPointerElementType(), - V->getType()->getPointerAddressSpace()); + auto *NewT = PointerType::getWithSamePointeeType( + cast<PointerType>(BC->getType()), + V->getType()->getPointerAddressSpace()); auto *NewI = new BitCastInst(V, NewT); IC.InsertNewInstWith(NewI, *BC); NewI->takeName(BC); @@ -345,8 +346,7 @@ void PointerReplacer::replacePointer(Instruction &I, Value *V) { #ifndef NDEBUG auto *PT = cast<PointerType>(I.getType()); auto *NT = cast<PointerType>(V->getType()); - assert(PT != NT && PT->getElementType() == NT->getElementType() && - "Invalid usage"); + assert(PT != NT && PT->hasSameElementTypeAs(NT) && "Invalid usage"); #endif WorkMap[&I] = V; diff --git a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp index aca7ec8d7325..1aa10b550fc4 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp @@ -155,6 +155,9 @@ Instruction *InstCombinerImpl::visitMul(BinaryOperator &I) { if (Instruction *X = foldVectorBinop(I)) return X; + if (Instruction *Phi = foldBinopWithPhiOperands(I)) + return Phi; + if (Value *V = SimplifyUsingDistributiveLaws(I)) return replaceInstUsesWith(I, V); @@ -348,13 +351,21 @@ Instruction *InstCombinerImpl::visitMul(BinaryOperator &I) { return CastInst::Create(Instruction::SExt, And, I.getType()); } - // (bool X) * Y --> X ? Y : 0 - // Y * (bool X) --> X ? Y : 0 + // (zext bool X) * Y --> X ? Y : 0 + // Y * (zext bool X) --> X ? Y : 0 if (match(Op0, m_ZExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1)) return SelectInst::Create(X, Op1, ConstantInt::get(I.getType(), 0)); if (match(Op1, m_ZExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1)) return SelectInst::Create(X, Op0, ConstantInt::get(I.getType(), 0)); + // (sext bool X) * C --> X ? -C : 0 + Constant *ImmC; + if (match(Op0, m_SExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1) && + match(Op1, m_ImmConstant(ImmC))) { + Constant *NegC = ConstantExpr::getNeg(ImmC); + return SelectInst::Create(X, NegC, ConstantInt::getNullValue(I.getType())); + } + // (lshr X, 31) * Y --> (ashr X, 31) & Y // Y * (lshr X, 31) --> (ashr X, 31) & Y // TODO: We are not checking one-use because the elimination of the multiply @@ -442,6 +453,9 @@ Instruction *InstCombinerImpl::visitFMul(BinaryOperator &I) { if (Instruction *X = foldVectorBinop(I)) return X; + if (Instruction *Phi = foldBinopWithPhiOperands(I)) + return Phi; + if (Instruction *FoldedMul = foldBinOpIntoSelectOrPhi(I)) return FoldedMul; @@ -742,6 +756,9 @@ static bool isMultiple(const APInt &C1, const APInt &C2, APInt &Quotient, /// division instructions. /// Common integer divide transforms Instruction *InstCombinerImpl::commonIDivTransforms(BinaryOperator &I) { + if (Instruction *Phi = foldBinopWithPhiOperands(I)) + return Phi; + Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); bool IsSigned = I.getOpcode() == Instruction::SDiv; Type *Ty = I.getType(); @@ -1359,6 +1376,9 @@ Instruction *InstCombinerImpl::visitFDiv(BinaryOperator &I) { if (Instruction *X = foldVectorBinop(I)) return X; + if (Instruction *Phi = foldBinopWithPhiOperands(I)) + return Phi; + if (Instruction *R = foldFDivConstantDivisor(I)) return R; @@ -1460,6 +1480,9 @@ Instruction *InstCombinerImpl::visitFDiv(BinaryOperator &I) { /// remainder instructions. /// Common integer remainder transforms Instruction *InstCombinerImpl::commonIRemTransforms(BinaryOperator &I) { + if (Instruction *Phi = foldBinopWithPhiOperands(I)) + return Phi; + Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); // The RHS is known non-zero. @@ -1638,5 +1661,8 @@ Instruction *InstCombinerImpl::visitFRem(BinaryOperator &I) { if (Instruction *X = foldVectorBinop(I)) return X; + if (Instruction *Phi = foldBinopWithPhiOperands(I)) + return Phi; + return nullptr; } diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp index a6d6b5199105..65e60498ff95 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp @@ -943,7 +943,7 @@ static Instruction *foldSelectCtlzToCttz(ICmpInst *ICI, Value *TrueVal, } /// Attempt to fold a cttz/ctlz followed by a icmp plus select into a single -/// call to cttz/ctlz with flag 'is_zero_undef' cleared. +/// call to cttz/ctlz with flag 'is_zero_poison' cleared. /// /// For example, we can fold the following code sequence: /// \code @@ -987,7 +987,7 @@ static Value *foldSelectCttzCtlz(ICmpInst *ICI, Value *TrueVal, Value *FalseVal, // sizeof in bits of 'Count'. unsigned SizeOfInBits = Count->getType()->getScalarSizeInBits(); if (match(ValueOnZero, m_SpecificInt(SizeOfInBits))) { - // Explicitly clear the 'undef_on_zero' flag. It's always valid to go from + // Explicitly clear the 'is_zero_poison' flag. It's always valid to go from // true to false on this flag, so we can replace it for all users. II->setArgOperand(1, ConstantInt::getFalse(II->getContext())); return SelectArg; @@ -995,7 +995,7 @@ static Value *foldSelectCttzCtlz(ICmpInst *ICI, Value *TrueVal, Value *FalseVal, // The ValueOnZero is not the bitwidth. But if the cttz/ctlz (and optional // zext/trunc) have one use (ending at the select), the cttz/ctlz result will - // not be used if the input is zero. Relax to 'undef_on_zero' for that case. + // not be used if the input is zero. Relax to 'zero is poison' for that case. if (II->hasOneUse() && SelectArg->hasOneUse() && !match(II->getArgOperand(1), m_One())) II->setArgOperand(1, ConstantInt::getTrue(II->getContext())); @@ -2325,8 +2325,9 @@ Instruction *InstCombinerImpl::matchSAddSubSat(Instruction &MinMax1) { // The two operands of the add/sub must be nsw-truncatable to the NewTy. This // is usually achieved via a sext from a smaller type. - if (ComputeMinSignedBits(AddSub->getOperand(0), 0, AddSub) > NewBitWidth || - ComputeMinSignedBits(AddSub->getOperand(1), 0, AddSub) > NewBitWidth) + if (ComputeMaxSignificantBits(AddSub->getOperand(0), 0, AddSub) > + NewBitWidth || + ComputeMaxSignificantBits(AddSub->getOperand(1), 0, AddSub) > NewBitWidth) return nullptr; // Finally create and return the sat intrinsic, truncated to the new type diff --git a/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp index 06421d553915..17f0c5c4cff0 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp @@ -369,6 +369,9 @@ static Instruction *foldShiftOfShiftedLogic(BinaryOperator &I, } Instruction *InstCombinerImpl::commonShiftTransforms(BinaryOperator &I) { + if (Instruction *Phi = foldBinopWithPhiOperands(I)) + return Phi; + Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); assert(Op0->getType() == Op1->getType()); @@ -1032,12 +1035,13 @@ Instruction *InstCombinerImpl::visitLShr(BinaryOperator &I) { NewLShr->setIsExact(I.isExact()); return NewLShr; } - // (X << C1) >>u C --> (X >>u (C - C1)) & (-1 >> C) - Value *NewLShr = Builder.CreateLShr(X, ShiftDiff, "", I.isExact()); - APInt Mask(APInt::getLowBitsSet(BitWidth, BitWidth - ShAmtC)); - return BinaryOperator::CreateAnd(NewLShr, ConstantInt::get(Ty, Mask)); - } - if (C1->ugt(ShAmtC)) { + if (Op0->hasOneUse()) { + // (X << C1) >>u C --> (X >>u (C - C1)) & (-1 >> C) + Value *NewLShr = Builder.CreateLShr(X, ShiftDiff, "", I.isExact()); + APInt Mask(APInt::getLowBitsSet(BitWidth, BitWidth - ShAmtC)); + return BinaryOperator::CreateAnd(NewLShr, ConstantInt::get(Ty, Mask)); + } + } else if (C1->ugt(ShAmtC)) { unsigned ShlAmtC = C1->getZExtValue(); Constant *ShiftDiff = ConstantInt::get(Ty, ShlAmtC - ShAmtC); if (cast<BinaryOperator>(Op0)->hasNoUnsignedWrap()) { @@ -1046,15 +1050,33 @@ Instruction *InstCombinerImpl::visitLShr(BinaryOperator &I) { NewShl->setHasNoUnsignedWrap(true); return NewShl; } - // (X << C1) >>u C --> X << (C1 - C) & (-1 >> C) - Value *NewShl = Builder.CreateShl(X, ShiftDiff); + if (Op0->hasOneUse()) { + // (X << C1) >>u C --> X << (C1 - C) & (-1 >> C) + Value *NewShl = Builder.CreateShl(X, ShiftDiff); + APInt Mask(APInt::getLowBitsSet(BitWidth, BitWidth - ShAmtC)); + return BinaryOperator::CreateAnd(NewShl, ConstantInt::get(Ty, Mask)); + } + } else { + assert(*C1 == ShAmtC); + // (X << C) >>u C --> X & (-1 >>u C) APInt Mask(APInt::getLowBitsSet(BitWidth, BitWidth - ShAmtC)); - return BinaryOperator::CreateAnd(NewShl, ConstantInt::get(Ty, Mask)); + return BinaryOperator::CreateAnd(X, ConstantInt::get(Ty, Mask)); } - assert(*C1 == ShAmtC); - // (X << C) >>u C --> X & (-1 >>u C) - APInt Mask(APInt::getLowBitsSet(BitWidth, BitWidth - ShAmtC)); - return BinaryOperator::CreateAnd(X, ConstantInt::get(Ty, Mask)); + } + + // ((X << C) + Y) >>u C --> (X + (Y >>u C)) & (-1 >>u C) + // TODO: Consolidate with the more general transform that starts from shl + // (the shifts are in the opposite order). + Value *Y; + if (match(Op0, + m_OneUse(m_c_Add(m_OneUse(m_Shl(m_Value(X), m_Specific(Op1))), + m_Value(Y))))) { + Value *NewLshr = Builder.CreateLShr(Y, Op1); + Value *NewAdd = Builder.CreateAdd(NewLshr, X); + unsigned Op1Val = C->getLimitedValue(BitWidth); + APInt Bits = APInt::getLowBitsSet(BitWidth, BitWidth - Op1Val); + Constant *Mask = ConstantInt::get(Ty, Bits); + return BinaryOperator::CreateAnd(NewAdd, Mask); } if (match(Op0, m_OneUse(m_ZExt(m_Value(X)))) && @@ -1094,7 +1116,6 @@ Instruction *InstCombinerImpl::visitLShr(BinaryOperator &I) { } } - Value *Y; if (ShAmtC == BitWidth - 1) { // lshr i32 or(X,-X), 31 --> zext (X != 0) if (match(Op0, m_OneUse(m_c_Or(m_Neg(m_Value(X)), m_Deferred(X))))) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp index 4dc712f32536..71a5ae24eead 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp @@ -800,22 +800,21 @@ Value *InstCombinerImpl::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, // Round NTZ down to the next byte. If we have 11 trailing zeros, then // we need all the bits down to bit 8. Likewise, round NLZ. If we // have 14 leading zeros, round to 8. - NLZ &= ~7; - NTZ &= ~7; + NLZ = alignDown(NLZ, 8); + NTZ = alignDown(NTZ, 8); // If we need exactly one byte, we can do this transformation. - if (BitWidth-NLZ-NTZ == 8) { - unsigned ResultBit = NTZ; - unsigned InputBit = BitWidth-NTZ-8; - + if (BitWidth - NLZ - NTZ == 8) { // Replace this with either a left or right shift to get the byte into // the right place. Instruction *NewVal; - if (InputBit > ResultBit) - NewVal = BinaryOperator::CreateLShr(II->getArgOperand(0), - ConstantInt::get(I->getType(), InputBit-ResultBit)); + if (NLZ > NTZ) + NewVal = BinaryOperator::CreateLShr( + II->getArgOperand(0), + ConstantInt::get(I->getType(), NLZ - NTZ)); else - NewVal = BinaryOperator::CreateShl(II->getArgOperand(0), - ConstantInt::get(I->getType(), ResultBit-InputBit)); + NewVal = BinaryOperator::CreateShl( + II->getArgOperand(0), + ConstantInt::get(I->getType(), NTZ - NLZ)); NewVal->takeName(I); return InsertNewInstWith(NewVal, *I); } diff --git a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp index c6a4602e59e3..736cf9c825d5 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp @@ -495,8 +495,7 @@ Instruction *InstCombinerImpl::visitExtractElementInst(ExtractElementInst &EI) { } GetElementPtrInst *NewGEP = GetElementPtrInst::Create( - cast<PointerType>(NewPtr->getType())->getElementType(), NewPtr, - NewOps); + GEP->getSourceElementType(), NewPtr, NewOps); NewGEP->setIsInBounds(GEP->isInBounds()); return NewGEP; } diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp index eb5eadba194d..029be5257694 100644 --- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -1027,13 +1027,11 @@ static Value *foldOperationIntoSelectOperand(Instruction &I, Value *SO, if (!ConstIsRHS) std::swap(Op0, Op1); - auto *BO = cast<BinaryOperator>(&I); - Value *RI = Builder.CreateBinOp(BO->getOpcode(), Op0, Op1, - SO->getName() + ".op"); - auto *FPInst = dyn_cast<Instruction>(RI); - if (FPInst && isa<FPMathOperator>(FPInst)) - FPInst->copyFastMathFlags(BO); - return RI; + Value *NewBO = Builder.CreateBinOp(cast<BinaryOperator>(&I)->getOpcode(), Op0, + Op1, SO->getName() + ".op"); + if (auto *NewBOI = dyn_cast<Instruction>(NewBO)) + NewBOI->copyIRFlags(&I); + return NewBO; } Instruction *InstCombinerImpl::FoldOpIntoSelect(Instruction &Op, @@ -1289,6 +1287,70 @@ Instruction *InstCombinerImpl::foldOpIntoPhi(Instruction &I, PHINode *PN) { return replaceInstUsesWith(I, NewPN); } +Instruction *InstCombinerImpl::foldBinopWithPhiOperands(BinaryOperator &BO) { + // TODO: This should be similar to the incoming values check in foldOpIntoPhi: + // we are guarding against replicating the binop in >1 predecessor. + // This could miss matching a phi with 2 constant incoming values. + auto *Phi0 = dyn_cast<PHINode>(BO.getOperand(0)); + auto *Phi1 = dyn_cast<PHINode>(BO.getOperand(1)); + if (!Phi0 || !Phi1 || !Phi0->hasOneUse() || !Phi1->hasOneUse() || + Phi0->getNumOperands() != 2 || Phi1->getNumOperands() != 2) + return nullptr; + + // TODO: Remove the restriction for binop being in the same block as the phis. + if (BO.getParent() != Phi0->getParent() || + BO.getParent() != Phi1->getParent()) + return nullptr; + + // Match a pair of incoming constants for one of the predecessor blocks. + BasicBlock *ConstBB, *OtherBB; + Constant *C0, *C1; + if (match(Phi0->getIncomingValue(0), m_ImmConstant(C0))) { + ConstBB = Phi0->getIncomingBlock(0); + OtherBB = Phi0->getIncomingBlock(1); + } else if (match(Phi0->getIncomingValue(1), m_ImmConstant(C0))) { + ConstBB = Phi0->getIncomingBlock(1); + OtherBB = Phi0->getIncomingBlock(0); + } else { + return nullptr; + } + if (!match(Phi1->getIncomingValueForBlock(ConstBB), m_ImmConstant(C1))) + return nullptr; + + // The block that we are hoisting to must reach here unconditionally. + // Otherwise, we could be speculatively executing an expensive or + // non-speculative op. + auto *PredBlockBranch = dyn_cast<BranchInst>(OtherBB->getTerminator()); + if (!PredBlockBranch || PredBlockBranch->isConditional() || + !DT.isReachableFromEntry(OtherBB)) + return nullptr; + + // TODO: This check could be tightened to only apply to binops (div/rem) that + // are not safe to speculatively execute. But that could allow hoisting + // potentially expensive instructions (fdiv for example). + for (auto BBIter = BO.getParent()->begin(); &*BBIter != &BO; ++BBIter) + if (!isGuaranteedToTransferExecutionToSuccessor(&*BBIter)) + return nullptr; + + // Make a new binop in the predecessor block with the non-constant incoming + // values. + Builder.SetInsertPoint(PredBlockBranch); + Value *NewBO = Builder.CreateBinOp(BO.getOpcode(), + Phi0->getIncomingValueForBlock(OtherBB), + Phi1->getIncomingValueForBlock(OtherBB)); + if (auto *NotFoldedNewBO = dyn_cast<BinaryOperator>(NewBO)) + NotFoldedNewBO->copyIRFlags(&BO); + + // Fold constants for the predecessor block with constant incoming values. + Constant *NewC = ConstantExpr::get(BO.getOpcode(), C0, C1); + + // Replace the binop with a phi of the new values. The old phis are dead. + PHINode *NewPhi = PHINode::Create(BO.getType(), 2); + NewPhi->addIncoming(NewBO, OtherBB); + NewPhi->addIncoming(NewC, ConstBB); + return NewPhi; +} + Instruction *InstCombinerImpl::foldBinOpIntoSelectOrPhi(BinaryOperator &I) { if (!isa<Constant>(I.getOperand(1))) return nullptr; @@ -1307,10 +1369,11 @@ Instruction *InstCombinerImpl::foldBinOpIntoSelectOrPhi(BinaryOperator &I) { /// is a sequence of GEP indices into the pointed type that will land us at the /// specified offset. If so, fill them into NewIndices and return the resultant /// element type, otherwise return null. -Type * -InstCombinerImpl::FindElementAtOffset(PointerType *PtrTy, int64_t IntOffset, - SmallVectorImpl<Value *> &NewIndices) { - Type *Ty = PtrTy->getElementType(); +static Type *findElementAtOffset(PointerType *PtrTy, int64_t IntOffset, + SmallVectorImpl<Value *> &NewIndices, + const DataLayout &DL) { + // Only used by visitGEPOfBitcast(), which is skipped for opaque pointers. + Type *Ty = PtrTy->getNonOpaquePointerElementType(); if (!Ty->isSized()) return nullptr; @@ -1320,7 +1383,7 @@ InstCombinerImpl::FindElementAtOffset(PointerType *PtrTy, int64_t IntOffset, return nullptr; for (const APInt &Index : Indices) - NewIndices.push_back(Builder.getInt(Index)); + NewIndices.push_back(ConstantInt::get(PtrTy->getContext(), Index)); return Ty; } @@ -1884,12 +1947,254 @@ static Instruction *foldSelectGEP(GetElementPtrInst &GEP, return SelectInst::Create(Cond, NewTrueC, NewFalseC, "", nullptr, Sel); } +Instruction *InstCombinerImpl::visitGEPOfGEP(GetElementPtrInst &GEP, + GEPOperator *Src) { + // Combine Indices - If the source pointer to this getelementptr instruction + // is a getelementptr instruction with matching element type, combine the + // indices of the two getelementptr instructions into a single instruction. + if (Src->getResultElementType() != GEP.getSourceElementType()) + return nullptr; + + if (!shouldMergeGEPs(*cast<GEPOperator>(&GEP), *Src)) + return nullptr; + + if (Src->getNumOperands() == 2 && GEP.getNumOperands() == 2 && + Src->hasOneUse()) { + Value *GO1 = GEP.getOperand(1); + Value *SO1 = Src->getOperand(1); + + if (LI) { + // Try to reassociate loop invariant GEP chains to enable LICM. + if (Loop *L = LI->getLoopFor(GEP.getParent())) { + // Reassociate the two GEPs if SO1 is variant in the loop and GO1 is + // invariant: this breaks the dependence between GEPs and allows LICM + // to hoist the invariant part out of the loop. + if (L->isLoopInvariant(GO1) && !L->isLoopInvariant(SO1)) { + // We have to be careful here. + // We have something like: + // %src = getelementptr <ty>, <ty>* %base, <ty> %idx + // %gep = getelementptr <ty>, <ty>* %src, <ty> %idx2 + // If we just swap idx & idx2 then we could inadvertantly + // change %src from a vector to a scalar, or vice versa. + // Cases: + // 1) %base a scalar & idx a scalar & idx2 a vector + // => Swapping idx & idx2 turns %src into a vector type. + // 2) %base a scalar & idx a vector & idx2 a scalar + // => Swapping idx & idx2 turns %src in a scalar type + // 3) %base, %idx, and %idx2 are scalars + // => %src & %gep are scalars + // => swapping idx & idx2 is safe + // 4) %base a vector + // => %src is a vector + // => swapping idx & idx2 is safe. + auto *SO0 = Src->getOperand(0); + auto *SO0Ty = SO0->getType(); + if (!isa<VectorType>(GEP.getType()) || // case 3 + isa<VectorType>(SO0Ty)) { // case 4 + Src->setOperand(1, GO1); + GEP.setOperand(1, SO1); + return &GEP; + } else { + // Case 1 or 2 + // -- have to recreate %src & %gep + // put NewSrc at same location as %src + Builder.SetInsertPoint(cast<Instruction>(Src)); + Value *NewSrc = Builder.CreateGEP( + GEP.getSourceElementType(), SO0, GO1, Src->getName()); + // Propagate 'inbounds' if the new source was not constant-folded. + if (auto *NewSrcGEPI = dyn_cast<GetElementPtrInst>(NewSrc)) + NewSrcGEPI->setIsInBounds(Src->isInBounds()); + GetElementPtrInst *NewGEP = GetElementPtrInst::Create( + GEP.getSourceElementType(), NewSrc, {SO1}); + NewGEP->setIsInBounds(GEP.isInBounds()); + return NewGEP; + } + } + } + } + } + + // Note that if our source is a gep chain itself then we wait for that + // chain to be resolved before we perform this transformation. This + // avoids us creating a TON of code in some cases. + if (auto *SrcGEP = dyn_cast<GEPOperator>(Src->getOperand(0))) + if (SrcGEP->getNumOperands() == 2 && shouldMergeGEPs(*Src, *SrcGEP)) + return nullptr; // Wait until our source is folded to completion. + + SmallVector<Value*, 8> Indices; + + // Find out whether the last index in the source GEP is a sequential idx. + bool EndsWithSequential = false; + for (gep_type_iterator I = gep_type_begin(*Src), E = gep_type_end(*Src); + I != E; ++I) + EndsWithSequential = I.isSequential(); + + // Can we combine the two pointer arithmetics offsets? + if (EndsWithSequential) { + // Replace: gep (gep %P, long B), long A, ... + // With: T = long A+B; gep %P, T, ... + Value *SO1 = Src->getOperand(Src->getNumOperands()-1); + Value *GO1 = GEP.getOperand(1); + + // If they aren't the same type, then the input hasn't been processed + // by the loop above yet (which canonicalizes sequential index types to + // intptr_t). Just avoid transforming this until the input has been + // normalized. + if (SO1->getType() != GO1->getType()) + return nullptr; + + Value *Sum = + SimplifyAddInst(GO1, SO1, false, false, SQ.getWithInstruction(&GEP)); + // Only do the combine when we are sure the cost after the + // merge is never more than that before the merge. + if (Sum == nullptr) + return nullptr; + + // Update the GEP in place if possible. + if (Src->getNumOperands() == 2) { + GEP.setIsInBounds(isMergedGEPInBounds(*Src, *cast<GEPOperator>(&GEP))); + replaceOperand(GEP, 0, Src->getOperand(0)); + replaceOperand(GEP, 1, Sum); + return &GEP; + } + Indices.append(Src->op_begin()+1, Src->op_end()-1); + Indices.push_back(Sum); + Indices.append(GEP.op_begin()+2, GEP.op_end()); + } else if (isa<Constant>(*GEP.idx_begin()) && + cast<Constant>(*GEP.idx_begin())->isNullValue() && + Src->getNumOperands() != 1) { + // Otherwise we can do the fold if the first index of the GEP is a zero + Indices.append(Src->op_begin()+1, Src->op_end()); + Indices.append(GEP.idx_begin()+1, GEP.idx_end()); + } + + if (!Indices.empty()) + return isMergedGEPInBounds(*Src, *cast<GEPOperator>(&GEP)) + ? GetElementPtrInst::CreateInBounds( + Src->getSourceElementType(), Src->getOperand(0), Indices, + GEP.getName()) + : GetElementPtrInst::Create(Src->getSourceElementType(), + Src->getOperand(0), Indices, + GEP.getName()); + + return nullptr; +} + +// Note that we may have also stripped an address space cast in between. +Instruction *InstCombinerImpl::visitGEPOfBitcast(BitCastInst *BCI, + GetElementPtrInst &GEP) { + // With opaque pointers, there is no pointer element type we can use to + // adjust the GEP type. + PointerType *SrcType = cast<PointerType>(BCI->getSrcTy()); + if (SrcType->isOpaque()) + return nullptr; + + Type *GEPEltType = GEP.getSourceElementType(); + Type *SrcEltType = SrcType->getNonOpaquePointerElementType(); + Value *SrcOp = BCI->getOperand(0); + + // GEP directly using the source operand if this GEP is accessing an element + // of a bitcasted pointer to vector or array of the same dimensions: + // gep (bitcast <c x ty>* X to [c x ty]*), Y, Z --> gep X, Y, Z + // gep (bitcast [c x ty]* X to <c x ty>*), Y, Z --> gep X, Y, Z + auto areMatchingArrayAndVecTypes = [](Type *ArrTy, Type *VecTy, + const DataLayout &DL) { + auto *VecVTy = cast<FixedVectorType>(VecTy); + return ArrTy->getArrayElementType() == VecVTy->getElementType() && + ArrTy->getArrayNumElements() == VecVTy->getNumElements() && + DL.getTypeAllocSize(ArrTy) == DL.getTypeAllocSize(VecTy); + }; + if (GEP.getNumOperands() == 3 && + ((GEPEltType->isArrayTy() && isa<FixedVectorType>(SrcEltType) && + areMatchingArrayAndVecTypes(GEPEltType, SrcEltType, DL)) || + (isa<FixedVectorType>(GEPEltType) && SrcEltType->isArrayTy() && + areMatchingArrayAndVecTypes(SrcEltType, GEPEltType, DL)))) { + + // Create a new GEP here, as using `setOperand()` followed by + // `setSourceElementType()` won't actually update the type of the + // existing GEP Value. Causing issues if this Value is accessed when + // constructing an AddrSpaceCastInst + SmallVector<Value *, 8> Indices(GEP.indices()); + Value *NGEP = GEP.isInBounds() + ? Builder.CreateInBoundsGEP(SrcEltType, SrcOp, Indices) + : Builder.CreateGEP(SrcEltType, SrcOp, Indices); + NGEP->takeName(&GEP); + + // Preserve GEP address space to satisfy users + if (NGEP->getType()->getPointerAddressSpace() != GEP.getAddressSpace()) + return new AddrSpaceCastInst(NGEP, GEP.getType()); + + return replaceInstUsesWith(GEP, NGEP); + } + + // See if we can simplify: + // X = bitcast A* to B* + // Y = gep X, <...constant indices...> + // into a gep of the original struct. This is important for SROA and alias + // analysis of unions. If "A" is also a bitcast, wait for A/X to be merged. + unsigned OffsetBits = DL.getIndexTypeSizeInBits(GEP.getType()); + APInt Offset(OffsetBits, 0); + + // If the bitcast argument is an allocation, The bitcast is for convertion + // to actual type of allocation. Removing such bitcasts, results in having + // GEPs with i8* base and pure byte offsets. That means GEP is not aware of + // struct or array hierarchy. + // By avoiding such GEPs, phi translation and MemoryDependencyAnalysis have + // a better chance to succeed. + if (!isa<BitCastInst>(SrcOp) && GEP.accumulateConstantOffset(DL, Offset) && + !isAllocationFn(SrcOp, &TLI)) { + // If this GEP instruction doesn't move the pointer, just replace the GEP + // with a bitcast of the real input to the dest type. + if (!Offset) { + // If the bitcast is of an allocation, and the allocation will be + // converted to match the type of the cast, don't touch this. + if (isa<AllocaInst>(SrcOp)) { + // See if the bitcast simplifies, if so, don't nuke this GEP yet. + if (Instruction *I = visitBitCast(*BCI)) { + if (I != BCI) { + I->takeName(BCI); + BCI->getParent()->getInstList().insert(BCI->getIterator(), I); + replaceInstUsesWith(*BCI, I); + } + return &GEP; + } + } + + if (SrcType->getPointerAddressSpace() != GEP.getAddressSpace()) + return new AddrSpaceCastInst(SrcOp, GEP.getType()); + return new BitCastInst(SrcOp, GEP.getType()); + } + + // Otherwise, if the offset is non-zero, we need to find out if there is a + // field at Offset in 'A's type. If so, we can pull the cast through the + // GEP. + SmallVector<Value*, 8> NewIndices; + if (findElementAtOffset(SrcType, Offset.getSExtValue(), NewIndices, DL)) { + Value *NGEP = + GEP.isInBounds() + ? Builder.CreateInBoundsGEP(SrcEltType, SrcOp, NewIndices) + : Builder.CreateGEP(SrcEltType, SrcOp, NewIndices); + + if (NGEP->getType() == GEP.getType()) + return replaceInstUsesWith(GEP, NGEP); + NGEP->takeName(&GEP); + + if (NGEP->getType()->getPointerAddressSpace() != GEP.getAddressSpace()) + return new AddrSpaceCastInst(NGEP, GEP.getType()); + return new BitCastInst(NGEP, GEP.getType()); + } + } + + return nullptr; +} + Instruction *InstCombinerImpl::visitGetElementPtrInst(GetElementPtrInst &GEP) { - SmallVector<Value *, 8> Ops(GEP.operands()); + Value *PtrOp = GEP.getOperand(0); + SmallVector<Value *, 8> Indices(GEP.indices()); Type *GEPType = GEP.getType(); Type *GEPEltType = GEP.getSourceElementType(); bool IsGEPSrcEleScalable = isa<ScalableVectorType>(GEPEltType); - if (Value *V = SimplifyGEPInst(GEPEltType, Ops, GEP.isInBounds(), + if (Value *V = SimplifyGEPInst(GEPEltType, PtrOp, Indices, GEP.isInBounds(), SQ.getWithInstruction(&GEP))) return replaceInstUsesWith(GEP, V); @@ -1912,8 +2217,6 @@ Instruction *InstCombinerImpl::visitGetElementPtrInst(GetElementPtrInst &GEP) { // undef elements to decrease demanded bits } - Value *PtrOp = GEP.getOperand(0); - // Eliminate unneeded casts for indices, and replace indices which displace // by multiples of a zero size type with zero. bool MadeChange = false; @@ -2063,132 +2366,9 @@ Instruction *InstCombinerImpl::visitGetElementPtrInst(GetElementPtrInst &GEP) { PtrOp = NewGEP; } - // Combine Indices - If the source pointer to this getelementptr instruction - // is a getelementptr instruction, combine the indices of the two - // getelementptr instructions into a single instruction. - if (auto *Src = dyn_cast<GEPOperator>(PtrOp)) { - if (!shouldMergeGEPs(*cast<GEPOperator>(&GEP), *Src)) - return nullptr; - - if (Src->getNumOperands() == 2 && GEP.getNumOperands() == 2 && - Src->hasOneUse()) { - Value *GO1 = GEP.getOperand(1); - Value *SO1 = Src->getOperand(1); - - if (LI) { - // Try to reassociate loop invariant GEP chains to enable LICM. - if (Loop *L = LI->getLoopFor(GEP.getParent())) { - // Reassociate the two GEPs if SO1 is variant in the loop and GO1 is - // invariant: this breaks the dependence between GEPs and allows LICM - // to hoist the invariant part out of the loop. - if (L->isLoopInvariant(GO1) && !L->isLoopInvariant(SO1)) { - // We have to be careful here. - // We have something like: - // %src = getelementptr <ty>, <ty>* %base, <ty> %idx - // %gep = getelementptr <ty>, <ty>* %src, <ty> %idx2 - // If we just swap idx & idx2 then we could inadvertantly - // change %src from a vector to a scalar, or vice versa. - // Cases: - // 1) %base a scalar & idx a scalar & idx2 a vector - // => Swapping idx & idx2 turns %src into a vector type. - // 2) %base a scalar & idx a vector & idx2 a scalar - // => Swapping idx & idx2 turns %src in a scalar type - // 3) %base, %idx, and %idx2 are scalars - // => %src & %gep are scalars - // => swapping idx & idx2 is safe - // 4) %base a vector - // => %src is a vector - // => swapping idx & idx2 is safe. - auto *SO0 = Src->getOperand(0); - auto *SO0Ty = SO0->getType(); - if (!isa<VectorType>(GEPType) || // case 3 - isa<VectorType>(SO0Ty)) { // case 4 - Src->setOperand(1, GO1); - GEP.setOperand(1, SO1); - return &GEP; - } else { - // Case 1 or 2 - // -- have to recreate %src & %gep - // put NewSrc at same location as %src - Builder.SetInsertPoint(cast<Instruction>(PtrOp)); - Value *NewSrc = - Builder.CreateGEP(GEPEltType, SO0, GO1, Src->getName()); - // Propagate 'inbounds' if the new source was not constant-folded. - if (auto *NewSrcGEPI = dyn_cast<GetElementPtrInst>(NewSrc)) - NewSrcGEPI->setIsInBounds(Src->isInBounds()); - GetElementPtrInst *NewGEP = - GetElementPtrInst::Create(GEPEltType, NewSrc, {SO1}); - NewGEP->setIsInBounds(GEP.isInBounds()); - return NewGEP; - } - } - } - } - } - - // Note that if our source is a gep chain itself then we wait for that - // chain to be resolved before we perform this transformation. This - // avoids us creating a TON of code in some cases. - if (auto *SrcGEP = dyn_cast<GEPOperator>(Src->getOperand(0))) - if (SrcGEP->getNumOperands() == 2 && shouldMergeGEPs(*Src, *SrcGEP)) - return nullptr; // Wait until our source is folded to completion. - - SmallVector<Value*, 8> Indices; - - // Find out whether the last index in the source GEP is a sequential idx. - bool EndsWithSequential = false; - for (gep_type_iterator I = gep_type_begin(*Src), E = gep_type_end(*Src); - I != E; ++I) - EndsWithSequential = I.isSequential(); - - // Can we combine the two pointer arithmetics offsets? - if (EndsWithSequential) { - // Replace: gep (gep %P, long B), long A, ... - // With: T = long A+B; gep %P, T, ... - Value *SO1 = Src->getOperand(Src->getNumOperands()-1); - Value *GO1 = GEP.getOperand(1); - - // If they aren't the same type, then the input hasn't been processed - // by the loop above yet (which canonicalizes sequential index types to - // intptr_t). Just avoid transforming this until the input has been - // normalized. - if (SO1->getType() != GO1->getType()) - return nullptr; - - Value *Sum = - SimplifyAddInst(GO1, SO1, false, false, SQ.getWithInstruction(&GEP)); - // Only do the combine when we are sure the cost after the - // merge is never more than that before the merge. - if (Sum == nullptr) - return nullptr; - - // Update the GEP in place if possible. - if (Src->getNumOperands() == 2) { - GEP.setIsInBounds(isMergedGEPInBounds(*Src, *cast<GEPOperator>(&GEP))); - replaceOperand(GEP, 0, Src->getOperand(0)); - replaceOperand(GEP, 1, Sum); - return &GEP; - } - Indices.append(Src->op_begin()+1, Src->op_end()-1); - Indices.push_back(Sum); - Indices.append(GEP.op_begin()+2, GEP.op_end()); - } else if (isa<Constant>(*GEP.idx_begin()) && - cast<Constant>(*GEP.idx_begin())->isNullValue() && - Src->getNumOperands() != 1) { - // Otherwise we can do the fold if the first index of the GEP is a zero - Indices.append(Src->op_begin()+1, Src->op_end()); - Indices.append(GEP.idx_begin()+1, GEP.idx_end()); - } - - if (!Indices.empty()) - return isMergedGEPInBounds(*Src, *cast<GEPOperator>(&GEP)) - ? GetElementPtrInst::CreateInBounds( - Src->getSourceElementType(), Src->getOperand(0), Indices, - GEP.getName()) - : GetElementPtrInst::Create(Src->getSourceElementType(), - Src->getOperand(0), Indices, - GEP.getName()); - } + if (auto *Src = dyn_cast<GEPOperator>(PtrOp)) + if (Instruction *I = visitGEPOfGEP(GEP, Src)) + return I; // Skip if GEP source element type is scalable. The type alloc size is unknown // at compile-time. @@ -2234,9 +2414,13 @@ Instruction *InstCombinerImpl::visitGetElementPtrInst(GetElementPtrInst &GEP) { Value *StrippedPtr = PtrOp->stripPointerCasts(); PointerType *StrippedPtrTy = cast<PointerType>(StrippedPtr->getType()); - if (StrippedPtr != PtrOp) { + // TODO: The basic approach of these folds is not compatible with opaque + // pointers, because we can't use bitcasts as a hint for a desirable GEP + // type. Instead, we should perform canonicalization directly on the GEP + // type. For now, skip these. + if (StrippedPtr != PtrOp && !StrippedPtrTy->isOpaque()) { bool HasZeroPointerIndex = false; - Type *StrippedPtrEltTy = StrippedPtrTy->getElementType(); + Type *StrippedPtrEltTy = StrippedPtrTy->getNonOpaquePointerElementType(); if (auto *C = dyn_cast<ConstantInt>(GEP.getOperand(1))) HasZeroPointerIndex = C->isZero(); @@ -2420,103 +2604,9 @@ Instruction *InstCombinerImpl::visitGetElementPtrInst(GetElementPtrInst &GEP) { ASCStrippedPtrOp = BC; } - if (auto *BCI = dyn_cast<BitCastInst>(ASCStrippedPtrOp)) { - Value *SrcOp = BCI->getOperand(0); - PointerType *SrcType = cast<PointerType>(BCI->getSrcTy()); - Type *SrcEltType = SrcType->getElementType(); - - // GEP directly using the source operand if this GEP is accessing an element - // of a bitcasted pointer to vector or array of the same dimensions: - // gep (bitcast <c x ty>* X to [c x ty]*), Y, Z --> gep X, Y, Z - // gep (bitcast [c x ty]* X to <c x ty>*), Y, Z --> gep X, Y, Z - auto areMatchingArrayAndVecTypes = [](Type *ArrTy, Type *VecTy, - const DataLayout &DL) { - auto *VecVTy = cast<FixedVectorType>(VecTy); - return ArrTy->getArrayElementType() == VecVTy->getElementType() && - ArrTy->getArrayNumElements() == VecVTy->getNumElements() && - DL.getTypeAllocSize(ArrTy) == DL.getTypeAllocSize(VecTy); - }; - if (GEP.getNumOperands() == 3 && - ((GEPEltType->isArrayTy() && isa<FixedVectorType>(SrcEltType) && - areMatchingArrayAndVecTypes(GEPEltType, SrcEltType, DL)) || - (isa<FixedVectorType>(GEPEltType) && SrcEltType->isArrayTy() && - areMatchingArrayAndVecTypes(SrcEltType, GEPEltType, DL)))) { - - // Create a new GEP here, as using `setOperand()` followed by - // `setSourceElementType()` won't actually update the type of the - // existing GEP Value. Causing issues if this Value is accessed when - // constructing an AddrSpaceCastInst - Value *NGEP = - GEP.isInBounds() - ? Builder.CreateInBoundsGEP(SrcEltType, SrcOp, {Ops[1], Ops[2]}) - : Builder.CreateGEP(SrcEltType, SrcOp, {Ops[1], Ops[2]}); - NGEP->takeName(&GEP); - - // Preserve GEP address space to satisfy users - if (NGEP->getType()->getPointerAddressSpace() != GEP.getAddressSpace()) - return new AddrSpaceCastInst(NGEP, GEPType); - - return replaceInstUsesWith(GEP, NGEP); - } - - // See if we can simplify: - // X = bitcast A* to B* - // Y = gep X, <...constant indices...> - // into a gep of the original struct. This is important for SROA and alias - // analysis of unions. If "A" is also a bitcast, wait for A/X to be merged. - unsigned OffsetBits = DL.getIndexTypeSizeInBits(GEPType); - APInt Offset(OffsetBits, 0); - - // If the bitcast argument is an allocation, The bitcast is for convertion - // to actual type of allocation. Removing such bitcasts, results in having - // GEPs with i8* base and pure byte offsets. That means GEP is not aware of - // struct or array hierarchy. - // By avoiding such GEPs, phi translation and MemoryDependencyAnalysis have - // a better chance to succeed. - if (!isa<BitCastInst>(SrcOp) && GEP.accumulateConstantOffset(DL, Offset) && - !isAllocationFn(SrcOp, &TLI)) { - // If this GEP instruction doesn't move the pointer, just replace the GEP - // with a bitcast of the real input to the dest type. - if (!Offset) { - // If the bitcast is of an allocation, and the allocation will be - // converted to match the type of the cast, don't touch this. - if (isa<AllocaInst>(SrcOp)) { - // See if the bitcast simplifies, if so, don't nuke this GEP yet. - if (Instruction *I = visitBitCast(*BCI)) { - if (I != BCI) { - I->takeName(BCI); - BCI->getParent()->getInstList().insert(BCI->getIterator(), I); - replaceInstUsesWith(*BCI, I); - } - return &GEP; - } - } - - if (SrcType->getPointerAddressSpace() != GEP.getAddressSpace()) - return new AddrSpaceCastInst(SrcOp, GEPType); - return new BitCastInst(SrcOp, GEPType); - } - - // Otherwise, if the offset is non-zero, we need to find out if there is a - // field at Offset in 'A's type. If so, we can pull the cast through the - // GEP. - SmallVector<Value*, 8> NewIndices; - if (FindElementAtOffset(SrcType, Offset.getSExtValue(), NewIndices)) { - Value *NGEP = - GEP.isInBounds() - ? Builder.CreateInBoundsGEP(SrcEltType, SrcOp, NewIndices) - : Builder.CreateGEP(SrcEltType, SrcOp, NewIndices); - - if (NGEP->getType() == GEPType) - return replaceInstUsesWith(GEP, NGEP); - NGEP->takeName(&GEP); - - if (NGEP->getType()->getPointerAddressSpace() != GEP.getAddressSpace()) - return new AddrSpaceCastInst(NGEP, GEPType); - return new BitCastInst(NGEP, GEPType); - } - } - } + if (auto *BCI = dyn_cast<BitCastInst>(ASCStrippedPtrOp)) + if (Instruction *I = visitGEPOfBitcast(BCI, GEP)) + return I; if (!GEP.isInBounds()) { unsigned IdxWidth = @@ -2533,8 +2623,7 @@ Instruction *InstCombinerImpl::visitGetElementPtrInst(GetElementPtrInst &GEP) { DL.getTypeAllocSize(AI->getAllocatedType()).getKnownMinSize()); if (BasePtrOffset.ule(AllocSize)) { return GetElementPtrInst::CreateInBounds( - GEP.getSourceElementType(), PtrOp, makeArrayRef(Ops).slice(1), - GEP.getName()); + GEP.getSourceElementType(), PtrOp, Indices, GEP.getName()); } } } @@ -2553,10 +2642,6 @@ static bool isNeverEqualToUnescapedAlloc(Value *V, const TargetLibraryInfo &TLI, if (auto *LI = dyn_cast<LoadInst>(V)) return isa<GlobalVariable>(LI->getPointerOperand()); // Two distinct allocations will never be equal. - // We rely on LookThroughBitCast in isAllocLikeFn being false, since looking - // through bitcasts of V can cause - // the result statement below to be true, even when AI and V (ex: - // i8* ->i32* ->i8* of AI) are the same allocations. return isAllocLikeFn(V, &TLI) && V != AI; } @@ -2659,7 +2744,7 @@ static bool isAllocSiteRemovable(Instruction *AI, continue; } - if (isReallocLikeFn(I, &TLI, true)) { + if (isReallocLikeFn(I, &TLI)) { Users.emplace_back(I); Worklist.push_back(I); continue; @@ -2682,6 +2767,8 @@ static bool isAllocSiteRemovable(Instruction *AI, } Instruction *InstCombinerImpl::visitAllocSite(Instruction &MI) { + assert(isa<AllocaInst>(MI) || isAllocRemovable(&cast<CallBase>(MI), &TLI)); + // If we have a malloc call which is only used in any amount of comparisons to // null and free calls, delete the calls and replace the comparisons with true // or false as appropriate. @@ -2900,7 +2987,7 @@ Instruction *InstCombinerImpl::visitFree(CallInst &FI) { // If we had free(realloc(...)) with no intervening uses, then eliminate the // realloc() entirely. if (CallInst *CI = dyn_cast<CallInst>(Op)) { - if (CI->hasOneUse() && isReallocLikeFn(CI, &TLI, true)) { + if (CI->hasOneUse() && isReallocLikeFn(CI, &TLI)) { return eraseInstFromFunction( *replaceInstUsesWith(*CI, CI->getOperand(0))); } @@ -3709,16 +3796,61 @@ Instruction *InstCombinerImpl::visitFreeze(FreezeInst &I) { return nullptr; } +/// Check for case where the call writes to an otherwise dead alloca. This +/// shows up for unused out-params in idiomatic C/C++ code. Note that this +/// helper *only* analyzes the write; doesn't check any other legality aspect. +static bool SoleWriteToDeadLocal(Instruction *I, TargetLibraryInfo &TLI) { + auto *CB = dyn_cast<CallBase>(I); + if (!CB) + // TODO: handle e.g. store to alloca here - only worth doing if we extend + // to allow reload along used path as described below. Otherwise, this + // is simply a store to a dead allocation which will be removed. + return false; + Optional<MemoryLocation> Dest = MemoryLocation::getForDest(CB, TLI); + if (!Dest) + return false; + auto *AI = dyn_cast<AllocaInst>(getUnderlyingObject(Dest->Ptr)); + if (!AI) + // TODO: allow malloc? + return false; + // TODO: allow memory access dominated by move point? Note that since AI + // could have a reference to itself captured by the call, we would need to + // account for cycles in doing so. + SmallVector<const User *> AllocaUsers; + SmallPtrSet<const User *, 4> Visited; + auto pushUsers = [&](const Instruction &I) { + for (const User *U : I.users()) { + if (Visited.insert(U).second) + AllocaUsers.push_back(U); + } + }; + pushUsers(*AI); + while (!AllocaUsers.empty()) { + auto *UserI = cast<Instruction>(AllocaUsers.pop_back_val()); + if (isa<BitCastInst>(UserI) || isa<GetElementPtrInst>(UserI) || + isa<AddrSpaceCastInst>(UserI)) { + pushUsers(*UserI); + continue; + } + if (UserI == CB) + continue; + // TODO: support lifetime.start/end here + return false; + } + return true; +} + /// Try to move the specified instruction from its current block into the /// beginning of DestBlock, which can only happen if it's safe to move the /// instruction past all of the instructions between it and the end of its /// block. -static bool TryToSinkInstruction(Instruction *I, BasicBlock *DestBlock) { +static bool TryToSinkInstruction(Instruction *I, BasicBlock *DestBlock, + TargetLibraryInfo &TLI) { assert(I->getUniqueUndroppableUser() && "Invariants didn't hold!"); BasicBlock *SrcBlock = I->getParent(); // Cannot move control-flow-involving, volatile loads, vaarg, etc. - if (isa<PHINode>(I) || I->isEHPad() || I->mayHaveSideEffects() || + if (isa<PHINode>(I) || I->isEHPad() || I->mayThrow() || !I->willReturn() || I->isTerminator()) return false; @@ -3738,6 +3870,14 @@ static bool TryToSinkInstruction(Instruction *I, BasicBlock *DestBlock) { if (CI->isConvergent()) return false; } + + // Unless we can prove that the memory write isn't visibile except on the + // path we're sinking to, we must bail. + if (I->mayWriteToMemory()) { + if (!SoleWriteToDeadLocal(I, TLI)) + return false; + } + // We can only sink load instructions if there is nothing between the load and // the end of block that could change the value. if (I->mayReadFromMemory()) { @@ -3746,7 +3886,7 @@ static bool TryToSinkInstruction(Instruction *I, BasicBlock *DestBlock) { // successor block. if (DestBlock->getUniquePredecessor() != I->getParent()) return false; - for (BasicBlock::iterator Scan = I->getIterator(), + for (BasicBlock::iterator Scan = std::next(I->getIterator()), E = I->getParent()->end(); Scan != E; ++Scan) if (Scan->mayWriteToMemory()) @@ -3906,12 +4046,11 @@ bool InstCombinerImpl::run() { // predecessor, so that we don't have to split the critical edge. // Another option where we can sink is a block that ends with a // terminator that does not pass control to other block (such as - // return or unreachable). In this case: + // return or unreachable or resume). In this case: // - I dominates the User (by SSA form); // - the User will be executed at most once. // So sinking I down to User is always profitable or neutral. - if (UserParent->getUniquePredecessor() == BB || - (isa<ReturnInst>(Term) || isa<UnreachableInst>(Term))) { + if (UserParent->getUniquePredecessor() == BB || succ_empty(Term)) { assert(DT.dominates(BB, UserParent) && "Dominance relation broken?"); return UserParent; } @@ -3922,7 +4061,7 @@ bool InstCombinerImpl::run() { if (OptBB) { auto *UserParent = *OptBB; // Okay, the CFG is simple enough, try to sink this instruction. - if (TryToSinkInstruction(I, UserParent)) { + if (TryToSinkInstruction(I, UserParent, TLI)) { LLVM_DEBUG(dbgs() << "IC: Sink: " << *I << '\n'); MadeIRChange = true; // We'll add uses of the sunk instruction below, but since diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp index bd2dc8d639fc..6e72255e51ae 100644 --- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp @@ -1547,10 +1547,9 @@ void AddressSanitizer::getInterestingMemoryOperands( Interesting.emplace_back(I, XCHG->getPointerOperandIndex(), true, XCHG->getCompareOperand()->getType(), None); } else if (auto CI = dyn_cast<CallInst>(I)) { - auto *F = CI->getCalledFunction(); - if (F && (F->getName().startswith("llvm.masked.load.") || - F->getName().startswith("llvm.masked.store."))) { - bool IsWrite = F->getName().startswith("llvm.masked.store."); + if (CI->getIntrinsicID() == Intrinsic::masked_load || + CI->getIntrinsicID() == Intrinsic::masked_store) { + bool IsWrite = CI->getIntrinsicID() == Intrinsic::masked_store; // Masked store has an initial operand for the value. unsigned OpOffset = IsWrite ? 1 : 0; if (IsWrite ? !ClInstrumentWrites : !ClInstrumentReads) @@ -1559,7 +1558,7 @@ void AddressSanitizer::getInterestingMemoryOperands( auto BasePtr = CI->getOperand(OpOffset); if (ignoreAccess(LI, BasePtr)) return; - auto Ty = cast<PointerType>(BasePtr->getType())->getElementType(); + Type *Ty = IsWrite ? CI->getArgOperand(0)->getType() : CI->getType(); MaybeAlign Alignment = Align(1); // Otherwise no alignment guarantees. We probably got Undef. if (auto *Op = dyn_cast<ConstantInt>(CI->getOperand(1 + OpOffset))) @@ -1653,11 +1652,10 @@ static void instrumentMaskedLoadOrStore(AddressSanitizer *Pass, const DataLayout &DL, Type *IntptrTy, Value *Mask, Instruction *I, Value *Addr, MaybeAlign Alignment, - unsigned Granularity, uint32_t TypeSize, + unsigned Granularity, Type *OpType, bool IsWrite, Value *SizeArgument, bool UseCalls, uint32_t Exp) { - auto *VTy = cast<FixedVectorType>( - cast<PointerType>(Addr->getType())->getElementType()); + auto *VTy = cast<FixedVectorType>(OpType); uint64_t ElemTypeSize = DL.getTypeStoreSizeInBits(VTy->getScalarType()); unsigned Num = VTy->getNumElements(); auto Zero = ConstantInt::get(IntptrTy, 0); @@ -1735,7 +1733,7 @@ void AddressSanitizer::instrumentMop(ObjectSizeOffsetVisitor &ObjSizeVis, unsigned Granularity = 1 << Mapping.Scale; if (O.MaybeMask) { instrumentMaskedLoadOrStore(this, DL, IntptrTy, O.MaybeMask, O.getInsn(), - Addr, O.Alignment, Granularity, O.TypeSize, + Addr, O.Alignment, Granularity, O.OpType, O.IsWrite, nullptr, UseCalls, Exp); } else { doInstrumentAddress(this, O.getInsn(), O.getInsn(), Addr, O.Alignment, diff --git a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp index 9f26b37bbc79..ff3aa14a2a83 100644 --- a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp @@ -208,6 +208,14 @@ static cl::opt<bool> ClEventCallbacks( cl::desc("Insert calls to __dfsan_*_callback functions on data events."), cl::Hidden, cl::init(false)); +// Experimental feature that inserts callbacks for conditionals, including: +// conditional branch, switch, select. +// This must be true for dfsan_set_conditional_callback() to have effect. +static cl::opt<bool> ClConditionalCallbacks( + "dfsan-conditional-callbacks", + cl::desc("Insert calls to callback functions on conditionals."), cl::Hidden, + cl::init(false)); + // Controls whether the pass tracks the control flow of select instructions. static cl::opt<bool> ClTrackSelectControlFlow( "dfsan-track-select-control-flow", @@ -428,6 +436,8 @@ class DataFlowSanitizer { FunctionType *DFSanSetLabelFnTy; FunctionType *DFSanNonzeroLabelFnTy; FunctionType *DFSanVarargWrapperFnTy; + FunctionType *DFSanConditionalCallbackFnTy; + FunctionType *DFSanConditionalCallbackOriginFnTy; FunctionType *DFSanCmpCallbackFnTy; FunctionType *DFSanLoadStoreCallbackFnTy; FunctionType *DFSanMemTransferCallbackFnTy; @@ -444,6 +454,8 @@ class DataFlowSanitizer { FunctionCallee DFSanLoadCallbackFn; FunctionCallee DFSanStoreCallbackFn; FunctionCallee DFSanMemTransferCallbackFn; + FunctionCallee DFSanConditionalCallbackFn; + FunctionCallee DFSanConditionalCallbackOriginFn; FunctionCallee DFSanCmpCallbackFn; FunctionCallee DFSanChainOriginFn; FunctionCallee DFSanChainOriginIfTaintedFn; @@ -454,7 +466,7 @@ class DataFlowSanitizer { MDNode *OriginStoreWeights; DFSanABIList ABIList; DenseMap<Value *, Function *> UnwrappedFnMap; - AttrBuilder ReadOnlyNoneAttrs; + AttributeMask ReadOnlyNoneAttrs; /// Memory map parameters used in calculation mapping application addresses /// to shadow addresses and origin addresses. @@ -642,6 +654,10 @@ struct DFSanFunction { Align getShadowAlign(Align InstAlignment); + // If ClConditionalCallbacks is enabled, insert a callback after a given + // branch instruction using the given conditional expression. + void addConditionalCallbacksIfEnabled(Instruction &I, Value *Condition); + private: /// Collapses the shadow with aggregate type into a single primitive shadow /// value. @@ -748,6 +764,8 @@ public: void visitSelectInst(SelectInst &I); void visitMemSetInst(MemSetInst &I); void visitMemTransferInst(MemTransferInst &I); + void visitBranchInst(BranchInst &BR); + void visitSwitchInst(SwitchInst &SW); private: void visitCASOrRMW(Align InstAlignment, Instruction &I); @@ -971,6 +989,22 @@ Value *DFSanFunction::collapseToPrimitiveShadow(Value *Shadow, return PrimitiveShadow; } +void DFSanFunction::addConditionalCallbacksIfEnabled(Instruction &I, + Value *Condition) { + if (!ClConditionalCallbacks) { + return; + } + IRBuilder<> IRB(&I); + Value *CondShadow = getShadow(Condition); + if (DFS.shouldTrackOrigins()) { + Value *CondOrigin = getOrigin(Condition); + IRB.CreateCall(DFS.DFSanConditionalCallbackOriginFn, + {CondShadow, CondOrigin}); + } else { + IRB.CreateCall(DFS.DFSanConditionalCallbackFn, {CondShadow}); + } +} + Type *DataFlowSanitizer::getShadowTy(Type *OrigTy) { if (!OrigTy->isSized()) return PrimitiveShadowTy; @@ -1032,6 +1066,13 @@ bool DataFlowSanitizer::initializeModule(Module &M) { FunctionType::get(Type::getVoidTy(*Ctx), None, /*isVarArg=*/false); DFSanVarargWrapperFnTy = FunctionType::get( Type::getVoidTy(*Ctx), Type::getInt8PtrTy(*Ctx), /*isVarArg=*/false); + DFSanConditionalCallbackFnTy = + FunctionType::get(Type::getVoidTy(*Ctx), PrimitiveShadowTy, + /*isVarArg=*/false); + Type *DFSanConditionalCallbackOriginArgs[2] = {PrimitiveShadowTy, OriginTy}; + DFSanConditionalCallbackOriginFnTy = FunctionType::get( + Type::getVoidTy(*Ctx), DFSanConditionalCallbackOriginArgs, + /*isVarArg=*/false); DFSanCmpCallbackFnTy = FunctionType::get(Type::getVoidTy(*Ctx), PrimitiveShadowTy, /*isVarArg=*/false); @@ -1160,7 +1201,7 @@ Constant *DataFlowSanitizer::getOrBuildTrampolineFunction(FunctionType *FT, // F is called by a wrapped custom function with primitive shadows. So // its arguments and return value need conversion. DFSanFunction DFSF(*this, F, /*IsNativeABI=*/true, - /*ForceZeroLabels=*/false); + /*IsForceZeroLabels=*/false); Function::arg_iterator ValAI = F->arg_begin(), ShadowAI = AI; ++ValAI; for (unsigned N = FT->getNumParams(); N != 0; ++ValAI, ++ShadowAI, --N) { @@ -1271,6 +1312,10 @@ void DataFlowSanitizer::initializeRuntimeFunctions(Module &M) { DFSanRuntimeFunctions.insert( DFSanMemTransferCallbackFn.getCallee()->stripPointerCasts()); DFSanRuntimeFunctions.insert( + DFSanConditionalCallbackFn.getCallee()->stripPointerCasts()); + DFSanRuntimeFunctions.insert( + DFSanConditionalCallbackOriginFn.getCallee()->stripPointerCasts()); + DFSanRuntimeFunctions.insert( DFSanCmpCallbackFn.getCallee()->stripPointerCasts()); DFSanRuntimeFunctions.insert( DFSanChainOriginFn.getCallee()->stripPointerCasts()); @@ -1292,6 +1337,12 @@ void DataFlowSanitizer::initializeCallbackFunctions(Module &M) { "__dfsan_mem_transfer_callback", DFSanMemTransferCallbackFnTy); DFSanCmpCallbackFn = Mod->getOrInsertFunction("__dfsan_cmp_callback", DFSanCmpCallbackFnTy); + + DFSanConditionalCallbackFn = Mod->getOrInsertFunction( + "__dfsan_conditional_callback", DFSanConditionalCallbackFnTy); + DFSanConditionalCallbackOriginFn = + Mod->getOrInsertFunction("__dfsan_conditional_callback_origin", + DFSanConditionalCallbackOriginFnTy); } void DataFlowSanitizer::injectMetadataGlobals(Module &M) { @@ -2593,6 +2644,8 @@ void DFSanVisitor::visitSelectInst(SelectInst &I) { Value *FalseOrigin = ShouldTrackOrigins ? DFSF.getOrigin(I.getFalseValue()) : nullptr; + DFSF.addConditionalCallbacksIfEnabled(I, I.getCondition()); + if (isa<VectorType>(I.getCondition()->getType())) { ShadowSel = DFSF.combineShadowsThenConvert(I.getType(), TrueShadow, FalseShadow, &I); @@ -2683,6 +2736,17 @@ void DFSanVisitor::visitMemTransferInst(MemTransferInst &I) { } } +void DFSanVisitor::visitBranchInst(BranchInst &BR) { + if (!BR.isConditional()) + return; + + DFSF.addConditionalCallbacksIfEnabled(BR, BR.getCondition()); +} + +void DFSanVisitor::visitSwitchInst(SwitchInst &SW) { + DFSF.addConditionalCallbacksIfEnabled(SW, SW.getCondition()); +} + static bool isAMustTailRetVal(Value *RetVal) { // Tail call may have a bitcast between return. if (auto *I = dyn_cast<BitCastInst>(RetVal)) { diff --git a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp index 8d3bc1383e96..fb10a99d1338 100644 --- a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp @@ -1403,16 +1403,16 @@ bool HWAddressSanitizer::instrumentStack( size_t Size = getAllocaSizeInBytes(*AI); size_t AlignedSize = alignTo(Size, Mapping.getObjectAlignment()); + auto TagEnd = [&](Instruction *Node) { + IRB.SetInsertPoint(Node); + Value *UARTag = getUARTag(IRB, StackTag); + tagAlloca(IRB, AI, UARTag, AlignedSize); + }; bool StandardLifetime = UnrecognizedLifetimes.empty() && isStandardLifetime(Info, GetDT()); if (DetectUseAfterScope && StandardLifetime) { IntrinsicInst *Start = Info.LifetimeStart[0]; IRB.SetInsertPoint(Start->getNextNode()); - auto TagEnd = [&](Instruction *Node) { - IRB.SetInsertPoint(Node); - Value *UARTag = getUARTag(IRB, StackTag); - tagAlloca(IRB, AI, UARTag, AlignedSize); - }; tagAlloca(IRB, AI, Tag, Size); if (!forAllReachableExits(GetDT(), GetPDT(), Start, Info.LifetimeEnd, RetVec, TagEnd)) { @@ -1421,11 +1421,8 @@ bool HWAddressSanitizer::instrumentStack( } } else { tagAlloca(IRB, AI, Tag, Size); - for (auto *RI : RetVec) { - IRB.SetInsertPoint(RI); - Value *UARTag = getUARTag(IRB, StackTag); - tagAlloca(IRB, AI, UARTag, AlignedSize); - } + for (auto *RI : RetVec) + TagEnd(RI); if (!StandardLifetime) { for (auto &II : Info.LifetimeStart) II->eraseFromParent(); diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp index de34348606ef..ab179b03dd29 100644 --- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp +++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp @@ -248,8 +248,7 @@ public: PGOCounterPromoter( DenseMap<Loop *, SmallVector<LoadStorePair, 8>> &LoopToCands, Loop &CurLoop, LoopInfo &LI, BlockFrequencyInfo *BFI) - : LoopToCandidates(LoopToCands), ExitBlocks(), InsertPts(), L(CurLoop), - LI(LI), BFI(BFI) { + : LoopToCandidates(LoopToCands), L(CurLoop), LI(LI), BFI(BFI) { // Skip collection of ExitBlocks and InsertPts for loops that will not be // able to have counters promoted. @@ -446,24 +445,19 @@ llvm::createInstrProfilingLegacyPass(const InstrProfOptions &Options, return new InstrProfilingLegacyPass(Options, IsCS); } -static InstrProfIncrementInst *castToIncrementInst(Instruction *Instr) { - InstrProfIncrementInst *Inc = dyn_cast<InstrProfIncrementInstStep>(Instr); - if (Inc) - return Inc; - return dyn_cast<InstrProfIncrementInst>(Instr); -} - bool InstrProfiling::lowerIntrinsics(Function *F) { bool MadeChange = false; PromotionCandidates.clear(); for (BasicBlock &BB : *F) { for (Instruction &Instr : llvm::make_early_inc_range(BB)) { - InstrProfIncrementInst *Inc = castToIncrementInst(&Instr); - if (Inc) { - lowerIncrement(Inc); + if (auto *IPIS = dyn_cast<InstrProfIncrementInstStep>(&Instr)) { + lowerIncrement(IPIS); + MadeChange = true; + } else if (auto *IPI = dyn_cast<InstrProfIncrementInst>(&Instr)) { + lowerIncrement(IPI); MadeChange = true; - } else if (auto *Ind = dyn_cast<InstrProfValueProfileInst>(&Instr)) { - lowerValueProfileInst(Ind); + } else if (auto *IPVP = dyn_cast<InstrProfValueProfileInst>(&Instr)) { + lowerValueProfileInst(IPVP); MadeChange = true; } } @@ -540,19 +534,14 @@ static bool needsRuntimeHookUnconditionally(const Triple &TT) { /// Check if the module contains uses of any profiling intrinsics. static bool containsProfilingIntrinsics(Module &M) { - if (auto *F = M.getFunction( - Intrinsic::getName(llvm::Intrinsic::instrprof_increment))) - if (!F->use_empty()) - return true; - if (auto *F = M.getFunction( - Intrinsic::getName(llvm::Intrinsic::instrprof_increment_step))) - if (!F->use_empty()) - return true; - if (auto *F = M.getFunction( - Intrinsic::getName(llvm::Intrinsic::instrprof_value_profile))) - if (!F->use_empty()) - return true; - return false; + auto containsIntrinsic = [&](int ID) { + if (auto *F = M.getFunction(Intrinsic::getName(ID))) + return !F->use_empty(); + return false; + }; + return containsIntrinsic(llvm::Intrinsic::instrprof_increment) || + containsIntrinsic(llvm::Intrinsic::instrprof_increment_step) || + containsIntrinsic(llvm::Intrinsic::instrprof_value_profile); } bool InstrProfiling::run( @@ -771,7 +760,7 @@ void InstrProfiling::lowerCoverageData(GlobalVariable *CoverageNamesVar) { } /// Get the name of a profiling variable for a particular function. -static std::string getVarName(InstrProfIncrementInst *Inc, StringRef Prefix, +static std::string getVarName(InstrProfInstBase *Inc, StringRef Prefix, bool &Renamed) { StringRef NamePrefix = getInstrProfNameVarPrefix(); StringRef Name = Inc->getName()->getName().substr(NamePrefix.size()); @@ -860,7 +849,7 @@ static bool needsRuntimeRegistrationOfSectionRange(const Triple &TT) { } GlobalVariable * -InstrProfiling::getOrCreateRegionCounters(InstrProfIncrementInst *Inc) { +InstrProfiling::getOrCreateRegionCounters(InstrProfInstBase *Inc) { GlobalVariable *NamePtr = Inc->getName(); auto &PD = ProfileDataMap[NamePtr]; if (PD.RegionCounters) @@ -997,8 +986,11 @@ InstrProfiling::getOrCreateRegionCounters(InstrProfIncrementInst *Inc) { ConstantExpr::getBitCast(ValuesVar, Type::getInt8PtrTy(Ctx)); } - if (DebugInfoCorrelate) + if (DebugInfoCorrelate) { + // Mark the counter variable as used so that it isn't optimized out. + CompilerUsedVars.push_back(PD.RegionCounters); return PD.RegionCounters; + } // Create data variable. auto *IntPtrTy = M->getDataLayout().getIntPtrType(M->getContext()); diff --git a/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp b/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp index 727672fa0605..8fedefccf0e1 100644 --- a/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp @@ -156,6 +156,7 @@ struct InterestingMemoryAccess { Value *Addr = nullptr; bool IsWrite; unsigned Alignment; + Type *AccessTy; uint64_t TypeSize; Value *MaybeMask = nullptr; }; @@ -181,7 +182,7 @@ public: Value *Addr, uint32_t TypeSize, bool IsWrite); void instrumentMaskedLoadOrStore(const DataLayout &DL, Value *Mask, Instruction *I, Value *Addr, - unsigned Alignment, uint32_t TypeSize, + unsigned Alignment, Type *AccessTy, bool IsWrite); void instrumentMemIntrinsic(MemIntrinsic *MI); Value *memToShadow(Value *Shadow, IRBuilder<> &IRB); @@ -334,36 +335,32 @@ MemProfiler::isInterestingMemoryAccess(Instruction *I) const { InterestingMemoryAccess Access; - const DataLayout &DL = I->getModule()->getDataLayout(); if (LoadInst *LI = dyn_cast<LoadInst>(I)) { if (!ClInstrumentReads) return None; Access.IsWrite = false; - Access.TypeSize = DL.getTypeStoreSizeInBits(LI->getType()); + Access.AccessTy = LI->getType(); Access.Alignment = LI->getAlignment(); Access.Addr = LI->getPointerOperand(); } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) { if (!ClInstrumentWrites) return None; Access.IsWrite = true; - Access.TypeSize = - DL.getTypeStoreSizeInBits(SI->getValueOperand()->getType()); + Access.AccessTy = SI->getValueOperand()->getType(); Access.Alignment = SI->getAlignment(); Access.Addr = SI->getPointerOperand(); } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(I)) { if (!ClInstrumentAtomics) return None; Access.IsWrite = true; - Access.TypeSize = - DL.getTypeStoreSizeInBits(RMW->getValOperand()->getType()); + Access.AccessTy = RMW->getValOperand()->getType(); Access.Alignment = 0; Access.Addr = RMW->getPointerOperand(); } else if (AtomicCmpXchgInst *XCHG = dyn_cast<AtomicCmpXchgInst>(I)) { if (!ClInstrumentAtomics) return None; Access.IsWrite = true; - Access.TypeSize = - DL.getTypeStoreSizeInBits(XCHG->getCompareOperand()->getType()); + Access.AccessTy = XCHG->getCompareOperand()->getType(); Access.Alignment = 0; Access.Addr = XCHG->getPointerOperand(); } else if (auto *CI = dyn_cast<CallInst>(I)) { @@ -376,16 +373,16 @@ MemProfiler::isInterestingMemoryAccess(Instruction *I) const { return None; // Masked store has an initial operand for the value. OpOffset = 1; + Access.AccessTy = CI->getArgOperand(0)->getType(); Access.IsWrite = true; } else { if (!ClInstrumentReads) return None; + Access.AccessTy = CI->getType(); Access.IsWrite = false; } auto *BasePtr = CI->getOperand(0 + OpOffset); - auto *Ty = cast<PointerType>(BasePtr->getType())->getElementType(); - Access.TypeSize = DL.getTypeStoreSizeInBits(Ty); if (auto *AlignmentConstant = dyn_cast<ConstantInt>(CI->getOperand(1 + OpOffset))) Access.Alignment = (unsigned)AlignmentConstant->getZExtValue(); @@ -412,15 +409,16 @@ MemProfiler::isInterestingMemoryAccess(Instruction *I) const { if (Access.Addr->isSwiftError()) return None; + const DataLayout &DL = I->getModule()->getDataLayout(); + Access.TypeSize = DL.getTypeStoreSizeInBits(Access.AccessTy); return Access; } void MemProfiler::instrumentMaskedLoadOrStore(const DataLayout &DL, Value *Mask, Instruction *I, Value *Addr, unsigned Alignment, - uint32_t TypeSize, bool IsWrite) { - auto *VTy = cast<FixedVectorType>( - cast<PointerType>(Addr->getType())->getElementType()); + Type *AccessTy, bool IsWrite) { + auto *VTy = cast<FixedVectorType>(AccessTy); uint64_t ElemTypeSize = DL.getTypeStoreSizeInBits(VTy->getScalarType()); unsigned Num = VTy->getNumElements(); auto *Zero = ConstantInt::get(IntptrTy, 0); @@ -469,7 +467,7 @@ void MemProfiler::instrumentMop(Instruction *I, const DataLayout &DL, if (Access.MaybeMask) { instrumentMaskedLoadOrStore(DL, Access.MaybeMask, I, Access.Addr, - Access.Alignment, Access.TypeSize, + Access.Alignment, Access.AccessTy, Access.IsWrite); } else { // Since the access counts will be accumulated across the entire allocation, diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp index 446e601cd4d7..cfe993dedbc2 100644 --- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp @@ -492,7 +492,7 @@ class MemorySanitizer { public: MemorySanitizer(Module &M, MemorySanitizerOptions Options) : CompileKernel(Options.Kernel), TrackOrigins(Options.TrackOrigins), - Recover(Options.Recover) { + Recover(Options.Recover), EagerChecks(Options.EagerChecks) { initializeModule(M); } @@ -522,6 +522,7 @@ private: /// Track origins (allocation points) of uninitialized values. int TrackOrigins; bool Recover; + bool EagerChecks; LLVMContext *C; Type *IntptrTy; @@ -665,10 +666,12 @@ template <class T> T getOptOrDefault(const cl::opt<T> &Opt, T Default) { } // end anonymous namespace -MemorySanitizerOptions::MemorySanitizerOptions(int TO, bool R, bool K) +MemorySanitizerOptions::MemorySanitizerOptions(int TO, bool R, bool K, + bool EagerChecks) : Kernel(getOptOrDefault(ClEnableKmsan, K)), TrackOrigins(getOptOrDefault(ClTrackOrigins, Kernel ? 2 : TO)), - Recover(getOptOrDefault(ClKeepGoing, Kernel || R)) {} + Recover(getOptOrDefault(ClKeepGoing, Kernel || R)), + EagerChecks(getOptOrDefault(ClEagerChecks, EagerChecks)) {} PreservedAnalyses MemorySanitizerPass::run(Function &F, FunctionAnalysisManager &FAM) { @@ -695,6 +698,8 @@ void MemorySanitizerPass::printPipeline( OS << "recover;"; if (Options.Kernel) OS << "kernel;"; + if (Options.EagerChecks) + OS << "eager-checks;"; OS << "track-origins=" << Options.TrackOrigins; OS << ">"; } @@ -1667,9 +1672,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { /// This function either returns the value set earlier with setShadow, /// or extracts if from ParamTLS (for function arguments). Value *getShadow(Value *V) { - if (!PropagateShadow) return getCleanShadow(V); if (Instruction *I = dyn_cast<Instruction>(V)) { - if (I->getMetadata("nosanitize")) + if (!PropagateShadow || I->getMetadata("nosanitize")) return getCleanShadow(V); // For instructions the shadow is already stored in the map. Value *Shadow = ShadowMap[V]; @@ -1681,7 +1685,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { return Shadow; } if (UndefValue *U = dyn_cast<UndefValue>(V)) { - Value *AllOnes = PoisonUndef ? getPoisonedShadow(V) : getCleanShadow(V); + Value *AllOnes = (PropagateShadow && PoisonUndef) ? getPoisonedShadow(V) + : getCleanShadow(V); LLVM_DEBUG(dbgs() << "Undef: " << *U << " ==> " << *AllOnes << "\n"); (void)U; return AllOnes; @@ -1701,22 +1706,13 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { continue; } - bool FArgByVal = FArg.hasByValAttr(); - bool FArgNoUndef = FArg.hasAttribute(Attribute::NoUndef); - bool FArgEagerCheck = ClEagerChecks && !FArgByVal && FArgNoUndef; - unsigned Size = - FArg.hasByValAttr() - ? DL.getTypeAllocSize(FArg.getParamByValType()) - : DL.getTypeAllocSize(FArg.getType()); + unsigned Size = FArg.hasByValAttr() + ? DL.getTypeAllocSize(FArg.getParamByValType()) + : DL.getTypeAllocSize(FArg.getType()); if (A == &FArg) { bool Overflow = ArgOffset + Size > kParamTLSSize; - if (FArgEagerCheck) { - *ShadowPtr = getCleanShadow(V); - setOrigin(A, getCleanOrigin()); - break; - } else if (FArgByVal) { - Value *Base = getShadowPtrForArgument(&FArg, EntryIRB, ArgOffset); + if (FArg.hasByValAttr()) { // ByVal pointer itself has clean shadow. We copy the actual // argument shadow to the underlying memory. // Figure out maximal valid memcpy alignment. @@ -1727,40 +1723,38 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { /*isStore*/ true) .first; // TODO(glider): need to copy origins. - if (Overflow) { + if (!PropagateShadow || Overflow) { // ParamTLS overflow. EntryIRB.CreateMemSet( CpShadowPtr, Constant::getNullValue(EntryIRB.getInt8Ty()), Size, ArgAlign); } else { + Value *Base = getShadowPtrForArgument(&FArg, EntryIRB, ArgOffset); const Align CopyAlign = std::min(ArgAlign, kShadowTLSAlignment); Value *Cpy = EntryIRB.CreateMemCpy(CpShadowPtr, CopyAlign, Base, CopyAlign, Size); LLVM_DEBUG(dbgs() << " ByValCpy: " << *Cpy << "\n"); (void)Cpy; } + } + + if (!PropagateShadow || Overflow || FArg.hasByValAttr() || + (MS.EagerChecks && FArg.hasAttribute(Attribute::NoUndef))) { *ShadowPtr = getCleanShadow(V); + setOrigin(A, getCleanOrigin()); } else { // Shadow over TLS Value *Base = getShadowPtrForArgument(&FArg, EntryIRB, ArgOffset); - if (Overflow) { - // ParamTLS overflow. - *ShadowPtr = getCleanShadow(V); - } else { - *ShadowPtr = EntryIRB.CreateAlignedLoad(getShadowTy(&FArg), Base, - kShadowTLSAlignment); + *ShadowPtr = EntryIRB.CreateAlignedLoad(getShadowTy(&FArg), Base, + kShadowTLSAlignment); + if (MS.TrackOrigins) { + Value *OriginPtr = + getOriginPtrForArgument(&FArg, EntryIRB, ArgOffset); + setOrigin(A, EntryIRB.CreateLoad(MS.OriginTy, OriginPtr)); } } LLVM_DEBUG(dbgs() << " ARG: " << FArg << " ==> " << **ShadowPtr << "\n"); - if (MS.TrackOrigins && !Overflow) { - Value *OriginPtr = - getOriginPtrForArgument(&FArg, EntryIRB, ArgOffset); - setOrigin(A, EntryIRB.CreateLoad(MS.OriginTy, OriginPtr)); - } else { - setOrigin(A, getCleanOrigin()); - } - break; } @@ -3664,7 +3658,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { // will become a non-readonly function after it is instrumented by us. To // prevent this code from being optimized out, mark that function // non-readonly in advance. - AttrBuilder B; + AttributeMask B; B.addAttribute(Attribute::ReadOnly) .addAttribute(Attribute::ReadNone) .addAttribute(Attribute::WriteOnly) @@ -3679,7 +3673,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { maybeMarkSanitizerLibraryCallNoBuiltin(Call, TLI); } IRBuilder<> IRB(&CB); - bool MayCheckCall = ClEagerChecks; + bool MayCheckCall = MS.EagerChecks; if (Function *Func = CB.getCalledFunction()) { // __sanitizer_unaligned_{load,store} functions may be called by users // and always expects shadows in the TLS. So don't check them. @@ -3697,15 +3691,6 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { continue; } unsigned Size = 0; - Value *Store = nullptr; - // Compute the Shadow for arg even if it is ByVal, because - // in that case getShadow() will copy the actual arg shadow to - // __msan_param_tls. - Value *ArgShadow = getShadow(A); - Value *ArgShadowBase = getShadowPtrForArgument(A, IRB, ArgOffset); - LLVM_DEBUG(dbgs() << " Arg#" << i << ": " << *A - << " Shadow: " << *ArgShadow << "\n"); - bool ArgIsInitialized = false; const DataLayout &DL = F.getParent()->getDataLayout(); bool ByVal = CB.paramHasAttr(i, Attribute::ByVal); @@ -3716,6 +3701,15 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { insertShadowCheck(A, &CB); Size = DL.getTypeAllocSize(A->getType()); } else { + bool ArgIsInitialized = false; + Value *Store = nullptr; + // Compute the Shadow for arg even if it is ByVal, because + // in that case getShadow() will copy the actual arg shadow to + // __msan_param_tls. + Value *ArgShadow = getShadow(A); + Value *ArgShadowBase = getShadowPtrForArgument(A, IRB, ArgOffset); + LLVM_DEBUG(dbgs() << " Arg#" << i << ": " << *A + << " Shadow: " << *ArgShadow << "\n"); if (ByVal) { // ByVal requires some special handling as it's too big for a single // load @@ -3732,10 +3726,14 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { getShadowOriginPtr(A, IRB, IRB.getInt8Ty(), Alignment, /*isStore*/ false) .first; - - Store = IRB.CreateMemCpy(ArgShadowBase, Alignment, AShadowPtr, - Alignment, Size); - // TODO(glider): need to copy origins. + if (!PropagateShadow) { + Store = IRB.CreateMemSet(ArgShadowBase, + Constant::getNullValue(IRB.getInt8Ty()), + Size, Alignment); + } else { + Store = IRB.CreateMemCpy(ArgShadowBase, Alignment, AShadowPtr, + Alignment, Size); + } } else { // Any other parameters mean we need bit-grained tracking of uninit // data @@ -3832,10 +3830,10 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { Value *ShadowPtr = getShadowPtrForRetval(RetVal, IRB); bool HasNoUndef = F.hasRetAttribute(Attribute::NoUndef); - bool StoreShadow = !(ClEagerChecks && HasNoUndef); + bool StoreShadow = !(MS.EagerChecks && HasNoUndef); // FIXME: Consider using SpecialCaseList to specify a list of functions that // must always return fully initialized values. For now, we hardcode "main". - bool EagerCheck = (ClEagerChecks && HasNoUndef) || (F.getName() == "main"); + bool EagerCheck = (MS.EagerChecks && HasNoUndef) || (F.getName() == "main"); Value *Shadow = getShadow(RetVal); bool StoreOrigin = true; @@ -5359,7 +5357,7 @@ bool MemorySanitizer::sanitizeFunction(Function &F, TargetLibraryInfo &TLI) { MemorySanitizerVisitor Visitor(F, *this, TLI); // Clear out readonly/readnone attributes. - AttrBuilder B; + AttributeMask B; B.addAttribute(Attribute::ReadOnly) .addAttribute(Attribute::ReadNone) .addAttribute(Attribute::WriteOnly) diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp index b6ba1fc2132c..c46415e5b1f4 100644 --- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp +++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp @@ -877,7 +877,10 @@ populateEHOperandBundle(VPCandidateInfo &Cand, DenseMap<BasicBlock *, ColorVector> &BlockColors, SmallVectorImpl<OperandBundleDef> &OpBundles) { auto *OrigCall = dyn_cast<CallBase>(Cand.AnnotatedInst); - if (OrigCall && !isa<IntrinsicInst>(OrigCall)) { + if (!OrigCall) + return; + + if (!isa<IntrinsicInst>(OrigCall)) { // The instrumentation call should belong to the same funclet as a // non-intrinsic call, so just copy the operand bundle, if any exists. Optional<OperandBundleUse> ParentFunclet = diff --git a/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp b/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp index da8ee1f15bf8..d3b60c7add34 100644 --- a/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp +++ b/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp @@ -917,8 +917,7 @@ void ModuleSanitizerCoverage::InjectTraceForGep( void ModuleSanitizerCoverage::InjectTraceForLoadsAndStores( Function &, ArrayRef<LoadInst *> Loads, ArrayRef<StoreInst *> Stores) { - auto CallbackIdx = [&](const Value *Ptr) -> int { - auto ElementTy = cast<PointerType>(Ptr->getType())->getElementType(); + auto CallbackIdx = [&](Type *ElementTy) -> int { uint64_t TypeSize = DL->getTypeStoreSizeInBits(ElementTy); return TypeSize == 8 ? 0 : TypeSize == 16 ? 1 @@ -932,7 +931,7 @@ void ModuleSanitizerCoverage::InjectTraceForLoadsAndStores( for (auto LI : Loads) { IRBuilder<> IRB(LI); auto Ptr = LI->getPointerOperand(); - int Idx = CallbackIdx(Ptr); + int Idx = CallbackIdx(LI->getType()); if (Idx < 0) continue; IRB.CreateCall(SanCovLoadFunction[Idx], @@ -941,7 +940,7 @@ void ModuleSanitizerCoverage::InjectTraceForLoadsAndStores( for (auto SI : Stores) { IRBuilder<> IRB(SI); auto Ptr = SI->getPointerOperand(); - int Idx = CallbackIdx(Ptr); + int Idx = CallbackIdx(SI->getValueOperand()->getType()); if (Idx < 0) continue; IRB.CreateCall(SanCovStoreFunction[Idx], diff --git a/llvm/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h b/llvm/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h index 764dc5f92707..c11691c613ac 100644 --- a/llvm/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h +++ b/llvm/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h @@ -42,7 +42,7 @@ enum class ARCRuntimeEntryPointKind { Autorelease, StoreStrong, RetainRV, - ClaimRV, + UnsafeClaimRV, RetainAutorelease, RetainAutoreleaseRV, }; @@ -62,7 +62,7 @@ public: Autorelease = nullptr; StoreStrong = nullptr; RetainRV = nullptr; - ClaimRV = nullptr; + UnsafeClaimRV = nullptr; RetainAutorelease = nullptr; RetainAutoreleaseRV = nullptr; } @@ -87,9 +87,9 @@ public: case ARCRuntimeEntryPointKind::RetainRV: return getIntrinsicEntryPoint(RetainRV, Intrinsic::objc_retainAutoreleasedReturnValue); - case ARCRuntimeEntryPointKind::ClaimRV: + case ARCRuntimeEntryPointKind::UnsafeClaimRV: return getIntrinsicEntryPoint( - ClaimRV, Intrinsic::objc_unsafeClaimAutoreleasedReturnValue); + UnsafeClaimRV, Intrinsic::objc_unsafeClaimAutoreleasedReturnValue); case ARCRuntimeEntryPointKind::RetainAutorelease: return getIntrinsicEntryPoint(RetainAutorelease, Intrinsic::objc_retainAutorelease); @@ -127,7 +127,7 @@ private: Function *RetainRV = nullptr; /// Declaration for objc_unsafeClaimAutoreleasedReturnValue(). - Function *ClaimRV = nullptr; + Function *UnsafeClaimRV = nullptr; /// Declaration for objc_retainAutorelease(). Function *RetainAutorelease = nullptr; diff --git a/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.cpp b/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.cpp index 4921209f041b..de0f5803b4c7 100644 --- a/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.cpp +++ b/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.cpp @@ -194,9 +194,6 @@ llvm::objcarc::Depends(DependenceKind Flavor, Instruction *Inst, return CanInterruptRV(Class); } } - - case RetainRVDep: - return CanInterruptRV(GetBasicARCInstKind(Inst)); } llvm_unreachable("Invalid dependence flavor"); diff --git a/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.h b/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.h index cf4c05ebe91c..dd6a1c3f9795 100644 --- a/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.h +++ b/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.h @@ -46,8 +46,7 @@ enum DependenceKind { AutoreleasePoolBoundary, CanChangeRetainCount, RetainAutoreleaseDep, ///< Blocks objc_retainAutorelease. - RetainAutoreleaseRVDep, ///< Blocks objc_retainAutoreleaseReturnValue. - RetainRVDep ///< Blocks objc_retainAutoreleasedReturnValue. + RetainAutoreleaseRVDep ///< Blocks objc_retainAutoreleaseReturnValue. }; /// Find dependent instructions. If there is exactly one dependent instruction, diff --git a/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp b/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp index c2ed94e8e1f6..9e2832827686 100644 --- a/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp +++ b/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp @@ -433,7 +433,7 @@ bool ObjCARCContract::tryToPeepholeInstruction( // If we succeed in our optimization, fall through. LLVM_FALLTHROUGH; case ARCInstKind::RetainRV: - case ARCInstKind::ClaimRV: { + case ARCInstKind::UnsafeClaimRV: { bool IsInstContainedInBundle = BundledInsts->contains(Inst); // Return now if the target doesn't need a special inline-asm marker. Return diff --git a/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp b/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp index 0fa4904456cd..b6dc97f1e43f 100644 --- a/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp +++ b/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp @@ -515,7 +515,7 @@ class ObjCARCOpt { Function &F, DenseMap<BasicBlock *, ColorVector> &BlockColors, Instruction *Inst, ARCInstKind Class, const Value *Arg); - /// Try to optimize an AutoreleaseRV with a RetainRV or ClaimRV. If the + /// Try to optimize an AutoreleaseRV with a RetainRV or UnsafeClaimRV. If the /// optimization occurs, returns true to indicate that the caller should /// assume the instructions are dead. bool OptimizeInlinedAutoreleaseRVCall( @@ -705,14 +705,14 @@ bool ObjCARCOpt::OptimizeInlinedAutoreleaseRVCall( return true; } - // ClaimRV is a frontend peephole for RetainRV + Release. Since the - // AutoreleaseRV and RetainRV cancel out, replace the ClaimRV with a Release. - assert(Class == ARCInstKind::ClaimRV); + // UnsafeClaimRV is a frontend peephole for RetainRV + Release. Since the + // AutoreleaseRV and RetainRV cancel out, replace UnsafeClaimRV with Release. + assert(Class == ARCInstKind::UnsafeClaimRV); Value *CallArg = cast<CallInst>(Inst)->getArgOperand(0); CallInst *Release = CallInst::Create( EP.get(ARCRuntimeEntryPointKind::Release), CallArg, "", Inst); - assert(IsAlwaysTail(ARCInstKind::ClaimRV) && - "Expected ClaimRV to be safe to tail call"); + assert(IsAlwaysTail(ARCInstKind::UnsafeClaimRV) && + "Expected UnsafeClaimRV to be safe to tail call"); Release->setTailCall(); Inst->replaceAllUsesWith(CallArg); EraseInstruction(Inst); @@ -810,7 +810,7 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) { BlockColors = colorEHFunclets(F); // Store any delayed AutoreleaseRV intrinsics, so they can be easily paired - // with RetainRV and ClaimRV. + // with RetainRV and UnsafeClaimRV. Instruction *DelayedAutoreleaseRV = nullptr; const Value *DelayedAutoreleaseRVArg = nullptr; auto setDelayedAutoreleaseRV = [&](Instruction *AutoreleaseRV) { @@ -837,7 +837,7 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) { return false; // Given the frontend rules for emitting AutoreleaseRV, RetainRV, and - // ClaimRV, it's probably safe to skip over even opaque function calls + // UnsafeClaimRV, it's probably safe to skip over even opaque function calls // here since OptimizeInlinedAutoreleaseRVCall will confirm that they // have the same RCIdentityRoot. However, what really matters is // skipping instructions or intrinsics that the inliner could leave behind; @@ -881,7 +881,7 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) { setDelayedAutoreleaseRV(Inst); continue; case ARCInstKind::RetainRV: - case ARCInstKind::ClaimRV: + case ARCInstKind::UnsafeClaimRV: if (DelayedAutoreleaseRV) { // We have a potential RV pair. Check if they cancel out. if (OptimizeInlinedAutoreleaseRVCall(F, BlockColors, Inst, Arg, Class, @@ -979,9 +979,8 @@ void ObjCARCOpt::OptimizeIndividualCallImpl( CallInst *CI = cast<CallInst>(Inst); if (IsNullOrUndef(CI->getArgOperand(0))) { Changed = true; - Type *Ty = CI->getArgOperand(0)->getType(); - new StoreInst(UndefValue::get(cast<PointerType>(Ty)->getElementType()), - Constant::getNullValue(Ty), CI); + new StoreInst(ConstantInt::getTrue(CI->getContext()), + UndefValue::get(Type::getInt1PtrTy(CI->getContext())), CI); Value *NewValue = UndefValue::get(CI->getType()); LLVM_DEBUG( dbgs() << "A null pointer-to-weak-pointer is undefined behavior." @@ -999,9 +998,8 @@ void ObjCARCOpt::OptimizeIndividualCallImpl( if (IsNullOrUndef(CI->getArgOperand(0)) || IsNullOrUndef(CI->getArgOperand(1))) { Changed = true; - Type *Ty = CI->getArgOperand(0)->getType(); - new StoreInst(UndefValue::get(cast<PointerType>(Ty)->getElementType()), - Constant::getNullValue(Ty), CI); + new StoreInst(ConstantInt::getTrue(CI->getContext()), + UndefValue::get(Type::getInt1PtrTy(CI->getContext())), CI); Value *NewValue = UndefValue::get(CI->getType()); LLVM_DEBUG( @@ -1165,7 +1163,7 @@ void ObjCARCOpt::OptimizeIndividualCallImpl( DepInst = findSingleDependency(AutoreleasePoolBoundary, Arg, Inst->getParent(), Inst, PA); break; - case ARCInstKind::ClaimRV: + case ARCInstKind::UnsafeClaimRV: case ARCInstKind::RetainRV: case ARCInstKind::AutoreleaseRV: // Don't move these; the RV optimization depends on the autoreleaseRV diff --git a/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysis.h b/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysis.h index 6d0a67c91cfa..1624cf26094a 100644 --- a/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysis.h +++ b/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysis.h @@ -32,7 +32,6 @@ namespace llvm { class AAResults; -class DataLayout; class PHINode; class SelectInst; class Value; diff --git a/llvm/lib/Transforms/Scalar/ADCE.cpp b/llvm/lib/Transforms/Scalar/ADCE.cpp index b693acceb3f6..1cda206a7e14 100644 --- a/llvm/lib/Transforms/Scalar/ADCE.cpp +++ b/llvm/lib/Transforms/Scalar/ADCE.cpp @@ -579,6 +579,7 @@ bool AggressiveDeadCodeElimination::updateDeadRegions() { // Don't compute the post ordering unless we needed it. bool HavePostOrder = false; bool Changed = false; + SmallVector<DominatorTree::UpdateType, 10> DeletedEdges; for (auto *BB : BlocksWithDeadTerminators) { auto &Info = BlockInfo[BB]; @@ -617,7 +618,6 @@ bool AggressiveDeadCodeElimination::updateDeadRegions() { makeUnconditional(BB, PreferredSucc->BB); // Inform the dominators about the deleted CFG edges. - SmallVector<DominatorTree::UpdateType, 4> DeletedEdges; for (auto *Succ : RemovedSuccessors) { // It might have happened that the same successor appeared multiple times // and the CFG edge wasn't really removed. @@ -629,13 +629,14 @@ bool AggressiveDeadCodeElimination::updateDeadRegions() { } } - DomTreeUpdater(DT, &PDT, DomTreeUpdater::UpdateStrategy::Eager) - .applyUpdates(DeletedEdges); - NumBranchesRemoved += 1; Changed = true; } + if (!DeletedEdges.empty()) + DomTreeUpdater(DT, &PDT, DomTreeUpdater::UpdateStrategy::Eager) + .applyUpdates(DeletedEdges); + return Changed; } diff --git a/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp b/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp index 37a7053d778e..25e8c3ef3b48 100644 --- a/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp +++ b/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp @@ -414,6 +414,14 @@ void ConstantHoistingPass::collectConstantCandidates( IntegerType *PtrIntTy = DL->getIntPtrType(*Ctx, GVPtrTy->getAddressSpace()); APInt Offset(DL->getTypeSizeInBits(PtrIntTy), /*val*/0, /*isSigned*/true); auto *GEPO = cast<GEPOperator>(ConstExpr); + + // TODO: If we have a mix of inbounds and non-inbounds GEPs, then basing a + // non-inbounds GEP on an inbounds GEP is potentially incorrect. Restrict to + // inbounds GEP for now -- alternatively, we could drop inbounds from the + // constant expression, + if (!GEPO->isInBounds()) + return; + if (!GEPO->accumulateConstantOffset(*DL, Offset)) return; @@ -470,7 +478,7 @@ void ConstantHoistingPass::collectConstantCandidates( // Visit constant expressions that have constant integers. if (auto ConstExpr = dyn_cast<ConstantExpr>(Opnd)) { // Handle constant gep expressions. - if (ConstHoistGEP && ConstExpr->isGEPWithNoNotionalOverIndexing()) + if (ConstHoistGEP && isa<GEPOperator>(ConstExpr)) collectConstantCandidates(ConstCandMap, Inst, Idx, ConstExpr); // Only visit constant cast expressions. @@ -810,7 +818,7 @@ void ConstantHoistingPass::emitBaseConstants(Instruction *Base, // Visit constant expression. if (auto ConstExpr = dyn_cast<ConstantExpr>(Opnd)) { - if (ConstExpr->isGEPWithNoNotionalOverIndexing()) { + if (isa<GEPOperator>(ConstExpr)) { // Operand is a ConstantGEP, replace it. updateOperand(ConstUser.Inst, ConstUser.OpndIdx, Mat); return; diff --git a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp index 7f2d5d7d9987..13963657d183 100644 --- a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp +++ b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp @@ -43,6 +43,51 @@ DEBUG_COUNTER(EliminatedCounter, "conds-eliminated", static int64_t MaxConstraintValue = std::numeric_limits<int64_t>::max(); +namespace { +struct ConstraintTy { + SmallVector<int64_t, 8> Coefficients; + + ConstraintTy(SmallVector<int64_t, 8> Coefficients) + : Coefficients(Coefficients) {} + + unsigned size() const { return Coefficients.size(); } +}; + +/// Struct to manage a list of constraints. +struct ConstraintListTy { + SmallVector<ConstraintTy, 4> Constraints; + + ConstraintListTy() {} + + ConstraintListTy(const SmallVector<ConstraintTy, 4> &Constraints) + : Constraints(Constraints) {} + + void mergeIn(const ConstraintListTy &Other) { + append_range(Constraints, Other.Constraints); + } + + unsigned size() const { return Constraints.size(); } + + unsigned empty() const { return Constraints.empty(); } + + /// Returns true if any constraint has a non-zero coefficient for any of the + /// newly added indices. Zero coefficients for new indices are removed. If it + /// returns true, no new variable need to be added to the system. + bool needsNewIndices(const DenseMap<Value *, unsigned> &NewIndices) { + assert(size() == 1); + for (unsigned I = 0; I < NewIndices.size(); ++I) { + int64_t Last = get(0).Coefficients.pop_back_val(); + if (Last != 0) + return true; + } + return false; + } + + ConstraintTy &get(unsigned I) { return Constraints[I]; } +}; + +} // namespace + // Decomposes \p V into a vector of pairs of the form { c, X } where c * X. The // sum of the pairs equals \p V. The first pair is the constant-factor and X // must be nullptr. If the expression cannot be decomposed, returns an empty @@ -108,24 +153,15 @@ static SmallVector<std::pair<int64_t, Value *>, 4> decompose(Value *V) { if (match(V, m_NUWSub(m_Value(Op0), m_ConstantInt(CI)))) return {{-1 * CI->getSExtValue(), nullptr}, {1, Op0}}; if (match(V, m_NUWSub(m_Value(Op0), m_Value(Op1)))) - return {{0, nullptr}, {1, Op0}, {1, Op1}}; + return {{0, nullptr}, {1, Op0}, {-1, Op1}}; return {{0, nullptr}, {1, V}}; } -struct ConstraintTy { - SmallVector<int64_t, 8> Coefficients; - - ConstraintTy(SmallVector<int64_t, 8> Coefficients) - : Coefficients(Coefficients) {} - - unsigned size() const { return Coefficients.size(); } -}; - /// Turn a condition \p CmpI into a vector of constraints, using indices from \p /// Value2Index. Additional indices for newly discovered values are added to \p /// NewIndices. -static SmallVector<ConstraintTy, 4> +static ConstraintListTy getConstraint(CmpInst::Predicate Pred, Value *Op0, Value *Op1, const DenseMap<Value *, unsigned> &Value2Index, DenseMap<Value *, unsigned> &NewIndices) { @@ -151,11 +187,15 @@ getConstraint(CmpInst::Predicate Pred, Value *Op0, Value *Op1, Value2Index, NewIndices); if (Pred == CmpInst::ICMP_EQ) { + if (match(Op1, m_Zero())) + return getConstraint(CmpInst::ICMP_ULE, Op0, Op1, Value2Index, + NewIndices); + auto A = getConstraint(CmpInst::ICMP_UGE, Op0, Op1, Value2Index, NewIndices); auto B = getConstraint(CmpInst::ICMP_ULE, Op0, Op1, Value2Index, NewIndices); - append_range(A, B); + A.mergeIn(B); return A; } @@ -200,10 +240,10 @@ getConstraint(CmpInst::Predicate Pred, Value *Op0, Value *Op1, R[GetOrAddIndex(KV.second)] -= KV.first; R[0] = Offset1 + Offset2 + (Pred == CmpInst::ICMP_ULT ? -1 : 0); - return {R}; + return {{R}}; } -static SmallVector<ConstraintTy, 4> +static ConstraintListTy getConstraint(CmpInst *Cmp, const DenseMap<Value *, unsigned> &Value2Index, DenseMap<Value *, unsigned> &NewIndices) { return getConstraint(Cmp->getPredicate(), Cmp->getOperand(0), @@ -397,21 +437,10 @@ static bool eliminateConstraints(Function &F, DominatorTree &DT) { if (R.size() != 1) continue; - // Check if all coefficients of new indices are 0 after building the - // constraint. Skip if any of the new indices has a non-null - // coefficient. - bool HasNewIndex = false; - for (unsigned I = 0; I < NewIndices.size(); ++I) { - int64_t Last = R[0].Coefficients.pop_back_val(); - if (Last != 0) { - HasNewIndex = true; - break; - } - } - if (HasNewIndex || R[0].size() == 1) + if (R.needsNewIndices(NewIndices)) continue; - if (CS.isConditionImplied(R[0].Coefficients)) { + if (CS.isConditionImplied(R.get(0).Coefficients)) { if (!DebugCounter::shouldExecute(EliminatedCounter)) continue; @@ -432,7 +461,7 @@ static bool eliminateConstraints(Function &F, DominatorTree &DT) { Changed = true; } if (CS.isConditionImplied( - ConstraintSystem::negate(R[0].Coefficients))) { + ConstraintSystem::negate(R.get(0).Coefficients))) { if (!DebugCounter::shouldExecute(EliminatedCounter)) continue; @@ -479,7 +508,7 @@ static bool eliminateConstraints(Function &F, DominatorTree &DT) { LLVM_DEBUG(dbgs() << "Adding " << *CB.Condition << " " << CB.Not << "\n"); bool Added = false; - for (auto &C : R) { + for (auto &C : R.Constraints) { auto Coeffs = C.Coefficients; LLVM_DEBUG({ dbgs() << " constraint: "; diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp index eadbb4293539..ae636e7b61f7 100644 --- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp +++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp @@ -699,17 +699,14 @@ bool isNoopIntrinsic(Instruction *I) { } // Check if we can ignore \p D for DSE. -bool canSkipDef(MemoryDef *D, bool DefVisibleToCaller, - const TargetLibraryInfo &TLI) { +bool canSkipDef(MemoryDef *D, bool DefVisibleToCaller) { Instruction *DI = D->getMemoryInst(); // Calls that only access inaccessible memory cannot read or write any memory // locations we consider for elimination. if (auto *CB = dyn_cast<CallBase>(DI)) - if (CB->onlyAccessesInaccessibleMemory()) { - if (isAllocLikeFn(DI, &TLI)) - return false; + if (CB->onlyAccessesInaccessibleMemory()) return true; - } + // We can eliminate stores to locations not visible to the caller across // throwing instructions. if (DI->mayThrow() && !DefVisibleToCaller) @@ -759,10 +756,8 @@ struct DSEState { SmallVector<MemoryDef *, 64> MemDefs; // Any that should be skipped as they are already deleted SmallPtrSet<MemoryAccess *, 4> SkipStores; - // Keep track of all of the objects that are invisible to the caller before - // the function returns. - // SmallPtrSet<const Value *, 16> InvisibleToCallerBeforeRet; - DenseMap<const Value *, bool> InvisibleToCallerBeforeRet; + // Keep track whether a given object is captured before return or not. + DenseMap<const Value *, bool> CapturedBeforeReturn; // Keep track of all of the objects that are invisible to the caller after // the function returns. DenseMap<const Value *, bool> InvisibleToCallerAfterRet; @@ -805,12 +800,8 @@ struct DSEState { // Treat byval or inalloca arguments the same as Allocas, stores to them are // dead at the end of the function. for (Argument &AI : F.args()) - if (AI.hasPassPointeeByValueCopyAttr()) { - // For byval, the caller doesn't know the address of the allocation. - if (AI.hasByValAttr()) - InvisibleToCallerBeforeRet.insert({&AI, true}); + if (AI.hasPassPointeeByValueCopyAttr()) InvisibleToCallerAfterRet.insert({&AI, true}); - } // Collect whether there is any irreducible control flow in the function. ContainsIrreducibleLoops = mayContainIrreducibleControl(F, &LI); @@ -835,6 +826,20 @@ struct DSEState { if (!isGuaranteedLoopIndependent(DeadI, KillingI, DeadLoc)) return OW_Unknown; + const Value *DeadPtr = DeadLoc.Ptr->stripPointerCasts(); + const Value *KillingPtr = KillingLoc.Ptr->stripPointerCasts(); + const Value *DeadUndObj = getUnderlyingObject(DeadPtr); + const Value *KillingUndObj = getUnderlyingObject(KillingPtr); + + // Check whether the killing store overwrites the whole object, in which + // case the size/offset of the dead store does not matter. + if (DeadUndObj == KillingUndObj && KillingLoc.Size.isPrecise()) { + uint64_t KillingUndObjSize = getPointerSize(KillingUndObj, DL, TLI, &F); + if (KillingUndObjSize != MemoryLocation::UnknownSize && + KillingUndObjSize == KillingLoc.Size.getValue()) + return OW_Complete; + } + // FIXME: Vet that this works for size upper-bounds. Seems unlikely that we'll // get imprecise values here, though (except for unknown sizes). if (!KillingLoc.Size.isPrecise() || !DeadLoc.Size.isPrecise()) { @@ -875,14 +880,6 @@ struct DSEState { return OW_Complete; } - // Check to see if the killing store is to the entire object (either a - // global, an alloca, or a byval/inalloca argument). If so, then it clearly - // overwrites any other store to the same object. - const Value *DeadPtr = DeadLoc.Ptr->stripPointerCasts(); - const Value *KillingPtr = KillingLoc.Ptr->stripPointerCasts(); - const Value *DeadUndObj = getUnderlyingObject(DeadPtr); - const Value *KillingUndObj = getUnderlyingObject(KillingPtr); - // If we can't resolve the same pointers to the same object, then we can't // analyze them at all. if (DeadUndObj != KillingUndObj) { @@ -896,12 +893,6 @@ struct DSEState { return OW_Unknown; } - // If the KillingI store is to a recognizable object, get its size. - uint64_t KillingUndObjSize = getPointerSize(KillingUndObj, DL, TLI, &F); - if (KillingUndObjSize != MemoryLocation::UnknownSize) - if (KillingUndObjSize == KillingSize && KillingUndObjSize >= DeadSize) - return OW_Complete; - // Okay, we have stores to two completely different pointers. Try to // decompose the pointer into a "base + constant_offset" form. If the base // pointers are equal, then we can reason about the two stores. @@ -957,31 +948,30 @@ struct DSEState { return true; auto I = InvisibleToCallerAfterRet.insert({V, false}); if (I.second) { - if (!isInvisibleToCallerBeforeRet(V)) { + if (!isInvisibleToCallerOnUnwind(V)) { I.first->second = false; - } else { - auto *Inst = dyn_cast<Instruction>(V); - if (Inst && isAllocLikeFn(Inst, &TLI)) - I.first->second = !PointerMayBeCaptured(V, true, false); + } else if (isNoAliasCall(V)) { + I.first->second = !PointerMayBeCaptured(V, true, false); } } return I.first->second; } - bool isInvisibleToCallerBeforeRet(const Value *V) { - if (isa<AllocaInst>(V)) + bool isInvisibleToCallerOnUnwind(const Value *V) { + bool RequiresNoCaptureBeforeUnwind; + if (!isNotVisibleOnUnwind(V, RequiresNoCaptureBeforeUnwind)) + return false; + if (!RequiresNoCaptureBeforeUnwind) return true; - auto I = InvisibleToCallerBeforeRet.insert({V, false}); - if (I.second) { - auto *Inst = dyn_cast<Instruction>(V); - if (Inst && isAllocLikeFn(Inst, &TLI)) - // NOTE: This could be made more precise by PointerMayBeCapturedBefore - // with the killing MemoryDef. But we refrain from doing so for now to - // limit compile-time and this does not cause any changes to the number - // of stores removed on a large test set in practice. - I.first->second = !PointerMayBeCaptured(V, false, true); - } - return I.first->second; + + auto I = CapturedBeforeReturn.insert({V, true}); + if (I.second) + // NOTE: This could be made more precise by PointerMayBeCapturedBefore + // with the killing MemoryDef. But we refrain from doing so for now to + // limit compile-time and this does not cause any changes to the number + // of stores removed on a large test set in practice. + I.first->second = PointerMayBeCaptured(V, false, true); + return !I.first->second; } Optional<MemoryLocation> getLocForWrite(Instruction *I) const { @@ -1269,8 +1259,7 @@ struct DSEState { MemoryDef *CurrentDef = cast<MemoryDef>(Current); Instruction *CurrentI = CurrentDef->getMemoryInst(); - if (canSkipDef(CurrentDef, !isInvisibleToCallerBeforeRet(KillingUndObj), - TLI)) { + if (canSkipDef(CurrentDef, !isInvisibleToCallerOnUnwind(KillingUndObj))) { CanOptimize = false; continue; } @@ -1442,7 +1431,7 @@ struct DSEState { continue; } - if (UseInst->mayThrow() && !isInvisibleToCallerBeforeRet(KillingUndObj)) { + if (UseInst->mayThrow() && !isInvisibleToCallerOnUnwind(KillingUndObj)) { LLVM_DEBUG(dbgs() << " ... found throwing instruction\n"); return None; } @@ -1623,7 +1612,7 @@ struct DSEState { // First see if we can ignore it by using the fact that KillingI is an // alloca/alloca like object that is not visible to the caller during // execution of the function. - if (KillingUndObj && isInvisibleToCallerBeforeRet(KillingUndObj)) + if (KillingUndObj && isInvisibleToCallerOnUnwind(KillingUndObj)) return false; if (KillingI->getParent() == DeadI->getParent()) @@ -1639,7 +1628,7 @@ struct DSEState { bool isDSEBarrier(const Value *KillingUndObj, Instruction *DeadI) { // If DeadI may throw it acts as a barrier, unless we are to an // alloca/alloca like object that does not escape. - if (DeadI->mayThrow() && !isInvisibleToCallerBeforeRet(KillingUndObj)) + if (DeadI->mayThrow() && !isInvisibleToCallerOnUnwind(KillingUndObj)) return true; // If DeadI is an atomic load/store stronger than monotonic, do not try to @@ -1696,6 +1685,84 @@ struct DSEState { return MadeChange; } + /// If we have a zero initializing memset following a call to malloc, + /// try folding it into a call to calloc. + bool tryFoldIntoCalloc(MemoryDef *Def, const Value *DefUO) { + Instruction *DefI = Def->getMemoryInst(); + MemSetInst *MemSet = dyn_cast<MemSetInst>(DefI); + if (!MemSet) + // TODO: Could handle zero store to small allocation as well. + return false; + Constant *StoredConstant = dyn_cast<Constant>(MemSet->getValue()); + if (!StoredConstant || !StoredConstant->isNullValue()) + return false; + + if (!isRemovable(DefI)) + // The memset might be volatile.. + return false; + + if (F.hasFnAttribute(Attribute::SanitizeMemory) || + F.hasFnAttribute(Attribute::SanitizeAddress) || + F.hasFnAttribute(Attribute::SanitizeHWAddress) || + F.getName() == "calloc") + return false; + auto *Malloc = const_cast<CallInst *>(dyn_cast<CallInst>(DefUO)); + if (!Malloc) + return false; + auto *InnerCallee = Malloc->getCalledFunction(); + if (!InnerCallee) + return false; + LibFunc Func; + if (!TLI.getLibFunc(*InnerCallee, Func) || !TLI.has(Func) || + Func != LibFunc_malloc) + return false; + + auto shouldCreateCalloc = [](CallInst *Malloc, CallInst *Memset) { + // Check for br(icmp ptr, null), truebb, falsebb) pattern at the end + // of malloc block + auto *MallocBB = Malloc->getParent(), + *MemsetBB = Memset->getParent(); + if (MallocBB == MemsetBB) + return true; + auto *Ptr = Memset->getArgOperand(0); + auto *TI = MallocBB->getTerminator(); + ICmpInst::Predicate Pred; + BasicBlock *TrueBB, *FalseBB; + if (!match(TI, m_Br(m_ICmp(Pred, m_Specific(Ptr), m_Zero()), TrueBB, + FalseBB))) + return false; + if (Pred != ICmpInst::ICMP_EQ || MemsetBB != FalseBB) + return false; + return true; + }; + + if (Malloc->getOperand(0) != MemSet->getLength()) + return false; + if (!shouldCreateCalloc(Malloc, MemSet) || + !DT.dominates(Malloc, MemSet) || + !memoryIsNotModifiedBetween(Malloc, MemSet, BatchAA, DL, &DT)) + return false; + IRBuilder<> IRB(Malloc); + const auto &DL = Malloc->getModule()->getDataLayout(); + auto *Calloc = + emitCalloc(ConstantInt::get(IRB.getIntPtrTy(DL), 1), + Malloc->getArgOperand(0), IRB, TLI); + if (!Calloc) + return false; + MemorySSAUpdater Updater(&MSSA); + auto *LastDef = + cast<MemoryDef>(Updater.getMemorySSA()->getMemoryAccess(Malloc)); + auto *NewAccess = + Updater.createMemoryAccessAfter(cast<Instruction>(Calloc), LastDef, + LastDef); + auto *NewAccessMD = cast<MemoryDef>(NewAccess); + Updater.insertDef(NewAccessMD, /*RenameUses=*/true); + Updater.removeMemoryAccess(Malloc); + Malloc->replaceAllUsesWith(Calloc); + Malloc->eraseFromParent(); + return true; + } + /// \returns true if \p Def is a no-op store, either because it /// directly stores back a loaded value or stores zero to a calloced object. bool storeIsNoop(MemoryDef *Def, const Value *DefUO) { @@ -1713,81 +1780,15 @@ struct DSEState { if (!isRemovable(DefI)) return false; - if (StoredConstant && StoredConstant->isNullValue()) { - auto *DefUOInst = dyn_cast<Instruction>(DefUO); - if (DefUOInst) { - if (isCallocLikeFn(DefUOInst, &TLI)) { - auto *UnderlyingDef = - cast<MemoryDef>(MSSA.getMemoryAccess(DefUOInst)); - // If UnderlyingDef is the clobbering access of Def, no instructions - // between them can modify the memory location. - auto *ClobberDef = - MSSA.getSkipSelfWalker()->getClobberingMemoryAccess(Def); - return UnderlyingDef == ClobberDef; - } - - if (MemSet) { - if (F.hasFnAttribute(Attribute::SanitizeMemory) || - F.hasFnAttribute(Attribute::SanitizeAddress) || - F.hasFnAttribute(Attribute::SanitizeHWAddress) || - F.getName() == "calloc") - return false; - auto *Malloc = const_cast<CallInst *>(dyn_cast<CallInst>(DefUOInst)); - if (!Malloc) - return false; - auto *InnerCallee = Malloc->getCalledFunction(); - if (!InnerCallee) - return false; - LibFunc Func; - if (!TLI.getLibFunc(*InnerCallee, Func) || !TLI.has(Func) || - Func != LibFunc_malloc) - return false; - - auto shouldCreateCalloc = [](CallInst *Malloc, CallInst *Memset) { - // Check for br(icmp ptr, null), truebb, falsebb) pattern at the end - // of malloc block - auto *MallocBB = Malloc->getParent(), - *MemsetBB = Memset->getParent(); - if (MallocBB == MemsetBB) - return true; - auto *Ptr = Memset->getArgOperand(0); - auto *TI = MallocBB->getTerminator(); - ICmpInst::Predicate Pred; - BasicBlock *TrueBB, *FalseBB; - if (!match(TI, m_Br(m_ICmp(Pred, m_Specific(Ptr), m_Zero()), TrueBB, - FalseBB))) - return false; - if (Pred != ICmpInst::ICMP_EQ || MemsetBB != FalseBB) - return false; - return true; - }; - - if (Malloc->getOperand(0) == MemSet->getLength()) { - if (shouldCreateCalloc(Malloc, MemSet) && - DT.dominates(Malloc, MemSet) && - memoryIsNotModifiedBetween(Malloc, MemSet, BatchAA, DL, &DT)) { - IRBuilder<> IRB(Malloc); - const auto &DL = Malloc->getModule()->getDataLayout(); - if (auto *Calloc = - emitCalloc(ConstantInt::get(IRB.getIntPtrTy(DL), 1), - Malloc->getArgOperand(0), IRB, TLI)) { - MemorySSAUpdater Updater(&MSSA); - auto *LastDef = cast<MemoryDef>( - Updater.getMemorySSA()->getMemoryAccess(Malloc)); - auto *NewAccess = Updater.createMemoryAccessAfter( - cast<Instruction>(Calloc), LastDef, LastDef); - auto *NewAccessMD = cast<MemoryDef>(NewAccess); - Updater.insertDef(NewAccessMD, /*RenameUses=*/true); - Updater.removeMemoryAccess(Malloc); - Malloc->replaceAllUsesWith(Calloc); - Malloc->eraseFromParent(); - return true; - } - return false; - } - } - } - } + if (StoredConstant && isAllocationFn(DefUO, &TLI)) { + auto *CB = cast<CallBase>(DefUO); + auto *InitC = getInitialValueOfAllocation(CB, &TLI, + StoredConstant->getType()); + // If the clobbering access is LiveOnEntry, no instructions between them + // can modify the memory location. + if (InitC && InitC == StoredConstant) + return MSSA.isLiveOnEntryDef( + MSSA.getSkipSelfWalker()->getClobberingMemoryAccess(Def)); } if (!Store) @@ -2074,6 +2075,15 @@ static bool eliminateDeadStores(Function &F, AliasAnalysis &AA, MemorySSA &MSSA, MadeChange = true; continue; } + + // Can we form a calloc from a memset/malloc pair? + if (!Shortend && State.tryFoldIntoCalloc(KillingDef, KillingUndObj)) { + LLVM_DEBUG(dbgs() << "DSE: Remove memset after forming calloc:\n" + << " DEAD: " << *KillingI << '\n'); + State.deleteDeadInstruction(KillingI); + MadeChange = true; + continue; + } } if (EnablePartialOverwriteTracking) diff --git a/llvm/lib/Transforms/Scalar/EarlyCSE.cpp b/llvm/lib/Transforms/Scalar/EarlyCSE.cpp index a24997dd3fd4..59b934c16c8a 100644 --- a/llvm/lib/Transforms/Scalar/EarlyCSE.cpp +++ b/llvm/lib/Transforms/Scalar/EarlyCSE.cpp @@ -827,10 +827,13 @@ private: const ParseMemoryInst &Later); Value *getOrCreateResult(Value *Inst, Type *ExpectedType) const { + // TODO: We could insert relevant casts on type mismatch here. if (auto *LI = dyn_cast<LoadInst>(Inst)) - return LI; - if (auto *SI = dyn_cast<StoreInst>(Inst)) - return SI->getValueOperand(); + return LI->getType() == ExpectedType ? LI : nullptr; + else if (auto *SI = dyn_cast<StoreInst>(Inst)) { + Value *V = SI->getValueOperand(); + return V->getType() == ExpectedType ? V : nullptr; + } assert(isa<IntrinsicInst>(Inst) && "Instruction not supported"); auto *II = cast<IntrinsicInst>(Inst); if (isHandledNonTargetIntrinsic(II->getIntrinsicID())) diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp index 00506fb86006..398c93e8758c 100644 --- a/llvm/lib/Transforms/Scalar/GVN.cpp +++ b/llvm/lib/Transforms/Scalar/GVN.cpp @@ -1104,20 +1104,19 @@ bool GVNPass::AnalyzeLoadAvailability(LoadInst *Load, MemDepResult DepInfo, } assert(DepInfo.isDef() && "follows from above"); - // Loading the allocation -> undef. - if (isa<AllocaInst>(DepInst) || isMallocLikeFn(DepInst, TLI) || - isAlignedAllocLikeFn(DepInst, TLI) || - // Loading immediately after lifetime begin -> undef. - isLifetimeStart(DepInst)) { + // Loading the alloca -> undef. + // Loading immediately after lifetime begin -> undef. + if (isa<AllocaInst>(DepInst) || isLifetimeStart(DepInst)) { Res = AvailableValue::get(UndefValue::get(Load->getType())); return true; } - // Loading from calloc (which zero initializes memory) -> zero - if (isCallocLikeFn(DepInst, TLI)) { - Res = AvailableValue::get(Constant::getNullValue(Load->getType())); - return true; - } + if (isAllocationFn(DepInst, TLI)) + if (auto *InitVal = getInitialValueOfAllocation(cast<CallBase>(DepInst), + TLI, Load->getType())) { + Res = AvailableValue::get(InitVal); + return true; + } if (StoreInst *S = dyn_cast<StoreInst>(DepInst)) { // Reject loads and stores that are to the same address but are of @@ -1769,7 +1768,7 @@ bool GVNPass::processAssumeIntrinsic(AssumeInst *IntrinsicI) { // Insert a new store to null instruction before the load to indicate that // this code is not reachable. FIXME: We could insert unreachable // instruction directly because we can modify the CFG. - auto *NewS = new StoreInst(UndefValue::get(Int8Ty), + auto *NewS = new StoreInst(PoisonValue::get(Int8Ty), Constant::getNullValue(Int8Ty->getPointerTo()), IntrinsicI); if (MSSAU) { @@ -2991,12 +2990,12 @@ void GVNPass::addDeadBlock(BasicBlock *BB) { } } - // Now undef the incoming values from the dead predecessors. + // Now poison the incoming values from the dead predecessors. for (BasicBlock *P : predecessors(B)) { if (!DeadBlocks.count(P)) continue; for (PHINode &Phi : B->phis()) { - Phi.setIncomingValueForBlock(P, UndefValue::get(Phi.getType())); + Phi.setIncomingValueForBlock(P, PoisonValue::get(Phi.getType())); if (MD) MD->invalidateCachedPointerInfo(&Phi); } diff --git a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp index 7001d330fce0..ceb03eb17f6d 100644 --- a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp +++ b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp @@ -138,8 +138,6 @@ AllowIVWidening("indvars-widen-indvars", cl::Hidden, cl::init(true), namespace { -struct RewritePhi; - class IndVarSimplify { LoopInfo *LI; ScalarEvolution *SE; @@ -982,6 +980,7 @@ static Value *genLoopLimit(PHINode *IndVar, BasicBlock *ExitingBB, assert(isLoopCounter(IndVar, L, SE)); const SCEVAddRecExpr *AR = cast<SCEVAddRecExpr>(SE->getSCEV(IndVar)); const SCEV *IVInit = AR->getStart(); + assert(AR->getStepRecurrence(*SE)->isOne() && "only handles unit stride"); // IVInit may be a pointer while ExitCount is an integer when FindLoopCounter // finds a valid pointer IV. Sign extend ExitCount in order to materialize a @@ -1004,13 +1003,6 @@ static Value *genLoopLimit(PHINode *IndVar, BasicBlock *ExitingBB, assert(SE->isLoopInvariant(IVOffset, L) && "Computed iteration count is not loop invariant!"); - // We could handle pointer IVs other than i8*, but we need to compensate for - // gep index scaling. - assert(SE->getSizeOfExpr(IntegerType::getInt64Ty(IndVar->getContext()), - cast<PointerType>(IndVar->getType()) - ->getElementType())->isOne() && - "unit stride pointer IV must be i8*"); - const SCEV *IVLimit = SE->getAddExpr(IVInit, IVOffset); BranchInst *BI = cast<BranchInst>(ExitingBB->getTerminator()); return Rewriter.expandCodeFor(IVLimit, IndVar->getType(), BI); @@ -1026,7 +1018,6 @@ static Value *genLoopLimit(PHINode *IndVar, BasicBlock *ExitingBB, // IVInit integer and ExitCount pointer would only occur if a canonical IV // were generated on top of case #2, which is not expected. - assert(AR->getStepRecurrence(*SE)->isOne() && "only handles unit stride"); // For unit stride, IVCount = Start + ExitCount with 2's complement // overflow. diff --git a/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp b/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp index 883d4afff3bd..8f5933b7bd71 100644 --- a/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp +++ b/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp @@ -250,12 +250,6 @@ public: char InferAddressSpaces::ID = 0; -namespace llvm { - -void initializeInferAddressSpacesPass(PassRegistry &); - -} // end namespace llvm - INITIALIZE_PASS_BEGIN(InferAddressSpaces, DEBUG_TYPE, "Infer address spaces", false, false) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) diff --git a/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/llvm/lib/Transforms/Scalar/JumpThreading.cpp index fe9a7211967c..a3efad104ca6 100644 --- a/llvm/lib/Transforms/Scalar/JumpThreading.cpp +++ b/llvm/lib/Transforms/Scalar/JumpThreading.cpp @@ -728,8 +728,8 @@ bool JumpThreadingPass::computeValueKnownInPredecessorsImpl( // Handle some boolean conditions. if (I->getType()->getPrimitiveSizeInBits() == 1) { using namespace PatternMatch; - - assert(Preference == WantInteger && "One-bit non-integer type?"); + if (Preference != WantInteger) + return false; // X | true -> true // X & false -> false Value *Op0, *Op1; @@ -789,8 +789,8 @@ bool JumpThreadingPass::computeValueKnownInPredecessorsImpl( // Try to simplify some other binary operator values. } else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(I)) { - assert(Preference != WantBlockAddress - && "A binary operator creating a block address?"); + if (Preference != WantInteger) + return false; if (ConstantInt *CI = dyn_cast<ConstantInt>(BO->getOperand(1))) { PredValueInfoTy LHSVals; computeValueKnownInPredecessorsImpl(BO->getOperand(0), BB, LHSVals, @@ -811,7 +811,8 @@ bool JumpThreadingPass::computeValueKnownInPredecessorsImpl( // Handle compare with phi operand, where the PHI is defined in this block. if (CmpInst *Cmp = dyn_cast<CmpInst>(I)) { - assert(Preference == WantInteger && "Compares only produce integers"); + if (Preference != WantInteger) + return false; Type *CmpType = Cmp->getType(); Value *CmpLHS = Cmp->getOperand(0); Value *CmpRHS = Cmp->getOperand(1); diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp index bc792ca3d8da..7fb1a25bdf13 100644 --- a/llvm/lib/Transforms/Scalar/LICM.cpp +++ b/llvm/lib/Transforms/Scalar/LICM.cpp @@ -1355,7 +1355,7 @@ static bool isFreeInLoop(const Instruction &I, const Loop *CurLoop, TargetTransformInfo::TCC_Free) return false; // For a GEP, we cannot simply use getUserCost because currently it - // optimistically assume that a GEP will fold into addressing mode + // optimistically assumes that a GEP will fold into addressing mode // regardless of its users. const BasicBlock *BB = GEP->getParent(); for (const User *U : GEP->users()) { @@ -1923,26 +1923,15 @@ bool isNotCapturedBeforeOrInLoop(const Value *V, const Loop *L, L->getHeader()->getTerminator(), DT); } -/// Return true iff we can prove that a caller of this function can not inspect -/// the contents of the provided object in a well defined program. -bool isKnownNonEscaping(Value *Object, const Loop *L, - const TargetLibraryInfo *TLI, DominatorTree *DT) { - if (isa<AllocaInst>(Object)) - // Since the alloca goes out of scope, we know the caller can't retain a - // reference to it and be well defined. Thus, we don't need to check for - // capture. - return true; +/// Return true if we can prove that a caller cannot inspect the object if an +/// unwind occurs inside the loop. +bool isNotVisibleOnUnwindInLoop(const Value *Object, const Loop *L, + DominatorTree *DT) { + bool RequiresNoCaptureBeforeUnwind; + if (!isNotVisibleOnUnwind(Object, RequiresNoCaptureBeforeUnwind)) + return false; - // For all other objects we need to know that the caller can't possibly - // have gotten a reference to the object. There are two components of - // that: - // 1) Object can't be escaped by this function. This is what - // PointerMayBeCaptured checks. - // 2) Object can't have been captured at definition site. For this, we - // need to know the return value is noalias. At the moment, we use a - // weaker condition and handle only AllocLikeFunctions (which are - // known to be noalias). TODO - return isAllocLikeFn(Object, TLI) && + return !RequiresNoCaptureBeforeUnwind || isNotCapturedBeforeOrInLoop(Object, L, DT); } @@ -2030,7 +2019,7 @@ bool llvm::promoteLoopAccessesToScalars( // this by proving that the caller can't have a reference to the object // after return and thus can't possibly load from the object. Value *Object = getUnderlyingObject(SomePtr); - if (!isKnownNonEscaping(Object, CurLoop, TLI, DT)) + if (!isNotVisibleOnUnwindInLoop(Object, CurLoop, DT)) return false; // Subtlety: Alloca's aren't visible to callers, but *are* potentially // visible to other threads if captured and used during their lifetimes. @@ -2163,7 +2152,7 @@ bool llvm::promoteLoopAccessesToScalars( else { Value *Object = getUnderlyingObject(SomePtr); SafeToInsertStore = - (isAllocLikeFn(Object, TLI) || isa<AllocaInst>(Object)) && + (isNoAliasCall(Object) || isa<AllocaInst>(Object)) && isNotCapturedBeforeOrInLoop(Object, CurLoop, DT); } } diff --git a/llvm/lib/Transforms/Scalar/LoopDeletion.cpp b/llvm/lib/Transforms/Scalar/LoopDeletion.cpp index 5814e2f043d5..361d6c0d9381 100644 --- a/llvm/lib/Transforms/Scalar/LoopDeletion.cpp +++ b/llvm/lib/Transforms/Scalar/LoopDeletion.cpp @@ -407,25 +407,19 @@ breakBackedgeIfNotTaken(Loop *L, DominatorTree &DT, ScalarEvolution &SE, if (!L->getLoopLatch()) return LoopDeletionResult::Unmodified; - auto *BTC = SE.getSymbolicMaxBackedgeTakenCount(L); - if (BTC->isZero()) { - // SCEV knows this backedge isn't taken! - breakLoopBackedge(L, DT, SE, LI, MSSA); - ++NumBackedgesBroken; - return LoopDeletionResult::Deleted; - } - - // If SCEV leaves open the possibility of a zero trip count, see if - // symbolically evaluating the first iteration lets us prove the backedge - // unreachable. - if (isa<SCEVCouldNotCompute>(BTC) || !SE.isKnownNonZero(BTC)) - if (canProveExitOnFirstIteration(L, DT, LI)) { - breakLoopBackedge(L, DT, SE, LI, MSSA); - ++NumBackedgesBroken; - return LoopDeletionResult::Deleted; + auto *BTCMax = SE.getConstantMaxBackedgeTakenCount(L); + if (!BTCMax->isZero()) { + auto *BTC = SE.getBackedgeTakenCount(L); + if (!BTC->isZero()) { + if (!isa<SCEVCouldNotCompute>(BTC) && SE.isKnownNonZero(BTC)) + return LoopDeletionResult::Unmodified; + if (!canProveExitOnFirstIteration(L, DT, LI)) + return LoopDeletionResult::Unmodified; } - - return LoopDeletionResult::Unmodified; + } + ++NumBackedgesBroken; + breakLoopBackedge(L, DT, SE, LI, MSSA); + return LoopDeletionResult::Deleted; } /// Remove a loop if it is dead. diff --git a/llvm/lib/Transforms/Scalar/LoopFlatten.cpp b/llvm/lib/Transforms/Scalar/LoopFlatten.cpp index 965d1575518e..c46db4e63bfe 100644 --- a/llvm/lib/Transforms/Scalar/LoopFlatten.cpp +++ b/llvm/lib/Transforms/Scalar/LoopFlatten.cpp @@ -10,10 +10,13 @@ // // The intention is to optimise loop nests like this, which together access an // array linearly: +// // for (int i = 0; i < N; ++i) // for (int j = 0; j < M; ++j) // f(A[i*M+j]); +// // into one loop: +// // for (int i = 0; i < (N*M); ++i) // f(A[i]); // @@ -22,7 +25,27 @@ // expression like i*M+j. If they had any other uses, we would have to insert a // div/mod to reconstruct the original values, so this wouldn't be profitable. // -// We also need to prove that N*M will not overflow. +// We also need to prove that N*M will not overflow. The preferred solution is +// to widen the IV, which avoids overflow checks, so that is tried first. If +// the IV cannot be widened, then we try to determine that this new tripcount +// expression won't overflow. +// +// Q: Does LoopFlatten use SCEV? +// Short answer: Yes and no. +// +// Long answer: +// For this transformation to be valid, we require all uses of the induction +// variables to be linear expressions of the form i*M+j. The different Loop +// APIs are used to get some loop components like the induction variable, +// compare statement, etc. In addition, we do some pattern matching to find the +// linear expressions and other loop components like the loop increment. The +// latter are examples of expressions that do use the induction variable, but +// are safe to ignore when we check all uses to be of the form i*M+j. We keep +// track of all of this in bookkeeping struct FlattenInfo. +// We assume the loops to be canonical, i.e. starting at 0 and increment with +// 1. This makes RHS of the compare the loop tripcount (with the right +// predicate). We use SCEV to then sanity check that this tripcount matches +// with the tripcount as computed by SCEV. // //===----------------------------------------------------------------------===// @@ -31,6 +54,7 @@ #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/MemorySSAUpdater.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/TargetTransformInfo.h" @@ -70,37 +94,54 @@ static cl::opt<bool> "trip counts will never overflow")); static cl::opt<bool> - WidenIV("loop-flatten-widen-iv", cl::Hidden, - cl::init(true), + WidenIV("loop-flatten-widen-iv", cl::Hidden, cl::init(true), cl::desc("Widen the loop induction variables, if possible, so " "overflow checks won't reject flattening")); +// We require all uses of both induction variables to match this pattern: +// +// (OuterPHI * InnerTripCount) + InnerPHI +// +// I.e., it needs to be a linear expression of the induction variables and the +// inner loop trip count. We keep track of all different expressions on which +// checks will be performed in this bookkeeping struct. +// struct FlattenInfo { - Loop *OuterLoop = nullptr; + Loop *OuterLoop = nullptr; // The loop pair to be flattened. Loop *InnerLoop = nullptr; - // These PHINodes correspond to loop induction variables, which are expected - // to start at zero and increment by one on each loop. - PHINode *InnerInductionPHI = nullptr; - PHINode *OuterInductionPHI = nullptr; - Value *InnerTripCount = nullptr; - Value *OuterTripCount = nullptr; - BinaryOperator *InnerIncrement = nullptr; - BinaryOperator *OuterIncrement = nullptr; - BranchInst *InnerBranch = nullptr; - BranchInst *OuterBranch = nullptr; - SmallPtrSet<Value *, 4> LinearIVUses; + + PHINode *InnerInductionPHI = nullptr; // These PHINodes correspond to loop + PHINode *OuterInductionPHI = nullptr; // induction variables, which are + // expected to start at zero and + // increment by one on each loop. + + Value *InnerTripCount = nullptr; // The product of these two tripcounts + Value *OuterTripCount = nullptr; // will be the new flattened loop + // tripcount. Also used to recognise a + // linear expression that will be replaced. + + SmallPtrSet<Value *, 4> LinearIVUses; // Contains the linear expressions + // of the form i*M+j that will be + // replaced. + + BinaryOperator *InnerIncrement = nullptr; // Uses of induction variables in + BinaryOperator *OuterIncrement = nullptr; // loop control statements that + BranchInst *InnerBranch = nullptr; // are safe to ignore. + + BranchInst *OuterBranch = nullptr; // The instruction that needs to be + // updated with new tripcount. + SmallPtrSet<PHINode *, 4> InnerPHIsToTransform; - // Whether this holds the flatten info before or after widening. - bool Widened = false; + bool Widened = false; // Whether this holds the flatten info before or after + // widening. - // Holds the old/narrow induction phis, i.e. the Phis before IV widening has - // been applied. This bookkeeping is used so we can skip some checks on these - // phi nodes. - PHINode *NarrowInnerInductionPHI = nullptr; - PHINode *NarrowOuterInductionPHI = nullptr; + PHINode *NarrowInnerInductionPHI = nullptr; // Holds the old/narrow induction + PHINode *NarrowOuterInductionPHI = nullptr; // phis, i.e. the Phis before IV + // has been apllied. Used to skip + // checks on phi nodes. - FlattenInfo(Loop *OL, Loop *IL) : OuterLoop(OL), InnerLoop(IL) {}; + FlattenInfo(Loop *OL, Loop *IL) : OuterLoop(OL), InnerLoop(IL){}; bool isNarrowInductionPhi(PHINode *Phi) { // This can't be the narrow phi if we haven't widened the IV first. @@ -108,6 +149,118 @@ struct FlattenInfo { return false; return NarrowInnerInductionPHI == Phi || NarrowOuterInductionPHI == Phi; } + bool isInnerLoopIncrement(User *U) { + return InnerIncrement == U; + } + bool isOuterLoopIncrement(User *U) { + return OuterIncrement == U; + } + bool isInnerLoopTest(User *U) { + return InnerBranch->getCondition() == U; + } + + bool checkOuterInductionPhiUsers(SmallPtrSet<Value *, 4> &ValidOuterPHIUses) { + for (User *U : OuterInductionPHI->users()) { + if (isOuterLoopIncrement(U)) + continue; + + auto IsValidOuterPHIUses = [&] (User *U) -> bool { + LLVM_DEBUG(dbgs() << "Found use of outer induction variable: "; U->dump()); + if (!ValidOuterPHIUses.count(U)) { + LLVM_DEBUG(dbgs() << "Did not match expected pattern, bailing\n"); + return false; + } + LLVM_DEBUG(dbgs() << "Use is optimisable\n"); + return true; + }; + + if (auto *V = dyn_cast<TruncInst>(U)) { + for (auto *K : V->users()) { + if (!IsValidOuterPHIUses(K)) + return false; + } + continue; + } + + if (!IsValidOuterPHIUses(U)) + return false; + } + return true; + } + + bool matchLinearIVUser(User *U, Value *InnerTripCount, + SmallPtrSet<Value *, 4> &ValidOuterPHIUses) { + LLVM_DEBUG(dbgs() << "Found use of inner induction variable: "; U->dump()); + Value *MatchedMul = nullptr; + Value *MatchedItCount = nullptr; + + bool IsAdd = match(U, m_c_Add(m_Specific(InnerInductionPHI), + m_Value(MatchedMul))) && + match(MatchedMul, m_c_Mul(m_Specific(OuterInductionPHI), + m_Value(MatchedItCount))); + + // Matches the same pattern as above, except it also looks for truncs + // on the phi, which can be the result of widening the induction variables. + bool IsAddTrunc = + match(U, m_c_Add(m_Trunc(m_Specific(InnerInductionPHI)), + m_Value(MatchedMul))) && + match(MatchedMul, m_c_Mul(m_Trunc(m_Specific(OuterInductionPHI)), + m_Value(MatchedItCount))); + + if (!MatchedItCount) + return false; + + // Look through extends if the IV has been widened. + if (Widened && + (isa<SExtInst>(MatchedItCount) || isa<ZExtInst>(MatchedItCount))) { + assert(MatchedItCount->getType() == InnerInductionPHI->getType() && + "Unexpected type mismatch in types after widening"); + MatchedItCount = isa<SExtInst>(MatchedItCount) + ? dyn_cast<SExtInst>(MatchedItCount)->getOperand(0) + : dyn_cast<ZExtInst>(MatchedItCount)->getOperand(0); + } + + if ((IsAdd || IsAddTrunc) && MatchedItCount == InnerTripCount) { + LLVM_DEBUG(dbgs() << "Use is optimisable\n"); + ValidOuterPHIUses.insert(MatchedMul); + LinearIVUses.insert(U); + return true; + } + + LLVM_DEBUG(dbgs() << "Did not match expected pattern, bailing\n"); + return false; + } + + bool checkInnerInductionPhiUsers(SmallPtrSet<Value *, 4> &ValidOuterPHIUses) { + Value *SExtInnerTripCount = InnerTripCount; + if (Widened && + (isa<SExtInst>(InnerTripCount) || isa<ZExtInst>(InnerTripCount))) + SExtInnerTripCount = cast<Instruction>(InnerTripCount)->getOperand(0); + + for (User *U : InnerInductionPHI->users()) { + if (isInnerLoopIncrement(U)) + continue; + + // After widening the IVs, a trunc instruction might have been introduced, + // so look through truncs. + if (isa<TruncInst>(U)) { + if (!U->hasOneUse()) + return false; + U = *U->user_begin(); + } + + // If the use is in the compare (which is also the condition of the inner + // branch) then the compare has been altered by another transformation e.g + // icmp ult %inc, tripcount -> icmp ult %j, tripcount-1, where tripcount is + // a constant. Ignore this use as the compare gets removed later anyway. + if (isInnerLoopTest(U)) + continue; + + if (!matchLinearIVUser(U, SExtInnerTripCount, ValidOuterPHIUses)) + return false; + } + return true; + } }; static bool @@ -121,6 +274,77 @@ setLoopComponents(Value *&TC, Value *&TripCount, BinaryOperator *&Increment, return true; } +// Given the RHS of the loop latch compare instruction, verify with SCEV +// that this is indeed the loop tripcount. +// TODO: This used to be a straightforward check but has grown to be quite +// complicated now. It is therefore worth revisiting what the additional +// benefits are of this (compared to relying on canonical loops and pattern +// matching). +static bool verifyTripCount(Value *RHS, Loop *L, + SmallPtrSetImpl<Instruction *> &IterationInstructions, + PHINode *&InductionPHI, Value *&TripCount, BinaryOperator *&Increment, + BranchInst *&BackBranch, ScalarEvolution *SE, bool IsWidened) { + const SCEV *BackedgeTakenCount = SE->getBackedgeTakenCount(L); + if (isa<SCEVCouldNotCompute>(BackedgeTakenCount)) { + LLVM_DEBUG(dbgs() << "Backedge-taken count is not predictable\n"); + return false; + } + + // The Extend=false flag is used for getTripCountFromExitCount as we want + // to verify and match it with the pattern matched tripcount. Please note + // that overflow checks are performed in checkOverflow, but are first tried + // to avoid by widening the IV. + const SCEV *SCEVTripCount = + SE->getTripCountFromExitCount(BackedgeTakenCount, /*Extend=*/false); + + const SCEV *SCEVRHS = SE->getSCEV(RHS); + if (SCEVRHS == SCEVTripCount) + return setLoopComponents(RHS, TripCount, Increment, IterationInstructions); + ConstantInt *ConstantRHS = dyn_cast<ConstantInt>(RHS); + if (ConstantRHS) { + const SCEV *BackedgeTCExt = nullptr; + if (IsWidened) { + const SCEV *SCEVTripCountExt; + // Find the extended backedge taken count and extended trip count using + // SCEV. One of these should now match the RHS of the compare. + BackedgeTCExt = SE->getZeroExtendExpr(BackedgeTakenCount, RHS->getType()); + SCEVTripCountExt = SE->getTripCountFromExitCount(BackedgeTCExt, false); + if (SCEVRHS != BackedgeTCExt && SCEVRHS != SCEVTripCountExt) { + LLVM_DEBUG(dbgs() << "Could not find valid trip count\n"); + return false; + } + } + // If the RHS of the compare is equal to the backedge taken count we need + // to add one to get the trip count. + if (SCEVRHS == BackedgeTCExt || SCEVRHS == BackedgeTakenCount) { + ConstantInt *One = ConstantInt::get(ConstantRHS->getType(), 1); + Value *NewRHS = ConstantInt::get( + ConstantRHS->getContext(), ConstantRHS->getValue() + One->getValue()); + return setLoopComponents(NewRHS, TripCount, Increment, + IterationInstructions); + } + return setLoopComponents(RHS, TripCount, Increment, IterationInstructions); + } + // If the RHS isn't a constant then check that the reason it doesn't match + // the SCEV trip count is because the RHS is a ZExt or SExt instruction + // (and take the trip count to be the RHS). + if (!IsWidened) { + LLVM_DEBUG(dbgs() << "Could not find valid trip count\n"); + return false; + } + auto *TripCountInst = dyn_cast<Instruction>(RHS); + if (!TripCountInst) { + LLVM_DEBUG(dbgs() << "Could not find valid trip count\n"); + return false; + } + if ((!isa<ZExtInst>(TripCountInst) && !isa<SExtInst>(TripCountInst)) || + SE->getSCEV(TripCountInst->getOperand(0)) != SCEVTripCount) { + LLVM_DEBUG(dbgs() << "Could not find valid extended trip count\n"); + return false; + } + return setLoopComponents(RHS, TripCount, Increment, IterationInstructions); +} + // Finds the induction variable, increment and trip count for a simple loop that // we can flatten. static bool findLoopComponents( @@ -197,63 +421,9 @@ static bool findLoopComponents( // another transformation has changed the compare (e.g. icmp ult %inc, // tripcount -> icmp ult %j, tripcount-1), or both. Value *RHS = Compare->getOperand(1); - const SCEV *BackedgeTakenCount = SE->getBackedgeTakenCount(L); - if (isa<SCEVCouldNotCompute>(BackedgeTakenCount)) { - LLVM_DEBUG(dbgs() << "Backedge-taken count is not predictable\n"); - return false; - } - // The use of the Extend=false flag on getTripCountFromExitCount was added - // during a refactoring to preserve existing behavior. However, there's - // nothing obvious in the surrounding code when handles the overflow case. - // FIXME: audit code to establish whether there's a latent bug here. - const SCEV *SCEVTripCount = - SE->getTripCountFromExitCount(BackedgeTakenCount, false); - const SCEV *SCEVRHS = SE->getSCEV(RHS); - if (SCEVRHS == SCEVTripCount) - return setLoopComponents(RHS, TripCount, Increment, IterationInstructions); - ConstantInt *ConstantRHS = dyn_cast<ConstantInt>(RHS); - if (ConstantRHS) { - const SCEV *BackedgeTCExt = nullptr; - if (IsWidened) { - const SCEV *SCEVTripCountExt; - // Find the extended backedge taken count and extended trip count using - // SCEV. One of these should now match the RHS of the compare. - BackedgeTCExt = SE->getZeroExtendExpr(BackedgeTakenCount, RHS->getType()); - SCEVTripCountExt = SE->getTripCountFromExitCount(BackedgeTCExt, false); - if (SCEVRHS != BackedgeTCExt && SCEVRHS != SCEVTripCountExt) { - LLVM_DEBUG(dbgs() << "Could not find valid trip count\n"); - return false; - } - } - // If the RHS of the compare is equal to the backedge taken count we need - // to add one to get the trip count. - if (SCEVRHS == BackedgeTCExt || SCEVRHS == BackedgeTakenCount) { - ConstantInt *One = ConstantInt::get(ConstantRHS->getType(), 1); - Value *NewRHS = ConstantInt::get( - ConstantRHS->getContext(), ConstantRHS->getValue() + One->getValue()); - return setLoopComponents(NewRHS, TripCount, Increment, - IterationInstructions); - } - return setLoopComponents(RHS, TripCount, Increment, IterationInstructions); - } - // If the RHS isn't a constant then check that the reason it doesn't match - // the SCEV trip count is because the RHS is a ZExt or SExt instruction - // (and take the trip count to be the RHS). - if (!IsWidened) { - LLVM_DEBUG(dbgs() << "Could not find valid trip count\n"); - return false; - } - auto *TripCountInst = dyn_cast<Instruction>(RHS); - if (!TripCountInst) { - LLVM_DEBUG(dbgs() << "Could not find valid trip count\n"); - return false; - } - if ((!isa<ZExtInst>(TripCountInst) && !isa<SExtInst>(TripCountInst)) || - SE->getSCEV(TripCountInst->getOperand(0)) != SCEVTripCount) { - LLVM_DEBUG(dbgs() << "Could not find valid extended trip count\n"); - return false; - } - return setLoopComponents(RHS, TripCount, Increment, IterationInstructions); + + return verifyTripCount(RHS, L, IterationInstructions, InductionPHI, TripCount, + Increment, BackBranch, SE, IsWidened); } static bool checkPHIs(FlattenInfo &FI, const TargetTransformInfo *TTI) { @@ -399,108 +569,26 @@ checkOuterLoopInsts(FlattenInfo &FI, return true; } -static bool checkIVUsers(FlattenInfo &FI) { - // We require all uses of both induction variables to match this pattern: - // - // (OuterPHI * InnerTripCount) + InnerPHI - // - // Any uses of the induction variables not matching that pattern would - // require a div/mod to reconstruct in the flattened loop, so the - // transformation wouldn't be profitable. - - Value *InnerTripCount = FI.InnerTripCount; - if (FI.Widened && - (isa<SExtInst>(InnerTripCount) || isa<ZExtInst>(InnerTripCount))) - InnerTripCount = cast<Instruction>(InnerTripCount)->getOperand(0); + +// We require all uses of both induction variables to match this pattern: +// +// (OuterPHI * InnerTripCount) + InnerPHI +// +// Any uses of the induction variables not matching that pattern would +// require a div/mod to reconstruct in the flattened loop, so the +// transformation wouldn't be profitable. +static bool checkIVUsers(FlattenInfo &FI) { // Check that all uses of the inner loop's induction variable match the // expected pattern, recording the uses of the outer IV. SmallPtrSet<Value *, 4> ValidOuterPHIUses; - for (User *U : FI.InnerInductionPHI->users()) { - if (U == FI.InnerIncrement) - continue; - - // After widening the IVs, a trunc instruction might have been introduced, - // so look through truncs. - if (isa<TruncInst>(U)) { - if (!U->hasOneUse()) - return false; - U = *U->user_begin(); - } - - // If the use is in the compare (which is also the condition of the inner - // branch) then the compare has been altered by another transformation e.g - // icmp ult %inc, tripcount -> icmp ult %j, tripcount-1, where tripcount is - // a constant. Ignore this use as the compare gets removed later anyway. - if (U == FI.InnerBranch->getCondition()) - continue; - - LLVM_DEBUG(dbgs() << "Found use of inner induction variable: "; U->dump()); - - Value *MatchedMul = nullptr; - Value *MatchedItCount = nullptr; - bool IsAdd = match(U, m_c_Add(m_Specific(FI.InnerInductionPHI), - m_Value(MatchedMul))) && - match(MatchedMul, m_c_Mul(m_Specific(FI.OuterInductionPHI), - m_Value(MatchedItCount))); - - // Matches the same pattern as above, except it also looks for truncs - // on the phi, which can be the result of widening the induction variables. - bool IsAddTrunc = - match(U, m_c_Add(m_Trunc(m_Specific(FI.InnerInductionPHI)), - m_Value(MatchedMul))) && - match(MatchedMul, m_c_Mul(m_Trunc(m_Specific(FI.OuterInductionPHI)), - m_Value(MatchedItCount))); - - if (!MatchedItCount) - return false; - // Look through extends if the IV has been widened. - if (FI.Widened && - (isa<SExtInst>(MatchedItCount) || isa<ZExtInst>(MatchedItCount))) { - assert(MatchedItCount->getType() == FI.InnerInductionPHI->getType() && - "Unexpected type mismatch in types after widening"); - MatchedItCount = isa<SExtInst>(MatchedItCount) - ? dyn_cast<SExtInst>(MatchedItCount)->getOperand(0) - : dyn_cast<ZExtInst>(MatchedItCount)->getOperand(0); - } - - if ((IsAdd || IsAddTrunc) && MatchedItCount == InnerTripCount) { - LLVM_DEBUG(dbgs() << "Use is optimisable\n"); - ValidOuterPHIUses.insert(MatchedMul); - FI.LinearIVUses.insert(U); - } else { - LLVM_DEBUG(dbgs() << "Did not match expected pattern, bailing\n"); - return false; - } - } + if (!FI.checkInnerInductionPhiUsers(ValidOuterPHIUses)) + return false; // Check that there are no uses of the outer IV other than the ones found // as part of the pattern above. - for (User *U : FI.OuterInductionPHI->users()) { - if (U == FI.OuterIncrement) - continue; - - auto IsValidOuterPHIUses = [&] (User *U) -> bool { - LLVM_DEBUG(dbgs() << "Found use of outer induction variable: "; U->dump()); - if (!ValidOuterPHIUses.count(U)) { - LLVM_DEBUG(dbgs() << "Did not match expected pattern, bailing\n"); - return false; - } - LLVM_DEBUG(dbgs() << "Use is optimisable\n"); - return true; - }; - - if (auto *V = dyn_cast<TruncInst>(U)) { - for (auto *K : V->users()) { - if (!IsValidOuterPHIUses(K)) - return false; - } - continue; - } - - if (!IsValidOuterPHIUses(U)) - return false; - } + if (!FI.checkOuterInductionPhiUsers(ValidOuterPHIUses)) + return false; LLVM_DEBUG(dbgs() << "checkIVUsers: OK\n"; dbgs() << "Found " << FI.LinearIVUses.size() @@ -535,7 +623,7 @@ static OverflowResult checkOverflow(FlattenInfo &FI, DominatorTree *DT, for (Value *U : V->users()) { if (auto *GEP = dyn_cast<GetElementPtrInst>(U)) { for (Value *GEPUser : U->users()) { - Instruction *GEPUserInst = dyn_cast<Instruction>(GEPUser); + auto *GEPUserInst = cast<Instruction>(GEPUser); if (!isa<LoadInst>(GEPUserInst) && !(isa<StoreInst>(GEPUserInst) && GEP == GEPUserInst->getOperand(1))) @@ -611,7 +699,8 @@ static bool CanFlattenLoopPair(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI, static bool DoFlattenLoopPair(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI, ScalarEvolution *SE, AssumptionCache *AC, - const TargetTransformInfo *TTI, LPMUpdater *U) { + const TargetTransformInfo *TTI, LPMUpdater *U, + MemorySSAUpdater *MSSAU) { Function *F = FI.OuterLoop->getHeader()->getParent(); LLVM_DEBUG(dbgs() << "Checks all passed, doing the transformation\n"); { @@ -647,7 +736,11 @@ static bool DoFlattenLoopPair(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI, BasicBlock *InnerExitingBlock = FI.InnerLoop->getExitingBlock(); InnerExitingBlock->getTerminator()->eraseFromParent(); BranchInst::Create(InnerExitBlock, InnerExitingBlock); + + // Update the DomTree and MemorySSA. DT->deleteEdge(InnerExitingBlock, FI.InnerLoop->getHeader()); + if (MSSAU) + MSSAU->removeEdge(InnerExitingBlock, FI.InnerLoop->getHeader()); // Replace all uses of the polynomial calculated from the two induction // variables with the one new one. @@ -658,8 +751,8 @@ static bool DoFlattenLoopPair(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI, OuterValue = Builder.CreateTrunc(FI.OuterInductionPHI, V->getType(), "flatten.trunciv"); - LLVM_DEBUG(dbgs() << "Replacing: "; V->dump(); - dbgs() << "with: "; OuterValue->dump()); + LLVM_DEBUG(dbgs() << "Replacing: "; V->dump(); dbgs() << "with: "; + OuterValue->dump()); V->replaceAllUsesWith(OuterValue); } @@ -698,7 +791,8 @@ static bool CanWidenIV(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI, // (OuterTripCount * InnerTripCount) as the new trip count is safe. if (InnerType != OuterType || InnerType->getScalarSizeInBits() >= MaxLegalSize || - MaxLegalType->getScalarSizeInBits() < InnerType->getScalarSizeInBits() * 2) { + MaxLegalType->getScalarSizeInBits() < + InnerType->getScalarSizeInBits() * 2) { LLVM_DEBUG(dbgs() << "Can't widen the IV\n"); return false; } @@ -708,10 +802,10 @@ static bool CanWidenIV(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI, unsigned ElimExt = 0; unsigned Widened = 0; - auto CreateWideIV = [&] (WideIVInfo WideIV, bool &Deleted) -> bool { - PHINode *WidePhi = createWideIV(WideIV, LI, SE, Rewriter, DT, DeadInsts, - ElimExt, Widened, true /* HasGuards */, - true /* UsePostIncrementRanges */); + auto CreateWideIV = [&](WideIVInfo WideIV, bool &Deleted) -> bool { + PHINode *WidePhi = + createWideIV(WideIV, LI, SE, Rewriter, DT, DeadInsts, ElimExt, Widened, + true /* HasGuards */, true /* UsePostIncrementRanges */); if (!WidePhi) return false; LLVM_DEBUG(dbgs() << "Created wide phi: "; WidePhi->dump()); @@ -721,14 +815,14 @@ static bool CanWidenIV(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI, }; bool Deleted; - if (!CreateWideIV({FI.InnerInductionPHI, MaxLegalType, false }, Deleted)) + if (!CreateWideIV({FI.InnerInductionPHI, MaxLegalType, false}, Deleted)) return false; // Add the narrow phi to list, so that it will be adjusted later when the // the transformation is performed. if (!Deleted) FI.InnerPHIsToTransform.insert(FI.InnerInductionPHI); - if (!CreateWideIV({FI.OuterInductionPHI, MaxLegalType, false }, Deleted)) + if (!CreateWideIV({FI.OuterInductionPHI, MaxLegalType, false}, Deleted)) return false; assert(Widened && "Widened IV expected"); @@ -744,7 +838,8 @@ static bool CanWidenIV(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI, static bool FlattenLoopPair(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI, ScalarEvolution *SE, AssumptionCache *AC, - const TargetTransformInfo *TTI, LPMUpdater *U) { + const TargetTransformInfo *TTI, LPMUpdater *U, + MemorySSAUpdater *MSSAU) { LLVM_DEBUG( dbgs() << "Loop flattening running on outer loop " << FI.OuterLoop->getHeader()->getName() << " and inner loop " @@ -773,7 +868,7 @@ static bool FlattenLoopPair(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI, // If we have widened and can perform the transformation, do that here. if (CanFlatten) - return DoFlattenLoopPair(FI, DT, LI, SE, AC, TTI, U); + return DoFlattenLoopPair(FI, DT, LI, SE, AC, TTI, U, MSSAU); // Otherwise, if we haven't widened the IV, check if the new iteration // variable might overflow. In this case, we need to version the loop, and @@ -791,18 +886,19 @@ static bool FlattenLoopPair(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI, } LLVM_DEBUG(dbgs() << "Multiply cannot overflow, modifying loop in-place\n"); - return DoFlattenLoopPair(FI, DT, LI, SE, AC, TTI, U); + return DoFlattenLoopPair(FI, DT, LI, SE, AC, TTI, U, MSSAU); } bool Flatten(LoopNest &LN, DominatorTree *DT, LoopInfo *LI, ScalarEvolution *SE, - AssumptionCache *AC, TargetTransformInfo *TTI, LPMUpdater *U) { + AssumptionCache *AC, TargetTransformInfo *TTI, LPMUpdater *U, + MemorySSAUpdater *MSSAU) { bool Changed = false; for (Loop *InnerLoop : LN.getLoops()) { auto *OuterLoop = InnerLoop->getParentLoop(); if (!OuterLoop) continue; FlattenInfo FI(OuterLoop, InnerLoop); - Changed |= FlattenLoopPair(FI, DT, LI, SE, AC, TTI, U); + Changed |= FlattenLoopPair(FI, DT, LI, SE, AC, TTI, U, MSSAU); } return Changed; } @@ -813,16 +909,30 @@ PreservedAnalyses LoopFlattenPass::run(LoopNest &LN, LoopAnalysisManager &LAM, bool Changed = false; + Optional<MemorySSAUpdater> MSSAU; + if (AR.MSSA) { + MSSAU = MemorySSAUpdater(AR.MSSA); + if (VerifyMemorySSA) + AR.MSSA->verifyMemorySSA(); + } + // The loop flattening pass requires loops to be // in simplified form, and also needs LCSSA. Running // this pass will simplify all loops that contain inner loops, // regardless of whether anything ends up being flattened. - Changed |= Flatten(LN, &AR.DT, &AR.LI, &AR.SE, &AR.AC, &AR.TTI, &U); + Changed |= Flatten(LN, &AR.DT, &AR.LI, &AR.SE, &AR.AC, &AR.TTI, &U, + MSSAU.hasValue() ? MSSAU.getPointer() : nullptr); if (!Changed) return PreservedAnalyses::all(); - return getLoopPassPreservedAnalyses(); + if (AR.MSSA && VerifyMemorySSA) + AR.MSSA->verifyMemorySSA(); + + auto PA = getLoopPassPreservedAnalyses(); + if (AR.MSSA) + PA.preserve<MemorySSAAnalysis>(); + return PA; } namespace { @@ -842,6 +952,7 @@ public: AU.addPreserved<TargetTransformInfoWrapperPass>(); AU.addRequired<AssumptionCacheTracker>(); AU.addPreserved<AssumptionCacheTracker>(); + AU.addPreserved<MemorySSAWrapperPass>(); } }; } // namespace @@ -854,7 +965,9 @@ INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_END(LoopFlattenLegacyPass, "loop-flatten", "Flattens loops", false, false) -FunctionPass *llvm::createLoopFlattenPass() { return new LoopFlattenLegacyPass(); } +FunctionPass *llvm::createLoopFlattenPass() { + return new LoopFlattenLegacyPass(); +} bool LoopFlattenLegacyPass::runOnFunction(Function &F) { ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); @@ -864,10 +977,17 @@ bool LoopFlattenLegacyPass::runOnFunction(Function &F) { auto &TTIP = getAnalysis<TargetTransformInfoWrapperPass>(); auto *TTI = &TTIP.getTTI(F); auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); + auto *MSSA = getAnalysisIfAvailable<MemorySSAWrapperPass>(); + + Optional<MemorySSAUpdater> MSSAU; + if (MSSA) + MSSAU = MemorySSAUpdater(&MSSA->getMSSA()); + bool Changed = false; for (Loop *L : *LI) { auto LN = LoopNest::getLoopNest(*L, *SE); - Changed |= Flatten(*LN, DT, LI, SE, AC, TTI, nullptr); + Changed |= Flatten(*LN, DT, LI, SE, AC, TTI, nullptr, + MSSAU.hasValue() ? MSSAU.getPointer() : nullptr); } return Changed; } diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp index 5d00fa56e888..35ba4e2b4032 100644 --- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -1117,7 +1117,7 @@ bool LoopIdiomRecognize::processLoopStridedStore( BasicBlock *Preheader = CurLoop->getLoopPreheader(); IRBuilder<> Builder(Preheader->getTerminator()); SCEVExpander Expander(*SE, *DL, "loop-idiom"); - SCEVExpanderCleaner ExpCleaner(Expander, *DT); + SCEVExpanderCleaner ExpCleaner(Expander); Type *DestInt8PtrTy = Builder.getInt8PtrTy(DestAS); Type *IntIdxTy = DL->getIndexType(DestPtr->getType()); @@ -1328,7 +1328,7 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad( IRBuilder<> Builder(Preheader->getTerminator()); SCEVExpander Expander(*SE, *DL, "loop-idiom"); - SCEVExpanderCleaner ExpCleaner(Expander, *DT); + SCEVExpanderCleaner ExpCleaner(Expander); bool Changed = false; const SCEV *StrStart = StoreEv->getStart(); diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp index 9f605b4ac4ad..c2b065c4eb31 100644 --- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp +++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp @@ -292,33 +292,6 @@ static LoopVector populateWorklist(Loop &L) { return LoopList; } -static PHINode *getInductionVariable(Loop *L, ScalarEvolution *SE) { - PHINode *InnerIndexVar = L->getCanonicalInductionVariable(); - if (InnerIndexVar) - return InnerIndexVar; - if (L->getLoopLatch() == nullptr || L->getLoopPredecessor() == nullptr) - return nullptr; - for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) { - PHINode *PhiVar = cast<PHINode>(I); - Type *PhiTy = PhiVar->getType(); - if (!PhiTy->isIntegerTy() && !PhiTy->isFloatingPointTy() && - !PhiTy->isPointerTy()) - return nullptr; - const SCEVAddRecExpr *AddRec = - dyn_cast<SCEVAddRecExpr>(SE->getSCEV(PhiVar)); - if (!AddRec || !AddRec->isAffine()) - continue; - const SCEV *Step = AddRec->getStepRecurrence(*SE); - if (!isa<SCEVConstant>(Step)) - continue; - // Found the induction variable. - // FIXME: Handle loops with more than one induction variable. Note that, - // currently, legality makes sure we have only one induction variable. - return PhiVar; - } - return nullptr; -} - namespace { /// LoopInterchangeLegality checks if it is legal to interchange the loop. @@ -332,9 +305,13 @@ public: bool canInterchangeLoops(unsigned InnerLoopId, unsigned OuterLoopId, CharMatrix &DepMatrix); + /// Discover induction PHIs in the header of \p L. Induction + /// PHIs are added to \p Inductions. + bool findInductions(Loop *L, SmallVectorImpl<PHINode *> &Inductions); + /// Check if the loop structure is understood. We do not handle triangular /// loops for now. - bool isLoopStructureUnderstood(PHINode *InnerInductionVar); + bool isLoopStructureUnderstood(); bool currentLimitations(); @@ -342,6 +319,10 @@ public: return OuterInnerReductions; } + const SmallVectorImpl<PHINode *> &getInnerLoopInductions() const { + return InnerLoopInductions; + } + private: bool tightlyNested(Loop *Outer, Loop *Inner); bool containsUnsafeInstructions(BasicBlock *BB); @@ -365,6 +346,9 @@ private: /// Set of reduction PHIs taking part of a reduction across the inner and /// outer loop. SmallPtrSet<PHINode *, 4> OuterInnerReductions; + + /// Set of inner loop induction PHIs + SmallVector<PHINode *, 8> InnerLoopInductions; }; /// LoopInterchangeProfitability checks if it is profitable to interchange the @@ -635,25 +619,26 @@ bool LoopInterchangeLegality::tightlyNested(Loop *OuterLoop, Loop *InnerLoop) { return true; } -bool LoopInterchangeLegality::isLoopStructureUnderstood( - PHINode *InnerInduction) { - unsigned Num = InnerInduction->getNumOperands(); +bool LoopInterchangeLegality::isLoopStructureUnderstood() { BasicBlock *InnerLoopPreheader = InnerLoop->getLoopPreheader(); - for (unsigned i = 0; i < Num; ++i) { - Value *Val = InnerInduction->getOperand(i); - if (isa<Constant>(Val)) - continue; - Instruction *I = dyn_cast<Instruction>(Val); - if (!I) - return false; - // TODO: Handle triangular loops. - // e.g. for(int i=0;i<N;i++) - // for(int j=i;j<N;j++) - unsigned IncomBlockIndx = PHINode::getIncomingValueNumForOperand(i); - if (InnerInduction->getIncomingBlock(IncomBlockIndx) == - InnerLoopPreheader && - !OuterLoop->isLoopInvariant(I)) { - return false; + for (PHINode *InnerInduction : InnerLoopInductions) { + unsigned Num = InnerInduction->getNumOperands(); + for (unsigned i = 0; i < Num; ++i) { + Value *Val = InnerInduction->getOperand(i); + if (isa<Constant>(Val)) + continue; + Instruction *I = dyn_cast<Instruction>(Val); + if (!I) + return false; + // TODO: Handle triangular loops. + // e.g. for(int i=0;i<N;i++) + // for(int j=i;j<N;j++) + unsigned IncomBlockIndx = PHINode::getIncomingValueNumForOperand(i); + if (InnerInduction->getIncomingBlock(IncomBlockIndx) == + InnerLoopPreheader && + !OuterLoop->isLoopInvariant(I)) { + return false; + } } } @@ -682,27 +667,34 @@ bool LoopInterchangeLegality::isLoopStructureUnderstood( // Return true if V is InnerInduction, or a cast from // InnerInduction, or a binary operator that involves // InnerInduction and a constant. - std::function<bool(Value *)> IsPathToIndVar; - IsPathToIndVar = [&InnerInduction, &IsPathToIndVar](Value *V) -> bool { - if (V == InnerInduction) + std::function<bool(Value *)> IsPathToInnerIndVar; + IsPathToInnerIndVar = [this, &IsPathToInnerIndVar](const Value *V) -> bool { + if (llvm::is_contained(InnerLoopInductions, V)) return true; if (isa<Constant>(V)) return true; - Instruction *I = dyn_cast<Instruction>(V); + const Instruction *I = dyn_cast<Instruction>(V); if (!I) return false; if (isa<CastInst>(I)) - return IsPathToIndVar(I->getOperand(0)); + return IsPathToInnerIndVar(I->getOperand(0)); if (isa<BinaryOperator>(I)) - return IsPathToIndVar(I->getOperand(0)) && - IsPathToIndVar(I->getOperand(1)); + return IsPathToInnerIndVar(I->getOperand(0)) && + IsPathToInnerIndVar(I->getOperand(1)); return false; }; - if (IsPathToIndVar(Op0) && !isa<Constant>(Op0)) { + // In case of multiple inner loop indvars, it is okay if LHS and RHS + // are both inner indvar related variables. + if (IsPathToInnerIndVar(Op0) && IsPathToInnerIndVar(Op1)) + return true; + + // Otherwise we check if the cmp instruction compares an inner indvar + // related variable (Left) with a outer loop invariant (Right). + if (IsPathToInnerIndVar(Op0) && !isa<Constant>(Op0)) { Left = Op0; Right = Op1; - } else if (IsPathToIndVar(Op1) && !isa<Constant>(Op1)) { + } else if (IsPathToInnerIndVar(Op1) && !isa<Constant>(Op1)) { Left = Op1; Right = Op0; } @@ -793,7 +785,6 @@ bool LoopInterchangeLegality::findInductionAndReductions( // This function indicates the current limitations in the transform as a result // of which we do not proceed. bool LoopInterchangeLegality::currentLimitations() { - BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader(); BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch(); // transform currently expects the loop latches to also be the exiting @@ -815,7 +806,6 @@ bool LoopInterchangeLegality::currentLimitations() { return true; } - PHINode *InnerInductionVar; SmallVector<PHINode *, 8> Inductions; if (!findInductionAndReductions(OuterLoop, Inductions, InnerLoop)) { LLVM_DEBUG( @@ -831,20 +821,6 @@ bool LoopInterchangeLegality::currentLimitations() { return true; } - // TODO: Currently we handle only loops with 1 induction variable. - if (Inductions.size() != 1) { - LLVM_DEBUG(dbgs() << "Loops with more than 1 induction variables are not " - << "supported currently.\n"); - ORE->emit([&]() { - return OptimizationRemarkMissed(DEBUG_TYPE, "MultiIndutionOuter", - OuterLoop->getStartLoc(), - OuterLoop->getHeader()) - << "Only outer loops with 1 induction variable can be " - "interchanged currently."; - }); - return true; - } - Inductions.clear(); if (!findInductionAndReductions(InnerLoop, Inductions, nullptr)) { LLVM_DEBUG( @@ -860,24 +836,8 @@ bool LoopInterchangeLegality::currentLimitations() { return true; } - // TODO: Currently we handle only loops with 1 induction variable. - if (Inductions.size() != 1) { - LLVM_DEBUG( - dbgs() << "We currently only support loops with 1 induction variable." - << "Failed to interchange due to current limitation\n"); - ORE->emit([&]() { - return OptimizationRemarkMissed(DEBUG_TYPE, "MultiInductionInner", - InnerLoop->getStartLoc(), - InnerLoop->getHeader()) - << "Only inner loops with 1 induction variable can be " - "interchanged currently."; - }); - return true; - } - InnerInductionVar = Inductions.pop_back_val(); - // TODO: Triangular loops are not handled for now. - if (!isLoopStructureUnderstood(InnerInductionVar)) { + if (!isLoopStructureUnderstood()) { LLVM_DEBUG(dbgs() << "Loop structure not understood by pass\n"); ORE->emit([&]() { return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedStructureInner", @@ -888,79 +848,17 @@ bool LoopInterchangeLegality::currentLimitations() { return true; } - // TODO: Current limitation: Since we split the inner loop latch at the point - // were induction variable is incremented (induction.next); We cannot have - // more than 1 user of induction.next since it would result in broken code - // after split. - // e.g. - // for(i=0;i<N;i++) { - // for(j = 0;j<M;j++) { - // A[j+1][i+2] = A[j][i]+k; - // } - // } - Instruction *InnerIndexVarInc = nullptr; - if (InnerInductionVar->getIncomingBlock(0) == InnerLoopPreHeader) - InnerIndexVarInc = - dyn_cast<Instruction>(InnerInductionVar->getIncomingValue(1)); - else - InnerIndexVarInc = - dyn_cast<Instruction>(InnerInductionVar->getIncomingValue(0)); - - if (!InnerIndexVarInc) { - LLVM_DEBUG( - dbgs() << "Did not find an instruction to increment the induction " - << "variable.\n"); - ORE->emit([&]() { - return OptimizationRemarkMissed(DEBUG_TYPE, "NoIncrementInInner", - InnerLoop->getStartLoc(), - InnerLoop->getHeader()) - << "The inner loop does not increment the induction variable."; - }); - return true; - } - - // Since we split the inner loop latch on this induction variable. Make sure - // we do not have any instruction between the induction variable and branch - // instruction. - - bool FoundInduction = false; - for (const Instruction &I : - llvm::reverse(InnerLoopLatch->instructionsWithoutDebug())) { - if (isa<BranchInst>(I) || isa<CmpInst>(I) || isa<TruncInst>(I) || - isa<ZExtInst>(I)) - continue; - - // We found an instruction. If this is not induction variable then it is not - // safe to split this loop latch. - if (!I.isIdenticalTo(InnerIndexVarInc)) { - LLVM_DEBUG(dbgs() << "Found unsupported instructions between induction " - << "variable increment and branch.\n"); - ORE->emit([&]() { - return OptimizationRemarkMissed( - DEBUG_TYPE, "UnsupportedInsBetweenInduction", - InnerLoop->getStartLoc(), InnerLoop->getHeader()) - << "Found unsupported instruction between induction variable " - "increment and branch."; - }); - return true; - } + return false; +} - FoundInduction = true; - break; - } - // The loop latch ended and we didn't find the induction variable return as - // current limitation. - if (!FoundInduction) { - LLVM_DEBUG(dbgs() << "Did not find the induction variable.\n"); - ORE->emit([&]() { - return OptimizationRemarkMissed(DEBUG_TYPE, "NoIndutionVariable", - InnerLoop->getStartLoc(), - InnerLoop->getHeader()) - << "Did not find the induction variable."; - }); - return true; +bool LoopInterchangeLegality::findInductions( + Loop *L, SmallVectorImpl<PHINode *> &Inductions) { + for (PHINode &PHI : L->getHeader()->phis()) { + InductionDescriptor ID; + if (InductionDescriptor::isInductionPHI(&PHI, L, SE, ID)) + Inductions.push_back(&PHI); } - return false; + return !Inductions.empty(); } // We currently only support LCSSA PHI nodes in the inner loop exit, if their @@ -1076,7 +974,7 @@ bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId, for (Instruction &I : BB->instructionsWithoutDebug()) if (CallInst *CI = dyn_cast<CallInst>(&I)) { // readnone functions do not prevent interchanging. - if (CI->doesNotReadMemory()) + if (CI->onlyWritesMemory()) continue; LLVM_DEBUG( dbgs() << "Loops with call instructions cannot be interchanged " @@ -1091,6 +989,11 @@ bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId, return false; } + if (!findInductions(InnerLoop, InnerLoopInductions)) { + LLVM_DEBUG(dbgs() << "Cound not find inner loop induction variables.\n"); + return false; + } + if (!areInnerLoopLatchPHIsSupported(OuterLoop, InnerLoop)) { LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in inner loop latch.\n"); ORE->emit([&]() { @@ -1347,25 +1250,25 @@ void LoopInterchangeTransform::restructureLoops( bool LoopInterchangeTransform::transform() { bool Transformed = false; - Instruction *InnerIndexVar; if (InnerLoop->getSubLoops().empty()) { BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader(); LLVM_DEBUG(dbgs() << "Splitting the inner loop latch\n"); - PHINode *InductionPHI = getInductionVariable(InnerLoop, SE); - if (!InductionPHI) { + auto &InductionPHIs = LIL.getInnerLoopInductions(); + if (InductionPHIs.empty()) { LLVM_DEBUG(dbgs() << "Failed to find the point to split loop latch \n"); return false; } - if (InductionPHI->getIncomingBlock(0) == InnerLoopPreHeader) - InnerIndexVar = dyn_cast<Instruction>(InductionPHI->getIncomingValue(1)); - else - InnerIndexVar = dyn_cast<Instruction>(InductionPHI->getIncomingValue(0)); - - // Ensure that InductionPHI is the first Phi node. - if (&InductionPHI->getParent()->front() != InductionPHI) - InductionPHI->moveBefore(&InductionPHI->getParent()->front()); + SmallVector<Instruction *, 8> InnerIndexVarList; + for (PHINode *CurInductionPHI : InductionPHIs) { + if (CurInductionPHI->getIncomingBlock(0) == InnerLoopPreHeader) + InnerIndexVarList.push_back( + dyn_cast<Instruction>(CurInductionPHI->getIncomingValue(1))); + else + InnerIndexVarList.push_back( + dyn_cast<Instruction>(CurInductionPHI->getIncomingValue(0))); + } // Create a new latch block for the inner loop. We split at the // current latch's terminator and then move the condition and all @@ -1377,7 +1280,7 @@ bool LoopInterchangeTransform::transform() { SmallSetVector<Instruction *, 4> WorkList; unsigned i = 0; - auto MoveInstructions = [&i, &WorkList, this, InductionPHI, NewLatch]() { + auto MoveInstructions = [&i, &WorkList, this, &InductionPHIs, NewLatch]() { for (; i < WorkList.size(); i++) { // Duplicate instruction and move it the new latch. Update uses that // have been moved. @@ -1389,7 +1292,8 @@ bool LoopInterchangeTransform::transform() { for (Use &U : llvm::make_early_inc_range(WorkList[i]->uses())) { Instruction *UserI = cast<Instruction>(U.getUser()); if (!InnerLoop->contains(UserI->getParent()) || - UserI->getParent() == NewLatch || UserI == InductionPHI) + UserI->getParent() == NewLatch || + llvm::is_contained(InductionPHIs, UserI)) U.set(NewI); } // Add operands of moved instruction to the worklist, except if they are @@ -1398,7 +1302,7 @@ bool LoopInterchangeTransform::transform() { Instruction *OpI = dyn_cast<Instruction>(Op); if (!OpI || this->LI->getLoopFor(OpI->getParent()) != this->InnerLoop || - OpI == InductionPHI) + llvm::is_contained(InductionPHIs, OpI)) continue; WorkList.insert(OpI); } @@ -1412,7 +1316,8 @@ bool LoopInterchangeTransform::transform() { if (CondI) WorkList.insert(CondI); MoveInstructions(); - WorkList.insert(cast<Instruction>(InnerIndexVar)); + for (Instruction *InnerIndexVar : InnerIndexVarList) + WorkList.insert(cast<Instruction>(InnerIndexVar)); MoveInstructions(); // Splits the inner loops phi nodes out into a separate basic block. @@ -1685,7 +1590,6 @@ bool LoopInterchangeTransform::adjustLoopBranches() { updateSuccessor(InnerLoopLatchPredecessorBI, InnerLoopLatch, InnerLoopLatchSuccessor, DTUpdates); - if (OuterLoopLatchBI->getSuccessor(0) == OuterLoopHeader) OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(1); else @@ -1712,19 +1616,22 @@ bool LoopInterchangeTransform::adjustLoopBranches() { SmallVector<PHINode *, 4> InnerLoopPHIs, OuterLoopPHIs; for (PHINode &PHI : InnerLoopHeader->phis()) if (OuterInnerReductions.contains(&PHI)) - InnerLoopPHIs.push_back(cast<PHINode>(&PHI)); + InnerLoopPHIs.push_back(&PHI); + for (PHINode &PHI : OuterLoopHeader->phis()) if (OuterInnerReductions.contains(&PHI)) - OuterLoopPHIs.push_back(cast<PHINode>(&PHI)); + OuterLoopPHIs.push_back(&PHI); // Now move the remaining reduction PHIs from outer to inner loop header and // vice versa. The PHI nodes must be part of a reduction across the inner and // outer loop and all the remains to do is and updating the incoming blocks. for (PHINode *PHI : OuterLoopPHIs) { + LLVM_DEBUG(dbgs() << "Outer loop reduction PHIs:\n"; PHI->dump();); PHI->moveBefore(InnerLoopHeader->getFirstNonPHI()); assert(OuterInnerReductions.count(PHI) && "Expected a reduction PHI node"); } for (PHINode *PHI : InnerLoopPHIs) { + LLVM_DEBUG(dbgs() << "Inner loop reduction PHIs:\n"; PHI->dump();); PHI->moveBefore(OuterLoopHeader->getFirstNonPHI()); assert(OuterInnerReductions.count(PHI) && "Expected a reduction PHI node"); } diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp index 798af48c2337..654f0d2a03a8 100644 --- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -3486,6 +3486,31 @@ LSRInstance::CollectLoopInvariantFixupsAndFormulae() { // Don't bother if the instruction is in a BB which ends in an EHPad. if (UseBB->getTerminator()->isEHPad()) continue; + + // Ignore cases in which the currently-examined value could come from + // a basic block terminated with an EHPad. This checks all incoming + // blocks of the phi node since it is possible that the same incoming + // value comes from multiple basic blocks, only some of which may end + // in an EHPad. If any of them do, a subsequent rewrite attempt by this + // pass would try to insert instructions into an EHPad, hitting an + // assertion. + if (isa<PHINode>(UserInst)) { + const auto *PhiNode = cast<PHINode>(UserInst); + bool HasIncompatibleEHPTerminatedBlock = false; + llvm::Value *ExpectedValue = U; + for (unsigned int I = 0; I < PhiNode->getNumIncomingValues(); I++) { + if (PhiNode->getIncomingValue(I) == ExpectedValue) { + if (PhiNode->getIncomingBlock(I)->getTerminator()->isEHPad()) { + HasIncompatibleEHPTerminatedBlock = true; + break; + } + } + } + if (HasIncompatibleEHPTerminatedBlock) { + continue; + } + } + // Don't bother rewriting PHIs in catchswitch blocks. if (isa<CatchSwitchInst>(UserInst->getParent()->getTerminator())) continue; diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp index 893928fb0560..022d9c7abc8c 100644 --- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -1142,7 +1142,7 @@ static LoopUnrollResult tryToUnrollLoop( // automatic unrolling from interfering with the user requested // transformation. Loop *ParentL = L->getParentLoop(); - if (ParentL != NULL && + if (ParentL != nullptr && hasUnrollAndJamTransformation(ParentL) == TM_ForcedByUser && hasUnrollTransformation(L) != TM_ForcedByUser) { LLVM_DEBUG(dbgs() << "Not unrolling loop since parent loop has" diff --git a/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp b/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp index 1c186e9a0488..a7eb60b5e032 100644 --- a/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp +++ b/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp @@ -64,7 +64,7 @@ getBranchWeight(Intrinsic::ID IntrinsicID, CallInst *CI, int BranchCount) { // __builtin_expect_with_probability assert(CI->getNumOperands() >= 3 && "expect with probability must have 3 arguments"); - ConstantFP *Confidence = dyn_cast<ConstantFP>(CI->getArgOperand(2)); + auto *Confidence = cast<ConstantFP>(CI->getArgOperand(2)); double TrueProb = Confidence->getValueAPF().convertToDouble(); assert((TrueProb >= 0.0 && TrueProb <= 1.0) && "probability value must be in the range [0.0, 1.0]"); diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp index 4e4097e13271..8f1d0181ee5b 100644 --- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp +++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp @@ -220,9 +220,7 @@ class LowerMatrixIntrinsics { bool IsColumnMajor = true; public: - MatrixTy() - : Vectors(), - IsColumnMajor(MatrixLayout == MatrixLayoutTy::ColumnMajor) {} + MatrixTy() : IsColumnMajor(MatrixLayout == MatrixLayoutTy::ColumnMajor) {} MatrixTy(ArrayRef<Value *> Vectors) : Vectors(Vectors.begin(), Vectors.end()), IsColumnMajor(MatrixLayout == MatrixLayoutTy::ColumnMajor) {} @@ -1393,7 +1391,8 @@ public: // reloads necessary. unsigned Op0Regs = (R + VF - 1) / VF * M; unsigned Op1Regs = (M + VF - 1) / VF * C; - return Op0Regs + Op1Regs > TTI.getNumberOfRegisters(true); + return Op0Regs + Op1Regs > + TTI.getNumberOfRegisters(TTI.getRegisterClassForType(true)); } MatrixTy getZeroMatrix(Type *EltType, unsigned R, unsigned C) { @@ -1832,7 +1831,7 @@ public: const DenseMap<Value *, SmallPtrSet<Value *, 2>> &Shared, const SmallSetVector<Value *, 32> &ExprsInSubprogram, Value *Leaf) - : Str(), Stream(Str), DL(DL), Inst2Matrix(Inst2Matrix), Shared(Shared), + : Stream(Str), DL(DL), Inst2Matrix(Inst2Matrix), Shared(Shared), ExprsInSubprogram(ExprsInSubprogram), Leaf(Leaf) {} void indent(unsigned N) { @@ -1895,7 +1894,7 @@ public: write(Name); return; } - IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI); + auto *II = cast<IntrinsicInst>(CI); write(Intrinsic::getBaseName(II->getIntrinsicID()) .drop_front(StringRef("llvm.matrix.").size())); write("."); diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp index 67335a45fb58..6698db26626b 100644 --- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -20,6 +20,7 @@ #include "llvm/ADT/iterator_range.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/CaptureTracking.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/Loads.h" #include "llvm/Analysis/MemoryLocation.h" @@ -171,7 +172,7 @@ public: bool empty() const { return Ranges.empty(); } void addInst(int64_t OffsetFromFirst, Instruction *Inst) { - if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) + if (auto *SI = dyn_cast<StoreInst>(Inst)) addStore(OffsetFromFirst, SI); else addMemSet(OffsetFromFirst, cast<MemSetInst>(Inst)); @@ -312,15 +313,21 @@ INITIALIZE_PASS_END(MemCpyOptLegacyPass, "memcpyopt", "MemCpy Optimization", static bool mayBeVisibleThroughUnwinding(Value *V, Instruction *Start, Instruction *End) { assert(Start->getParent() == End->getParent() && "Must be in same block"); - if (!Start->getFunction()->doesNotThrow() && - !isa<AllocaInst>(getUnderlyingObject(V))) { - for (const Instruction &I : - make_range(Start->getIterator(), End->getIterator())) { - if (I.mayThrow()) - return true; - } - } - return false; + // Function can't unwind, so it also can't be visible through unwinding. + if (Start->getFunction()->doesNotThrow()) + return false; + + // Object is not visible on unwind. + // TODO: Support RequiresNoCaptureBeforeUnwind case. + bool RequiresNoCaptureBeforeUnwind; + if (isNotVisibleOnUnwind(getUnderlyingObject(V), + RequiresNoCaptureBeforeUnwind) && + !RequiresNoCaptureBeforeUnwind) + return false; + + // Check whether there are any unwinding instructions in the range. + return any_of(make_range(Start->getIterator(), End->getIterator()), + [](const Instruction &I) { return I.mayThrow(); }); } void MemCpyOptPass::eraseInstruction(Instruction *I) { @@ -364,7 +371,7 @@ Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst, const DataLayout &DL = StartInst->getModule()->getDataLayout(); // We can't track scalable types - if (StoreInst *SI = dyn_cast<StoreInst>(StartInst)) + if (auto *SI = dyn_cast<StoreInst>(StartInst)) if (DL.getTypeStoreSize(SI->getOperand(0)->getType()).isScalable()) return nullptr; @@ -410,7 +417,7 @@ Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst, continue; } - if (StoreInst *NextStore = dyn_cast<StoreInst>(BI)) { + if (auto *NextStore = dyn_cast<StoreInst>(BI)) { // If this is a store, see if we can merge it in. if (!NextStore->isSimple()) break; @@ -440,7 +447,7 @@ Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst, Ranges.addStore(*Offset, NextStore); } else { - MemSetInst *MSI = cast<MemSetInst>(BI); + auto *MSI = cast<MemSetInst>(BI); if (MSI->isVolatile() || ByteVal != MSI->getValue() || !isa<ConstantInt>(MSI->getLength())) @@ -661,7 +668,7 @@ bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) { return false; // Load to store forwarding can be interpreted as memcpy. - if (LoadInst *LI = dyn_cast<LoadInst>(StoredVal)) { + if (auto *LI = dyn_cast<LoadInst>(StoredVal)) { if (LI->isSimple() && LI->hasOneUse() && LI->getParent() == SI->getParent()) { @@ -871,7 +878,7 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpyLoad, return false; // Require that src be an alloca. This simplifies the reasoning considerably. - AllocaInst *srcAlloca = dyn_cast<AllocaInst>(cpySrc); + auto *srcAlloca = dyn_cast<AllocaInst>(cpySrc); if (!srcAlloca) return false; @@ -890,8 +897,10 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpyLoad, // trap. Otherwise the transform is invalid since it might cause a trap // to occur earlier than it otherwise would. if (!isDereferenceableAndAlignedPointer(cpyDest, Align(1), APInt(64, cpySize), - DL, C, DT)) + DL, C, DT)) { + LLVM_DEBUG(dbgs() << "Call Slot: Dest pointer not dereferenceable\n"); return false; + } // Make sure that nothing can observe cpyDest being written early. There are // a number of cases to consider: @@ -907,8 +916,10 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpyLoad, // guaranteed to be executed if C is. As it is a non-atomic access, it // renders accesses from other threads undefined. // TODO: This is currently not checked. - if (mayBeVisibleThroughUnwinding(cpyDest, C, cpyStore)) + if (mayBeVisibleThroughUnwinding(cpyDest, C, cpyStore)) { + LLVM_DEBUG(dbgs() << "Call Slot: Dest may be visible through unwinding"); return false; + } // Check that dest points to memory that is at least as aligned as src. Align srcAlign = srcAlloca->getAlign(); @@ -930,14 +941,14 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpyLoad, append_range(srcUseList, U->users()); continue; } - if (GetElementPtrInst *G = dyn_cast<GetElementPtrInst>(U)) { + if (const auto *G = dyn_cast<GetElementPtrInst>(U)) { if (!G->hasAllZeroIndices()) return false; append_range(srcUseList, U->users()); continue; } - if (const IntrinsicInst *IT = dyn_cast<IntrinsicInst>(U)) + if (const auto *IT = dyn_cast<IntrinsicInst>(U)) if (IT->isLifetimeStartOrEnd()) continue; @@ -945,12 +956,57 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpyLoad, return false; } - // Check that src isn't captured by the called function since the - // transformation can cause aliasing issues in that case. - for (unsigned ArgI = 0, E = C->arg_size(); ArgI != E; ++ArgI) - if (C->getArgOperand(ArgI) == cpySrc && !C->doesNotCapture(ArgI)) + // Check whether src is captured by the called function, in which case there + // may be further indirect uses of src. + bool SrcIsCaptured = any_of(C->args(), [&](Use &U) { + return U->stripPointerCasts() == cpySrc && + !C->doesNotCapture(C->getArgOperandNo(&U)); + }); + + // If src is captured, then check whether there are any potential uses of + // src through the captured pointer before the lifetime of src ends, either + // due to a lifetime.end or a return from the function. + if (SrcIsCaptured) { + // Check that dest is not captured before/at the call. We have already + // checked that src is not captured before it. If either had been captured, + // then the call might be comparing the argument against the captured dest + // or src pointer. + Value *DestObj = getUnderlyingObject(cpyDest); + if (!isIdentifiedFunctionLocal(DestObj) || + PointerMayBeCapturedBefore(DestObj, /* ReturnCaptures */ true, + /* StoreCaptures */ true, C, DT, + /* IncludeI */ true)) return false; + MemoryLocation SrcLoc = + MemoryLocation(srcAlloca, LocationSize::precise(srcSize)); + for (Instruction &I : + make_range(++C->getIterator(), C->getParent()->end())) { + // Lifetime of srcAlloca ends at lifetime.end. + if (auto *II = dyn_cast<IntrinsicInst>(&I)) { + if (II->getIntrinsicID() == Intrinsic::lifetime_end && + II->getArgOperand(1)->stripPointerCasts() == srcAlloca && + cast<ConstantInt>(II->getArgOperand(0))->uge(srcSize)) + break; + } + + // Lifetime of srcAlloca ends at return. + if (isa<ReturnInst>(&I)) + break; + + // Ignore the direct read of src in the load. + if (&I == cpyLoad) + continue; + + // Check whether this instruction may mod/ref src through the captured + // pointer (we have already any direct mod/refs in the loop above). + // Also bail if we hit a terminator, as we don't want to scan into other + // blocks. + if (isModOrRefSet(AA->getModRefInfo(&I, SrcLoc)) || I.isTerminator()) + return false; + } + } + // Since we're changing the parameter to the callsite, we need to make sure // that what would be the new parameter dominates the callsite. if (!DT->dominates(cpyDest, C)) { @@ -1018,6 +1074,8 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpyLoad, LLVMContext::MD_invariant_group, LLVMContext::MD_access_group}; combineMetadata(C, cpyLoad, KnownIDs, true); + if (cpyLoad != cpyStore) + combineMetadata(C, cpyStore, KnownIDs, true); ++NumCallSlot; return true; @@ -1043,8 +1101,8 @@ bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M, // Second, the length of the memcpy's must be the same, or the preceding one // must be larger than the following one. if (MDep->getLength() != M->getLength()) { - ConstantInt *MDepLen = dyn_cast<ConstantInt>(MDep->getLength()); - ConstantInt *MLen = dyn_cast<ConstantInt>(M->getLength()); + auto *MDepLen = dyn_cast<ConstantInt>(MDep->getLength()); + auto *MLen = dyn_cast<ConstantInt>(M->getLength()); if (!MDepLen || !MLen || MDepLen->getZExtValue() < MLen->getZExtValue()) return false; } @@ -1163,7 +1221,7 @@ bool MemCpyOptPass::processMemSetMemCpyDependence(MemCpyInst *MemCpy, const unsigned DestAlign = std::max(MemSet->getDestAlignment(), MemCpy->getDestAlignment()); if (DestAlign > 1) - if (ConstantInt *SrcSizeC = dyn_cast<ConstantInt>(SrcSize)) + if (auto *SrcSizeC = dyn_cast<ConstantInt>(SrcSize)) Align = MinAlign(SrcSizeC->getZExtValue(), DestAlign); IRBuilder<> Builder(MemCpy); @@ -1211,12 +1269,11 @@ static bool hasUndefContents(MemorySSA *MSSA, AliasAnalysis *AA, Value *V, if (MSSA->isLiveOnEntryDef(Def)) return isa<AllocaInst>(getUnderlyingObject(V)); - if (IntrinsicInst *II = - dyn_cast_or_null<IntrinsicInst>(Def->getMemoryInst())) { + if (auto *II = dyn_cast_or_null<IntrinsicInst>(Def->getMemoryInst())) { if (II->getIntrinsicID() == Intrinsic::lifetime_start) { - ConstantInt *LTSize = cast<ConstantInt>(II->getArgOperand(0)); + auto *LTSize = cast<ConstantInt>(II->getArgOperand(0)); - if (ConstantInt *CSize = dyn_cast<ConstantInt>(Size)) { + if (auto *CSize = dyn_cast<ConstantInt>(Size)) { if (AA->isMustAlias(V, II->getArgOperand(1)) && LTSize->getZExtValue() >= CSize->getZExtValue()) return true; @@ -1226,12 +1283,14 @@ static bool hasUndefContents(MemorySSA *MSSA, AliasAnalysis *AA, Value *V, // does) and we're querying a pointer based on that alloca, then we know // the memory is definitely undef, regardless of how exactly we alias. // The size also doesn't matter, as an out-of-bounds access would be UB. - AllocaInst *Alloca = dyn_cast<AllocaInst>(getUnderlyingObject(V)); - if (getUnderlyingObject(II->getArgOperand(1)) == Alloca) { - const DataLayout &DL = Alloca->getModule()->getDataLayout(); - if (Optional<TypeSize> AllocaSize = Alloca->getAllocationSizeInBits(DL)) - if (*AllocaSize == LTSize->getValue() * 8) - return true; + if (auto *Alloca = dyn_cast<AllocaInst>(getUnderlyingObject(V))) { + if (getUnderlyingObject(II->getArgOperand(1)) == Alloca) { + const DataLayout &DL = Alloca->getModule()->getDataLayout(); + if (Optional<TypeSize> AllocaSize = + Alloca->getAllocationSizeInBits(DL)) + if (*AllocaSize == LTSize->getValue() * 8) + return true; + } } } } @@ -1266,12 +1325,12 @@ bool MemCpyOptPass::performMemCpyToMemSetOptzn(MemCpyInst *MemCpy, // Don't worry about sizes larger than i64. // A known memset size is required. - ConstantInt *CMemSetSize = dyn_cast<ConstantInt>(MemSetSize); + auto *CMemSetSize = dyn_cast<ConstantInt>(MemSetSize); if (!CMemSetSize) return false; // A known memcpy size is also required. - ConstantInt *CCopySize = dyn_cast<ConstantInt>(CopySize); + auto *CCopySize = dyn_cast<ConstantInt>(CopySize); if (!CCopySize) return false; if (CCopySize->getZExtValue() > CMemSetSize->getZExtValue()) { @@ -1323,7 +1382,7 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) { } // If copying from a constant, try to turn the memcpy into a memset. - if (GlobalVariable *GV = dyn_cast<GlobalVariable>(M->getSource())) + if (auto *GV = dyn_cast<GlobalVariable>(M->getSource())) if (GV->isConstant() && GV->hasDefinitiveInitializer()) if (Value *ByteVal = isBytewiseValue(GV->getInitializer(), M->getModule()->getDataLayout())) { @@ -1370,7 +1429,7 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) { // d) memcpy from a just-memset'd source can be turned into memset. if (auto *MD = dyn_cast<MemoryDef>(SrcClobber)) { if (Instruction *MI = MD->getMemoryInst()) { - if (ConstantInt *CopySize = dyn_cast<ConstantInt>(M->getLength())) { + if (auto *CopySize = dyn_cast<ConstantInt>(M->getLength())) { if (auto *C = dyn_cast<CallInst>(MI)) { // The memcpy must post-dom the call. Limit to the same block for // now. Additionally, we need to ensure that there are no accesses @@ -1469,7 +1528,7 @@ bool MemCpyOptPass::processByValArgument(CallBase &CB, unsigned ArgNo) { return false; // The length of the memcpy must be larger or equal to the size of the byval. - ConstantInt *C1 = dyn_cast<ConstantInt>(MDep->getLength()); + auto *C1 = dyn_cast<ConstantInt>(MDep->getLength()); if (!C1 || !TypeSize::isKnownGE( TypeSize::getFixed(C1->getValue().getZExtValue()), ByValSize)) return false; @@ -1540,13 +1599,13 @@ bool MemCpyOptPass::iterateOnFunction(Function &F) { bool RepeatInstruction = false; - if (StoreInst *SI = dyn_cast<StoreInst>(I)) + if (auto *SI = dyn_cast<StoreInst>(I)) MadeChange |= processStore(SI, BI); - else if (MemSetInst *M = dyn_cast<MemSetInst>(I)) + else if (auto *M = dyn_cast<MemSetInst>(I)) RepeatInstruction = processMemSet(M, BI); - else if (MemCpyInst *M = dyn_cast<MemCpyInst>(I)) + else if (auto *M = dyn_cast<MemCpyInst>(I)) RepeatInstruction = processMemCpy(M, BI); - else if (MemMoveInst *M = dyn_cast<MemMoveInst>(I)) + else if (auto *M = dyn_cast<MemMoveInst>(I)) RepeatInstruction = processMemMove(M); else if (auto *CB = dyn_cast<CallBase>(I)) { for (unsigned i = 0, e = CB->arg_size(); i != e; ++i) diff --git a/llvm/lib/Transforms/Scalar/NewGVN.cpp b/llvm/lib/Transforms/Scalar/NewGVN.cpp index 10a8742940b1..2476e6c408b1 100644 --- a/llvm/lib/Transforms/Scalar/NewGVN.cpp +++ b/llvm/lib/Transforms/Scalar/NewGVN.cpp @@ -1198,9 +1198,10 @@ NewGVN::ExprResult NewGVN::createExpression(Instruction *I) const { if (auto Simplified = checkExprResults(E, I, V)) return Simplified; } else if (auto *GEPI = dyn_cast<GetElementPtrInst>(I)) { - Value *V = SimplifyGEPInst(GEPI->getSourceElementType(), - ArrayRef<Value *>(E->op_begin(), E->op_end()), - GEPI->isInBounds(), SQ); + Value *V = + SimplifyGEPInst(GEPI->getSourceElementType(), *E->op_begin(), + makeArrayRef(std::next(E->op_begin()), E->op_end()), + GEPI->isInBounds(), SQ); if (auto Simplified = checkExprResults(E, I, V)) return Simplified; } else if (AllConstant) { @@ -1322,11 +1323,11 @@ bool NewGVN::someEquivalentDominates(const Instruction *Inst, Value *NewGVN::lookupOperandLeader(Value *V) const { CongruenceClass *CC = ValueToClass.lookup(V); if (CC) { - // Everything in TOP is represented by undef, as it can be any value. + // Everything in TOP is represented by poison, as it can be any value. // We do have to make sure we get the type right though, so we can't set the - // RepLeader to undef. + // RepLeader to poison. if (CC == TOPClass) - return UndefValue::get(V->getType()); + return PoisonValue::get(V->getType()); return CC->getStoredValue() ? CC->getStoredValue() : CC->getLeader(); } @@ -1493,8 +1494,7 @@ NewGVN::performSymbolicLoadCoercion(Type *LoadType, Value *LoadPtr, // undef value. This can happen when loading for a fresh allocation with no // intervening stores, for example. Note that this is only true in the case // that the result of the allocation is pointer equal to the load ptr. - if (isa<AllocaInst>(DepInst) || isMallocLikeFn(DepInst, TLI) || - isAlignedAllocLikeFn(DepInst, TLI)) { + if (isa<AllocaInst>(DepInst)) { return createConstantExpression(UndefValue::get(LoadType)); } // If this load occurs either right after a lifetime begin, @@ -1502,12 +1502,10 @@ NewGVN::performSymbolicLoadCoercion(Type *LoadType, Value *LoadPtr, else if (auto *II = dyn_cast<IntrinsicInst>(DepInst)) { if (II->getIntrinsicID() == Intrinsic::lifetime_start) return createConstantExpression(UndefValue::get(LoadType)); - } - // If this load follows a calloc (which zero initializes memory), - // then the loaded value is zero - else if (isCallocLikeFn(DepInst, TLI)) { - return createConstantExpression(Constant::getNullValue(LoadType)); - } + } else if (isAllocationFn(DepInst, TLI)) + if (auto *InitVal = getInitialValueOfAllocation(cast<CallBase>(DepInst), + TLI, LoadType)) + return createConstantExpression(InitVal); return nullptr; } @@ -1521,9 +1519,9 @@ const Expression *NewGVN::performSymbolicLoadEvaluation(Instruction *I) const { return nullptr; Value *LoadAddressLeader = lookupOperandLeader(LI->getPointerOperand()); - // Load of undef is undef. + // Load of undef is UB. if (isa<UndefValue>(LoadAddressLeader)) - return createConstantExpression(UndefValue::get(LI->getType())); + return createConstantExpression(PoisonValue::get(LI->getType())); MemoryAccess *OriginalAccess = getMemoryAccess(I); MemoryAccess *DefiningAccess = MSSAWalker->getClobberingMemoryAccess(OriginalAccess); @@ -1531,9 +1529,9 @@ const Expression *NewGVN::performSymbolicLoadEvaluation(Instruction *I) const { if (!MSSA->isLiveOnEntryDef(DefiningAccess)) { if (auto *MD = dyn_cast<MemoryDef>(DefiningAccess)) { Instruction *DefiningInst = MD->getMemoryInst(); - // If the defining instruction is not reachable, replace with undef. + // If the defining instruction is not reachable, replace with poison. if (!ReachableBlocks.count(DefiningInst->getParent())) - return createConstantExpression(UndefValue::get(LI->getType())); + return createConstantExpression(PoisonValue::get(LI->getType())); // This will handle stores and memory insts. We only do if it the // defining access has a different type, or it is a pointer produced by // certain memory operations that cause the memory to have a fixed value @@ -1722,8 +1720,12 @@ NewGVN::performSymbolicPHIEvaluation(ArrayRef<ValPair> PHIOps, // We match the semantics of SimplifyPhiNode from InstructionSimplify here. // See if all arguments are the same. // We track if any were undef because they need special handling. - bool HasUndef = false; + bool HasUndef = false, HasPoison = false; auto Filtered = make_filter_range(E->operands(), [&](Value *Arg) { + if (isa<PoisonValue>(Arg)) { + HasPoison = true; + return false; + } if (isa<UndefValue>(Arg)) { HasUndef = true; return false; @@ -1732,8 +1734,14 @@ NewGVN::performSymbolicPHIEvaluation(ArrayRef<ValPair> PHIOps, }); // If we are left with no operands, it's dead. if (Filtered.empty()) { - // If it has undef at this point, it means there are no-non-undef arguments, - // and thus, the value of the phi node must be undef. + // If it has undef or poison at this point, it means there are no-non-undef + // arguments, and thus, the value of the phi node must be undef. + if (HasPoison && !HasUndef) { + LLVM_DEBUG( + dbgs() << "PHI Node " << *I + << " has no non-poison arguments, valuing it as poison\n"); + return createConstantExpression(PoisonValue::get(I->getType())); + } if (HasUndef) { LLVM_DEBUG( dbgs() << "PHI Node " << *I @@ -1758,7 +1766,7 @@ NewGVN::performSymbolicPHIEvaluation(ArrayRef<ValPair> PHIOps, // expression to say if one is equivalent to the other. // We also special case undef, so that if we have an undef, we can't use the // common value unless it dominates the phi block. - if (HasUndef) { + if (HasPoison || HasUndef) { // If we have undef and at least one other value, this is really a // multivalued phi, and we need to know if it's cycle free in order to // evaluate whether we can ignore the undef. The other parts of this are @@ -2579,6 +2587,15 @@ bool NewGVN::OpIsSafeForPHIOfOpsHelper( } auto *OrigI = cast<Instruction>(V); + // When we hit an instruction that reads memory (load, call, etc), we must + // consider any store that may happen in the loop. For now, we assume the + // worst: there is a store in the loop that alias with this read. + // The case where the load is outside the loop is already covered by the + // dominator check above. + // TODO: relax this condition + if (OrigI->mayReadFromMemory()) + return false; + for (auto *Op : OrigI->operand_values()) { if (!isa<Instruction>(Op)) continue; @@ -2780,7 +2797,7 @@ NewGVN::makePossiblePHIOfOps(Instruction *I, LLVM_DEBUG(dbgs() << "Skipping phi of ops operand for incoming block " << getBlockName(PredBB) << " because the block is unreachable\n"); - FoundVal = UndefValue::get(I->getType()); + FoundVal = PoisonValue::get(I->getType()); RevisitOnReachabilityChange[PHIBlock].set(InstrToDFSNum(I)); } @@ -3459,7 +3476,7 @@ bool NewGVN::runGVN() { // Delete all instructions marked for deletion. for (Instruction *ToErase : InstructionsToErase) { if (!ToErase->use_empty()) - ToErase->replaceAllUsesWith(UndefValue::get(ToErase->getType())); + ToErase->replaceAllUsesWith(PoisonValue::get(ToErase->getType())); assert(ToErase->getParent() && "BB containing ToErase deleted unexpectedly!"); @@ -3677,7 +3694,7 @@ void NewGVN::deleteInstructionsInBlock(BasicBlock *BB) { for (BasicBlock::reverse_iterator I(StartPoint); I != BB->rend();) { Instruction &Inst = *I++; if (!Inst.use_empty()) - Inst.replaceAllUsesWith(UndefValue::get(Inst.getType())); + Inst.replaceAllUsesWith(PoisonValue::get(Inst.getType())); if (isa<LandingPadInst>(Inst)) continue; salvageKnowledge(&Inst, AC); @@ -3687,7 +3704,7 @@ void NewGVN::deleteInstructionsInBlock(BasicBlock *BB) { } // Now insert something that simplifycfg will turn into an unreachable. Type *Int8Ty = Type::getInt8Ty(BB->getContext()); - new StoreInst(UndefValue::get(Int8Ty), + new StoreInst(PoisonValue::get(Int8Ty), Constant::getNullValue(Int8Ty->getPointerTo()), BB->getTerminator()); } @@ -3827,8 +3844,8 @@ bool NewGVN::eliminateInstructions(Function &F) { LLVM_DEBUG(dbgs() << "Replacing incoming value of " << PHI << " for block " << getBlockName(PHI->getIncomingBlock(Operand)) - << " with undef due to it being unreachable\n"); - Operand.set(UndefValue::get(PHI->getType())); + << " with poison due to it being unreachable\n"); + Operand.set(PoisonValue::get(PHI->getType())); } }; // Replace unreachable phi arguments. @@ -4128,21 +4145,25 @@ bool NewGVN::eliminateInstructions(Function &F) { unsigned int NewGVN::getRank(const Value *V) const { // Prefer constants to undef to anything else // Undef is a constant, have to check it first. + // Prefer poison to undef as it's less defined. // Prefer smaller constants to constantexprs + // Note that the order here matters because of class inheritance if (isa<ConstantExpr>(V)) - return 2; - if (isa<UndefValue>(V)) + return 3; + if (isa<PoisonValue>(V)) return 1; + if (isa<UndefValue>(V)) + return 2; if (isa<Constant>(V)) return 0; - else if (auto *A = dyn_cast<Argument>(V)) - return 3 + A->getArgNo(); + if (auto *A = dyn_cast<Argument>(V)) + return 4 + A->getArgNo(); - // Need to shift the instruction DFS by number of arguments + 3 to account for + // Need to shift the instruction DFS by number of arguments + 5 to account for // the constant and argument ranking above. unsigned Result = InstrToDFSNum(V); if (Result > 0) - return 4 + NumFuncArgs + Result; + return 5 + NumFuncArgs + Result; // Unreachable or something else, just return a really large number. return ~0; } diff --git a/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp b/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp index 44027ccd92ca..e0d0301c1ef6 100644 --- a/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp +++ b/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp @@ -82,6 +82,7 @@ static bool optimizeSQRT(CallInst *Call, Function *CalledFunc, // Add attribute "readnone" so that backend can use a native sqrt instruction // for this call. + Call->removeFnAttr(Attribute::WriteOnly); Call->addFnAttr(Attribute::ReadNone); // Insert a FP compare instruction and use it as the CurrBB branch condition. diff --git a/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp index e12eca0ed287..3da367341d2a 100644 --- a/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp +++ b/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp @@ -1367,13 +1367,13 @@ static AttributeList legalizeCallAttributes(LLVMContext &Ctx, return AL; // Remove the readonly, readnone, and statepoint function attributes. - AttrBuilder FnAttrs = AL.getFnAttrs(); + AttrBuilder FnAttrs(Ctx, AL.getFnAttrs()); for (auto Attr : FnAttrsToStrip) FnAttrs.removeAttribute(Attr); for (Attribute A : AL.getFnAttrs()) { if (isStatepointDirectiveAttr(A)) - FnAttrs.remove(A); + FnAttrs.removeAttribute(A); } // Just skip parameter and return attributes for now @@ -2643,10 +2643,10 @@ static bool insertParsePoints(Function &F, DominatorTree &DT, // List of all parameter and return attributes which must be stripped when // lowering from the abstract machine model. Note that we list attributes // here which aren't valid as return attributes, that is okay. -static AttrBuilder getParamAndReturnAttributesToRemove() { - AttrBuilder R; - R.addDereferenceableAttr(1); - R.addDereferenceableOrNullAttr(1); +static AttributeMask getParamAndReturnAttributesToRemove() { + AttributeMask R; + R.addAttribute(Attribute::Dereferenceable); + R.addAttribute(Attribute::DereferenceableOrNull); R.addAttribute(Attribute::ReadNone); R.addAttribute(Attribute::ReadOnly); R.addAttribute(Attribute::WriteOnly); @@ -2668,7 +2668,7 @@ static void stripNonValidAttributesFromPrototype(Function &F) { return; } - AttrBuilder R = getParamAndReturnAttributesToRemove(); + AttributeMask R = getParamAndReturnAttributesToRemove(); for (Argument &A : F.args()) if (isa<PointerType>(A.getType())) F.removeParamAttrs(A.getArgNo(), R); @@ -2742,7 +2742,7 @@ static void stripNonValidDataFromBody(Function &F) { stripInvalidMetadataFromInstruction(I); - AttrBuilder R = getParamAndReturnAttributesToRemove(); + AttributeMask R = getParamAndReturnAttributesToRemove(); if (auto *Call = dyn_cast<CallBase>(&I)) { for (int i = 0, e = Call->arg_size(); i != e; i++) if (isa<PointerType>(Call->getArgOperand(i)->getType())) diff --git a/llvm/lib/Transforms/Scalar/SCCP.cpp b/llvm/lib/Transforms/Scalar/SCCP.cpp index ff2f8a25f379..c34da51e6dc1 100644 --- a/llvm/lib/Transforms/Scalar/SCCP.cpp +++ b/llvm/lib/Transforms/Scalar/SCCP.cpp @@ -486,7 +486,7 @@ bool llvm::runIPSCCP( // inaccessiblemem_or_argmemonly attributes do not hold any longer. Remove // them from both the function and callsites. if (ReplacedPointerArg) { - AttrBuilder AttributesToRemove; + AttributeMask AttributesToRemove; AttributesToRemove.addAttribute(Attribute::ArgMemOnly); AttributesToRemove.addAttribute(Attribute::InaccessibleMemOrArgMemOnly); F.removeFnAttrs(AttributesToRemove); diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp index 31c8999c3724..35497ae5ed9a 100644 --- a/llvm/lib/Transforms/Scalar/SROA.cpp +++ b/llvm/lib/Transforms/Scalar/SROA.cpp @@ -323,7 +323,7 @@ private: /// /// Note that these are not separated by slice. This is because we expect an /// alloca to be completely rewritten or not rewritten at all. If rewritten, - /// all these instructions can simply be removed and replaced with undef as + /// all these instructions can simply be removed and replaced with poison as /// they come from outside of the allocated space. SmallVector<Instruction *, 8> DeadUsers; @@ -333,10 +333,10 @@ private: /// Operands which will become dead if we rewrite the alloca. /// /// These are operands that in their particular use can be replaced with - /// undef when we rewrite the alloca. These show up in out-of-bounds inputs + /// poison when we rewrite the alloca. These show up in out-of-bounds inputs /// to PHI nodes and the like. They aren't entirely dead (there might be /// a GEP back into the bounds using it elsewhere) and nor is the PHI, but we - /// want to swap this particular input for undef to simplify the use lists of + /// want to swap this particular input for poison to simplify the use lists of /// the alloca. SmallVector<Use *, 8> DeadOperands; }; @@ -1008,6 +1008,13 @@ private: if (I.use_empty()) return markAsDead(I); + // If this is a PHI node before a catchswitch, we cannot insert any non-PHI + // instructions in this BB, which may be required during rewriting. Bail out + // on these cases. + if (isa<PHINode>(I) && + I.getParent()->getFirstInsertionPt() == I.getParent()->end()) + return PI.setAborted(&I); + // TODO: We could use SimplifyInstruction here to fold PHINodes and // SelectInsts. However, doing so requires to change the current // dead-operand-tracking mechanism. For instance, suppose neither loading @@ -1023,7 +1030,7 @@ private: enqueueUsers(I); else // Otherwise the operand to the PHI/select is dead, and we can replace - // it with undef. + // it with poison. AS.DeadOperands.push_back(U); return; @@ -1043,7 +1050,7 @@ private: // For PHI and select operands outside the alloca, we can't nuke the entire // phi or select -- the other side might still be relevant, so we special // case them here and use a separate structure to track the operands - // themselves which should be replaced with undef. + // themselves which should be replaced with poison. // FIXME: This should instead be escaped in the event we're instrumenting // for address sanitization. if (Offset.uge(AllocSize)) { @@ -1264,14 +1271,14 @@ static bool isSafePHIToSpeculate(PHINode &PN) { return true; } -static void speculatePHINodeLoads(PHINode &PN) { +static void speculatePHINodeLoads(IRBuilderTy &IRB, PHINode &PN) { LLVM_DEBUG(dbgs() << " original: " << PN << "\n"); LoadInst *SomeLoad = cast<LoadInst>(PN.user_back()); Type *LoadTy = SomeLoad->getType(); - IRBuilderTy PHIBuilder(&PN); - PHINode *NewPN = PHIBuilder.CreatePHI(LoadTy, PN.getNumIncomingValues(), - PN.getName() + ".sroa.speculated"); + IRB.SetInsertPoint(&PN); + PHINode *NewPN = IRB.CreatePHI(LoadTy, PN.getNumIncomingValues(), + PN.getName() + ".sroa.speculated"); // Get the AA tags and alignment to use from one of the loads. It does not // matter which one we get and if any differ. @@ -1301,9 +1308,9 @@ static void speculatePHINodeLoads(PHINode &PN) { } Instruction *TI = Pred->getTerminator(); - IRBuilderTy PredBuilder(TI); + IRB.SetInsertPoint(TI); - LoadInst *Load = PredBuilder.CreateAlignedLoad( + LoadInst *Load = IRB.CreateAlignedLoad( LoadTy, InVal, Alignment, (PN.getName() + ".sroa.speculate.load." + Pred->getName())); ++NumLoadsSpeculated; @@ -1361,10 +1368,10 @@ static bool isSafeSelectToSpeculate(SelectInst &SI) { return true; } -static void speculateSelectInstLoads(SelectInst &SI) { +static void speculateSelectInstLoads(IRBuilderTy &IRB, SelectInst &SI) { LLVM_DEBUG(dbgs() << " original: " << SI << "\n"); - IRBuilderTy IRB(&SI); + IRB.SetInsertPoint(&SI); Value *TV = SI.getTrueValue(); Value *FV = SI.getFalseValue(); // Replace the loads of the select with a select of two loads. @@ -1430,8 +1437,10 @@ static Value *buildGEP(IRBuilderTy &IRB, Value *BasePtr, if (Indices.size() == 1 && cast<ConstantInt>(Indices.back())->isZero()) return BasePtr; - return IRB.CreateInBoundsGEP(BasePtr->getType()->getPointerElementType(), - BasePtr, Indices, NamePrefix + "sroa_idx"); + // buildGEP() is only called for non-opaque pointers. + return IRB.CreateInBoundsGEP( + BasePtr->getType()->getNonOpaquePointerElementType(), BasePtr, Indices, + NamePrefix + "sroa_idx"); } /// Get a natural GEP off of the BasePtr walking through Ty toward @@ -1504,7 +1513,7 @@ static Value *getNaturalGEPWithOffset(IRBuilderTy &IRB, const DataLayout &DL, if (Ty == IRB.getInt8PtrTy(Ty->getAddressSpace()) && TargetTy->isIntegerTy(8)) return nullptr; - Type *ElementTy = Ty->getElementType(); + Type *ElementTy = Ty->getNonOpaquePointerElementType(); if (!ElementTy->isSized()) return nullptr; // We can't GEP through an unsized element. @@ -1563,7 +1572,7 @@ static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr, APInt Int8PtrOffset(Offset.getBitWidth(), 0); PointerType *TargetPtrTy = cast<PointerType>(PointerTy); - Type *TargetTy = TargetPtrTy->getElementType(); + Type *TargetTy = TargetPtrTy->getNonOpaquePointerElementType(); // As `addrspacecast` is , `Ptr` (the storage pointer) may have different // address space from the expected `PointerTy` (the pointer to be used). @@ -2558,7 +2567,7 @@ private: // the computed value, and then replace the placeholder with LI, leaving // LI only used for this computation. Value *Placeholder = new LoadInst( - LI.getType(), UndefValue::get(LI.getType()->getPointerTo(AS)), "", + LI.getType(), PoisonValue::get(LI.getType()->getPointerTo(AS)), "", false, Align(1)); V = insertInteger(DL, IRB, Placeholder, V, NewBeginOffset - BeginOffset, "insert"); @@ -3223,8 +3232,11 @@ class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> { /// Used to calculate offsets, and hence alignment, of subobjects. const DataLayout &DL; + IRBuilderTy &IRB; + public: - AggLoadStoreRewriter(const DataLayout &DL) : DL(DL) {} + AggLoadStoreRewriter(const DataLayout &DL, IRBuilderTy &IRB) + : DL(DL), IRB(IRB) {} /// Rewrite loads and stores through a pointer and all pointers derived from /// it. @@ -3255,7 +3267,7 @@ private: template <typename Derived> class OpSplitter { protected: /// The builder used to form new instructions. - IRBuilderTy IRB; + IRBuilderTy &IRB; /// The indices which to be used with insert- or extractvalue to select the /// appropriate value within the aggregate. @@ -3282,9 +3294,11 @@ private: /// Initialize the splitter with an insertion point, Ptr and start with a /// single zero GEP index. OpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy, - Align BaseAlign, const DataLayout &DL) - : IRB(InsertionPoint), GEPIndices(1, IRB.getInt32(0)), Ptr(Ptr), - BaseTy(BaseTy), BaseAlign(BaseAlign), DL(DL) {} + Align BaseAlign, const DataLayout &DL, IRBuilderTy &IRB) + : IRB(IRB), GEPIndices(1, IRB.getInt32(0)), Ptr(Ptr), BaseTy(BaseTy), + BaseAlign(BaseAlign), DL(DL) { + IRB.SetInsertPoint(InsertionPoint); + } public: /// Generic recursive split emission routine. @@ -3345,9 +3359,10 @@ private: AAMDNodes AATags; LoadOpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy, - AAMDNodes AATags, Align BaseAlign, const DataLayout &DL) - : OpSplitter<LoadOpSplitter>(InsertionPoint, Ptr, BaseTy, BaseAlign, - DL), + AAMDNodes AATags, Align BaseAlign, const DataLayout &DL, + IRBuilderTy &IRB) + : OpSplitter<LoadOpSplitter>(InsertionPoint, Ptr, BaseTy, BaseAlign, DL, + IRB), AATags(AATags) {} /// Emit a leaf load of a single value. This is called at the leaves of the @@ -3379,8 +3394,8 @@ private: // We have an aggregate being loaded, split it apart. LLVM_DEBUG(dbgs() << " original: " << LI << "\n"); LoadOpSplitter Splitter(&LI, *U, LI.getType(), LI.getAAMetadata(), - getAdjustedAlignment(&LI, 0), DL); - Value *V = UndefValue::get(LI.getType()); + getAdjustedAlignment(&LI, 0), DL, IRB); + Value *V = PoisonValue::get(LI.getType()); Splitter.emitSplitOps(LI.getType(), V, LI.getName() + ".fca"); Visited.erase(&LI); LI.replaceAllUsesWith(V); @@ -3390,9 +3405,10 @@ private: struct StoreOpSplitter : public OpSplitter<StoreOpSplitter> { StoreOpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy, - AAMDNodes AATags, Align BaseAlign, const DataLayout &DL) + AAMDNodes AATags, Align BaseAlign, const DataLayout &DL, + IRBuilderTy &IRB) : OpSplitter<StoreOpSplitter>(InsertionPoint, Ptr, BaseTy, BaseAlign, - DL), + DL, IRB), AATags(AATags) {} AAMDNodes AATags; /// Emit a leaf store of a single value. This is called at the leaves of the @@ -3430,7 +3446,7 @@ private: // We have an aggregate being stored, split it apart. LLVM_DEBUG(dbgs() << " original: " << SI << "\n"); StoreOpSplitter Splitter(&SI, *U, V->getType(), SI.getAAMetadata(), - getAdjustedAlignment(&SI, 0), DL); + getAdjustedAlignment(&SI, 0), DL, IRB); Splitter.emitSplitOps(V->getType(), V, V->getName() + ".fca"); Visited.erase(&SI); SI.eraseFromParent(); @@ -3458,7 +3474,7 @@ private: << "\n original: " << *Sel << "\n " << GEPI); - IRBuilderTy Builder(&GEPI); + IRB.SetInsertPoint(&GEPI); SmallVector<Value *, 4> Index(GEPI.indices()); bool IsInBounds = GEPI.isInBounds(); @@ -3466,21 +3482,20 @@ private: Value *True = Sel->getTrueValue(); Value *NTrue = IsInBounds - ? Builder.CreateInBoundsGEP(Ty, True, Index, - True->getName() + ".sroa.gep") - : Builder.CreateGEP(Ty, True, Index, True->getName() + ".sroa.gep"); + ? IRB.CreateInBoundsGEP(Ty, True, Index, + True->getName() + ".sroa.gep") + : IRB.CreateGEP(Ty, True, Index, True->getName() + ".sroa.gep"); Value *False = Sel->getFalseValue(); Value *NFalse = IsInBounds - ? Builder.CreateInBoundsGEP(Ty, False, Index, - False->getName() + ".sroa.gep") - : Builder.CreateGEP(Ty, False, Index, - False->getName() + ".sroa.gep"); + ? IRB.CreateInBoundsGEP(Ty, False, Index, + False->getName() + ".sroa.gep") + : IRB.CreateGEP(Ty, False, Index, False->getName() + ".sroa.gep"); - Value *NSel = Builder.CreateSelect(Sel->getCondition(), NTrue, NFalse, - Sel->getName() + ".sroa.sel"); + Value *NSel = IRB.CreateSelect(Sel->getCondition(), NTrue, NFalse, + Sel->getName() + ".sroa.sel"); Visited.erase(&GEPI); GEPI.replaceAllUsesWith(NSel); GEPI.eraseFromParent(); @@ -3517,10 +3532,9 @@ private: SmallVector<Value *, 4> Index(GEPI.indices()); bool IsInBounds = GEPI.isInBounds(); - IRBuilderTy PHIBuilder(GEPI.getParent()->getFirstNonPHI()); - PHINode *NewPN = PHIBuilder.CreatePHI(GEPI.getType(), - PHI->getNumIncomingValues(), - PHI->getName() + ".sroa.phi"); + IRB.SetInsertPoint(GEPI.getParent()->getFirstNonPHI()); + PHINode *NewPN = IRB.CreatePHI(GEPI.getType(), PHI->getNumIncomingValues(), + PHI->getName() + ".sroa.phi"); for (unsigned I = 0, E = PHI->getNumIncomingValues(); I != E; ++I) { BasicBlock *B = PHI->getIncomingBlock(I); Value *NewVal = nullptr; @@ -3530,11 +3544,12 @@ private: } else { Instruction *In = cast<Instruction>(PHI->getIncomingValue(I)); - IRBuilderTy B(In->getParent(), std::next(In->getIterator())); + IRB.SetInsertPoint(In->getParent(), std::next(In->getIterator())); Type *Ty = GEPI.getSourceElementType(); - NewVal = IsInBounds - ? B.CreateInBoundsGEP(Ty, In, Index, In->getName() + ".sroa.gep") - : B.CreateGEP(Ty, In, Index, In->getName() + ".sroa.gep"); + NewVal = IsInBounds ? IRB.CreateInBoundsGEP(Ty, In, Index, + In->getName() + ".sroa.gep") + : IRB.CreateGEP(Ty, In, Index, + In->getName() + ".sroa.gep"); } NewPN->addIncoming(NewVal, B); } @@ -4557,11 +4572,11 @@ bool SROAPass::splitAlloca(AllocaInst &AI, AllocaSlices &AS) { return Changed; } -/// Clobber a use with undef, deleting the used value if it becomes dead. +/// Clobber a use with poison, deleting the used value if it becomes dead. void SROAPass::clobberUse(Use &U) { Value *OldV = U; - // Replace the use with an undef value. - U = UndefValue::get(OldV->getType()); + // Replace the use with an poison value. + U = PoisonValue::get(OldV->getType()); // Check for this making an instruction dead. We have to garbage collect // all the dead instructions to ensure the uses of any alloca end up being @@ -4598,7 +4613,8 @@ bool SROAPass::runOnAlloca(AllocaInst &AI) { // First, split any FCA loads and stores touching this alloca to promote // better splitting and promotion opportunities. - AggLoadStoreRewriter AggRewriter(DL); + IRBuilderTy IRB(&AI); + AggLoadStoreRewriter AggRewriter(DL, IRB); Changed |= AggRewriter.rewrite(AI); // Build the slices using a recursive instruction-visiting builder. @@ -4614,7 +4630,7 @@ bool SROAPass::runOnAlloca(AllocaInst &AI) { clobberUse(DeadOp); // Now replace the uses of this instruction. - DeadUser->replaceAllUsesWith(UndefValue::get(DeadUser->getType())); + DeadUser->replaceAllUsesWith(PoisonValue::get(DeadUser->getType())); // And mark it for deletion. DeadInsts.push_back(DeadUser); @@ -4633,11 +4649,11 @@ bool SROAPass::runOnAlloca(AllocaInst &AI) { LLVM_DEBUG(dbgs() << " Speculating PHIs\n"); while (!SpeculatablePHIs.empty()) - speculatePHINodeLoads(*SpeculatablePHIs.pop_back_val()); + speculatePHINodeLoads(IRB, *SpeculatablePHIs.pop_back_val()); LLVM_DEBUG(dbgs() << " Speculating Selects\n"); while (!SpeculatableSelects.empty()) - speculateSelectInstLoads(*SpeculatableSelects.pop_back_val()); + speculateSelectInstLoads(IRB, *SpeculatableSelects.pop_back_val()); return Changed; } diff --git a/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp b/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp index 1284bae820a4..29cea42e4a00 100644 --- a/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp +++ b/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp @@ -959,7 +959,8 @@ static bool optimizeCallInst(CallInst *CI, bool &ModifiedDT, Type *LoadTy = CI->getType(); Align Alignment = DL.getValueOrABITypeAlignment(MA, LoadTy->getScalarType()); - if (TTI.isLegalMaskedGather(LoadTy, Alignment)) + if (TTI.isLegalMaskedGather(LoadTy, Alignment) && + !TTI.forceScalarizeMaskedGather(cast<VectorType>(LoadTy), Alignment)) return false; scalarizeMaskedGather(DL, CI, DTU, ModifiedDT); return true; @@ -970,7 +971,9 @@ static bool optimizeCallInst(CallInst *CI, bool &ModifiedDT, Type *StoreTy = CI->getArgOperand(0)->getType(); Align Alignment = DL.getValueOrABITypeAlignment(MA, StoreTy->getScalarType()); - if (TTI.isLegalMaskedScatter(StoreTy, Alignment)) + if (TTI.isLegalMaskedScatter(StoreTy, Alignment) && + !TTI.forceScalarizeMaskedScatter(cast<VectorType>(StoreTy), + Alignment)) return false; scalarizeMaskedScatter(DL, CI, DTU, ModifiedDT); return true; diff --git a/llvm/lib/Transforms/Scalar/Scalarizer.cpp b/llvm/lib/Transforms/Scalar/Scalarizer.cpp index 6b7419abe1d1..3606c8a4b073 100644 --- a/llvm/lib/Transforms/Scalar/Scalarizer.cpp +++ b/llvm/lib/Transforms/Scalar/Scalarizer.cpp @@ -270,7 +270,7 @@ Scatterer::Scatterer(BasicBlock *bb, BasicBlock::iterator bbi, Value *v, Type *Ty = V->getType(); PtrTy = dyn_cast<PointerType>(Ty); if (PtrTy) - Ty = PtrTy->getElementType(); + Ty = PtrTy->getPointerElementType(); Size = cast<FixedVectorType>(Ty)->getNumElements(); if (!CachePtr) Tmp.resize(Size, nullptr); @@ -288,7 +288,8 @@ Value *Scatterer::operator[](unsigned I) { return CV[I]; IRBuilder<> Builder(BB, BBI); if (PtrTy) { - Type *ElTy = cast<VectorType>(PtrTy->getElementType())->getElementType(); + Type *ElTy = + cast<VectorType>(PtrTy->getPointerElementType())->getElementType(); if (!CV[0]) { Type *NewPtrTy = PointerType::get(ElTy, PtrTy->getAddressSpace()); CV[0] = Builder.CreateBitCast(V, NewPtrTy, V->getName() + ".i0"); diff --git a/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp index 3799d2dd1cf2..ee17da1875e5 100644 --- a/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp +++ b/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp @@ -78,6 +78,79 @@ static cl::opt<bool> UserSinkCommonInsts( STATISTIC(NumSimpl, "Number of blocks simplified"); +static bool +performBlockTailMerging(Function &F, ArrayRef<BasicBlock *> BBs, + std::vector<DominatorTree::UpdateType> *Updates) { + SmallVector<PHINode *, 1> NewOps; + + // We don't want to change IR just because we can. + // Only do that if there are at least two blocks we'll tail-merge. + if (BBs.size() < 2) + return false; + + if (Updates) + Updates->reserve(Updates->size() + BBs.size()); + + BasicBlock *CanonicalBB; + Instruction *CanonicalTerm; + { + auto *Term = BBs[0]->getTerminator(); + + // Create a canonical block for this function terminator type now, + // placing it *before* the first block that will branch to it. + CanonicalBB = BasicBlock::Create( + F.getContext(), Twine("common.") + Term->getOpcodeName(), &F, BBs[0]); + // We'll also need a PHI node per each operand of the terminator. + NewOps.resize(Term->getNumOperands()); + for (auto I : zip(Term->operands(), NewOps)) { + std::get<1>(I) = PHINode::Create(std::get<0>(I)->getType(), + /*NumReservedValues=*/BBs.size(), + CanonicalBB->getName() + ".op"); + CanonicalBB->getInstList().push_back(std::get<1>(I)); + } + // Make it so that this canonical block actually has the right + // terminator. + CanonicalTerm = Term->clone(); + CanonicalBB->getInstList().push_back(CanonicalTerm); + // If the canonical terminator has operands, rewrite it to take PHI's. + for (auto I : zip(NewOps, CanonicalTerm->operands())) + std::get<1>(I) = std::get<0>(I); + } + + // Now, go through each block (with the current terminator type) + // we've recorded, and rewrite it to branch to the new common block. + const DILocation *CommonDebugLoc = nullptr; + for (BasicBlock *BB : BBs) { + auto *Term = BB->getTerminator(); + assert(Term->getOpcode() == CanonicalTerm->getOpcode() && + "All blocks to be tail-merged must be the same " + "(function-terminating) terminator type."); + + // Aha, found a new non-canonical function terminator. If it has operands, + // forward them to the PHI nodes in the canonical block. + for (auto I : zip(Term->operands(), NewOps)) + std::get<1>(I)->addIncoming(std::get<0>(I), BB); + + // Compute the debug location common to all the original terminators. + if (!CommonDebugLoc) + CommonDebugLoc = Term->getDebugLoc(); + else + CommonDebugLoc = + DILocation::getMergedLocation(CommonDebugLoc, Term->getDebugLoc()); + + // And turn BB into a block that just unconditionally branches + // to the canonical block. + Term->eraseFromParent(); + BranchInst::Create(CanonicalBB, BB); + if (Updates) + Updates->push_back({DominatorTree::Insert, BB, CanonicalBB}); + } + + CanonicalTerm->setDebugLoc(CommonDebugLoc); + + return true; +} + static bool tailMergeBlocksWithSimilarFunctionTerminators(Function &F, DomTreeUpdater *DTU) { SmallMapVector<unsigned /*TerminatorOpcode*/, SmallVector<BasicBlock *, 2>, 4> @@ -133,73 +206,8 @@ static bool tailMergeBlocksWithSimilarFunctionTerminators(Function &F, std::vector<DominatorTree::UpdateType> Updates; - for (ArrayRef<BasicBlock *> BBs : make_second_range(Structure)) { - SmallVector<PHINode *, 1> NewOps; - - // We don't want to change IR just because we can. - // Only do that if there are at least two blocks we'll tail-merge. - if (BBs.size() < 2) - continue; - - Changed = true; - - if (DTU) - Updates.reserve(Updates.size() + BBs.size()); - - BasicBlock *CanonicalBB; - Instruction *CanonicalTerm; - { - auto *Term = BBs[0]->getTerminator(); - - // Create a canonical block for this function terminator type now, - // placing it *before* the first block that will branch to it. - CanonicalBB = BasicBlock::Create( - F.getContext(), Twine("common.") + Term->getOpcodeName(), &F, BBs[0]); - // We'll also need a PHI node per each operand of the terminator. - NewOps.resize(Term->getNumOperands()); - for (auto I : zip(Term->operands(), NewOps)) { - std::get<1>(I) = PHINode::Create(std::get<0>(I)->getType(), - /*NumReservedValues=*/BBs.size(), - CanonicalBB->getName() + ".op"); - CanonicalBB->getInstList().push_back(std::get<1>(I)); - } - // Make it so that this canonical block actually has the right - // terminator. - CanonicalTerm = Term->clone(); - CanonicalBB->getInstList().push_back(CanonicalTerm); - // If the canonical terminator has operands, rewrite it to take PHI's. - for (auto I : zip(NewOps, CanonicalTerm->operands())) - std::get<1>(I) = std::get<0>(I); - } - - // Now, go through each block (with the current terminator type) - // we've recorded, and rewrite it to branch to the new common block. - const DILocation *CommonDebugLoc = nullptr; - for (BasicBlock *BB : BBs) { - auto *Term = BB->getTerminator(); - - // Aha, found a new non-canonical function terminator. If it has operands, - // forward them to the PHI nodes in the canonical block. - for (auto I : zip(Term->operands(), NewOps)) - std::get<1>(I)->addIncoming(std::get<0>(I), BB); - - // Compute the debug location common to all the original terminators. - if (!CommonDebugLoc) - CommonDebugLoc = Term->getDebugLoc(); - else - CommonDebugLoc = - DILocation::getMergedLocation(CommonDebugLoc, Term->getDebugLoc()); - - // And turn BB into a block that just unconditionally branches - // to the canonical block. - Term->eraseFromParent(); - BranchInst::Create(CanonicalBB, BB); - if (DTU) - Updates.push_back({DominatorTree::Insert, BB, CanonicalBB}); - } - - CanonicalTerm->setDebugLoc(CommonDebugLoc); - } + for (ArrayRef<BasicBlock *> BBs : make_second_range(Structure)) + Changed |= performBlockTailMerging(F, BBs, DTU ? &Updates : nullptr); if (DTU) DTU->applyUpdates(Updates); @@ -313,7 +321,7 @@ static void applyCommandLineOverridesToOptions(SimplifyCFGOptions &Options) { Options.SinkCommonInsts = UserSinkCommonInsts; } -SimplifyCFGPass::SimplifyCFGPass() : Options() { +SimplifyCFGPass::SimplifyCFGPass() { applyCommandLineOverridesToOptions(Options); } diff --git a/llvm/lib/Transforms/Utils/AMDGPUEmitPrintf.cpp b/llvm/lib/Transforms/Utils/AMDGPUEmitPrintf.cpp index fdc914a72bfd..c734611836eb 100644 --- a/llvm/lib/Transforms/Utils/AMDGPUEmitPrintf.cpp +++ b/llvm/lib/Transforms/Utils/AMDGPUEmitPrintf.cpp @@ -22,19 +22,6 @@ using namespace llvm; #define DEBUG_TYPE "amdgpu-emit-printf" -static bool isCString(const Value *Arg) { - auto Ty = Arg->getType(); - auto PtrTy = dyn_cast<PointerType>(Ty); - if (!PtrTy) - return false; - - auto IntTy = dyn_cast<IntegerType>(PtrTy->getElementType()); - if (!IntTy) - return false; - - return IntTy->getBitWidth() == 8; -} - static Value *fitArgInto64Bits(IRBuilder<> &Builder, Value *Arg) { auto Int64Ty = Builder.getInt64Ty(); auto Ty = Arg->getType(); @@ -176,13 +163,15 @@ static Value *callAppendStringN(IRBuilder<> &Builder, Value *Desc, Value *Str, static Value *appendString(IRBuilder<> &Builder, Value *Desc, Value *Arg, bool IsLast) { + Arg = Builder.CreateBitCast( + Arg, Builder.getInt8PtrTy(Arg->getType()->getPointerAddressSpace())); auto Length = getStrlenWithNull(Builder, Arg); return callAppendStringN(Builder, Desc, Arg, Length, IsLast); } static Value *processArg(IRBuilder<> &Builder, Value *Desc, Value *Arg, bool SpecIsCString, bool IsLast) { - if (SpecIsCString && isCString(Arg)) { + if (SpecIsCString && isa<PointerType>(Arg->getType())) { return appendString(Builder, Desc, Arg, IsLast); } // If the format specifies a string but the argument is not, the frontend will diff --git a/llvm/lib/Transforms/Utils/BuildLibCalls.cpp b/llvm/lib/Transforms/Utils/BuildLibCalls.cpp index 580cfd80141e..97f11ca71726 100644 --- a/llvm/lib/Transforms/Utils/BuildLibCalls.cpp +++ b/llvm/lib/Transforms/Utils/BuildLibCalls.cpp @@ -34,6 +34,7 @@ STATISTIC(NumReadNone, "Number of functions inferred as readnone"); STATISTIC(NumInaccessibleMemOnly, "Number of functions inferred as inaccessiblememonly"); STATISTIC(NumReadOnly, "Number of functions inferred as readonly"); +STATISTIC(NumWriteOnly, "Number of functions inferred as writeonly"); STATISTIC(NumArgMemOnly, "Number of functions inferred as argmemonly"); STATISTIC(NumInaccessibleMemOrArgMemOnly, "Number of functions inferred as inaccessiblemem_or_argmemonly"); @@ -71,6 +72,19 @@ static bool setOnlyReadsMemory(Function &F) { return true; } +static bool setOnlyWritesMemory(Function &F) { + if (F.onlyWritesMemory()) // writeonly or readnone + return false; + // Turn readonly and writeonly into readnone. + if (F.hasFnAttribute(Attribute::ReadOnly)) { + F.removeFnAttr(Attribute::ReadOnly); + return setDoesNotAccessMemory(F); + } + ++NumWriteOnly; + F.setOnlyWritesMemory(); + return true; +} + static bool setOnlyAccessesArgMemory(Function &F) { if (F.onlyAccessesArgMemory()) return false; @@ -233,6 +247,7 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) { switch (TheLibFunc) { case LibFunc_strlen: + case LibFunc_strnlen: case LibFunc_wcslen: Changed |= setOnlyReadsMemory(F); Changed |= setDoesNotThrow(F); @@ -400,6 +415,8 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) { Changed |= setDoesNotCapture(F, 0); Changed |= setOnlyReadsMemory(F, 0); return Changed; + case LibFunc_aligned_alloc: + case LibFunc_valloc: case LibFunc_malloc: case LibFunc_vec_malloc: Changed |= setOnlyAccessesInaccessibleMemory(F); @@ -484,6 +501,7 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) { return Changed; case LibFunc_realloc: case LibFunc_vec_realloc: + case LibFunc_reallocf: Changed |= setOnlyAccessesInaccessibleMemOrArgMem(F); Changed |= setRetNoUndef(F); Changed |= setDoesNotThrow(F); @@ -492,11 +510,6 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) { Changed |= setDoesNotCapture(F, 0); Changed |= setArgNoUndef(F, 1); return Changed; - case LibFunc_reallocf: - Changed |= setRetNoUndef(F); - Changed |= setWillReturn(F); - Changed |= setArgNoUndef(F, 1); - return Changed; case LibFunc_read: // May throw; "read" is a valid pthread cancellation point. Changed |= setRetAndArgsNoUndef(F); @@ -536,13 +549,6 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) { Changed |= setDoesNotCapture(F, 1); Changed |= setOnlyReadsMemory(F, 1); return Changed; - case LibFunc_aligned_alloc: - Changed |= setOnlyAccessesInaccessibleMemory(F); - Changed |= setRetAndArgsNoUndef(F); - Changed |= setDoesNotThrow(F); - Changed |= setRetDoesNotAlias(F); - Changed |= setWillReturn(F); - return Changed; case LibFunc_bcopy: Changed |= setDoesNotThrow(F); Changed |= setOnlyAccessesArgMemory(F); @@ -569,6 +575,7 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) { return Changed; case LibFunc_calloc: case LibFunc_vec_calloc: + Changed |= setOnlyAccessesInaccessibleMemory(F); Changed |= setRetAndArgsNoUndef(F); Changed |= setDoesNotThrow(F); Changed |= setRetDoesNotAlias(F); @@ -851,13 +858,6 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) { Changed |= setDoesNotCapture(F, 1); Changed |= setOnlyReadsMemory(F, 1); return Changed; - case LibFunc_valloc: - Changed |= setOnlyAccessesInaccessibleMemory(F); - Changed |= setRetAndArgsNoUndef(F); - Changed |= setDoesNotThrow(F); - Changed |= setRetDoesNotAlias(F); - Changed |= setWillReturn(F); - return Changed; case LibFunc_vprintf: Changed |= setRetAndArgsNoUndef(F); Changed |= setDoesNotThrow(F); @@ -1020,12 +1020,10 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) { case LibFunc_memset_pattern4: case LibFunc_memset_pattern8: case LibFunc_memset_pattern16: - Changed |= setOnlyAccessesArgMemory(F); Changed |= setDoesNotCapture(F, 0); - Changed |= setOnlyWritesMemory(F, 0); Changed |= setDoesNotCapture(F, 1); Changed |= setOnlyReadsMemory(F, 1); - return Changed; + LLVM_FALLTHROUGH; case LibFunc_memset: Changed |= setWillReturn(F); LLVM_FALLTHROUGH; @@ -1158,7 +1156,6 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) { case LibFunc_sqrt: case LibFunc_sqrtf: case LibFunc_sqrtl: - case LibFunc_strnlen: case LibFunc_tan: case LibFunc_tanf: case LibFunc_tanh: @@ -1171,6 +1168,7 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) { case LibFunc_truncl: Changed |= setDoesNotThrow(F); Changed |= setDoesNotFreeMemory(F); + Changed |= setOnlyWritesMemory(F); Changed |= setWillReturn(F); return Changed; default: diff --git a/llvm/lib/Transforms/Utils/CallGraphUpdater.cpp b/llvm/lib/Transforms/Utils/CallGraphUpdater.cpp index b2763900e154..ac3839f2a4ab 100644 --- a/llvm/lib/Transforms/Utils/CallGraphUpdater.cpp +++ b/llvm/lib/Transforms/Utils/CallGraphUpdater.cpp @@ -20,8 +20,7 @@ using namespace llvm; bool CallGraphUpdater::finalize() { if (!DeadFunctionsInComdats.empty()) { - filterDeadComdatFunctions(*DeadFunctionsInComdats.front()->getParent(), - DeadFunctionsInComdats); + filterDeadComdatFunctions(DeadFunctionsInComdats); DeadFunctions.append(DeadFunctionsInComdats.begin(), DeadFunctionsInComdats.end()); } diff --git a/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp b/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp index ebe19f1751e5..56b6e4bc46a5 100644 --- a/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp +++ b/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp @@ -500,7 +500,7 @@ CallBase &llvm::promoteCall(CallBase &CB, Function *Callee, CB.setArgOperand(ArgNo, Cast); // Remove any incompatible attributes for the argument. - AttrBuilder ArgAttrs(CallerPAL.getParamAttrs(ArgNo)); + AttrBuilder ArgAttrs(Ctx, CallerPAL.getParamAttrs(ArgNo)); ArgAttrs.remove(AttributeFuncs::typeIncompatible(FormalTy)); // We may have a different byval/inalloca type. @@ -518,7 +518,7 @@ CallBase &llvm::promoteCall(CallBase &CB, Function *Callee, // If the return type of the call site doesn't match that of the callee, cast // the returned value to the appropriate type. // Remove any incompatible return value attribute. - AttrBuilder RAttrs(CallerPAL, AttributeList::ReturnIndex); + AttrBuilder RAttrs(Ctx, CallerPAL.getRetAttrs()); if (!CallSiteRetTy->isVoidTy() && CallSiteRetTy != CalleeRetTy) { createRetBitCast(CB, CallSiteRetTy, RetBitCast); RAttrs.remove(AttributeFuncs::typeIncompatible(CalleeRetTy)); diff --git a/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/llvm/lib/Transforms/Utils/CodeExtractor.cpp index 96aff563aa9b..24cd5747c5a4 100644 --- a/llvm/lib/Transforms/Utils/CodeExtractor.cpp +++ b/llvm/lib/Transforms/Utils/CodeExtractor.cpp @@ -829,39 +829,54 @@ Function *CodeExtractor::constructFunction(const ValueSet &inputs, default: RetTy = Type::getInt16Ty(header->getContext()); break; } - std::vector<Type *> paramTy; + std::vector<Type *> ParamTy; + std::vector<Type *> AggParamTy; + ValueSet StructValues; // Add the types of the input values to the function's argument list for (Value *value : inputs) { LLVM_DEBUG(dbgs() << "value used in func: " << *value << "\n"); - paramTy.push_back(value->getType()); + if (AggregateArgs && !ExcludeArgsFromAggregate.contains(value)) { + AggParamTy.push_back(value->getType()); + StructValues.insert(value); + } else + ParamTy.push_back(value->getType()); } // Add the types of the output values to the function's argument list. for (Value *output : outputs) { LLVM_DEBUG(dbgs() << "instr used in func: " << *output << "\n"); - if (AggregateArgs) - paramTy.push_back(output->getType()); - else - paramTy.push_back(PointerType::getUnqual(output->getType())); + if (AggregateArgs && !ExcludeArgsFromAggregate.contains(output)) { + AggParamTy.push_back(output->getType()); + StructValues.insert(output); + } else + ParamTy.push_back(PointerType::getUnqual(output->getType())); + } + + assert( + (ParamTy.size() + AggParamTy.size()) == + (inputs.size() + outputs.size()) && + "Number of scalar and aggregate params does not match inputs, outputs"); + assert(StructValues.empty() || + AggregateArgs && "Expeced StructValues only with AggregateArgs set"); + + // Concatenate scalar and aggregate params in ParamTy. + size_t NumScalarParams = ParamTy.size(); + StructType *StructTy = nullptr; + if (AggregateArgs && !AggParamTy.empty()) { + StructTy = StructType::get(M->getContext(), AggParamTy); + ParamTy.push_back(PointerType::getUnqual(StructTy)); } LLVM_DEBUG({ dbgs() << "Function type: " << *RetTy << " f("; - for (Type *i : paramTy) + for (Type *i : ParamTy) dbgs() << *i << ", "; dbgs() << ")\n"; }); - StructType *StructTy = nullptr; - if (AggregateArgs && (inputs.size() + outputs.size() > 0)) { - StructTy = StructType::get(M->getContext(), paramTy); - paramTy.clear(); - paramTy.push_back(PointerType::getUnqual(StructTy)); - } - FunctionType *funcType = - FunctionType::get(RetTy, paramTy, - AllowVarArgs && oldFunction->isVarArg()); + FunctionType *funcType = FunctionType::get( + RetTy, ParamTy, AllowVarArgs && oldFunction->isVarArg()); std::string SuffixToUse = Suffix.empty() @@ -871,13 +886,6 @@ Function *CodeExtractor::constructFunction(const ValueSet &inputs, Function *newFunction = Function::Create( funcType, GlobalValue::InternalLinkage, oldFunction->getAddressSpace(), oldFunction->getName() + "." + SuffixToUse, M); - // If the old function is no-throw, so is the new one. - if (oldFunction->doesNotThrow()) - newFunction->setDoesNotThrow(); - - // Inherit the uwtable attribute if we need to. - if (oldFunction->hasUWTable()) - newFunction->setHasUWTable(); // Inherit all of the target dependent attributes and white-listed // target independent attributes. @@ -893,53 +901,26 @@ Function *CodeExtractor::constructFunction(const ValueSet &inputs, } else switch (Attr.getKindAsEnum()) { // Those attributes cannot be propagated safely. Explicitly list them - // here so we get a warning if new attributes are added. This list also - // includes non-function attributes. - case Attribute::Alignment: + // here so we get a warning if new attributes are added. case Attribute::AllocSize: case Attribute::ArgMemOnly: case Attribute::Builtin: - case Attribute::ByVal: case Attribute::Convergent: - case Attribute::Dereferenceable: - case Attribute::DereferenceableOrNull: - case Attribute::ElementType: - case Attribute::InAlloca: - case Attribute::InReg: case Attribute::InaccessibleMemOnly: case Attribute::InaccessibleMemOrArgMemOnly: case Attribute::JumpTable: case Attribute::Naked: - case Attribute::Nest: - case Attribute::NoAlias: case Attribute::NoBuiltin: - case Attribute::NoCapture: case Attribute::NoMerge: case Attribute::NoReturn: case Attribute::NoSync: - case Attribute::NoUndef: - case Attribute::None: - case Attribute::NonNull: - case Attribute::Preallocated: case Attribute::ReadNone: case Attribute::ReadOnly: - case Attribute::Returned: case Attribute::ReturnsTwice: - case Attribute::SExt: case Attribute::Speculatable: case Attribute::StackAlignment: - case Attribute::StructRet: - case Attribute::SwiftError: - case Attribute::SwiftSelf: - case Attribute::SwiftAsync: case Attribute::WillReturn: case Attribute::WriteOnly: - case Attribute::ZExt: - case Attribute::ImmArg: - case Attribute::ByRef: - case Attribute::EndAttrKinds: - case Attribute::EmptyKey: - case Attribute::TombstoneKey: continue; // Those attributes should be safe to propagate to the extracted function. case Attribute::AlwaysInline: @@ -980,30 +961,62 @@ Function *CodeExtractor::constructFunction(const ValueSet &inputs, case Attribute::MustProgress: case Attribute::NoProfile: break; + // These attributes cannot be applied to functions. + case Attribute::Alignment: + case Attribute::ByVal: + case Attribute::Dereferenceable: + case Attribute::DereferenceableOrNull: + case Attribute::ElementType: + case Attribute::InAlloca: + case Attribute::InReg: + case Attribute::Nest: + case Attribute::NoAlias: + case Attribute::NoCapture: + case Attribute::NoUndef: + case Attribute::NonNull: + case Attribute::Preallocated: + case Attribute::Returned: + case Attribute::SExt: + case Attribute::StructRet: + case Attribute::SwiftError: + case Attribute::SwiftSelf: + case Attribute::SwiftAsync: + case Attribute::ZExt: + case Attribute::ImmArg: + case Attribute::ByRef: + // These are not really attributes. + case Attribute::None: + case Attribute::EndAttrKinds: + case Attribute::EmptyKey: + case Attribute::TombstoneKey: + llvm_unreachable("Not a function attribute"); } newFunction->addFnAttr(Attr); } newFunction->getBasicBlockList().push_back(newRootNode); - // Create an iterator to name all of the arguments we inserted. - Function::arg_iterator AI = newFunction->arg_begin(); + // Create scalar and aggregate iterators to name all of the arguments we + // inserted. + Function::arg_iterator ScalarAI = newFunction->arg_begin(); + Function::arg_iterator AggAI = std::next(ScalarAI, NumScalarParams); // Rewrite all users of the inputs in the extracted region to use the // arguments (or appropriate addressing into struct) instead. - for (unsigned i = 0, e = inputs.size(); i != e; ++i) { + for (unsigned i = 0, e = inputs.size(), aggIdx = 0; i != e; ++i) { Value *RewriteVal; - if (AggregateArgs) { + if (AggregateArgs && StructValues.contains(inputs[i])) { Value *Idx[2]; Idx[0] = Constant::getNullValue(Type::getInt32Ty(header->getContext())); - Idx[1] = ConstantInt::get(Type::getInt32Ty(header->getContext()), i); + Idx[1] = ConstantInt::get(Type::getInt32Ty(header->getContext()), aggIdx); Instruction *TI = newFunction->begin()->getTerminator(); GetElementPtrInst *GEP = GetElementPtrInst::Create( - StructTy, &*AI, Idx, "gep_" + inputs[i]->getName(), TI); - RewriteVal = new LoadInst(StructTy->getElementType(i), GEP, + StructTy, &*AggAI, Idx, "gep_" + inputs[i]->getName(), TI); + RewriteVal = new LoadInst(StructTy->getElementType(aggIdx), GEP, "loadgep_" + inputs[i]->getName(), TI); + ++aggIdx; } else - RewriteVal = &*AI++; + RewriteVal = &*ScalarAI++; std::vector<User *> Users(inputs[i]->user_begin(), inputs[i]->user_end()); for (User *use : Users) @@ -1013,12 +1026,14 @@ Function *CodeExtractor::constructFunction(const ValueSet &inputs, } // Set names for input and output arguments. - if (!AggregateArgs) { - AI = newFunction->arg_begin(); - for (unsigned i = 0, e = inputs.size(); i != e; ++i, ++AI) - AI->setName(inputs[i]->getName()); - for (unsigned i = 0, e = outputs.size(); i != e; ++i, ++AI) - AI->setName(outputs[i]->getName()+".out"); + if (NumScalarParams) { + ScalarAI = newFunction->arg_begin(); + for (unsigned i = 0, e = inputs.size(); i != e; ++i, ++ScalarAI) + if (!StructValues.contains(inputs[i])) + ScalarAI->setName(inputs[i]->getName()); + for (unsigned i = 0, e = outputs.size(); i != e; ++i, ++ScalarAI) + if (!StructValues.contains(outputs[i])) + ScalarAI->setName(outputs[i]->getName() + ".out"); } // Rewrite branches to basic blocks outside of the loop to new dummy blocks @@ -1126,7 +1141,8 @@ CallInst *CodeExtractor::emitCallAndSwitchStatement(Function *newFunction, ValueSet &outputs) { // Emit a call to the new function, passing in: *pointer to struct (if // aggregating parameters), or plan inputs and allocated memory for outputs - std::vector<Value *> params, StructValues, ReloadOutputs, Reloads; + std::vector<Value *> params, ReloadOutputs, Reloads; + ValueSet StructValues; Module *M = newFunction->getParent(); LLVMContext &Context = M->getContext(); @@ -1134,23 +1150,24 @@ CallInst *CodeExtractor::emitCallAndSwitchStatement(Function *newFunction, CallInst *call = nullptr; // Add inputs as params, or to be filled into the struct - unsigned ArgNo = 0; + unsigned ScalarInputArgNo = 0; SmallVector<unsigned, 1> SwiftErrorArgs; for (Value *input : inputs) { - if (AggregateArgs) - StructValues.push_back(input); + if (AggregateArgs && !ExcludeArgsFromAggregate.contains(input)) + StructValues.insert(input); else { params.push_back(input); if (input->isSwiftError()) - SwiftErrorArgs.push_back(ArgNo); + SwiftErrorArgs.push_back(ScalarInputArgNo); } - ++ArgNo; + ++ScalarInputArgNo; } // Create allocas for the outputs + unsigned ScalarOutputArgNo = 0; for (Value *output : outputs) { - if (AggregateArgs) { - StructValues.push_back(output); + if (AggregateArgs && !ExcludeArgsFromAggregate.contains(output)) { + StructValues.insert(output); } else { AllocaInst *alloca = new AllocaInst(output->getType(), DL.getAllocaAddrSpace(), @@ -1158,12 +1175,14 @@ CallInst *CodeExtractor::emitCallAndSwitchStatement(Function *newFunction, &codeReplacer->getParent()->front().front()); ReloadOutputs.push_back(alloca); params.push_back(alloca); + ++ScalarOutputArgNo; } } StructType *StructArgTy = nullptr; AllocaInst *Struct = nullptr; - if (AggregateArgs && (inputs.size() + outputs.size() > 0)) { + unsigned NumAggregatedInputs = 0; + if (AggregateArgs && !StructValues.empty()) { std::vector<Type *> ArgTypes; for (Value *V : StructValues) ArgTypes.push_back(V->getType()); @@ -1175,14 +1194,18 @@ CallInst *CodeExtractor::emitCallAndSwitchStatement(Function *newFunction, &codeReplacer->getParent()->front().front()); params.push_back(Struct); - for (unsigned i = 0, e = inputs.size(); i != e; ++i) { - Value *Idx[2]; - Idx[0] = Constant::getNullValue(Type::getInt32Ty(Context)); - Idx[1] = ConstantInt::get(Type::getInt32Ty(Context), i); - GetElementPtrInst *GEP = GetElementPtrInst::Create( - StructArgTy, Struct, Idx, "gep_" + StructValues[i]->getName()); - codeReplacer->getInstList().push_back(GEP); - new StoreInst(StructValues[i], GEP, codeReplacer); + // Store aggregated inputs in the struct. + for (unsigned i = 0, e = StructValues.size(); i != e; ++i) { + if (inputs.contains(StructValues[i])) { + Value *Idx[2]; + Idx[0] = Constant::getNullValue(Type::getInt32Ty(Context)); + Idx[1] = ConstantInt::get(Type::getInt32Ty(Context), i); + GetElementPtrInst *GEP = GetElementPtrInst::Create( + StructArgTy, Struct, Idx, "gep_" + StructValues[i]->getName()); + codeReplacer->getInstList().push_back(GEP); + new StoreInst(StructValues[i], GEP, codeReplacer); + NumAggregatedInputs++; + } } } @@ -1205,24 +1228,24 @@ CallInst *CodeExtractor::emitCallAndSwitchStatement(Function *newFunction, newFunction->addParamAttr(SwiftErrArgNo, Attribute::SwiftError); } - Function::arg_iterator OutputArgBegin = newFunction->arg_begin(); - unsigned FirstOut = inputs.size(); - if (!AggregateArgs) - std::advance(OutputArgBegin, inputs.size()); - - // Reload the outputs passed in by reference. - for (unsigned i = 0, e = outputs.size(); i != e; ++i) { + // Reload the outputs passed in by reference, use the struct if output is in + // the aggregate or reload from the scalar argument. + for (unsigned i = 0, e = outputs.size(), scalarIdx = 0, + aggIdx = NumAggregatedInputs; + i != e; ++i) { Value *Output = nullptr; - if (AggregateArgs) { + if (AggregateArgs && StructValues.contains(outputs[i])) { Value *Idx[2]; Idx[0] = Constant::getNullValue(Type::getInt32Ty(Context)); - Idx[1] = ConstantInt::get(Type::getInt32Ty(Context), FirstOut + i); + Idx[1] = ConstantInt::get(Type::getInt32Ty(Context), aggIdx); GetElementPtrInst *GEP = GetElementPtrInst::Create( StructArgTy, Struct, Idx, "gep_reload_" + outputs[i]->getName()); codeReplacer->getInstList().push_back(GEP); Output = GEP; + ++aggIdx; } else { - Output = ReloadOutputs[i]; + Output = ReloadOutputs[scalarIdx]; + ++scalarIdx; } LoadInst *load = new LoadInst(outputs[i]->getType(), Output, outputs[i]->getName() + ".reload", @@ -1304,8 +1327,13 @@ CallInst *CodeExtractor::emitCallAndSwitchStatement(Function *newFunction, // Store the arguments right after the definition of output value. // This should be proceeded after creating exit stubs to be ensure that invoke // result restore will be placed in the outlined function. - Function::arg_iterator OAI = OutputArgBegin; - for (unsigned i = 0, e = outputs.size(); i != e; ++i) { + Function::arg_iterator ScalarOutputArgBegin = newFunction->arg_begin(); + std::advance(ScalarOutputArgBegin, ScalarInputArgNo); + Function::arg_iterator AggOutputArgBegin = newFunction->arg_begin(); + std::advance(AggOutputArgBegin, ScalarInputArgNo + ScalarOutputArgNo); + + for (unsigned i = 0, e = outputs.size(), aggIdx = NumAggregatedInputs; i != e; + ++i) { auto *OutI = dyn_cast<Instruction>(outputs[i]); if (!OutI) continue; @@ -1325,23 +1353,27 @@ CallInst *CodeExtractor::emitCallAndSwitchStatement(Function *newFunction, assert((InsertBefore->getFunction() == newFunction || Blocks.count(InsertBefore->getParent())) && "InsertPt should be in new function"); - assert(OAI != newFunction->arg_end() && - "Number of output arguments should match " - "the amount of defined values"); - if (AggregateArgs) { + if (AggregateArgs && StructValues.contains(outputs[i])) { + assert(AggOutputArgBegin != newFunction->arg_end() && + "Number of aggregate output arguments should match " + "the number of defined values"); Value *Idx[2]; Idx[0] = Constant::getNullValue(Type::getInt32Ty(Context)); - Idx[1] = ConstantInt::get(Type::getInt32Ty(Context), FirstOut + i); + Idx[1] = ConstantInt::get(Type::getInt32Ty(Context), aggIdx); GetElementPtrInst *GEP = GetElementPtrInst::Create( - StructArgTy, &*OAI, Idx, "gep_" + outputs[i]->getName(), + StructArgTy, &*AggOutputArgBegin, Idx, "gep_" + outputs[i]->getName(), InsertBefore); new StoreInst(outputs[i], GEP, InsertBefore); + ++aggIdx; // Since there should be only one struct argument aggregating - // all the output values, we shouldn't increment OAI, which always - // points to the struct argument, in this case. + // all the output values, we shouldn't increment AggOutputArgBegin, which + // always points to the struct argument, in this case. } else { - new StoreInst(outputs[i], &*OAI, InsertBefore); - ++OAI; + assert(ScalarOutputArgBegin != newFunction->arg_end() && + "Number of scalar output arguments should match " + "the number of defined values"); + new StoreInst(outputs[i], &*ScalarOutputArgBegin, InsertBefore); + ++ScalarOutputArgBegin; } } @@ -1840,3 +1872,7 @@ bool CodeExtractor::verifyAssumptionCache(const Function &OldFunc, } return false; } + +void CodeExtractor::excludeArgFromAggregate(Value *Arg) { + ExcludeArgsFromAggregate.insert(Arg); +} diff --git a/llvm/lib/Transforms/Utils/Evaluator.cpp b/llvm/lib/Transforms/Utils/Evaluator.cpp index 91630d876fc8..e73287c060ae 100644 --- a/llvm/lib/Transforms/Utils/Evaluator.cpp +++ b/llvm/lib/Transforms/Utils/Evaluator.cpp @@ -122,129 +122,114 @@ isSimpleEnoughValueToCommit(Constant *C, return isSimpleEnoughValueToCommitHelper(C, SimpleConstants, DL); } -/// Return true if this constant is simple enough for us to understand. In -/// particular, if it is a cast to anything other than from one pointer type to -/// another pointer type, we punt. We basically just support direct accesses to -/// globals and GEP's of globals. This should be kept up to date with -/// CommitValueTo. -static bool isSimpleEnoughPointerToCommit(Constant *C, const DataLayout &DL) { - if (GlobalVariable *GV = dyn_cast<GlobalVariable>(C)) - // Do not allow weak/*_odr/linkonce linkage or external globals. - return GV->hasUniqueInitializer(); - - if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C)) { - // Handle a constantexpr gep. - if (CE->getOpcode() == Instruction::GetElementPtr && - isa<GlobalVariable>(CE->getOperand(0)) && - cast<GEPOperator>(CE)->isInBounds()) { - GlobalVariable *GV = cast<GlobalVariable>(CE->getOperand(0)); - // Do not allow weak/*_odr/linkonce/dllimport/dllexport linkage or - // external globals. - if (!GV->hasUniqueInitializer()) - return false; +void Evaluator::MutableValue::clear() { + if (auto *Agg = Val.dyn_cast<MutableAggregate *>()) + delete Agg; + Val = nullptr; +} - // The first index must be zero. - ConstantInt *CI = dyn_cast<ConstantInt>(*std::next(CE->op_begin())); - if (!CI || !CI->isZero()) return false; +Constant *Evaluator::MutableValue::read(Type *Ty, APInt Offset, + const DataLayout &DL) const { + TypeSize TySize = DL.getTypeStoreSize(Ty); + const MutableValue *V = this; + while (const auto *Agg = V->Val.dyn_cast<MutableAggregate *>()) { + Type *AggTy = Agg->Ty; + Optional<APInt> Index = DL.getGEPIndexForOffset(AggTy, Offset); + if (!Index || Index->uge(Agg->Elements.size()) || + !TypeSize::isKnownLE(TySize, DL.getTypeStoreSize(AggTy))) + return nullptr; + + V = &Agg->Elements[Index->getZExtValue()]; + } - // The remaining indices must be compile-time known integers within the - // notional bounds of the corresponding static array types. - if (!CE->isGEPWithNoNotionalOverIndexing()) - return false; + return ConstantFoldLoadFromConst(V->Val.get<Constant *>(), Ty, Offset, DL); +} - return ConstantFoldLoadThroughGEPConstantExpr( - GV->getInitializer(), CE, - cast<GEPOperator>(CE)->getResultElementType(), DL); - } else if (CE->getOpcode() == Instruction::BitCast && - isa<GlobalVariable>(CE->getOperand(0))) { - // A constantexpr bitcast from a pointer to another pointer is a no-op, - // and we know how to evaluate it by moving the bitcast from the pointer - // operand to the value operand. - // Do not allow weak/*_odr/linkonce/dllimport/dllexport linkage or - // external globals. - return cast<GlobalVariable>(CE->getOperand(0))->hasUniqueInitializer(); - } - } +bool Evaluator::MutableValue::makeMutable() { + Constant *C = Val.get<Constant *>(); + Type *Ty = C->getType(); + unsigned NumElements; + if (auto *VT = dyn_cast<FixedVectorType>(Ty)) { + NumElements = VT->getNumElements(); + } else if (auto *AT = dyn_cast<ArrayType>(Ty)) + NumElements = AT->getNumElements(); + else if (auto *ST = dyn_cast<StructType>(Ty)) + NumElements = ST->getNumElements(); + else + return false; - return false; + MutableAggregate *MA = new MutableAggregate(Ty); + MA->Elements.reserve(NumElements); + for (unsigned I = 0; I < NumElements; ++I) + MA->Elements.push_back(C->getAggregateElement(I)); + Val = MA; + return true; } -/// Apply \p TryLoad to Ptr. If this returns \p nullptr, introspect the -/// pointer's type and walk down through the initial elements to obtain -/// additional pointers to try. Returns the first non-null return value from -/// \p TryLoad, or \p nullptr if the type can't be introspected further. -static Constant * -evaluateBitcastFromPtr(Constant *Ptr, const DataLayout &DL, - const TargetLibraryInfo *TLI, - std::function<Constant *(Constant *)> TryLoad) { - Constant *Val; - while (!(Val = TryLoad(Ptr))) { - // If Ty is a non-opaque struct, we can convert the pointer to the struct - // into a pointer to its first member. - // FIXME: This could be extended to support arrays as well. - Type *Ty = cast<PointerType>(Ptr->getType())->getElementType(); - if (!isa<StructType>(Ty) || cast<StructType>(Ty)->isOpaque()) - break; - - IntegerType *IdxTy = IntegerType::get(Ty->getContext(), 32); - Constant *IdxZero = ConstantInt::get(IdxTy, 0, false); - Constant *const IdxList[] = {IdxZero, IdxZero}; - - Ptr = ConstantExpr::getGetElementPtr(Ty, Ptr, IdxList); - Ptr = ConstantFoldConstant(Ptr, DL, TLI); +bool Evaluator::MutableValue::write(Constant *V, APInt Offset, + const DataLayout &DL) { + Type *Ty = V->getType(); + TypeSize TySize = DL.getTypeStoreSize(Ty); + MutableValue *MV = this; + while (Offset != 0 || + !CastInst::isBitOrNoopPointerCastable(Ty, MV->getType(), DL)) { + if (MV->Val.is<Constant *>() && !MV->makeMutable()) + return false; + + MutableAggregate *Agg = MV->Val.get<MutableAggregate *>(); + Type *AggTy = Agg->Ty; + Optional<APInt> Index = DL.getGEPIndexForOffset(AggTy, Offset); + if (!Index || Index->uge(Agg->Elements.size()) || + !TypeSize::isKnownLE(TySize, DL.getTypeStoreSize(AggTy))) + return false; + + MV = &Agg->Elements[Index->getZExtValue()]; } - return Val; + + Type *MVType = MV->getType(); + MV->clear(); + if (Ty->isIntegerTy() && MVType->isPointerTy()) + MV->Val = ConstantExpr::getIntToPtr(V, MVType); + else if (Ty->isPointerTy() && MVType->isIntegerTy()) + MV->Val = ConstantExpr::getPtrToInt(V, MVType); + else if (Ty != MVType) + MV->Val = ConstantExpr::getBitCast(V, MVType); + else + MV->Val = V; + return true; } -static Constant *getInitializer(Constant *C) { - auto *GV = dyn_cast<GlobalVariable>(C); - return GV && GV->hasDefinitiveInitializer() ? GV->getInitializer() : nullptr; +Constant *Evaluator::MutableAggregate::toConstant() const { + SmallVector<Constant *, 32> Consts; + for (const MutableValue &MV : Elements) + Consts.push_back(MV.toConstant()); + + if (auto *ST = dyn_cast<StructType>(Ty)) + return ConstantStruct::get(ST, Consts); + if (auto *AT = dyn_cast<ArrayType>(Ty)) + return ConstantArray::get(AT, Consts); + assert(isa<FixedVectorType>(Ty) && "Must be vector"); + return ConstantVector::get(Consts); } /// Return the value that would be computed by a load from P after the stores /// reflected by 'memory' have been performed. If we can't decide, return null. Constant *Evaluator::ComputeLoadResult(Constant *P, Type *Ty) { - // If this memory location has been recently stored, use the stored value: it - // is the most up-to-date. - auto TryFindMemLoc = [this](Constant *Ptr) { - return MutatedMemory.lookup(Ptr); - }; - - if (Constant *Val = TryFindMemLoc(P)) - return Val; - - // Access it. - if (GlobalVariable *GV = dyn_cast<GlobalVariable>(P)) { - if (GV->hasDefinitiveInitializer()) - return GV->getInitializer(); + APInt Offset(DL.getIndexTypeSizeInBits(P->getType()), 0); + P = cast<Constant>(P->stripAndAccumulateConstantOffsets( + DL, Offset, /* AllowNonInbounds */ true)); + Offset = Offset.sextOrTrunc(DL.getIndexTypeSizeInBits(P->getType())); + auto *GV = dyn_cast<GlobalVariable>(P); + if (!GV) return nullptr; - } - if (ConstantExpr *CE = dyn_cast<ConstantExpr>(P)) { - switch (CE->getOpcode()) { - // Handle a constantexpr getelementptr. - case Instruction::GetElementPtr: - if (auto *I = getInitializer(CE->getOperand(0))) - return ConstantFoldLoadThroughGEPConstantExpr(I, CE, Ty, DL); - break; - // Handle a constantexpr bitcast. - case Instruction::BitCast: - // We're evaluating a load through a pointer that was bitcast to a - // different type. See if the "from" pointer has recently been stored. - // If it hasn't, we may still be able to find a stored pointer by - // introspecting the type. - Constant *Val = - evaluateBitcastFromPtr(CE->getOperand(0), DL, TLI, TryFindMemLoc); - if (!Val) - Val = getInitializer(CE->getOperand(0)); - if (Val) - return ConstantFoldLoadThroughBitcast( - Val, P->getType()->getPointerElementType(), DL); - break; - } - } + auto It = MutatedMemory.find(GV); + if (It != MutatedMemory.end()) + return It->second.read(Ty, Offset, DL); - return nullptr; // don't know how to evaluate. + if (!GV->hasDefinitiveInitializer()) + return nullptr; + return ConstantFoldLoadFromConst(GV->getInitializer(), Ty, Offset, DL); } static Function *getFunction(Constant *C) { @@ -260,17 +245,10 @@ static Function *getFunction(Constant *C) { Function * Evaluator::getCalleeWithFormalArgs(CallBase &CB, SmallVectorImpl<Constant *> &Formals) { - auto *V = CB.getCalledOperand(); + auto *V = CB.getCalledOperand()->stripPointerCasts(); if (auto *Fn = getFunction(getVal(V))) return getFormalParams(CB, Fn, Formals) ? Fn : nullptr; - - auto *CE = dyn_cast<ConstantExpr>(V); - if (!CE || CE->getOpcode() != Instruction::BitCast || - !getFormalParams(CB, getFunction(CE->getOperand(0)), Formals)) - return nullptr; - - return dyn_cast<Function>( - ConstantFoldLoadThroughBitcast(CE, CE->getOperand(0)->getType(), DL)); + return nullptr; } bool Evaluator::getFormalParams(CallBase &CB, Function *F, @@ -299,17 +277,13 @@ bool Evaluator::getFormalParams(CallBase &CB, Function *F, /// If call expression contains bitcast then we may need to cast /// evaluated return value to a type of the call expression. -Constant *Evaluator::castCallResultIfNeeded(Value *CallExpr, Constant *RV) { - ConstantExpr *CE = dyn_cast<ConstantExpr>(CallExpr); - if (!RV || !CE || CE->getOpcode() != Instruction::BitCast) +Constant *Evaluator::castCallResultIfNeeded(Type *ReturnType, Constant *RV) { + if (!RV || RV->getType() == ReturnType) return RV; - if (auto *FT = - dyn_cast<FunctionType>(CE->getType()->getPointerElementType())) { - RV = ConstantFoldLoadThroughBitcast(RV, FT->getReturnType(), DL); - if (!RV) - LLVM_DEBUG(dbgs() << "Failed to fold bitcast call expr\n"); - } + RV = ConstantFoldLoadThroughBitcast(RV, ReturnType, DL); + if (!RV) + LLVM_DEBUG(dbgs() << "Failed to fold bitcast call expr\n"); return RV; } @@ -337,68 +311,30 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst, BasicBlock *&NextBB, Ptr = FoldedPtr; LLVM_DEBUG(dbgs() << "; To: " << *Ptr << "\n"); } - // Conservatively, avoid aggregate types. This is because we don't - // want to worry about them partially overlapping other stores. - if (!SI->getValueOperand()->getType()->isSingleValueType() || - !isSimpleEnoughPointerToCommit(Ptr, DL)) { - // If this is too complex for us to commit, reject it. - LLVM_DEBUG( - dbgs() << "Pointer is too complex for us to evaluate store."); + + APInt Offset(DL.getIndexTypeSizeInBits(Ptr->getType()), 0); + Ptr = cast<Constant>(Ptr->stripAndAccumulateConstantOffsets( + DL, Offset, /* AllowNonInbounds */ true)); + Offset = Offset.sextOrTrunc(DL.getIndexTypeSizeInBits(Ptr->getType())); + auto *GV = dyn_cast<GlobalVariable>(Ptr); + if (!GV || !GV->hasUniqueInitializer()) { + LLVM_DEBUG(dbgs() << "Store is not to global with unique initializer: " + << *Ptr << "\n"); return false; } - Constant *Val = getVal(SI->getOperand(0)); - // If this might be too difficult for the backend to handle (e.g. the addr // of one global variable divided by another) then we can't commit it. + Constant *Val = getVal(SI->getOperand(0)); if (!isSimpleEnoughValueToCommit(Val, SimpleConstants, DL)) { LLVM_DEBUG(dbgs() << "Store value is too complex to evaluate store. " << *Val << "\n"); return false; } - if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Ptr)) { - if (CE->getOpcode() == Instruction::BitCast) { - LLVM_DEBUG(dbgs() - << "Attempting to resolve bitcast on constant ptr.\n"); - // If we're evaluating a store through a bitcast, then we need - // to pull the bitcast off the pointer type and push it onto the - // stored value. In order to push the bitcast onto the stored value, - // a bitcast from the pointer's element type to Val's type must be - // legal. If it's not, we can try introspecting the type to find a - // legal conversion. - - auto TryCastValTy = [&](Constant *P) -> Constant * { - // The conversion is illegal if the store is wider than the - // pointee proposed by `evaluateBitcastFromPtr`, since that would - // drop stores to other struct elements when the caller attempts to - // look through a struct's 0th element. - Type *NewTy = cast<PointerType>(P->getType())->getElementType(); - Type *STy = Val->getType(); - if (DL.getTypeSizeInBits(NewTy) < DL.getTypeSizeInBits(STy)) - return nullptr; - - if (Constant *FV = ConstantFoldLoadThroughBitcast(Val, NewTy, DL)) { - Ptr = P; - return FV; - } - return nullptr; - }; - - Constant *NewVal = - evaluateBitcastFromPtr(CE->getOperand(0), DL, TLI, TryCastValTy); - if (!NewVal) { - LLVM_DEBUG(dbgs() << "Failed to bitcast constant ptr, can not " - "evaluate.\n"); - return false; - } - - Val = NewVal; - LLVM_DEBUG(dbgs() << "Evaluated bitcast: " << *Val << "\n"); - } - } - - MutatedMemory[Ptr] = Val; + auto Res = MutatedMemory.try_emplace(GV, GV->getInitializer()); + if (!Res.first->second.write(Val, Offset, DL)) + return false; } else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(CurInst)) { InstResult = ConstantExpr::get(BO->getOpcode(), getVal(BO->getOperand(0)), @@ -593,7 +529,7 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst, BasicBlock *&NextBB, if (Callee->isDeclaration()) { // If this is a function we can constant fold, do it. if (Constant *C = ConstantFoldCall(&CB, Callee, Formals, TLI)) { - InstResult = castCallResultIfNeeded(CB.getCalledOperand(), C); + InstResult = castCallResultIfNeeded(CB.getType(), C); if (!InstResult) return false; LLVM_DEBUG(dbgs() << "Constant folded function call. Result: " @@ -617,7 +553,7 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst, BasicBlock *&NextBB, return false; } ValueStack.pop_back(); - InstResult = castCallResultIfNeeded(CB.getCalledOperand(), RetVal); + InstResult = castCallResultIfNeeded(CB.getType(), RetVal); if (RetVal && !InstResult) return false; diff --git a/llvm/lib/Transforms/Utils/GlobalStatus.cpp b/llvm/lib/Transforms/Utils/GlobalStatus.cpp index 9bfc73e4ba6c..f8ec8c6ad426 100644 --- a/llvm/lib/Transforms/Utils/GlobalStatus.cpp +++ b/llvm/lib/Transforms/Utils/GlobalStatus.cpp @@ -66,8 +66,6 @@ static bool analyzeGlobalAux(const Value *V, GlobalStatus &GS, for (const Use &U : V->uses()) { const User *UR = U.getUser(); if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(UR)) { - GS.HasNonInstructionUser = true; - // If the result of the constantexpr isn't pointer type, then we won't // know to expect it in various places. Just reject early. if (!isa<PointerType>(CE->getType())) @@ -105,9 +103,7 @@ static bool analyzeGlobalAux(const Value *V, GlobalStatus &GS, // value, not an aggregate), keep more specific information about // stores. if (GS.StoredType != GlobalStatus::Stored) { - const Value *Ptr = SI->getPointerOperand(); - if (isa<ConstantExpr>(Ptr)) - Ptr = Ptr->stripPointerCasts(); + const Value *Ptr = SI->getPointerOperand()->stripPointerCasts(); if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(Ptr)) { Value *StoredVal = SI->getOperand(0); @@ -174,12 +170,10 @@ static bool analyzeGlobalAux(const Value *V, GlobalStatus &GS, return true; // Any other non-load instruction might take address! } } else if (const Constant *C = dyn_cast<Constant>(UR)) { - GS.HasNonInstructionUser = true; // We might have a dead and dangling constant hanging off of here. if (!isSafeToDestroyConstant(C)) return true; } else { - GS.HasNonInstructionUser = true; // Otherwise must be some other user. return true; } diff --git a/llvm/lib/Transforms/Utils/InlineFunction.cpp b/llvm/lib/Transforms/Utils/InlineFunction.cpp index 997667810580..c9f872f5b7e1 100644 --- a/llvm/lib/Transforms/Utils/InlineFunction.cpp +++ b/llvm/lib/Transforms/Utils/InlineFunction.cpp @@ -1185,10 +1185,10 @@ static bool MayContainThrowingOrExitingCall(Instruction *Begin, static AttrBuilder IdentifyValidAttributes(CallBase &CB) { - AttrBuilder AB(CB.getAttributes(), AttributeList::ReturnIndex); - if (AB.empty()) + AttrBuilder AB(CB.getContext(), CB.getAttributes().getRetAttrs()); + if (!AB.hasAttributes()) return AB; - AttrBuilder Valid; + AttrBuilder Valid(CB.getContext()); // Only allow these white listed attributes to be propagated back to the // callee. This is because other attributes may only be valid on the call // itself, i.e. attributes such as signext and zeroext. @@ -1208,7 +1208,7 @@ static void AddReturnAttributes(CallBase &CB, ValueToValueMapTy &VMap) { return; AttrBuilder Valid = IdentifyValidAttributes(CB); - if (Valid.empty()) + if (!Valid.hasAttributes()) return; auto *CalledFunction = CB.getCalledFunction(); auto &Context = CalledFunction->getContext(); @@ -1667,7 +1667,7 @@ inlineRetainOrClaimRVCalls(CallBase &CB, objcarc::ARCInstKind RVCallKind, Module *Mod = CB.getModule(); assert(objcarc::isRetainOrClaimRV(RVCallKind) && "unexpected ARC function"); bool IsRetainRV = RVCallKind == objcarc::ARCInstKind::RetainRV, - IsClaimRV = !IsRetainRV; + IsUnsafeClaimRV = !IsRetainRV; for (auto *RI : Returns) { Value *RetOpnd = objcarc::GetRCIdentityRoot(RI->getOperand(0)); @@ -1694,7 +1694,7 @@ inlineRetainOrClaimRVCalls(CallBase &CB, objcarc::ARCInstKind RVCallKind, // and erase the autoreleaseRV call. // - If retainRV is attached to the call, just erase the autoreleaseRV // call. - if (IsClaimRV) { + if (IsUnsafeClaimRV) { Builder.SetInsertPoint(II); Function *IFn = Intrinsic::getDeclaration(Mod, Intrinsic::objc_release); diff --git a/llvm/lib/Transforms/Utils/LCSSA.cpp b/llvm/lib/Transforms/Utils/LCSSA.cpp index 668626fef933..72b864dc3e48 100644 --- a/llvm/lib/Transforms/Utils/LCSSA.cpp +++ b/llvm/lib/Transforms/Utils/LCSSA.cpp @@ -339,8 +339,10 @@ bool llvm::formLCSSA(Loop &L, const DominatorTree &DT, const LoopInfo *LI, #ifdef EXPENSIVE_CHECKS // Verify all sub-loops are in LCSSA form already. - for (Loop *SubLoop: L) + for (Loop *SubLoop: L) { + (void)SubLoop; // Silence unused variable warning. assert(SubLoop->isRecursivelyLCSSAForm(DT, *LI) && "Subloop not in LCSSA!"); + } #endif SmallVector<BasicBlock *, 8> ExitBlocks; diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp index ecad79b68185..9f33d2f82732 100644 --- a/llvm/lib/Transforms/Utils/Local.cpp +++ b/llvm/lib/Transforms/Utils/Local.cpp @@ -492,7 +492,7 @@ bool llvm::wouldInstructionBeTriviallyDead(Instruction *I, } } - if (isAllocLikeFn(I, TLI)) + if (isAllocationFn(I, TLI) && isAllocRemovable(cast<CallBase>(I), TLI)) return true; if (CallInst *CI = isFreeCall(I, TLI)) @@ -2189,8 +2189,8 @@ CallInst *llvm::createCallMatchingInvoke(InvokeInst *II) { return NewCall; } -/// changeToCall - Convert the specified invoke into a normal call. -void llvm::changeToCall(InvokeInst *II, DomTreeUpdater *DTU) { +// changeToCall - Convert the specified invoke into a normal call. +CallInst *llvm::changeToCall(InvokeInst *II, DomTreeUpdater *DTU) { CallInst *NewCall = createCallMatchingInvoke(II); NewCall->takeName(II); NewCall->insertBefore(II); @@ -2207,6 +2207,7 @@ void llvm::changeToCall(InvokeInst *II, DomTreeUpdater *DTU) { II->eraseFromParent(); if (DTU) DTU->applyUpdates({{DominatorTree::Delete, BB, UnwindDestBB}}); + return NewCall; } BasicBlock *llvm::changeToInvokeAndSplitBasicBlock(CallInst *CI, @@ -3147,11 +3148,6 @@ bool llvm::recognizeBSwapOrBitReverseIdiom( if (!ITy->isIntOrIntVectorTy() || ITy->getScalarSizeInBits() > 128) return false; // Can't do integer/elements > 128 bits. - Type *DemandedTy = ITy; - if (I->hasOneUse()) - if (auto *Trunc = dyn_cast<TruncInst>(I->user_back())) - DemandedTy = Trunc->getType(); - // Try to find all the pieces corresponding to the bswap. bool FoundRoot = false; std::map<Value *, Optional<BitPart>> BPS; @@ -3165,6 +3161,7 @@ bool llvm::recognizeBSwapOrBitReverseIdiom( "Illegal bit provenance index"); // If the upper bits are zero, then attempt to perform as a truncated op. + Type *DemandedTy = ITy; if (BitProvenance.back() == BitPart::Unset) { while (!BitProvenance.empty() && BitProvenance.back() == BitPart::Unset) BitProvenance = BitProvenance.drop_back(); diff --git a/llvm/lib/Transforms/Utils/LoopPeel.cpp b/llvm/lib/Transforms/Utils/LoopPeel.cpp index 69fd110dc3c2..92333408aaef 100644 --- a/llvm/lib/Transforms/Utils/LoopPeel.cpp +++ b/llvm/lib/Transforms/Utils/LoopPeel.cpp @@ -359,7 +359,7 @@ static bool violatesLegacyMultiExitLoopCheck(Loop *L) { // Return the number of iterations we want to peel off. void llvm::computePeelCount(Loop *L, unsigned LoopSize, TargetTransformInfo::PeelingPreferences &PP, - unsigned &TripCount, DominatorTree &DT, + unsigned TripCount, DominatorTree &DT, ScalarEvolution &SE, unsigned Threshold) { assert(LoopSize > 0 && "Zero loop size is not allowed!"); // Save the PP.PeelCount value set by the target in @@ -370,7 +370,7 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize, return; // Only try to peel innermost loops by default. - // The constraint can be relaxed by the target in TTI.getUnrollingPreferences + // The constraint can be relaxed by the target in TTI.getPeelingPreferences // or by the flag -unroll-allow-loop-nests-peeling. if (!PP.AllowLoopNestsPeeling && !L->isInnermost()) return; @@ -407,8 +407,8 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize, SmallDenseMap<PHINode *, Optional<unsigned> > IterationsToInvariance; // Now go through all Phis to calculate their the number of iterations they // need to become invariants. - // Start the max computation with the UP.PeelCount value set by the target - // in TTI.getUnrollingPreferences or by the flag -unroll-peel-count. + // Start the max computation with the PP.PeelCount value set by the target + // in TTI.getPeelingPreferences or by the flag -unroll-peel-count. unsigned DesiredPeelCount = TargetPeelCount; BasicBlock *BackEdge = L->getLoopLatch(); assert(BackEdge && "Loop is not in simplified form?"); diff --git a/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/llvm/lib/Transforms/Utils/LoopUnroll.cpp index b0c622b98d5e..9ca1f4f44b97 100644 --- a/llvm/lib/Transforms/Utils/LoopUnroll.cpp +++ b/llvm/lib/Transforms/Utils/LoopUnroll.cpp @@ -99,6 +99,17 @@ UnrollVerifyDomtree("unroll-verify-domtree", cl::Hidden, #endif ); +static cl::opt<bool> +UnrollVerifyLoopInfo("unroll-verify-loopinfo", cl::Hidden, + cl::desc("Verify loopinfo after unrolling"), +#ifdef EXPENSIVE_CHECKS + cl::init(true) +#else + cl::init(false) +#endif + ); + + /// Check if unrolling created a situation where we need to insert phi nodes to /// preserve LCSSA form. /// \param Blocks is a vector of basic blocks representing unrolled loop. @@ -764,6 +775,9 @@ LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI, // Apply updates to the DomTree. DT = &DTU.getDomTree(); + assert(!UnrollVerifyDomtree || + DT->verify(DominatorTree::VerificationLevel::Fast)); + // At this point, the code is well formed. We now simplify the unrolled loop, // doing constant propagation and dead code elimination as we go. simplifyLoopAfterUnroll(L, !CompletelyUnroll && ULO.Count > 1, LI, SE, DT, AC, @@ -777,6 +791,10 @@ LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI, if (CompletelyUnroll) LI->erase(L); + // LoopInfo should not be valid, confirm that. + if (UnrollVerifyLoopInfo) + LI->verify(*DT); + // After complete unrolling most of the blocks should be contained in OuterL. // However, some of them might happen to be out of OuterL (e.g. if they // precede a loop exit). In this case we might need to insert PHI nodes in diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp index 93157bd87c34..95db2fe8d310 100644 --- a/llvm/lib/Transforms/Utils/LoopUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp @@ -22,6 +22,7 @@ #include "llvm/Analysis/BasicAliasAnalysis.h" #include "llvm/Analysis/DomTreeUpdater.h" #include "llvm/Analysis/GlobalsModRef.h" +#include "llvm/Analysis/InstSimplifyFolder.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LoopAccessAnalysis.h" #include "llvm/Analysis/LoopInfo.h" @@ -1567,7 +1568,9 @@ Value *llvm::addRuntimeChecks( auto ExpandedChecks = expandBounds(PointerChecks, TheLoop, Loc, Exp); LLVMContext &Ctx = Loc->getContext(); - IRBuilder<> ChkBuilder(Loc); + IRBuilder<InstSimplifyFolder> ChkBuilder(Ctx, + Loc->getModule()->getDataLayout()); + ChkBuilder.SetInsertPoint(Loc); // Our instructions might fold to a constant. Value *MemoryRuntimeCheck = nullptr; diff --git a/llvm/lib/Transforms/Utils/LoopVersioning.cpp b/llvm/lib/Transforms/Utils/LoopVersioning.cpp index 771b7d25b0f2..f0bf625fa18e 100644 --- a/llvm/lib/Transforms/Utils/LoopVersioning.cpp +++ b/llvm/lib/Transforms/Utils/LoopVersioning.cpp @@ -15,6 +15,7 @@ #include "llvm/Transforms/Utils/LoopVersioning.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/InstSimplifyFolder.h" #include "llvm/Analysis/LoopAccessAnalysis.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/ScalarEvolution.h" @@ -70,17 +71,14 @@ void LoopVersioning::versionLoop( "scev.check"); SCEVRuntimeCheck = Exp.expandCodeForPredicate(&Preds, RuntimeCheckBB->getTerminator()); - auto *CI = dyn_cast<ConstantInt>(SCEVRuntimeCheck); - - // Discard the SCEV runtime check if it is always true. - if (CI && CI->isZero()) - SCEVRuntimeCheck = nullptr; + IRBuilder<InstSimplifyFolder> Builder( + RuntimeCheckBB->getContext(), + InstSimplifyFolder(RuntimeCheckBB->getModule()->getDataLayout())); if (MemRuntimeCheck && SCEVRuntimeCheck) { - RuntimeCheck = BinaryOperator::Create(Instruction::Or, MemRuntimeCheck, - SCEVRuntimeCheck, "lver.safe"); - if (auto *I = dyn_cast<Instruction>(RuntimeCheck)) - I->insertBefore(RuntimeCheckBB->getTerminator()); + Builder.SetInsertPoint(RuntimeCheckBB->getTerminator()); + RuntimeCheck = + Builder.CreateOr(MemRuntimeCheck, SCEVRuntimeCheck, "lver.safe"); } else RuntimeCheck = MemRuntimeCheck ? MemRuntimeCheck : SCEVRuntimeCheck; @@ -109,8 +107,9 @@ void LoopVersioning::versionLoop( // Insert the conditional branch based on the result of the memchecks. Instruction *OrigTerm = RuntimeCheckBB->getTerminator(); - BranchInst::Create(NonVersionedLoop->getLoopPreheader(), - VersionedLoop->getLoopPreheader(), RuntimeCheck, OrigTerm); + Builder.SetInsertPoint(OrigTerm); + Builder.CreateCondBr(RuntimeCheck, NonVersionedLoop->getLoopPreheader(), + VersionedLoop->getLoopPreheader()); OrigTerm->eraseFromParent(); // The loops merge in the original exit block. This is now dominated by the diff --git a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp index 8dc4702993c3..3d75dd57456d 100644 --- a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp +++ b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp @@ -297,7 +297,7 @@ static void createMemMoveLoop(Instruction *InsertBefore, Value *SrcAddr, Function *F = OrigBB->getParent(); const DataLayout &DL = F->getParent()->getDataLayout(); - Type *EltTy = cast<PointerType>(SrcAddr->getType())->getElementType(); + Type *EltTy = SrcAddr->getType()->getPointerElementType(); // Create the a comparison of src and dst, based on which we jump to either // the forward-copy part of the function (if src >= dst) or the backwards-copy diff --git a/llvm/lib/Transforms/Utils/ModuleUtils.cpp b/llvm/lib/Transforms/Utils/ModuleUtils.cpp index bb5ff59cba4b..7c9ab7f6ca2c 100644 --- a/llvm/lib/Transforms/Utils/ModuleUtils.cpp +++ b/llvm/lib/Transforms/Utils/ModuleUtils.cpp @@ -178,66 +178,30 @@ llvm::getOrCreateSanitizerCtorAndInitFunctions( } void llvm::filterDeadComdatFunctions( - Module &M, SmallVectorImpl<Function *> &DeadComdatFunctions) { - // Build a map from the comdat to the number of entries in that comdat we - // think are dead. If this fully covers the comdat group, then the entire - // group is dead. If we find another entry in the comdat group though, we'll - // have to preserve the whole group. - SmallDenseMap<Comdat *, int, 16> ComdatEntriesCovered; + SmallVectorImpl<Function *> &DeadComdatFunctions) { + SmallPtrSet<Function *, 32> MaybeDeadFunctions; + SmallPtrSet<Comdat *, 32> MaybeDeadComdats; for (Function *F : DeadComdatFunctions) { - Comdat *C = F->getComdat(); - assert(C && "Expected all input GVs to be in a comdat!"); - ComdatEntriesCovered[C] += 1; + MaybeDeadFunctions.insert(F); + if (Comdat *C = F->getComdat()) + MaybeDeadComdats.insert(C); } - auto CheckComdat = [&](Comdat &C) { - auto CI = ComdatEntriesCovered.find(&C); - if (CI == ComdatEntriesCovered.end()) - return; - - // If this could have been covered by a dead entry, just subtract one to - // account for it. - if (CI->second > 0) { - CI->second -= 1; - return; - } - - // If we've already accounted for all the entries that were dead, the - // entire comdat is alive so remove it from the map. - ComdatEntriesCovered.erase(CI); - }; - - auto CheckAllComdats = [&] { - for (Function &F : M.functions()) - if (Comdat *C = F.getComdat()) { - CheckComdat(*C); - if (ComdatEntriesCovered.empty()) - return; - } - for (GlobalVariable &GV : M.globals()) - if (Comdat *C = GV.getComdat()) { - CheckComdat(*C); - if (ComdatEntriesCovered.empty()) - return; - } - for (GlobalAlias &GA : M.aliases()) - if (Comdat *C = GA.getComdat()) { - CheckComdat(*C); - if (ComdatEntriesCovered.empty()) - return; - } - }; - CheckAllComdats(); - - if (ComdatEntriesCovered.empty()) { - DeadComdatFunctions.clear(); - return; + // Find comdats for which all users are dead now. + SmallPtrSet<Comdat *, 32> DeadComdats; + for (Comdat *C : MaybeDeadComdats) { + auto IsUserDead = [&](GlobalObject *GO) { + auto *F = dyn_cast<Function>(GO); + return F && MaybeDeadFunctions.contains(F); + }; + if (all_of(C->getUsers(), IsUserDead)) + DeadComdats.insert(C); } - // Remove the entries that were not covering. - erase_if(DeadComdatFunctions, [&](GlobalValue *GV) { - return ComdatEntriesCovered.find(GV->getComdat()) == - ComdatEntriesCovered.end(); + // Only keep functions which have no comdat or a dead comdat. + erase_if(DeadComdatFunctions, [&](Function *F) { + Comdat *C = F->getComdat(); + return C && !DeadComdats.contains(C); }); } diff --git a/llvm/lib/Transforms/Utils/SampleProfileInference.cpp b/llvm/lib/Transforms/Utils/SampleProfileInference.cpp index 2f2dff6b5f0b..961adf2570a7 100644 --- a/llvm/lib/Transforms/Utils/SampleProfileInference.cpp +++ b/llvm/lib/Transforms/Utils/SampleProfileInference.cpp @@ -14,6 +14,7 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Utils/SampleProfileInference.h" +#include "llvm/ADT/BitVector.h" #include "llvm/Support/Debug.h" #include <queue> #include <set> @@ -144,7 +145,7 @@ public: /// A cost of decreasing the entry block's count by one. static constexpr int64_t AuxCostDecEntry = 10; /// A cost of taking an unlikely jump. - static constexpr int64_t AuxCostUnlikely = ((int64_t)1) << 20; + static constexpr int64_t AuxCostUnlikely = ((int64_t)1) << 30; private: /// Check for existence of an augmenting path with a positive capacity. @@ -236,7 +237,7 @@ private: } } - /// An node in a flow network. + /// A node in a flow network. struct Node { /// The cost of the cheapest path from the source to the current node. int64_t Distance; @@ -303,13 +304,10 @@ public: rebalanceUnknownSubgraphs(); } - /// The probability for the first successor of a unknown subgraph - static constexpr double UnknownFirstSuccProbability = 0.5; - private: void joinIsolatedComponents() { // Find blocks that are reachable from the source - auto Visited = std::vector<bool>(NumBlocks(), false); + auto Visited = BitVector(NumBlocks(), false); findReachable(Func.Entry, Visited); // Iterate over all non-reachable blocks and adjust their weights @@ -334,7 +332,7 @@ private: /// Run BFS from a given block along the jumps with a positive flow and mark /// all reachable blocks. - void findReachable(uint64_t Src, std::vector<bool> &Visited) { + void findReachable(uint64_t Src, BitVector &Visited) { if (Visited[Src]) return; std::queue<uint64_t> Queue; @@ -452,44 +450,70 @@ private: uint64_t NumBlocks() const { return Func.Blocks.size(); } - /// Rebalance unknown subgraphs so as each branch splits with probabilities - /// UnknownFirstSuccProbability and 1 - UnknownFirstSuccProbability + /// Rebalance unknown subgraphs so that the flow is split evenly across the + /// outgoing branches of every block of the subgraph. The method iterates over + /// blocks with known weight and identifies unknown subgraphs rooted at the + /// blocks. Then it verifies if flow rebalancing is feasible and applies it. void rebalanceUnknownSubgraphs() { - assert(UnknownFirstSuccProbability >= 0.0 && - UnknownFirstSuccProbability <= 1.0 && - "the share of the unknown successor should be between 0 and 1"); - // Try to find unknown subgraphs from each non-unknown block + // Try to find unknown subgraphs from each block for (uint64_t I = 0; I < Func.Blocks.size(); I++) { auto SrcBlock = &Func.Blocks[I]; - // Do not attempt to find unknown successors from a unknown or a - // zero-flow block - if (SrcBlock->UnknownWeight || SrcBlock->Flow == 0) + // Verify if rebalancing rooted at SrcBlock is feasible + if (!canRebalanceAtRoot(SrcBlock)) continue; - std::vector<FlowBlock *> UnknownSuccs; + // Find an unknown subgraphs starting at SrcBlock. Along the way, + // fill in known destinations and intermediate unknown blocks. + std::vector<FlowBlock *> UnknownBlocks; + std::vector<FlowBlock *> KnownDstBlocks; + findUnknownSubgraph(SrcBlock, KnownDstBlocks, UnknownBlocks); + + // Verify if rebalancing of the subgraph is feasible. If the search is + // successful, find the unique destination block (which can be null) FlowBlock *DstBlock = nullptr; - // Find a unknown subgraphs starting at block SrcBlock - if (!findUnknownSubgraph(SrcBlock, DstBlock, UnknownSuccs)) + if (!canRebalanceSubgraph(SrcBlock, KnownDstBlocks, UnknownBlocks, + DstBlock)) continue; - // At the moment, we do not rebalance subgraphs containing cycles among - // unknown blocks - if (!isAcyclicSubgraph(SrcBlock, DstBlock, UnknownSuccs)) + + // We cannot rebalance subgraphs containing cycles among unknown blocks + if (!isAcyclicSubgraph(SrcBlock, DstBlock, UnknownBlocks)) continue; // Rebalance the flow - rebalanceUnknownSubgraph(SrcBlock, DstBlock, UnknownSuccs); + rebalanceUnknownSubgraph(SrcBlock, DstBlock, UnknownBlocks); } } - /// Find a unknown subgraph starting at block SrcBlock. - /// If the search is successful, the method sets DstBlock and UnknownSuccs. - bool findUnknownSubgraph(FlowBlock *SrcBlock, FlowBlock *&DstBlock, - std::vector<FlowBlock *> &UnknownSuccs) { + /// Verify if rebalancing rooted at a given block is possible. + bool canRebalanceAtRoot(const FlowBlock *SrcBlock) { + // Do not attempt to find unknown subgraphs from an unknown or a + // zero-flow block + if (SrcBlock->UnknownWeight || SrcBlock->Flow == 0) + return false; + + // Do not attempt to process subgraphs from a block w/o unknown sucessors + bool HasUnknownSuccs = false; + for (auto Jump : SrcBlock->SuccJumps) { + if (Func.Blocks[Jump->Target].UnknownWeight) { + HasUnknownSuccs = true; + break; + } + } + if (!HasUnknownSuccs) + return false; + + return true; + } + + /// Find an unknown subgraph starting at block SrcBlock. The method sets + /// identified destinations, KnownDstBlocks, and intermediate UnknownBlocks. + void findUnknownSubgraph(const FlowBlock *SrcBlock, + std::vector<FlowBlock *> &KnownDstBlocks, + std::vector<FlowBlock *> &UnknownBlocks) { // Run BFS from SrcBlock and make sure all paths are going through unknown // blocks and end at a non-unknown DstBlock - auto Visited = std::vector<bool>(NumBlocks(), false); + auto Visited = BitVector(NumBlocks(), false); std::queue<uint64_t> Queue; - DstBlock = nullptr; Queue.push(SrcBlock->Index); Visited[SrcBlock->Index] = true; @@ -498,52 +522,105 @@ private: Queue.pop(); // Process blocks reachable from Block for (auto Jump : Block.SuccJumps) { + // If Jump can be ignored, skip it + if (ignoreJump(SrcBlock, nullptr, Jump)) + continue; + uint64_t Dst = Jump->Target; + // If Dst has been visited, skip Jump if (Visited[Dst]) continue; + // Process block Dst Visited[Dst] = true; if (!Func.Blocks[Dst].UnknownWeight) { - // If we see non-unique non-unknown block reachable from SrcBlock, - // stop processing and skip rebalancing - FlowBlock *CandidateDstBlock = &Func.Blocks[Dst]; - if (DstBlock != nullptr && DstBlock != CandidateDstBlock) - return false; - DstBlock = CandidateDstBlock; + KnownDstBlocks.push_back(&Func.Blocks[Dst]); } else { Queue.push(Dst); - UnknownSuccs.push_back(&Func.Blocks[Dst]); + UnknownBlocks.push_back(&Func.Blocks[Dst]); } } } + } + /// Verify if rebalancing of the subgraph is feasible. If the checks are + /// successful, set the unique destination block, DstBlock (can be null). + bool canRebalanceSubgraph(const FlowBlock *SrcBlock, + const std::vector<FlowBlock *> &KnownDstBlocks, + const std::vector<FlowBlock *> &UnknownBlocks, + FlowBlock *&DstBlock) { // If the list of unknown blocks is empty, we don't need rebalancing - if (UnknownSuccs.empty()) + if (UnknownBlocks.empty()) return false; - // If all reachable nodes from SrcBlock are unknown, skip rebalancing - if (DstBlock == nullptr) + + // If there are multiple known sinks, we can't rebalance + if (KnownDstBlocks.size() > 1) return false; - // If any of the unknown blocks is an exit block, skip rebalancing - for (auto Block : UnknownSuccs) { - if (Block->isExit()) + DstBlock = KnownDstBlocks.empty() ? nullptr : KnownDstBlocks.front(); + + // Verify sinks of the subgraph + for (auto Block : UnknownBlocks) { + if (Block->SuccJumps.empty()) { + // If there are multiple (known and unknown) sinks, we can't rebalance + if (DstBlock != nullptr) + return false; + continue; + } + size_t NumIgnoredJumps = 0; + for (auto Jump : Block->SuccJumps) { + if (ignoreJump(SrcBlock, DstBlock, Jump)) + NumIgnoredJumps++; + } + // If there is a non-sink block in UnknownBlocks with all jumps ignored, + // then we can't rebalance + if (NumIgnoredJumps == Block->SuccJumps.size()) return false; } return true; } + /// Decide whether the Jump is ignored while processing an unknown subgraphs + /// rooted at basic block SrcBlock with the destination block, DstBlock. + bool ignoreJump(const FlowBlock *SrcBlock, const FlowBlock *DstBlock, + const FlowJump *Jump) { + // Ignore unlikely jumps with zero flow + if (Jump->IsUnlikely && Jump->Flow == 0) + return true; + + auto JumpSource = &Func.Blocks[Jump->Source]; + auto JumpTarget = &Func.Blocks[Jump->Target]; + + // Do not ignore jumps coming into DstBlock + if (DstBlock != nullptr && JumpTarget == DstBlock) + return false; + + // Ignore jumps out of SrcBlock to known blocks + if (!JumpTarget->UnknownWeight && JumpSource == SrcBlock) + return true; + + // Ignore jumps to known blocks with zero flow + if (!JumpTarget->UnknownWeight && JumpTarget->Flow == 0) + return true; + + return false; + } + /// Verify if the given unknown subgraph is acyclic, and if yes, reorder - /// UnknownSuccs in the topological order (so that all jumps are "forward"). - bool isAcyclicSubgraph(FlowBlock *SrcBlock, FlowBlock *DstBlock, - std::vector<FlowBlock *> &UnknownSuccs) { + /// UnknownBlocks in the topological order (so that all jumps are "forward"). + bool isAcyclicSubgraph(const FlowBlock *SrcBlock, const FlowBlock *DstBlock, + std::vector<FlowBlock *> &UnknownBlocks) { // Extract local in-degrees in the considered subgraph auto LocalInDegree = std::vector<uint64_t>(NumBlocks(), 0); - for (auto Jump : SrcBlock->SuccJumps) { - LocalInDegree[Jump->Target]++; - } - for (uint64_t I = 0; I < UnknownSuccs.size(); I++) { - for (auto Jump : UnknownSuccs[I]->SuccJumps) { + auto fillInDegree = [&](const FlowBlock *Block) { + for (auto Jump : Block->SuccJumps) { + if (ignoreJump(SrcBlock, DstBlock, Jump)) + continue; LocalInDegree[Jump->Target]++; } + }; + fillInDegree(SrcBlock); + for (auto Block : UnknownBlocks) { + fillInDegree(Block); } // A loop containing SrcBlock if (LocalInDegree[SrcBlock->Index] > 0) @@ -553,15 +630,20 @@ private: std::queue<uint64_t> Queue; Queue.push(SrcBlock->Index); while (!Queue.empty()) { - auto &Block = Func.Blocks[Queue.front()]; + FlowBlock *Block = &Func.Blocks[Queue.front()]; Queue.pop(); - // Stop propagation once we reach DstBlock - if (Block.Index == DstBlock->Index) + // Stop propagation once we reach DstBlock, if any + if (DstBlock != nullptr && Block == DstBlock) break; - AcyclicOrder.push_back(&Block); + // Keep an acyclic order of unknown blocks + if (Block->UnknownWeight && Block != SrcBlock) + AcyclicOrder.push_back(Block); + // Add to the queue all successors with zero local in-degree - for (auto Jump : Block.SuccJumps) { + for (auto Jump : Block->SuccJumps) { + if (ignoreJump(SrcBlock, DstBlock, Jump)) + continue; uint64_t Dst = Jump->Target; LocalInDegree[Dst]--; if (LocalInDegree[Dst] == 0) { @@ -572,42 +654,69 @@ private: // If there is a cycle in the subgraph, AcyclicOrder contains only a subset // of all blocks - if (UnknownSuccs.size() + 1 != AcyclicOrder.size()) + if (UnknownBlocks.size() != AcyclicOrder.size()) return false; - UnknownSuccs = AcyclicOrder; + UnknownBlocks = AcyclicOrder; return true; } - /// Rebalance a given subgraph. - void rebalanceUnknownSubgraph(FlowBlock *SrcBlock, FlowBlock *DstBlock, - std::vector<FlowBlock *> &UnknownSuccs) { + /// Rebalance a given subgraph rooted at SrcBlock, ending at DstBlock and + /// having UnknownBlocks intermediate blocks. + void rebalanceUnknownSubgraph(const FlowBlock *SrcBlock, + const FlowBlock *DstBlock, + const std::vector<FlowBlock *> &UnknownBlocks) { assert(SrcBlock->Flow > 0 && "zero-flow block in unknown subgraph"); - assert(UnknownSuccs.front() == SrcBlock && "incorrect order of unknowns"); - for (auto Block : UnknownSuccs) { + // Ditribute flow from the source block + uint64_t BlockFlow = 0; + // SrcBlock's flow is the sum of outgoing flows along non-ignored jumps + for (auto Jump : SrcBlock->SuccJumps) { + if (ignoreJump(SrcBlock, DstBlock, Jump)) + continue; + BlockFlow += Jump->Flow; + } + rebalanceBlock(SrcBlock, DstBlock, SrcBlock, BlockFlow); + + // Ditribute flow from the remaining blocks + for (auto Block : UnknownBlocks) { + assert(Block->UnknownWeight && "incorrect unknown subgraph"); + uint64_t BlockFlow = 0; // Block's flow is the sum of incoming flows - uint64_t TotalFlow = 0; - if (Block == SrcBlock) { - TotalFlow = Block->Flow; - } else { - for (auto Jump : Block->PredJumps) { - TotalFlow += Jump->Flow; - } - Block->Flow = TotalFlow; + for (auto Jump : Block->PredJumps) { + BlockFlow += Jump->Flow; } + Block->Flow = BlockFlow; + rebalanceBlock(SrcBlock, DstBlock, Block, BlockFlow); + } + } - // Process all successor jumps and update corresponding flow values - for (uint64_t I = 0; I < Block->SuccJumps.size(); I++) { - auto Jump = Block->SuccJumps[I]; - if (I + 1 == Block->SuccJumps.size()) { - Jump->Flow = TotalFlow; - continue; - } - uint64_t Flow = uint64_t(TotalFlow * UnknownFirstSuccProbability); - Jump->Flow = Flow; - TotalFlow -= Flow; - } + /// Redistribute flow for a block in a subgraph rooted at SrcBlock, + /// and ending at DstBlock. + void rebalanceBlock(const FlowBlock *SrcBlock, const FlowBlock *DstBlock, + const FlowBlock *Block, uint64_t BlockFlow) { + // Process all successor jumps and update corresponding flow values + size_t BlockDegree = 0; + for (auto Jump : Block->SuccJumps) { + if (ignoreJump(SrcBlock, DstBlock, Jump)) + continue; + BlockDegree++; + } + // If all successor jumps of the block are ignored, skip it + if (DstBlock == nullptr && BlockDegree == 0) + return; + assert(BlockDegree > 0 && "all outgoing jumps are ignored"); + + // Each of the Block's successors gets the following amount of flow. + // Rounding the value up so that all flow is propagated + uint64_t SuccFlow = (BlockFlow + BlockDegree - 1) / BlockDegree; + for (auto Jump : Block->SuccJumps) { + if (ignoreJump(SrcBlock, DstBlock, Jump)) + continue; + uint64_t Flow = std::min(SuccFlow, BlockFlow); + Jump->Flow = Flow; + BlockFlow -= Flow; } + assert(BlockFlow == 0 && "not all flow is propagated"); } /// A constant indicating an arbitrary exit block of a function. @@ -799,7 +908,7 @@ void verifyWeights(const FlowFunction &Func) { // Run BFS from the source along edges with positive flow std::queue<uint64_t> Queue; - auto Visited = std::vector<bool>(NumBlocks, false); + auto Visited = BitVector(NumBlocks, false); Queue.push(Func.Entry); Visited[Func.Entry] = true; while (!Queue.empty()) { diff --git a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp index c840ee85795f..5363a851fc27 100644 --- a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp +++ b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp @@ -173,7 +173,7 @@ Value *SCEVExpander::InsertNoopCastOfTo(Value *V, Type *Ty) { auto *PtrTy = cast<PointerType>(Ty); if (DL.isNonIntegralPointerType(PtrTy)) { auto *Int8PtrTy = Builder.getInt8PtrTy(PtrTy->getAddressSpace()); - assert(DL.getTypeAllocSize(Int8PtrTy->getElementType()) == 1 && + assert(DL.getTypeAllocSize(Builder.getInt8Ty()) == 1 && "alloc size of i8 must by 1 byte for the GEP to be correct"); auto *GEP = Builder.CreateGEP( Builder.getInt8Ty(), Constant::getNullValue(Int8PtrTy), V, "uglygep"); @@ -471,7 +471,7 @@ Value *SCEVExpander::expandAddToGEP(const SCEV *const *op_begin, // indexes into the array implied by the pointer operand; the rest of // the indices index into the element or field type selected by the // preceding index. - Type *ElTy = PTy->getElementType(); + Type *ElTy = PTy->getNonOpaquePointerElementType(); for (;;) { // If the scale size is not 0, attempt to factor out a scale for // array indexing. @@ -640,8 +640,8 @@ Value *SCEVExpander::expandAddToGEP(const SCEV *const *op_begin, Value *Casted = V; if (V->getType() != PTy) Casted = InsertNoopCastOfTo(Casted, PTy); - Value *GEP = Builder.CreateGEP(PTy->getElementType(), Casted, GepIndices, - "scevgep"); + Value *GEP = Builder.CreateGEP(PTy->getNonOpaquePointerElementType(), + Casted, GepIndices, "scevgep"); Ops.push_back(SE.getUnknown(GEP)); } @@ -1671,7 +1671,7 @@ Value *SCEVExpander::visitSignExtendExpr(const SCEVSignExtendExpr *S) { return Builder.CreateSExt(V, Ty); } -Value *SCEVExpander::visitSMaxExpr(const SCEVSMaxExpr *S) { +Value *SCEVExpander::expandSMaxExpr(const SCEVNAryExpr *S) { Value *LHS = expand(S->getOperand(S->getNumOperands()-1)); Type *Ty = LHS->getType(); for (int i = S->getNumOperands()-2; i >= 0; --i) { @@ -1700,7 +1700,7 @@ Value *SCEVExpander::visitSMaxExpr(const SCEVSMaxExpr *S) { return LHS; } -Value *SCEVExpander::visitUMaxExpr(const SCEVUMaxExpr *S) { +Value *SCEVExpander::expandUMaxExpr(const SCEVNAryExpr *S) { Value *LHS = expand(S->getOperand(S->getNumOperands()-1)); Type *Ty = LHS->getType(); for (int i = S->getNumOperands()-2; i >= 0; --i) { @@ -1729,7 +1729,7 @@ Value *SCEVExpander::visitUMaxExpr(const SCEVUMaxExpr *S) { return LHS; } -Value *SCEVExpander::visitSMinExpr(const SCEVSMinExpr *S) { +Value *SCEVExpander::expandSMinExpr(const SCEVNAryExpr *S) { Value *LHS = expand(S->getOperand(S->getNumOperands() - 1)); Type *Ty = LHS->getType(); for (int i = S->getNumOperands() - 2; i >= 0; --i) { @@ -1758,7 +1758,7 @@ Value *SCEVExpander::visitSMinExpr(const SCEVSMinExpr *S) { return LHS; } -Value *SCEVExpander::visitUMinExpr(const SCEVUMinExpr *S) { +Value *SCEVExpander::expandUMinExpr(const SCEVNAryExpr *S) { Value *LHS = expand(S->getOperand(S->getNumOperands() - 1)); Type *Ty = LHS->getType(); for (int i = S->getNumOperands() - 2; i >= 0; --i) { @@ -1787,6 +1787,40 @@ Value *SCEVExpander::visitUMinExpr(const SCEVUMinExpr *S) { return LHS; } +Value *SCEVExpander::visitSMaxExpr(const SCEVSMaxExpr *S) { + return expandSMaxExpr(S); +} + +Value *SCEVExpander::visitUMaxExpr(const SCEVUMaxExpr *S) { + return expandUMaxExpr(S); +} + +Value *SCEVExpander::visitSMinExpr(const SCEVSMinExpr *S) { + return expandSMinExpr(S); +} + +Value *SCEVExpander::visitUMinExpr(const SCEVUMinExpr *S) { + return expandUMinExpr(S); +} + +Value *SCEVExpander::visitSequentialUMinExpr(const SCEVSequentialUMinExpr *S) { + SmallVector<Value *> Ops; + for (const SCEV *Op : S->operands()) + Ops.emplace_back(expand(Op)); + + Value *SaturationPoint = + MinMaxIntrinsic::getSaturationPoint(Intrinsic::umin, S->getType()); + + SmallVector<Value *> OpIsZero; + for (Value *Op : ArrayRef<Value *>(Ops).drop_back()) + OpIsZero.emplace_back(Builder.CreateICmpEQ(Op, SaturationPoint)); + + Value *AnyOpIsZero = Builder.CreateLogicalOr(OpIsZero); + + Value *NaiveUMin = expandUMinExpr(S); + return Builder.CreateSelect(AnyOpIsZero, SaturationPoint, NaiveUMin); +} + Value *SCEVExpander::expandCodeForImpl(const SCEV *SH, Type *Ty, Instruction *IP, bool Root) { setInsertPoint(IP); @@ -1809,8 +1843,8 @@ Value *SCEVExpander::expandCodeForImpl(const SCEV *SH, Type *Ty, bool Root) { // instruction. Instruction *Tmp; if (Inst->getType()->isIntegerTy()) - Tmp = - cast<Instruction>(Builder.CreateAdd(Inst, Inst, "tmp.lcssa.user")); + Tmp = cast<Instruction>(Builder.CreateIntToPtr( + Inst, Inst->getType()->getPointerTo(), "tmp.lcssa.user")); else { assert(Inst->getType()->isPointerTy()); Tmp = cast<Instruction>(Builder.CreatePtrToInt( @@ -1947,22 +1981,14 @@ Value *SCEVExpander::expand(const SCEV *S) { if (VO.second) { if (PointerType *Vty = dyn_cast<PointerType>(V->getType())) { - Type *Ety = Vty->getPointerElementType(); int64_t Offset = VO.second->getSExtValue(); - int64_t ESize = SE.getTypeSizeInBits(Ety); - if ((Offset * 8) % ESize == 0) { - ConstantInt *Idx = - ConstantInt::getSigned(VO.second->getType(), -(Offset * 8) / ESize); - V = Builder.CreateGEP(Ety, V, Idx, "scevgep"); - } else { - ConstantInt *Idx = - ConstantInt::getSigned(VO.second->getType(), -Offset); - unsigned AS = Vty->getAddressSpace(); - V = Builder.CreateBitCast(V, Type::getInt8PtrTy(SE.getContext(), AS)); - V = Builder.CreateGEP(Type::getInt8Ty(SE.getContext()), V, Idx, - "uglygep"); - V = Builder.CreateBitCast(V, Vty); - } + ConstantInt *Idx = + ConstantInt::getSigned(VO.second->getType(), -Offset); + unsigned AS = Vty->getAddressSpace(); + V = Builder.CreateBitCast(V, Type::getInt8PtrTy(SE.getContext(), AS)); + V = Builder.CreateGEP(Type::getInt8Ty(SE.getContext()), V, Idx, + "uglygep"); + V = Builder.CreateBitCast(V, Vty); } else { V = Builder.CreateSub(V, VO.second); } @@ -2271,10 +2297,27 @@ template<typename T> static InstructionCost costAndCollectOperands( case scSMaxExpr: case scUMaxExpr: case scSMinExpr: - case scUMinExpr: { + case scUMinExpr: + case scSequentialUMinExpr: { // FIXME: should this ask the cost for Intrinsic's? + // The reduction tree. Cost += CmpSelCost(Instruction::ICmp, S->getNumOperands() - 1, 0, 1); Cost += CmpSelCost(Instruction::Select, S->getNumOperands() - 1, 0, 2); + switch (S->getSCEVType()) { + case scSequentialUMinExpr: { + // The safety net against poison. + // FIXME: this is broken. + Cost += CmpSelCost(Instruction::ICmp, S->getNumOperands() - 1, 0, 0); + Cost += ArithCost(Instruction::Or, + S->getNumOperands() > 2 ? S->getNumOperands() - 2 : 0); + Cost += CmpSelCost(Instruction::Select, 1, 0, 1); + break; + } + default: + assert(!isa<SCEVSequentialMinMaxExpr>(S) && + "Unhandled SCEV expression type?"); + break; + } break; } case scAddRecExpr: { @@ -2362,7 +2405,7 @@ bool SCEVExpander::isHighCostExpansionHelper( case scConstant: { // Only evalulate the costs of constants when optimizing for size. if (CostKind != TargetTransformInfo::TCK_CodeSize) - return 0; + return false; const APInt &Imm = cast<SCEVConstant>(S)->getAPInt(); Type *Ty = S->getType(); Cost += TTI.getIntImmCostInst( @@ -2399,7 +2442,8 @@ bool SCEVExpander::isHighCostExpansionHelper( case scUMaxExpr: case scSMaxExpr: case scUMinExpr: - case scSMinExpr: { + case scSMinExpr: + case scSequentialUMinExpr: { assert(cast<SCEVNAryExpr>(S)->getNumOperands() > 1 && "Nary expr should have more than 1 operand."); // The simple nary expr will require one less op (or pair of ops) @@ -2490,49 +2534,73 @@ Value *SCEVExpander::generateOverflowCheck(const SCEVAddRecExpr *AR, Value *StepCompare = Builder.CreateICmp(ICmpInst::ICMP_SLT, StepValue, Zero); Value *AbsStep = Builder.CreateSelect(StepCompare, NegStepValue, StepValue); - // Get the backedge taken count and truncate or extended to the AR type. - Value *TruncTripCount = Builder.CreateZExtOrTrunc(TripCountVal, Ty); - // Compute |Step| * Backedge - Value *MulV, *OfMul; - if (Step->isOne()) { - // Special-case Step of one. Potentially-costly `umul_with_overflow` isn't - // needed, there is never an overflow, so to avoid artificially inflating - // the cost of the check, directly emit the optimized IR. - MulV = TruncTripCount; - OfMul = ConstantInt::getFalse(MulV->getContext()); - } else { - auto *MulF = Intrinsic::getDeclaration(Loc->getModule(), - Intrinsic::umul_with_overflow, Ty); - CallInst *Mul = Builder.CreateCall(MulF, {AbsStep, TruncTripCount}, "mul"); - MulV = Builder.CreateExtractValue(Mul, 0, "mul.result"); - OfMul = Builder.CreateExtractValue(Mul, 1, "mul.overflow"); - } - // Compute: - // Start + |Step| * Backedge < Start - // Start - |Step| * Backedge > Start - Value *Add = nullptr, *Sub = nullptr; - if (PointerType *ARPtrTy = dyn_cast<PointerType>(ARTy)) { - StartValue = InsertNoopCastOfTo( - StartValue, Builder.getInt8PtrTy(ARPtrTy->getAddressSpace())); - Value *NegMulV = Builder.CreateNeg(MulV); - Add = Builder.CreateGEP(Builder.getInt8Ty(), StartValue, MulV); - Sub = Builder.CreateGEP(Builder.getInt8Ty(), StartValue, NegMulV); - } else { - Add = Builder.CreateAdd(StartValue, MulV); - Sub = Builder.CreateSub(StartValue, MulV); - } - - Value *EndCompareGT = Builder.CreateICmp( - Signed ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT, Sub, StartValue); + // 1. Start + |Step| * Backedge < Start + // 2. Start - |Step| * Backedge > Start + // + // And select either 1. or 2. depending on whether step is positive or + // negative. If Step is known to be positive or negative, only create + // either 1. or 2. + auto ComputeEndCheck = [&]() -> Value * { + // Checking <u 0 is always false. + if (!Signed && Start->isZero() && SE.isKnownPositive(Step)) + return ConstantInt::getFalse(Loc->getContext()); + + // Get the backedge taken count and truncate or extended to the AR type. + Value *TruncTripCount = Builder.CreateZExtOrTrunc(TripCountVal, Ty); + + Value *MulV, *OfMul; + if (Step->isOne()) { + // Special-case Step of one. Potentially-costly `umul_with_overflow` isn't + // needed, there is never an overflow, so to avoid artificially inflating + // the cost of the check, directly emit the optimized IR. + MulV = TruncTripCount; + OfMul = ConstantInt::getFalse(MulV->getContext()); + } else { + auto *MulF = Intrinsic::getDeclaration(Loc->getModule(), + Intrinsic::umul_with_overflow, Ty); + CallInst *Mul = + Builder.CreateCall(MulF, {AbsStep, TruncTripCount}, "mul"); + MulV = Builder.CreateExtractValue(Mul, 0, "mul.result"); + OfMul = Builder.CreateExtractValue(Mul, 1, "mul.overflow"); + } - Value *EndCompareLT = Builder.CreateICmp( - Signed ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT, Add, StartValue); + Value *Add = nullptr, *Sub = nullptr; + bool NeedPosCheck = !SE.isKnownNegative(Step); + bool NeedNegCheck = !SE.isKnownPositive(Step); + + if (PointerType *ARPtrTy = dyn_cast<PointerType>(ARTy)) { + StartValue = InsertNoopCastOfTo( + StartValue, Builder.getInt8PtrTy(ARPtrTy->getAddressSpace())); + Value *NegMulV = Builder.CreateNeg(MulV); + if (NeedPosCheck) + Add = Builder.CreateGEP(Builder.getInt8Ty(), StartValue, MulV); + if (NeedNegCheck) + Sub = Builder.CreateGEP(Builder.getInt8Ty(), StartValue, NegMulV); + } else { + if (NeedPosCheck) + Add = Builder.CreateAdd(StartValue, MulV); + if (NeedNegCheck) + Sub = Builder.CreateSub(StartValue, MulV); + } - // Select the answer based on the sign of Step. - Value *EndCheck = - Builder.CreateSelect(StepCompare, EndCompareGT, EndCompareLT); + Value *EndCompareLT = nullptr; + Value *EndCompareGT = nullptr; + Value *EndCheck = nullptr; + if (NeedPosCheck) + EndCheck = EndCompareLT = Builder.CreateICmp( + Signed ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT, Add, StartValue); + if (NeedNegCheck) + EndCheck = EndCompareGT = Builder.CreateICmp( + Signed ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT, Sub, StartValue); + if (NeedPosCheck && NeedNegCheck) { + // Select the answer based on the sign of Step. + EndCheck = Builder.CreateSelect(StepCompare, EndCompareGT, EndCompareLT); + } + return Builder.CreateOr(EndCheck, OfMul); + }; + Value *EndCheck = ComputeEndCheck(); // If the backedge taken count type is larger than the AR type, // check that we don't drop any bits by truncating it. If we are @@ -2548,7 +2616,7 @@ Value *SCEVExpander::generateOverflowCheck(const SCEVAddRecExpr *AR, EndCheck = Builder.CreateOr(EndCheck, BackedgeCheck); } - return Builder.CreateOr(EndCheck, OfMul); + return EndCheck; } Value *SCEVExpander::expandWrapPredicate(const SCEVWrapPredicate *Pred, @@ -2578,17 +2646,16 @@ Value *SCEVExpander::expandWrapPredicate(const SCEVWrapPredicate *Pred, Value *SCEVExpander::expandUnionPredicate(const SCEVUnionPredicate *Union, Instruction *IP) { - auto *BoolType = IntegerType::get(IP->getContext(), 1); - Value *Check = ConstantInt::getNullValue(BoolType); - // Loop over all checks in this set. + SmallVector<Value *> Checks; for (auto Pred : Union->getPredicates()) { - auto *NextCheck = expandCodeForPredicate(Pred, IP); + Checks.push_back(expandCodeForPredicate(Pred, IP)); Builder.SetInsertPoint(IP); - Check = Builder.CreateOr(Check, NextCheck); } - return Check; + if (Checks.empty()) + return ConstantInt::getFalse(IP->getContext()); + return Builder.CreateOr(Checks); } Value *SCEVExpander::fixupLCSSAFormFor(Instruction *User, unsigned OpIdx) { @@ -2720,13 +2787,8 @@ void SCEVExpanderCleaner::cleanup() { // Remove sets with value handles. Expander.clear(); - // Sort so that earlier instructions do not dominate later instructions. - stable_sort(InsertedInstructions, [this](Instruction *A, Instruction *B) { - return DT.dominates(B, A); - }); // Remove all inserted instructions. - for (Instruction *I : InsertedInstructions) { - + for (Instruction *I : reverse(InsertedInstructions)) { #ifndef NDEBUG assert(all_of(I->users(), [&InsertedSet](Value *U) { diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp index 1046998c26de..335ac03ccb52 100644 --- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -2052,109 +2052,119 @@ static bool SinkCommonCodeFromPredecessors(BasicBlock *BB, if (ScanIdx == 0) return false; - // Okay, we *could* sink last ScanIdx instructions. But how many can we - // actually sink before encountering instruction that is unprofitable to sink? - auto ProfitableToSinkInstruction = [&](LockstepReverseIterator &LRI) { - unsigned NumPHIdValues = 0; - for (auto *I : *LRI) - for (auto *V : PHIOperands[I]) { - if (!InstructionsToSink.contains(V)) - ++NumPHIdValues; - // FIXME: this check is overly optimistic. We may end up not sinking - // said instruction, due to the very same profitability check. - // See @creating_too_many_phis in sink-common-code.ll. - } - LLVM_DEBUG(dbgs() << "SINK: #phid values: " << NumPHIdValues << "\n"); - unsigned NumPHIInsts = NumPHIdValues / UnconditionalPreds.size(); - if ((NumPHIdValues % UnconditionalPreds.size()) != 0) + bool followedByDeoptOrUnreachable = IsBlockFollowedByDeoptOrUnreachable(BB); + + if (!followedByDeoptOrUnreachable) { + // Okay, we *could* sink last ScanIdx instructions. But how many can we + // actually sink before encountering instruction that is unprofitable to + // sink? + auto ProfitableToSinkInstruction = [&](LockstepReverseIterator &LRI) { + unsigned NumPHIdValues = 0; + for (auto *I : *LRI) + for (auto *V : PHIOperands[I]) { + if (!InstructionsToSink.contains(V)) + ++NumPHIdValues; + // FIXME: this check is overly optimistic. We may end up not sinking + // said instruction, due to the very same profitability check. + // See @creating_too_many_phis in sink-common-code.ll. + } + LLVM_DEBUG(dbgs() << "SINK: #phid values: " << NumPHIdValues << "\n"); + unsigned NumPHIInsts = NumPHIdValues / UnconditionalPreds.size(); + if ((NumPHIdValues % UnconditionalPreds.size()) != 0) NumPHIInsts++; - return NumPHIInsts <= 1; - }; + return NumPHIInsts <= 1; + }; - // We've determined that we are going to sink last ScanIdx instructions, - // and recorded them in InstructionsToSink. Now, some instructions may be - // unprofitable to sink. But that determination depends on the instructions - // that we are going to sink. - - // First, forward scan: find the first instruction unprofitable to sink, - // recording all the ones that are profitable to sink. - // FIXME: would it be better, after we detect that not all are profitable. - // to either record the profitable ones, or erase the unprofitable ones? - // Maybe we need to choose (at runtime) the one that will touch least instrs? - LRI.reset(); - int Idx = 0; - SmallPtrSet<Value *, 4> InstructionsProfitableToSink; - while (Idx < ScanIdx) { - if (!ProfitableToSinkInstruction(LRI)) { - // Too many PHIs would be created. - LLVM_DEBUG( - dbgs() << "SINK: stopping here, too many PHIs would be created!\n"); - break; + // We've determined that we are going to sink last ScanIdx instructions, + // and recorded them in InstructionsToSink. Now, some instructions may be + // unprofitable to sink. But that determination depends on the instructions + // that we are going to sink. + + // First, forward scan: find the first instruction unprofitable to sink, + // recording all the ones that are profitable to sink. + // FIXME: would it be better, after we detect that not all are profitable. + // to either record the profitable ones, or erase the unprofitable ones? + // Maybe we need to choose (at runtime) the one that will touch least + // instrs? + LRI.reset(); + int Idx = 0; + SmallPtrSet<Value *, 4> InstructionsProfitableToSink; + while (Idx < ScanIdx) { + if (!ProfitableToSinkInstruction(LRI)) { + // Too many PHIs would be created. + LLVM_DEBUG( + dbgs() << "SINK: stopping here, too many PHIs would be created!\n"); + break; + } + InstructionsProfitableToSink.insert((*LRI).begin(), (*LRI).end()); + --LRI; + ++Idx; } - InstructionsProfitableToSink.insert((*LRI).begin(), (*LRI).end()); - --LRI; - ++Idx; - } - // If no instructions can be sunk, early-return. - if (Idx == 0) - return false; + // If no instructions can be sunk, early-return. + if (Idx == 0) + return false; - // Did we determine that (only) some instructions are unprofitable to sink? - if (Idx < ScanIdx) { - // Okay, some instructions are unprofitable. - ScanIdx = Idx; - InstructionsToSink = InstructionsProfitableToSink; - - // But, that may make other instructions unprofitable, too. - // So, do a backward scan, do any earlier instructions become unprofitable? - assert(!ProfitableToSinkInstruction(LRI) && - "We already know that the last instruction is unprofitable to sink"); - ++LRI; - --Idx; - while (Idx >= 0) { - // If we detect that an instruction becomes unprofitable to sink, - // all earlier instructions won't be sunk either, - // so preemptively keep InstructionsProfitableToSink in sync. - // FIXME: is this the most performant approach? - for (auto *I : *LRI) - InstructionsProfitableToSink.erase(I); - if (!ProfitableToSinkInstruction(LRI)) { - // Everything starting with this instruction won't be sunk. - ScanIdx = Idx; - InstructionsToSink = InstructionsProfitableToSink; - } + // Did we determine that (only) some instructions are unprofitable to sink? + if (Idx < ScanIdx) { + // Okay, some instructions are unprofitable. + ScanIdx = Idx; + InstructionsToSink = InstructionsProfitableToSink; + + // But, that may make other instructions unprofitable, too. + // So, do a backward scan, do any earlier instructions become + // unprofitable? + assert( + !ProfitableToSinkInstruction(LRI) && + "We already know that the last instruction is unprofitable to sink"); ++LRI; --Idx; + while (Idx >= 0) { + // If we detect that an instruction becomes unprofitable to sink, + // all earlier instructions won't be sunk either, + // so preemptively keep InstructionsProfitableToSink in sync. + // FIXME: is this the most performant approach? + for (auto *I : *LRI) + InstructionsProfitableToSink.erase(I); + if (!ProfitableToSinkInstruction(LRI)) { + // Everything starting with this instruction won't be sunk. + ScanIdx = Idx; + InstructionsToSink = InstructionsProfitableToSink; + } + ++LRI; + --Idx; + } } - } - // If no instructions can be sunk, early-return. - if (ScanIdx == 0) - return false; + // If no instructions can be sunk, early-return. + if (ScanIdx == 0) + return false; + } bool Changed = false; if (HaveNonUnconditionalPredecessors) { - // It is always legal to sink common instructions from unconditional - // predecessors. However, if not all predecessors are unconditional, - // this transformation might be pessimizing. So as a rule of thumb, - // don't do it unless we'd sink at least one non-speculatable instruction. - // See https://bugs.llvm.org/show_bug.cgi?id=30244 - LRI.reset(); - int Idx = 0; - bool Profitable = false; - while (Idx < ScanIdx) { - if (!isSafeToSpeculativelyExecute((*LRI)[0])) { - Profitable = true; - break; + if (!followedByDeoptOrUnreachable) { + // It is always legal to sink common instructions from unconditional + // predecessors. However, if not all predecessors are unconditional, + // this transformation might be pessimizing. So as a rule of thumb, + // don't do it unless we'd sink at least one non-speculatable instruction. + // See https://bugs.llvm.org/show_bug.cgi?id=30244 + LRI.reset(); + int Idx = 0; + bool Profitable = false; + while (Idx < ScanIdx) { + if (!isSafeToSpeculativelyExecute((*LRI)[0])) { + Profitable = true; + break; + } + --LRI; + ++Idx; } - --LRI; - ++Idx; + if (!Profitable) + return false; } - if (!Profitable) - return false; LLVM_DEBUG(dbgs() << "SINK: Splitting edge\n"); // We have a conditional edge and we're going to sink some instructions. @@ -4935,14 +4945,13 @@ static bool eliminateDeadSwitchCases(SwitchInst *SI, DomTreeUpdater *DTU, AssumptionCache *AC, const DataLayout &DL) { Value *Cond = SI->getCondition(); - unsigned Bits = Cond->getType()->getIntegerBitWidth(); KnownBits Known = computeKnownBits(Cond, DL, 0, AC, SI); // We can also eliminate cases by determining that their values are outside of // the limited range of the condition based on how many significant (non-sign) // bits are in the condition value. - unsigned ExtraSignBits = ComputeNumSignBits(Cond, DL, 0, AC, SI) - 1; - unsigned MaxSignificantBitsInCond = Bits - ExtraSignBits; + unsigned MaxSignificantBitsInCond = + ComputeMaxSignificantBits(Cond, DL, 0, AC, SI); // Gather dead cases. SmallVector<ConstantInt *, 8> DeadCases; @@ -4973,8 +4982,8 @@ static bool eliminateDeadSwitchCases(SwitchInst *SI, DomTreeUpdater *DTU, bool HasDefault = !isa<UnreachableInst>(SI->getDefaultDest()->getFirstNonPHIOrDbg()); const unsigned NumUnknownBits = - Bits - (Known.Zero | Known.One).countPopulation(); - assert(NumUnknownBits <= Bits); + Known.getBitWidth() - (Known.Zero | Known.One).countPopulation(); + assert(NumUnknownBits <= Known.getBitWidth()); if (HasDefault && DeadCases.empty() && NumUnknownBits < 64 /* avoid overflow */ && SI->getNumCases() == (1ULL << NumUnknownBits)) { @@ -5796,10 +5805,9 @@ static void reuseTableCompare( for (auto ValuePair : Values) { Constant *CaseConst = ConstantExpr::getICmp(CmpInst->getPredicate(), ValuePair.second, CmpOp1, true); - if (!CaseConst || CaseConst == DefaultConst || isa<UndefValue>(CaseConst)) + if (!CaseConst || CaseConst == DefaultConst || + (CaseConst != TrueConst && CaseConst != FalseConst)) return; - assert((CaseConst == TrueConst || CaseConst == FalseConst) && - "Expect true or false as compare result."); } // Check if the branch instruction dominates the phi node. It's a simple diff --git a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp index 02727a3dbf9c..e02d02a05752 100644 --- a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp @@ -602,7 +602,7 @@ Value *LibCallSimplifier::optimizeStrNCpy(CallInst *CI, IRBuilderBase &B) { Align MemSetAlign = CI->getAttributes().getParamAttrs(0).getAlignment().valueOrOne(); CallInst *NewCI = B.CreateMemSet(Dst, B.getInt8('\0'), Size, MemSetAlign); - AttrBuilder ArgAttrs(CI->getAttributes().getParamAttrs(0)); + AttrBuilder ArgAttrs(CI->getContext(), CI->getAttributes().getParamAttrs(0)); NewCI->setAttributes(NewCI->getAttributes().addParamAttributes( CI->getContext(), 0, ArgAttrs)); copyFlags(*CI, NewCI); @@ -2515,8 +2515,9 @@ Value *LibCallSimplifier::optimizeSPrintFString(CallInst *CI, } else if (Value *V = emitStpCpy(Dest, CI->getArgOperand(2), B, TLI)) { // sprintf(dest, "%s", str) -> stpcpy(dest, str) - dest // Handle mismatched pointer types (goes away with typeless pointers?). - V = B.CreatePointerCast(V, Dest->getType()); - Value *PtrDiff = B.CreatePtrDiff(V, Dest); + V = B.CreatePointerCast(V, B.getInt8PtrTy()); + Dest = B.CreatePointerCast(Dest, B.getInt8PtrTy()); + Value *PtrDiff = B.CreatePtrDiff(B.getInt8Ty(), V, Dest); return B.CreateIntCast(PtrDiff, CI->getType(), false); } diff --git a/llvm/lib/Transforms/Utils/ValueMapper.cpp b/llvm/lib/Transforms/Utils/ValueMapper.cpp index b822db938af8..8947303674ee 100644 --- a/llvm/lib/Transforms/Utils/ValueMapper.cpp +++ b/llvm/lib/Transforms/Utils/ValueMapper.cpp @@ -398,13 +398,17 @@ Value *Mapper::mapValue(const Value *V) { SmallVector<ValueAsMetadata *, 4> MappedArgs; for (auto *VAM : AL->getArgs()) { // Map both Local and Constant VAMs here; they will both ultimately - // be mapped via mapValue (apart from constants when we have no - // module level changes, which have an identity mapping). + // be mapped via mapValue. The exceptions are constants when we have no + // module level changes and locals when they have no existing mapped + // value and RF_IgnoreMissingLocals is set; these have identity + // mappings. if ((Flags & RF_NoModuleLevelChanges) && isa<ConstantAsMetadata>(VAM)) { MappedArgs.push_back(VAM); } else if (Value *LV = mapValue(VAM->getValue())) { MappedArgs.push_back( LV == VAM->getValue() ? VAM : ValueAsMetadata::get(LV)); + } else if ((Flags & RF_IgnoreMissingLocals) && isa<LocalAsMetadata>(VAM)) { + MappedArgs.push_back(VAM); } else { // If we cannot map the value, set the argument as undef. MappedArgs.push_back(ValueAsMetadata::get( diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp index 5a4a2f0924f6..97c2acb7d4c7 100644 --- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp @@ -698,8 +698,9 @@ Vectorizer::getVectorizablePrefix(ArrayRef<Instruction *> Chain) { ChainInstrs.push_back(&I); continue; } - if (I.mayThrow()) { - LLVM_DEBUG(dbgs() << "LSV: Found may-throw operation: " << I << '\n'); + if (!isGuaranteedToTransferExecutionToSuccessor(&I)) { + LLVM_DEBUG(dbgs() << "LSV: Found instruction may not transfer execution: " + << I << '\n'); break; } if (I.mayReadOrWriteMemory()) @@ -853,13 +854,6 @@ Vectorizer::collectInstructions(BasicBlock *BB) { (VecTy && TTI.getLoadVectorFactor(VF, TySize, TySize / 8, VecTy) == 0)) continue; - // Make sure all the users of a vector are constant-index extracts. - if (isa<VectorType>(Ty) && !llvm::all_of(LI->users(), [](const User *U) { - const ExtractElementInst *EEI = dyn_cast<ExtractElementInst>(U); - return EEI && isa<ConstantInt>(EEI->getOperand(1)); - })) - continue; - // Save the load locations. const ChainID ID = getChainID(Ptr); LoadRefs[ID].push_back(LI); @@ -900,12 +894,6 @@ Vectorizer::collectInstructions(BasicBlock *BB) { (VecTy && TTI.getStoreVectorFactor(VF, TySize, TySize / 8, VecTy) == 0)) continue; - if (isa<VectorType>(Ty) && !llvm::all_of(SI->users(), [](const User *U) { - const ExtractElementInst *EEI = dyn_cast<ExtractElementInst>(U); - return EEI && isa<ConstantInt>(EEI->getOperand(1)); - })) - continue; - // Save store location. const ChainID ID = getChainID(Ptr); StoreRefs[ID].push_back(SI); @@ -1289,52 +1277,32 @@ bool Vectorizer::vectorizeLoadChain( Builder.CreateAlignedLoad(VecTy, Bitcast, MaybeAlign(Alignment)); propagateMetadata(LI, Chain); - if (VecLoadTy) { - SmallVector<Instruction *, 16> InstrsToErase; - - unsigned VecWidth = VecLoadTy->getNumElements(); - for (unsigned I = 0, E = Chain.size(); I != E; ++I) { - for (auto Use : Chain[I]->users()) { - // All users of vector loads are ExtractElement instructions with - // constant indices, otherwise we would have bailed before now. - Instruction *UI = cast<Instruction>(Use); - unsigned Idx = cast<ConstantInt>(UI->getOperand(1))->getZExtValue(); - unsigned NewIdx = Idx + I * VecWidth; - Value *V = Builder.CreateExtractElement(LI, Builder.getInt32(NewIdx), - UI->getName()); - if (V->getType() != UI->getType()) - V = Builder.CreateBitCast(V, UI->getType()); - - // Replace the old instruction. - UI->replaceAllUsesWith(V); - InstrsToErase.push_back(UI); - } + for (unsigned I = 0, E = Chain.size(); I != E; ++I) { + Value *CV = Chain[I]; + Value *V; + if (VecLoadTy) { + // Extract a subvector using shufflevector. + unsigned VecWidth = VecLoadTy->getNumElements(); + auto Mask = + llvm::to_vector<8>(llvm::seq<int>(I * VecWidth, (I + 1) * VecWidth)); + V = Builder.CreateShuffleVector(LI, Mask, CV->getName()); + } else { + V = Builder.CreateExtractElement(LI, Builder.getInt32(I), CV->getName()); } - // Bitcast might not be an Instruction, if the value being loaded is a - // constant. In that case, no need to reorder anything. - if (Instruction *BitcastInst = dyn_cast<Instruction>(Bitcast)) - reorder(BitcastInst); - - for (auto I : InstrsToErase) - I->eraseFromParent(); - } else { - for (unsigned I = 0, E = Chain.size(); I != E; ++I) { - Value *CV = Chain[I]; - Value *V = - Builder.CreateExtractElement(LI, Builder.getInt32(I), CV->getName()); - if (V->getType() != CV->getType()) { - V = Builder.CreateBitOrPointerCast(V, CV->getType()); - } - - // Replace the old instruction. - CV->replaceAllUsesWith(V); + if (V->getType() != CV->getType()) { + V = Builder.CreateBitOrPointerCast(V, CV->getType()); } - if (Instruction *BitcastInst = dyn_cast<Instruction>(Bitcast)) - reorder(BitcastInst); + // Replace the old instruction. + CV->replaceAllUsesWith(V); } + // Bitcast might not be an Instruction, if the value being loaded is a + // constant. In that case, no need to reorder anything. + if (Instruction *BitcastInst = dyn_cast<Instruction>(Bitcast)) + reorder(BitcastInst); + eraseInstructions(Chain); ++NumVectorInstructions; diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 4747f34fcc62..d11f4146b590 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -470,10 +470,11 @@ public: /// on, while the old loop will be used as the scalar remainder. Control flow /// is generated around the vectorized (and scalar epilogue) loops consisting /// of various checks and bypasses. Return the pre-header block of the new - /// loop. - /// In the case of epilogue vectorization, this function is overriden to - /// handle the more complex control flow around the loops. - virtual BasicBlock *createVectorizedLoopSkeleton(); + /// loop and the start value for the canonical induction, if it is != 0. The + /// latter is the case when vectorizing the epilogue loop. In the case of + /// epilogue vectorization, this function is overriden to handle the more + /// complex control flow around the loops. + virtual std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton(); /// Widen a single call instruction within the innermost loop. void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands, @@ -507,10 +508,10 @@ public: /// Widen an integer or floating-point induction variable \p IV. If \p Trunc /// is provided, the integer induction variable will first be truncated to - /// the corresponding type. - void widenIntOrFpInduction(PHINode *IV, const InductionDescriptor &ID, - Value *Start, TruncInst *Trunc, VPValue *Def, - VPTransformState &State); + /// the corresponding type. \p CanonicalIV is the scalar value generated for + /// the canonical induction variable. + void widenIntOrFpInduction(PHINode *IV, VPWidenIntOrFpInductionRecipe *Def, + VPTransformState &State, Value *CanonicalIV); /// Construct the vector value of a scalarized value \p V one lane at a time. void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance, @@ -556,6 +557,10 @@ public: /// vector of instructions. void addMetadata(ArrayRef<Value *> To, Instruction *From); + // Returns the resume value (bc.merge.rdx) for a reduction as + // generated by fixReduction. + PHINode *getReductionResumeValue(const RecurrenceDescriptor &RdxDesc); + protected: friend class LoopVectorizationPlanner; @@ -573,16 +578,18 @@ protected: Value *CountRoundDown, Value *EndValue, BasicBlock *MiddleBlock); - /// Create a new induction variable inside L. - PHINode *createInductionVariable(Loop *L, Value *Start, Value *End, - Value *Step, Instruction *DL); + /// Introduce a conditional branch (on true, condition to be set later) at the + /// end of the header=latch connecting it to itself (across the backedge) and + /// to the exit block of \p L. + void createHeaderBranch(Loop *L); /// Handle all cross-iteration phis in the header. void fixCrossIterationPHIs(VPTransformState &State); /// Create the exit value of first order recurrences in the middle block and /// update their users. - void fixFirstOrderRecurrence(VPWidenPHIRecipe *PhiR, VPTransformState &State); + void fixFirstOrderRecurrence(VPFirstOrderRecurrencePHIRecipe *PhiR, + VPTransformState &State); /// Create code for the loop exit value of the reduction. void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State); @@ -606,14 +613,6 @@ protected: /// represented as. void truncateToMinimalBitwidths(VPTransformState &State); - /// This function adds - /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...) - /// to each vector element of Val. The sequence starts at StartIndex. - /// \p Opcode is relevant for FP induction variable. - virtual Value * - getStepVector(Value *Val, Value *StartIdx, Value *Step, - Instruction::BinaryOps Opcode = Instruction::BinaryOpsEnd); - /// Compute scalar induction steps. \p ScalarIV is the scalar induction /// variable on which to base the steps, \p Step is the size of the step, and /// \p EntryVal is the value from the original loop that maps to the steps. @@ -640,9 +639,6 @@ protected: /// Returns true if we should generate a scalar version of \p IV. bool needsScalarInduction(Instruction *IV) const; - /// Generate a shuffle sequence that will reverse the vector Vec. - virtual Value *reverseVector(Value *Vec); - /// Returns (and creates if needed) the original loop trip count. Value *getOrCreateTripCount(Loop *NewLoop); @@ -685,14 +681,13 @@ protected: Loop *createVectorLoopSkeleton(StringRef Prefix); /// Create new phi nodes for the induction variables to resume iteration count - /// in the scalar epilogue, from where the vectorized loop left off (given by - /// \p VectorTripCount). + /// in the scalar epilogue, from where the vectorized loop left off. /// In cases where the loop skeleton is more complicated (eg. epilogue /// vectorization) and the resume values can come from an additional bypass /// block, the \p AdditionalBypass pair provides information about the bypass /// block and the end value on the edge from bypass to this loop. void createInductionResumeValues( - Loop *L, Value *VectorTripCount, + Loop *L, std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); /// Complete the loop skeleton by adding debug MDs, creating appropriate @@ -795,12 +790,6 @@ protected: /// A list of all bypass blocks. The first block is the entry of the loop. SmallVector<BasicBlock *, 4> LoopBypassBlocks; - /// The new Induction variable which was added to the new block. - PHINode *Induction = nullptr; - - /// The induction variable of the old basic block. - PHINode *OldInduction = nullptr; - /// Store instructions that were predicated. SmallVector<Instruction *, 4> PredicatedInstructions; @@ -838,6 +827,11 @@ protected: /// Structure to hold information about generated runtime checks, responsible /// for cleaning the checks, if vectorization turns out unprofitable. GeneratedRTChecks &RTChecks; + + // Holds the resume values for reductions in the loops, used to set the + // correct start value of reduction PHIs when vectorizing the epilogue. + SmallMapVector<const RecurrenceDescriptor *, PHINode *, 4> + ReductionResumeValues; }; class InnerLoopUnroller : public InnerLoopVectorizer { @@ -856,10 +850,6 @@ public: private: Value *getBroadcastInstrs(Value *V) override; - Value *getStepVector( - Value *Val, Value *StartIdx, Value *Step, - Instruction::BinaryOps Opcode = Instruction::BinaryOpsEnd) override; - Value *reverseVector(Value *Vec) override; }; /// Encapsulate information regarding vectorization of a loop and its epilogue. @@ -909,14 +899,16 @@ public: // Override this function to handle the more complex control flow around the // three loops. - BasicBlock *createVectorizedLoopSkeleton() final override { + std::pair<BasicBlock *, Value *> + createVectorizedLoopSkeleton() final override { return createEpilogueVectorizedLoopSkeleton(); } /// The interface for creating a vectorized skeleton using one of two /// different strategies, each corresponding to one execution of the vplan /// as described above. - virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0; + virtual std::pair<BasicBlock *, Value *> + createEpilogueVectorizedLoopSkeleton() = 0; /// Holds and updates state information required to vectorize the main loop /// and its epilogue in two separate passes. This setup helps us avoid @@ -944,7 +936,8 @@ public: EPI, LVL, CM, BFI, PSI, Check) {} /// Implements the interface for creating a vectorized skeleton using the /// *main loop* strategy (ie the first pass of vplan execution). - BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; + std::pair<BasicBlock *, Value *> + createEpilogueVectorizedLoopSkeleton() final override; protected: /// Emits an iteration count bypass check once for the main loop (when \p @@ -973,7 +966,8 @@ public: EPI, LVL, CM, BFI, PSI, Checks) {} /// Implements the interface for creating a vectorized skeleton using the /// *epilogue loop* strategy (ie the second pass of vplan execution). - BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; + std::pair<BasicBlock *, Value *> + createEpilogueVectorizedLoopSkeleton() final override; protected: /// Emits an iteration count bypass check after the main vector loop has @@ -1069,16 +1063,16 @@ static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion); } +namespace llvm { + /// Return a value for Step multiplied by VF. -static Value *createStepForVF(IRBuilder<> &B, Type *Ty, ElementCount VF, - int64_t Step) { +Value *createStepForVF(IRBuilder<> &B, Type *Ty, ElementCount VF, + int64_t Step) { assert(Ty->isIntegerTy() && "Expected an integer step"); Constant *StepVal = ConstantInt::get(Ty, Step * VF.getKnownMinValue()); return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal; } -namespace llvm { - /// Return the runtime value for VF. Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF) { Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue()); @@ -1163,7 +1157,8 @@ void InnerLoopVectorizer::collectPoisonGeneratingRecipes( // will lead to gather/scatter instructions, which don't need to be // handled. if (isa<VPWidenMemoryInstructionRecipe>(CurRec) || - isa<VPInterleaveRecipe>(CurRec)) + isa<VPInterleaveRecipe>(CurRec) || + isa<VPCanonicalIVPHIRecipe>(CurRec)) continue; // This recipe contributes to the address computation of a widen @@ -1232,6 +1227,14 @@ void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, } } +PHINode *InnerLoopVectorizer::getReductionResumeValue( + const RecurrenceDescriptor &RdxDesc) { + auto It = ReductionResumeValues.find(&RdxDesc); + assert(It != ReductionResumeValues.end() && + "Expected to find a resume value for the reduction."); + return It->second; +} + namespace llvm { // Loop vectorization cost-model hints how the scalar epilogue loop should be @@ -1556,13 +1559,16 @@ public: /// Returns true if the target machine can represent \p V as a masked gather /// or scatter operation. - bool isLegalGatherOrScatter(Value *V) { + bool isLegalGatherOrScatter(Value *V, + ElementCount VF = ElementCount::getFixed(1)) { bool LI = isa<LoadInst>(V); bool SI = isa<StoreInst>(V); if (!LI && !SI) return false; auto *Ty = getLoadStoreType(V); Align Align = getLoadStoreAlignment(V); + if (VF.isVector()) + Ty = VectorType::get(Ty, VF); return (LI && TTI.isLegalMaskedGather(Ty, Align)) || (SI && TTI.isLegalMaskedScatter(Ty, Align)); } @@ -1577,16 +1583,17 @@ public: } /// Returns true if \p I is an instruction that will be scalarized with - /// predication. Such instructions include conditional stores and - /// instructions that may divide by zero. - /// If a non-zero VF has been calculated, we check if I will be scalarized - /// predication for that VF. - bool isScalarWithPredication(Instruction *I) const; + /// predication when vectorizing \p I with vectorization factor \p VF. Such + /// instructions include conditional stores and instructions that may divide + /// by zero. + bool isScalarWithPredication(Instruction *I, ElementCount VF) const; // Returns true if \p I is an instruction that will be predicated either // through scalar predication or masked load/store or masked gather/scatter. + // \p VF is the vectorization factor that will be used to vectorize \p I. // Superset of instructions that return true for isScalarWithPredication. - bool isPredicatedInst(Instruction *I, bool IsKnownUniform = false) { + bool isPredicatedInst(Instruction *I, ElementCount VF, + bool IsKnownUniform = false) { // When we know the load is uniform and the original scalar loop was not // predicated we don't need to mark it as a predicated instruction. Any // vectorised blocks created when tail-folding are something artificial we @@ -1602,7 +1609,7 @@ public: // instructions. if (isa<LoadInst>(I) || isa<StoreInst>(I)) return Legal->isMaskRequired(I); - return isScalarWithPredication(I); + return isScalarWithPredication(I, VF); } /// Returns true if \p I is a memory instruction with consecutive memory @@ -1794,7 +1801,7 @@ private: /// Returns true if an artificially high cost for emulated masked memrefs /// should be used. - bool useEmulatedMaskMemRefHack(Instruction *I); + bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF); /// Map of scalar integer values to the smallest bitwidth they can be legally /// represented as. The vector equivalents of these values should be truncated @@ -2078,8 +2085,8 @@ public: /// Remove the created SCEV & memory runtime check blocks & instructions, if /// unused. ~GeneratedRTChecks() { - SCEVExpanderCleaner SCEVCleaner(SCEVExp, *DT); - SCEVExpanderCleaner MemCheckCleaner(MemCheckExp, *DT); + SCEVExpanderCleaner SCEVCleaner(SCEVExp); + SCEVExpanderCleaner MemCheckCleaner(MemCheckExp); if (!SCEVCheckCond) SCEVCleaner.markResultUsed(); @@ -2335,6 +2342,60 @@ Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { return Shuf; } +/// This function adds +/// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...) +/// to each vector element of Val. The sequence starts at StartIndex. +/// \p Opcode is relevant for FP induction variable. +static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step, + Instruction::BinaryOps BinOp, ElementCount VF, + IRBuilder<> &Builder) { + assert(VF.isVector() && "only vector VFs are supported"); + + // Create and check the types. + auto *ValVTy = cast<VectorType>(Val->getType()); + ElementCount VLen = ValVTy->getElementCount(); + + Type *STy = Val->getType()->getScalarType(); + assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && + "Induction Step must be an integer or FP"); + assert(Step->getType() == STy && "Step has wrong type"); + + SmallVector<Constant *, 8> Indices; + + // Create a vector of consecutive numbers from zero to VF. + VectorType *InitVecValVTy = ValVTy; + Type *InitVecValSTy = STy; + if (STy->isFloatingPointTy()) { + InitVecValSTy = + IntegerType::get(STy->getContext(), STy->getScalarSizeInBits()); + InitVecValVTy = VectorType::get(InitVecValSTy, VLen); + } + Value *InitVec = Builder.CreateStepVector(InitVecValVTy); + + // Splat the StartIdx + Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx); + + if (STy->isIntegerTy()) { + InitVec = Builder.CreateAdd(InitVec, StartIdxSplat); + Step = Builder.CreateVectorSplat(VLen, Step); + assert(Step->getType() == Val->getType() && "Invalid step vec"); + // FIXME: The newly created binary instructions should contain nsw/nuw + // flags, which can be found from the original scalar operations. + Step = Builder.CreateMul(InitVec, Step); + return Builder.CreateAdd(Val, Step, "induction"); + } + + // Floating point induction. + assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && + "Binary Opcode should be specified for FP induction"); + InitVec = Builder.CreateUIToFP(InitVec, ValVTy); + InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat); + + Step = Builder.CreateVectorSplat(VLen, Step); + Value *MulOp = Builder.CreateFMul(InitVec, Step); + return Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); +} + void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( const InductionDescriptor &II, Value *Step, Value *Start, Instruction *EntryVal, VPValue *Def, VPTransformState &State) { @@ -2355,8 +2416,8 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0); Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start); - Value *SteppedStart = - getStepVector(SplatStart, Zero, Step, II.getInductionOpcode()); + Value *SteppedStart = getStepVector( + SplatStart, Zero, Step, II.getInductionOpcode(), State.VF, State.Builder); // We create vector phi nodes for both integer and floating-point induction // variables. Here, we determine the kind of arithmetic we will perform. @@ -2411,8 +2472,7 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( // placement of all induction updates. auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator()); - auto *ICmp = cast<Instruction>(Br->getCondition()); - LastInduction->moveBefore(ICmp); + LastInduction->moveBefore(Br); LastInduction->setName("vec.ind.next"); VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); @@ -2434,15 +2494,15 @@ bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { return llvm::any_of(IV->users(), isScalarInst); } -void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, - const InductionDescriptor &ID, - Value *Start, TruncInst *Trunc, - VPValue *Def, - VPTransformState &State) { +void InnerLoopVectorizer::widenIntOrFpInduction( + PHINode *IV, VPWidenIntOrFpInductionRecipe *Def, VPTransformState &State, + Value *CanonicalIV) { + Value *Start = Def->getStartValue()->getLiveInIRValue(); + const InductionDescriptor &ID = Def->getInductionDescriptor(); + TruncInst *Trunc = Def->getTruncInst(); IRBuilder<> &Builder = State.Builder; - assert((IV->getType()->isIntegerTy() || IV != OldInduction) && - "Primary induction variable must have an integer type"); assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); + assert(!State.VF.isZero() && "VF must be non-zero"); // The value from the original loop to which we are mapping the new induction // variable. @@ -2468,12 +2528,13 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, // induction variable and step. Otherwise, derive these values from the // induction descriptor. auto CreateScalarIV = [&](Value *&Step) -> Value * { - Value *ScalarIV = Induction; - if (IV != OldInduction) { - ScalarIV = IV->getType()->isIntegerTy() - ? Builder.CreateSExtOrTrunc(Induction, IV->getType()) - : Builder.CreateCast(Instruction::SIToFP, Induction, - IV->getType()); + Value *ScalarIV = CanonicalIV; + Type *NeededType = IV->getType(); + if (!Def->isCanonical() || ScalarIV->getType() != NeededType) { + ScalarIV = + NeededType->isIntegerTy() + ? Builder.CreateSExtOrTrunc(ScalarIV, NeededType) + : Builder.CreateCast(Instruction::SIToFP, ScalarIV, NeededType); ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID, State.CFG.PrevBB); ScalarIV->setName("offset.idx"); @@ -2493,7 +2554,6 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) { Value *Broadcasted = getBroadcastInstrs(ScalarIV); for (unsigned Part = 0; Part < UF; ++Part) { - assert(!State.VF.isScalable() && "scalable vectors not yet supported."); Value *StartIdx; if (Step->getType()->isFloatingPointTy()) StartIdx = @@ -2502,7 +2562,8 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, StartIdx = getRuntimeVF(Builder, Step->getType(), State.VF * Part); Value *EntryPart = - getStepVector(Broadcasted, StartIdx, Step, ID.getInductionOpcode()); + getStepVector(Broadcasted, StartIdx, Step, ID.getInductionOpcode(), + State.VF, State.Builder); State.set(Def, EntryPart, Part); if (Trunc) addMetadata(EntryPart, Trunc); @@ -2516,9 +2577,31 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, // Now do the actual transformations, and start with creating the step value. Value *Step = CreateStepValue(ID.getStep()); - if (State.VF.isZero() || State.VF.isScalar()) { + if (State.VF.isScalar()) { Value *ScalarIV = CreateScalarIV(Step); - CreateSplatIV(ScalarIV, Step); + Type *ScalarTy = IntegerType::get(ScalarIV->getContext(), + Step->getType()->getScalarSizeInBits()); + + Instruction::BinaryOps IncOp = ID.getInductionOpcode(); + if (IncOp == Instruction::BinaryOpsEnd) + IncOp = Instruction::Add; + for (unsigned Part = 0; Part < UF; ++Part) { + Value *StartIdx = ConstantInt::get(ScalarTy, Part); + Instruction::BinaryOps MulOp = Instruction::Mul; + if (Step->getType()->isFloatingPointTy()) { + StartIdx = Builder.CreateUIToFP(StartIdx, Step->getType()); + MulOp = Instruction::FMul; + } + + Value *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); + Value *EntryPart = Builder.CreateBinOp(IncOp, ScalarIV, Mul, "induction"); + State.set(Def, EntryPart, Part); + if (Trunc) { + assert(!Step->getType()->isFloatingPointTy() && + "fp inductions shouldn't be truncated"); + addMetadata(EntryPart, Trunc); + } + } return; } @@ -2554,54 +2637,6 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, State); } -Value *InnerLoopVectorizer::getStepVector(Value *Val, Value *StartIdx, - Value *Step, - Instruction::BinaryOps BinOp) { - // Create and check the types. - auto *ValVTy = cast<VectorType>(Val->getType()); - ElementCount VLen = ValVTy->getElementCount(); - - Type *STy = Val->getType()->getScalarType(); - assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && - "Induction Step must be an integer or FP"); - assert(Step->getType() == STy && "Step has wrong type"); - - SmallVector<Constant *, 8> Indices; - - // Create a vector of consecutive numbers from zero to VF. - VectorType *InitVecValVTy = ValVTy; - Type *InitVecValSTy = STy; - if (STy->isFloatingPointTy()) { - InitVecValSTy = - IntegerType::get(STy->getContext(), STy->getScalarSizeInBits()); - InitVecValVTy = VectorType::get(InitVecValSTy, VLen); - } - Value *InitVec = Builder.CreateStepVector(InitVecValVTy); - - // Splat the StartIdx - Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx); - - if (STy->isIntegerTy()) { - InitVec = Builder.CreateAdd(InitVec, StartIdxSplat); - Step = Builder.CreateVectorSplat(VLen, Step); - assert(Step->getType() == Val->getType() && "Invalid step vec"); - // FIXME: The newly created binary instructions should contain nsw/nuw flags, - // which can be found from the original scalar operations. - Step = Builder.CreateMul(InitVec, Step); - return Builder.CreateAdd(Val, Step, "induction"); - } - - // Floating point induction. - assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && - "Binary Opcode should be specified for FP induction"); - InitVec = Builder.CreateUIToFP(InitVec, ValVTy); - InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat); - - Step = Builder.CreateVectorSplat(VLen, Step); - Value *MulOp = Builder.CreateFMul(InitVec, Step); - return Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); -} - void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, const InductionDescriptor &ID, @@ -2691,11 +2726,6 @@ void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def, State.set(Def, VectorValue, Instance.Part); } -Value *InnerLoopVectorizer::reverseVector(Value *Vec) { - assert(Vec->getType()->isVectorTy() && "Invalid type"); - return Builder.CreateVectorReverse(Vec, "reverse"); -} - // Return whether we allow using masked interleave-groups (for dealing with // strided loads/stores that reside in predicated blocks, or for dealing // with gaps). @@ -2858,7 +2888,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( } if (Group->isReverse()) - StridedVec = reverseVector(StridedVec); + StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse"); State.set(VPDefs[J], StridedVec, Part); } @@ -2894,7 +2924,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( Value *StoredVec = State.get(StoredValues[i], Part); if (Group->isReverse()) - StoredVec = reverseVector(StoredVec); + StoredVec = Builder.CreateVectorReverse(StoredVec, "reverse"); // If this member has different type, cast it to a unified type. @@ -2993,43 +3023,21 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, PredicatedInstructions.push_back(Cloned); } -PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, - Value *End, Value *Step, - Instruction *DL) { +void InnerLoopVectorizer::createHeaderBranch(Loop *L) { BasicBlock *Header = L->getHeader(); - BasicBlock *Latch = L->getLoopLatch(); - // As we're just creating this loop, it's possible no latch exists - // yet. If so, use the header as this will be a single block loop. - if (!Latch) - Latch = Header; - - IRBuilder<> B(&*Header->getFirstInsertionPt()); - Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction); - setDebugLocFromInst(OldInst, &B); - auto *Induction = B.CreatePHI(Start->getType(), 2, "index"); + assert(!L->getLoopLatch() && "loop should not have a latch at this point"); - B.SetInsertPoint(Latch->getTerminator()); + IRBuilder<> B(Header->getTerminator()); + Instruction *OldInst = + getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()); setDebugLocFromInst(OldInst, &B); - // Create i+1 and fill the PHINode. - // - // If the tail is not folded, we know that End - Start >= Step (either - // statically or through the minimum iteration checks). We also know that both - // Start % Step == 0 and End % Step == 0. We exit the vector loop if %IV + - // %Step == %End. Hence we must exit the loop before %IV + %Step unsigned - // overflows and we can mark the induction increment as NUW. - Value *Next = B.CreateAdd(Induction, Step, "index.next", - /*NUW=*/!Cost->foldTailByMasking(), /*NSW=*/false); - Induction->addIncoming(Start, L->getLoopPreheader()); - Induction->addIncoming(Next, Latch); - // Create the compare. - Value *ICmp = B.CreateICmpEQ(Next, End); - B.CreateCondBr(ICmp, L->getUniqueExitBlock(), Header); + // Connect the header to the exit and header blocks and replace the old + // terminator. + B.CreateCondBr(B.getTrue(), L->getUniqueExitBlock(), Header); // Now we have two terminators. Remove the old one from the block. - Latch->getTerminator()->eraseFromParent(); - - return Induction; + Header->getTerminator()->eraseFromParent(); } Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { @@ -3099,10 +3107,9 @@ Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { if (Cost->foldTailByMasking()) { assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && "VF*UF must be a power of 2 when folding tail by masking"); - assert(!VF.isScalable() && - "Tail folding not yet supported for scalable vectors"); + Value *NumLanes = getRuntimeVF(Builder, Ty, VF * UF); TC = Builder.CreateAdd( - TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up"); + TC, Builder.CreateSub(NumLanes, ConstantInt::get(Ty, 1)), "n.rnd.up"); } // Now we need to generate the expression for the part of the loop that the @@ -3436,12 +3443,13 @@ Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { } void InnerLoopVectorizer::createInductionResumeValues( - Loop *L, Value *VectorTripCount, - std::pair<BasicBlock *, Value *> AdditionalBypass) { - assert(VectorTripCount && L && "Expected valid arguments"); + Loop *L, std::pair<BasicBlock *, Value *> AdditionalBypass) { assert(((AdditionalBypass.first && AdditionalBypass.second) || (!AdditionalBypass.first && !AdditionalBypass.second)) && "Inconsistent information about additional bypass."); + + Value *VectorTripCount = getOrCreateVectorTripCount(L); + assert(VectorTripCount && L && "Expected valid arguments"); // We are going to resume the execution of the scalar loop. // Go over all of the induction variables that we found and fix the // PHIs that are left in the scalar version of the loop. @@ -3449,6 +3457,7 @@ void InnerLoopVectorizer::createInductionResumeValues( // iteration in the vectorized loop. // If we come from a bypass edge then we need to start from the original // start value. + Instruction *OldInduction = Legal->getPrimaryInduction(); for (auto &InductionEntry : Legal->getInductionVars()) { PHINode *OrigPhi = InductionEntry.first; InductionDescriptor II = InductionEntry.second; @@ -3546,25 +3555,6 @@ BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L, "Inconsistent vector loop preheader"); Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); - Optional<MDNode *> VectorizedLoopID = - makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, - LLVMLoopVectorizeFollowupVectorized}); - if (VectorizedLoopID.hasValue()) { - L->setLoopID(VectorizedLoopID.getValue()); - - // Do not setAlreadyVectorized if loop attributes have been defined - // explicitly. - return LoopVectorPreHeader; - } - - // Keep all loop hints from the original loop on the vector loop (we'll - // replace the vectorizer-specific hints below). - if (MDNode *LID = OrigLoop->getLoopID()) - L->setLoopID(LID); - - LoopVectorizeHints Hints(L, true, *ORE, TTI); - Hints.setAlreadyVectorized(); - #ifdef EXPENSIVE_CHECKS assert(DT->verify(DominatorTree::VerificationLevel::Fast)); LI->verify(*DT); @@ -3573,7 +3563,8 @@ BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L, return LoopVectorPreHeader; } -BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { +std::pair<BasicBlock *, Value *> +InnerLoopVectorizer::createVectorizedLoopSkeleton() { /* In this function we generate a new loop. The new loop will contain the vectorized instructions while the old loop will continue to run the @@ -3638,33 +3629,12 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { // faster. emitMemRuntimeChecks(Lp, LoopScalarPreHeader); - // Some loops have a single integer induction variable, while other loops - // don't. One example is c++ iterators that often have multiple pointer - // induction variables. In the code below we also support a case where we - // don't have a single induction variable. - // - // We try to obtain an induction variable from the original loop as hard - // as possible. However if we don't find one that: - // - is an integer - // - counts from zero, stepping by one - // - is the size of the widest induction variable type - // then we create a new one. - OldInduction = Legal->getPrimaryInduction(); - Type *IdxTy = Legal->getWidestInductionType(); - Value *StartIdx = ConstantInt::get(IdxTy, 0); - // The loop step is equal to the vectorization factor (num of SIMD elements) - // times the unroll factor (num of SIMD instructions). - Builder.SetInsertPoint(&*Lp->getHeader()->getFirstInsertionPt()); - Value *Step = createStepForVF(Builder, IdxTy, VF, UF); - Value *CountRoundDown = getOrCreateVectorTripCount(Lp); - Induction = - createInductionVariable(Lp, StartIdx, CountRoundDown, Step, - getDebugLocFromInstOrOperands(OldInduction)); + createHeaderBranch(Lp); // Emit phis for the new starting index of the scalar loop. - createInductionResumeValues(Lp, CountRoundDown); + createInductionResumeValues(Lp); - return completeLoopSkeleton(Lp, OrigLoopID); + return {completeLoopSkeleton(Lp, OrigLoopID), nullptr}; } // Fix up external users of the induction variable. At this point, we are @@ -4088,8 +4058,8 @@ void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) { } } -void InnerLoopVectorizer::fixFirstOrderRecurrence(VPWidenPHIRecipe *PhiR, - VPTransformState &State) { +void InnerLoopVectorizer::fixFirstOrderRecurrence( + VPFirstOrderRecurrencePHIRecipe *PhiR, VPTransformState &State) { // This is the second phase of vectorizing first-order recurrences. An // overview of the transformation is described below. Suppose we have the // following loop. @@ -4334,13 +4304,29 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR, : Builder.CreateZExt(ReducedPartRdx, PhiTy); } + PHINode *ResumePhi = + dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue()); + // Create a phi node that merges control-flow from the backedge-taken check // block and the middle block. PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx", LoopScalarPreHeader->getTerminator()); - for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) - BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]); - BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); + + // If we are fixing reductions in the epilogue loop then we should already + // have created a bc.merge.rdx Phi after the main vector body. Ensure that + // we carry over the incoming values correctly. + for (auto *Incoming : predecessors(LoopScalarPreHeader)) { + if (Incoming == LoopMiddleBlock) + BCBlockPhi->addIncoming(ReducedPartRdx, Incoming); + else if (ResumePhi && llvm::is_contained(ResumePhi->blocks(), Incoming)) + BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming), + Incoming); + else + BCBlockPhi->addIncoming(ReductionStartValue, Incoming); + } + + // Set the resume value for this reduction + ReductionResumeValues.insert({&RdxDesc, BCBlockPhi}); // Now, we need to fix the users of the reduction variable // inside and outside of the scalar remainder loop. @@ -4557,6 +4543,9 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, InductionDescriptor II = Legal->getInductionVars().lookup(P); const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); + auto *IVR = PhiR->getParent()->getPlan()->getCanonicalIV(); + PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0)); + // FIXME: The newly created binary instructions should contain nsw/nuw flags, // which can be found from the original scalar operations. switch (II.getKind()) { @@ -4572,7 +4561,7 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, if (Cost->isScalarAfterVectorization(P, State.VF)) { // This is the normalized GEP that starts counting at zero. Value *PtrInd = - Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType()); + Builder.CreateSExtOrTrunc(CanonicalIV, II.getStep()->getType()); // Determine the number of scalars we need to generate for each unroll // iteration. If the instruction is uniform, we only need to generate the // first lane. Otherwise, we generate all VF values. @@ -4602,10 +4591,10 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, Type *PhiType = II.getStep()->getType(); // Build a pointer phi - Value *ScalarStartValue = II.getStartValue(); + Value *ScalarStartValue = PhiR->getStartValue()->getLiveInIRValue(); Type *ScStValueType = ScalarStartValue->getType(); PHINode *NewPointerPhi = - PHINode::Create(ScStValueType, 2, "pointer.phi", Induction); + PHINode::Create(ScStValueType, 2, "pointer.phi", CanonicalIV); NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader); // A pointer induction, performed by using a gep @@ -4916,7 +4905,8 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { Scalars[VF].insert(Worklist.begin(), Worklist.end()); } -bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I) const { +bool LoopVectorizationCostModel::isScalarWithPredication( + Instruction *I, ElementCount VF) const { if (!blockNeedsPredicationForAnyReason(I->getParent())) return false; switch(I->getOpcode()) { @@ -4928,11 +4918,14 @@ bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I) const { return false; auto *Ptr = getLoadStorePointerOperand(I); auto *Ty = getLoadStoreType(I); + Type *VTy = Ty; + if (VF.isVector()) + VTy = VectorType::get(Ty, VF); const Align Alignment = getLoadStoreAlignment(I); return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || - TTI.isLegalMaskedGather(Ty, Alignment)) + TTI.isLegalMaskedGather(VTy, Alignment)) : !(isLegalMaskedStore(Ty, Ptr, Alignment) || - TTI.isLegalMaskedScatter(Ty, Alignment)); + TTI.isLegalMaskedScatter(VTy, Alignment)); } case Instruction::UDiv: case Instruction::SDiv: @@ -5005,7 +4998,7 @@ bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( // If the instruction is a store located in a predicated block, it will be // scalarized. - if (isScalarWithPredication(I)) + if (isScalarWithPredication(I, VF)) return false; // If the instruction's allocated size doesn't equal it's type size, it @@ -5056,7 +5049,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { << *I << "\n"); return; } - if (isScalarWithPredication(I)) { + if (isScalarWithPredication(I, VF)) { LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " << *I << "\n"); return; @@ -5531,10 +5524,12 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { } } - // For scalable vectors, don't use tail folding as this is currently not yet - // supported. The code is likely to have ended up here if the tripcount is - // low, in which case it makes sense not to use scalable vectors. - if (MaxFactors.ScalableVF.isVector()) + // For scalable vectors don't use tail folding for low trip counts or + // optimizing for code size. We only permit this if the user has explicitly + // requested it. + if (ScalarEpilogueStatus != CM_ScalarEpilogueNotNeededUsePredicate && + ScalarEpilogueStatus != CM_ScalarEpilogueNotAllowedUsePredicate && + MaxFactors.ScalableVF.isVector()) MaxFactors.ScalableVF = ElementCount::getScalable(0); // If we don't know the precise trip count, or if the trip count that we @@ -5849,10 +5844,8 @@ bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization( const Loop &L, ElementCount VF) const { // Cross iteration phis such as reductions need special handling and are // currently unsupported. - if (any_of(L.getHeader()->phis(), [&](PHINode &Phi) { - return Legal->isFirstOrderRecurrence(&Phi) || - Legal->isReductionVariable(&Phi); - })) + if (any_of(L.getHeader()->phis(), + [&](PHINode &Phi) { return Legal->isFirstOrderRecurrence(&Phi); })) return false; // Phis with uses outside of the loop require special handling and are @@ -5978,11 +5971,29 @@ LoopVectorizationCostModel::getSmallestAndWidestTypes() { unsigned MinWidth = -1U; unsigned MaxWidth = 8; const DataLayout &DL = TheFunction->getParent()->getDataLayout(); - for (Type *T : ElementTypesInLoop) { - MinWidth = std::min<unsigned>( - MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize()); - MaxWidth = std::max<unsigned>( - MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize()); + // For in-loop reductions, no element types are added to ElementTypesInLoop + // if there are no loads/stores in the loop. In this case, check through the + // reduction variables to determine the maximum width. + if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) { + // Reset MaxWidth so that we can find the smallest type used by recurrences + // in the loop. + MaxWidth = -1U; + for (auto &PhiDescriptorPair : Legal->getReductionVars()) { + const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second; + // When finding the min width used by the recurrence we need to account + // for casts on the input operands of the recurrence. + MaxWidth = std::min<unsigned>( + MaxWidth, std::min<unsigned>( + RdxDesc.getMinWidthCastToRecurrenceTypeInBits(), + RdxDesc.getRecurrenceType()->getScalarSizeInBits())); + } + } else { + for (Type *T : ElementTypesInLoop) { + MinWidth = std::min<unsigned>( + MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize()); + MaxWidth = std::max<unsigned>( + MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize()); + } } return {MinWidth, MaxWidth}; } @@ -6022,18 +6033,8 @@ void LoopVectorizationCostModel::collectElementTypesForWidening() { if (auto *ST = dyn_cast<StoreInst>(&I)) T = ST->getValueOperand()->getType(); - // Ignore loaded pointer types and stored pointer types that are not - // vectorizable. - // - // FIXME: The check here attempts to predict whether a load or store will - // be vectorized. We only know this for certain after a VF has - // been selected. Here, we assume that if an access can be - // vectorized, it will be. We should also look at extending this - // optimization to non-pointer types. - // - if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) && - !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I)) - continue; + assert(T->isSized() && + "Expected the load/store/recurrence type to be sized"); ElementTypesInLoop.insert(T); } @@ -6475,7 +6476,8 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { return RUs; } -bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){ +bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I, + ElementCount VF) { // TODO: Cost model for emulated masked load/store is completely // broken. This hack guides the cost model to use an artificially // high enough value to practically disable vectorization with such @@ -6484,8 +6486,7 @@ bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){ // from moving "masked load/store" check from legality to cost model. // Masked Load/Gather emulation was previously never allowed. // Limited number of Masked Store/Scatter emulation was allowed. - assert(isPredicatedInst(I) && - "Expecting a scalar emulated instruction"); + assert(isPredicatedInst(I, VF) && "Expecting a scalar emulated instruction"); return isa<LoadInst>(I) || (isa<StoreInst>(I) && NumPredStores > NumberOfStoresToPredicate); @@ -6512,13 +6513,13 @@ void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { if (!blockNeedsPredicationForAnyReason(BB)) continue; for (Instruction &I : *BB) - if (isScalarWithPredication(&I)) { + if (isScalarWithPredication(&I, VF)) { ScalarCostsTy ScalarCosts; // Do not apply discount if scalable, because that would lead to // invalid scalarization costs. // Do not apply discount logic if hacked cost is needed // for emulated masked memrefs. - if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I) && + if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I, VF) && computePredInstDiscount(&I, ScalarCosts, VF) >= 0) ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); // Remember that BB will remain after vectorization. @@ -6554,7 +6555,7 @@ int LoopVectorizationCostModel::computePredInstDiscount( // If the instruction is scalar with predication, it will be analyzed // separately. We ignore it within the context of PredInst. - if (isScalarWithPredication(I)) + if (isScalarWithPredication(I, VF)) return false; // If any of the instruction's operands are uniform after vectorization, @@ -6601,7 +6602,7 @@ int LoopVectorizationCostModel::computePredInstDiscount( // Compute the scalarization overhead of needed insertelement instructions // and phi nodes. - if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) { + if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) { ScalarCost += TTI.getScalarizationOverhead( cast<VectorType>(ToVectorTy(I->getType(), VF)), APInt::getAllOnes(VF.getFixedValue()), true, false); @@ -6764,7 +6765,7 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, // If we have a predicated load/store, it will need extra i1 extracts and // conditional branches, but may not be executed for each vector lane. Scale // the cost by the probability of executing the predicated block. - if (isPredicatedInst(I)) { + if (isPredicatedInst(I, VF)) { Cost /= getReciprocalPredBlockProb(); // Add the cost of an i1 extract and a branch @@ -6775,7 +6776,7 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, /*Insert=*/false, /*Extract=*/true); Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput); - if (useEmulatedMaskMemRefHack(I)) + if (useEmulatedMaskMemRefHack(I, VF)) // Artificially setting to a high enough value to practically disable // vectorization with such operations. Cost = 3000000; @@ -7182,7 +7183,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { // predicated uniform stores. Today they are treated as any other // predicated store (see added test cases in // invariant-store-vectorization.ll). - if (isa<StoreInst>(&I) && isScalarWithPredication(&I)) + if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF)) NumPredStores++; if (Legal->isUniformMemOp(I)) { @@ -7192,7 +7193,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract InstructionCost Cost; if (isa<StoreInst>(&I) && VF.isScalable() && - isLegalGatherOrScatter(&I)) { + isLegalGatherOrScatter(&I, VF)) { Cost = getGatherScatterCost(&I, VF); setWideningDecision(&I, VF, CM_GatherScatter, Cost); } else { @@ -7234,7 +7235,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { } InstructionCost GatherScatterCost = - isLegalGatherOrScatter(&I) + isLegalGatherOrScatter(&I, VF) ? getGatherScatterCost(&I, VF) * NumAccesses : InstructionCost::getInvalid(); @@ -7437,7 +7438,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, // vector lane. Get the scalarization cost and scale this amount by the // probability of executing the predicated block. If the instruction is not // predicated, we fall through to the next case. - if (VF.isVector() && isScalarWithPredication(I)) { + if (VF.isVector() && isScalarWithPredication(I, VF)) { InstructionCost Cost = 0; // These instructions have a non-void type, so account for the phi nodes @@ -7941,6 +7942,40 @@ VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const { llvm_unreachable("No plan found!"); } +static void AddRuntimeUnrollDisableMetaData(Loop *L) { + SmallVector<Metadata *, 4> MDs; + // Reserve first location for self reference to the LoopID metadata node. + MDs.push_back(nullptr); + bool IsUnrollMetadata = false; + MDNode *LoopID = L->getLoopID(); + if (LoopID) { + // First find existing loop unrolling disable metadata. + for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { + auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); + if (MD) { + const auto *S = dyn_cast<MDString>(MD->getOperand(0)); + IsUnrollMetadata = + S && S->getString().startswith("llvm.loop.unroll.disable"); + } + MDs.push_back(LoopID->getOperand(i)); + } + } + + if (!IsUnrollMetadata) { + // Add runtime unroll disable metadata. + LLVMContext &Context = L->getHeader()->getContext(); + SmallVector<Metadata *, 1> DisableOperands; + DisableOperands.push_back( + MDString::get(Context, "llvm.loop.unroll.runtime.disable")); + MDNode *DisableNode = MDNode::get(Context, DisableOperands); + MDs.push_back(DisableNode); + MDNode *NewLoopID = MDNode::get(Context, MDs); + // Set operand 0 to refer to the loop id itself. + NewLoopID->replaceOperandWith(0, NewLoopID); + L->setLoopID(NewLoopID); + } +} + void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan, InnerLoopVectorizer &ILV, @@ -7952,9 +7987,9 @@ void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF, // 1. Create a new empty loop. Unlink the old loop and connect the new one. VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan}; - State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); - State.TripCount = ILV.getOrCreateTripCount(nullptr); - State.CanonicalIV = ILV.Induction; + Value *CanonicalIVStartValue; + std::tie(State.CFG.PrevBB, CanonicalIVStartValue) = + ILV.createVectorizedLoopSkeleton(); ILV.collectPoisonGeneratingRecipes(State); ILV.printDebugTracesAtStart(); @@ -7968,8 +8003,35 @@ void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF, //===------------------------------------------------===// // 2. Copy and widen instructions from the old loop into the new loop. + BestVPlan.prepareToExecute(ILV.getOrCreateTripCount(nullptr), + ILV.getOrCreateVectorTripCount(nullptr), + CanonicalIVStartValue, State); BestVPlan.execute(&State); + // Keep all loop hints from the original loop on the vector loop (we'll + // replace the vectorizer-specific hints below). + MDNode *OrigLoopID = OrigLoop->getLoopID(); + + Optional<MDNode *> VectorizedLoopID = + makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, + LLVMLoopVectorizeFollowupVectorized}); + + Loop *L = LI->getLoopFor(State.CFG.PrevBB); + if (VectorizedLoopID.hasValue()) + L->setLoopID(VectorizedLoopID.getValue()); + else { + // Keep all loop hints from the original loop on the vector loop (we'll + // replace the vectorizer-specific hints below). + if (MDNode *LID = OrigLoop->getLoopID()) + L->setLoopID(LID); + + LoopVectorizeHints Hints(L, true, *ORE); + Hints.setAlreadyVectorized(); + } + // Disable runtime unrolling when vectorizing the epilogue loop. + if (CanonicalIVStartValue) + AddRuntimeUnrollDisableMetaData(L); + // 3. Fix the vectorized code: take care of header phi's, live-outs, // predication, updating analyses. ILV.fixVectorizedLoop(State); @@ -8032,66 +8094,16 @@ void LoopVectorizationPlanner::collectTriviallyDeadInstructions( } } -Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; } - Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } -Value *InnerLoopUnroller::getStepVector(Value *Val, Value *StartIdx, - Value *Step, - Instruction::BinaryOps BinOp) { - // When unrolling and the VF is 1, we only need to add a simple scalar. - Type *Ty = Val->getType(); - assert(!Ty->isVectorTy() && "Val must be a scalar"); - - if (Ty->isFloatingPointTy()) { - // Floating-point operations inherit FMF via the builder's flags. - Value *MulOp = Builder.CreateFMul(StartIdx, Step); - return Builder.CreateBinOp(BinOp, Val, MulOp); - } - return Builder.CreateAdd(Val, Builder.CreateMul(StartIdx, Step), "induction"); -} - -static void AddRuntimeUnrollDisableMetaData(Loop *L) { - SmallVector<Metadata *, 4> MDs; - // Reserve first location for self reference to the LoopID metadata node. - MDs.push_back(nullptr); - bool IsUnrollMetadata = false; - MDNode *LoopID = L->getLoopID(); - if (LoopID) { - // First find existing loop unrolling disable metadata. - for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { - auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); - if (MD) { - const auto *S = dyn_cast<MDString>(MD->getOperand(0)); - IsUnrollMetadata = - S && S->getString().startswith("llvm.loop.unroll.disable"); - } - MDs.push_back(LoopID->getOperand(i)); - } - } - - if (!IsUnrollMetadata) { - // Add runtime unroll disable metadata. - LLVMContext &Context = L->getHeader()->getContext(); - SmallVector<Metadata *, 1> DisableOperands; - DisableOperands.push_back( - MDString::get(Context, "llvm.loop.unroll.runtime.disable")); - MDNode *DisableNode = MDNode::get(Context, DisableOperands); - MDs.push_back(DisableNode); - MDNode *NewLoopID = MDNode::get(Context, MDs); - // Set operand 0 to refer to the loop id itself. - NewLoopID->replaceOperandWith(0, NewLoopID); - L->setLoopID(NewLoopID); - } -} - //===--------------------------------------------------------------------===// // EpilogueVectorizerMainLoop //===--------------------------------------------------------------------===// /// This function is partially responsible for generating the control flow /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. -BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { +std::pair<BasicBlock *, Value *> +EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { MDNode *OrigLoopID = OrigLoop->getLoopID(); Loop *Lp = createVectorLoopSkeleton(""); @@ -8120,24 +8132,16 @@ BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false); // Generate the induction variable. - OldInduction = Legal->getPrimaryInduction(); - Type *IdxTy = Legal->getWidestInductionType(); - Value *StartIdx = ConstantInt::get(IdxTy, 0); - - IRBuilder<> B(&*Lp->getLoopPreheader()->getFirstInsertionPt()); - Value *Step = getRuntimeVF(B, IdxTy, VF * UF); Value *CountRoundDown = getOrCreateVectorTripCount(Lp); EPI.VectorTripCount = CountRoundDown; - Induction = - createInductionVariable(Lp, StartIdx, CountRoundDown, Step, - getDebugLocFromInstOrOperands(OldInduction)); + createHeaderBranch(Lp); // Skip induction resume value creation here because they will be created in // the second pass. If we created them here, they wouldn't be used anyway, // because the vplan in the second pass still contains the inductions from the // original loop. - return completeLoopSkeleton(Lp, OrigLoopID); + return {completeLoopSkeleton(Lp, OrigLoopID), nullptr}; } void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { @@ -8219,7 +8223,7 @@ BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck( /// This function is partially responsible for generating the control flow /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. -BasicBlock * +std::pair<BasicBlock *, Value *> EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { MDNode *OrigLoopID = OrigLoop->getLoopID(); Loop *Lp = createVectorLoopSkeleton("vec.epilog."); @@ -8275,6 +8279,25 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { LoopBypassBlocks.push_back(EPI.MemSafetyCheck); LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck); + // The vec.epilog.iter.check block may contain Phi nodes from reductions which + // merge control-flow from the latch block and the middle block. Update the + // incoming values here and move the Phi into the preheader. + SmallVector<PHINode *, 4> PhisInBlock; + for (PHINode &Phi : VecEpilogueIterationCountCheck->phis()) + PhisInBlock.push_back(&Phi); + + for (PHINode *Phi : PhisInBlock) { + Phi->replaceIncomingBlockWith( + VecEpilogueIterationCountCheck->getSinglePredecessor(), + VecEpilogueIterationCountCheck); + Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck); + if (EPI.SCEVSafetyCheck) + Phi->removeIncomingValue(EPI.SCEVSafetyCheck); + if (EPI.MemSafetyCheck) + Phi->removeIncomingValue(EPI.MemSafetyCheck); + Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHI()); + } + // Generate a resume induction for the vector epilogue and put it in the // vector epilogue preheader Type *IdxTy = Legal->getWidestInductionType(); @@ -8285,13 +8308,7 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { EPI.MainLoopIterationCountCheck); // Generate the induction variable. - OldInduction = Legal->getPrimaryInduction(); - Value *CountRoundDown = getOrCreateVectorTripCount(Lp); - Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); - Value *StartIdx = EPResumeVal; - Induction = - createInductionVariable(Lp, StartIdx, CountRoundDown, Step, - getDebugLocFromInstOrOperands(OldInduction)); + createHeaderBranch(Lp); // Generate induction resume values. These variables save the new starting // indexes for the scalar loop. They are used to test if there are any tail @@ -8300,12 +8317,10 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { // check, then the resume value for the induction variable comes from // the trip count of the main vector loop, hence passing the AdditionalBypass // argument. - createInductionResumeValues(Lp, CountRoundDown, - {VecEpilogueIterationCountCheck, - EPI.VectorTripCount} /* AdditionalBypass */); + createInductionResumeValues(Lp, {VecEpilogueIterationCountCheck, + EPI.VectorTripCount} /* AdditionalBypass */); - AddRuntimeUnrollDisableMetaData(Lp); - return completeLoopSkeleton(Lp, OrigLoopID); + return {completeLoopSkeleton(Lp, OrigLoopID), EPResumeVal}; } BasicBlock * @@ -8447,33 +8462,22 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. // Introduce the early-exit compare IV <= BTC to form header block mask. - // This is used instead of IV < TC because TC may wrap, unlike BTC. - // Start by constructing the desired canonical IV in the header block. - VPValue *IV = nullptr; - if (Legal->getPrimaryInduction()) - IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction()); - else { - VPBasicBlock *HeaderVPBB = Plan->getEntry()->getEntryBasicBlock(); - auto *IVRecipe = new VPWidenCanonicalIVRecipe(); - HeaderVPBB->insert(IVRecipe, HeaderVPBB->getFirstNonPhi()); - IV = IVRecipe; - } + // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by + // constructing the desired canonical IV in the header block as its first + // non-phi instructions. + assert(CM.foldTailByMasking() && "must fold the tail"); + VPBasicBlock *HeaderVPBB = Plan->getEntry()->getEntryBasicBlock(); + auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi(); + auto *IV = new VPWidenCanonicalIVRecipe(Plan->getCanonicalIV()); + HeaderVPBB->insert(IV, HeaderVPBB->getFirstNonPhi()); - // Create the block in mask as the first non-phi instruction in the block. VPBuilder::InsertPointGuard Guard(Builder); - auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi(); - Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint); - - VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); - bool TailFolded = !CM.isScalarEpilogueAllowed(); - - if (TailFolded && CM.TTI.emitGetActiveLaneMask()) { - // While ActiveLaneMask is a binary op that consumes the loop tripcount - // as a second argument, we only pass the IV here and extract the - // tripcount from the transform state where codegen of the VP instructions - // happen. - BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV}); + Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint); + if (CM.TTI.emitGetActiveLaneMask()) { + VPValue *TC = Plan->getOrCreateTripCount(); + BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC}); } else { + VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); } return BlockMaskCache[BB] = BlockMask; @@ -8621,7 +8625,9 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range) const { bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( - [this, CI](ElementCount VF) { return CM.isScalarWithPredication(CI); }, + [this, CI](ElementCount VF) { + return CM.isScalarWithPredication(CI, VF); + }, Range); if (IsPredicated) @@ -8661,7 +8667,8 @@ bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { // scalarization is profitable or it is predicated. auto WillScalarize = [this, I](ElementCount VF) -> bool { return CM.isScalarAfterVectorization(I, VF) || - CM.isProfitableToScalarize(I, VF) || CM.isScalarWithPredication(I); + CM.isProfitableToScalarize(I, VF) || + CM.isScalarWithPredication(I, VF); }; return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, Range); @@ -8719,7 +8726,7 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, void VPRecipeBuilder::fixHeaderPhis() { BasicBlock *OrigLatch = OrigLoop->getLoopLatch(); - for (VPWidenPHIRecipe *R : PhisToFix) { + for (VPHeaderPHIRecipe *R : PhisToFix) { auto *PN = cast<PHINode>(R->getUnderlyingValue()); VPRecipeBase *IncR = getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch))); @@ -8735,7 +8742,7 @@ VPBasicBlock *VPRecipeBuilder::handleReplication( Range); bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( - [&](ElementCount VF) { return CM.isPredicatedInst(I, IsUniform); }, + [&](ElementCount VF) { return CM.isPredicatedInst(I, VF, IsUniform); }, Range); // Even if the instruction is not marked as uniform, there are certain @@ -8861,7 +8868,7 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands))) return toVPRecipeResult(Recipe); - VPWidenPHIRecipe *PhiRecipe = nullptr; + VPHeaderPHIRecipe *PhiRecipe = nullptr; if (Legal->isReductionVariable(Phi) || Legal->isFirstOrderRecurrence(Phi)) { VPValue *StartV = Operands[0]; if (Legal->isReductionVariable(Phi)) { @@ -8882,11 +8889,14 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch()))); PhisToFix.push_back(PhiRecipe); } else { - // TODO: record start and backedge value for remaining pointer induction - // phis. + // TODO: record backedge value for remaining pointer induction phis. assert(Phi->getType()->isPointerTy() && "only pointer phis should be handled here"); - PhiRecipe = new VPWidenPHIRecipe(Phi); + assert(Legal->getInductionVars().count(Phi) && + "Not an induction variable"); + InductionDescriptor II = Legal->getInductionVars().lookup(Phi); + VPValue *Start = Plan->getOrAddVPValue(II.getStartValue()); + PhiRecipe = new VPWidenPHIRecipe(Phi, Start); } return toVPRecipeResult(PhiRecipe); @@ -8966,6 +8976,40 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, } } +// Add a VPCanonicalIVPHIRecipe starting at 0 to the header, a +// CanonicalIVIncrement{NUW} VPInstruction to increment it by VF * UF and a +// BranchOnCount VPInstruction to the latch. +static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL, + bool HasNUW, bool IsVPlanNative) { + Value *StartIdx = ConstantInt::get(IdxTy, 0); + auto *StartV = Plan.getOrAddVPValue(StartIdx); + + auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL); + VPRegionBlock *TopRegion = Plan.getVectorLoopRegion(); + VPBasicBlock *Header = TopRegion->getEntryBasicBlock(); + if (IsVPlanNative) + Header = cast<VPBasicBlock>(Header->getSingleSuccessor()); + Header->insert(CanonicalIVPHI, Header->begin()); + + auto *CanonicalIVIncrement = + new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementNUW + : VPInstruction::CanonicalIVIncrement, + {CanonicalIVPHI}, DL); + CanonicalIVPHI->addOperand(CanonicalIVIncrement); + + VPBasicBlock *EB = TopRegion->getExitBasicBlock(); + if (IsVPlanNative) { + EB = cast<VPBasicBlock>(EB->getSinglePredecessor()); + EB->setCondBit(nullptr); + } + EB->appendRecipe(CanonicalIVIncrement); + + auto *BranchOnCount = + new VPInstruction(VPInstruction::BranchOnCount, + {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL); + EB->appendRecipe(BranchOnCount); +} + VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions, const MapVector<Instruction *, Instruction *> &SinkAfter) { @@ -9033,6 +9077,12 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( auto *TopRegion = new VPRegionBlock(HeaderVPBB, LatchVPBB, "vector loop"); auto Plan = std::make_unique<VPlan>(TopRegion); + Instruction *DLInst = + getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()); + addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), + DLInst ? DLInst->getDebugLoc() : DebugLoc(), + !CM.foldTailByMasking(), false); + // Scan the body of the loop in a topological order to visit each basic block // after having visited its predecessor basic blocks. LoopBlocksDFS DFS(OrigLoop); @@ -9194,6 +9244,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( } } + VPlanTransforms::removeRedundantCanonicalIVs(*Plan); VPlanTransforms::removeRedundantInductionCasts(*Plan); // Now that sink-after is done, move induction recipes for optimized truncates @@ -9325,6 +9376,9 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { OrigLoop, Plan, [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); }, DeadInstructions, *PSE.getSE()); + + addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DebugLoc(), + true, true); return Plan; } @@ -9414,16 +9468,19 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( } // If tail is folded by masking, introduce selects between the phi - // and the live-out instruction of each reduction, at the end of the latch. + // and the live-out instruction of each reduction, at the beginning of the + // dedicated latch block. if (CM.foldTailByMasking()) { + Builder.setInsertPoint(LatchVPBB, LatchVPBB->begin()); for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) { VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R); if (!PhiR || PhiR->isInLoop()) continue; - Builder.setInsertPoint(LatchVPBB); VPValue *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); VPValue *Red = PhiR->getBackedgeValue(); + assert(cast<VPRecipeBase>(Red->getDef())->getParent() != LatchVPBB && + "reduction recipe must be defined before latch"); Builder.createNaryOp(Instruction::Select, {Cond, Red, PhiR}); } } @@ -9682,9 +9739,8 @@ void VPWidenGEPRecipe::execute(VPTransformState &State) { void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { assert(!State.Instance && "Int or FP induction being replicated."); - State.ILV->widenIntOrFpInduction(IV, getInductionDescriptor(), - getStartValue()->getLiveInIRValue(), - getTruncInst(), getVPValue(0), State); + auto *CanonicalIV = State.get(getParent()->getPlan()->getCanonicalIV(), 0); + State.ILV->widenIntOrFpInduction(IV, this, State, CanonicalIV); } void VPWidenPHIRecipe::execute(VPTransformState &State) { @@ -10013,7 +10069,7 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { NewLI = Builder.CreateVectorReverse(NewLI, "reverse"); } - State.set(getVPSingleValue(), NewLI, Part); + State.set(this, NewLI, Part); } } @@ -10561,6 +10617,21 @@ bool LoopVectorizePass::processLoop(Loop *L) { Checks); VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF); + + // Ensure that the start values for any VPReductionPHIRecipes are + // updated before vectorising the epilogue loop. + VPBasicBlock *Header = BestEpiPlan.getEntry()->getEntryBasicBlock(); + for (VPRecipeBase &R : Header->phis()) { + if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) { + if (auto *Resume = MainILV.getReductionResumeValue( + ReductionPhi->getRecurrenceDescriptor())) { + VPValue *StartVal = new VPValue(Resume); + BestEpiPlan.addExternalDef(StartVal); + ReductionPhi->setOperand(0, StartVal); + } + } + } + LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV, DT); ++LoopsEpilogueVectorized; diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 37ae13666f7a..99c265fc5101 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -435,7 +435,7 @@ struct InstructionsState { } /// Some of the instructions in the list have alternate opcodes. - bool isAltShuffle() const { return getOpcode() != getAltOpcode(); } + bool isAltShuffle() const { return AltOp != MainOp; } bool isOpcodeOrAlt(Instruction *I) const { unsigned CheckedOpcode = I->getOpcode(); @@ -581,7 +581,7 @@ static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, } /// \returns the AA location that is being access by the instruction. -static MemoryLocation getLocation(Instruction *I, AAResults *AA) { +static MemoryLocation getLocation(Instruction *I) { if (StoreInst *SI = dyn_cast<StoreInst>(I)) return MemoryLocation::get(SI); if (LoadInst *LI = dyn_cast<LoadInst>(I)) @@ -1417,7 +1417,11 @@ public: HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane); } else if (NumFreeOpsHash.NumOfAPOs == Min && NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) { - ++HashMap[NumFreeOpsHash.Hash].first; + auto It = HashMap.find(NumFreeOpsHash.Hash); + if (It == HashMap.end()) + HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane); + else + ++It->second.first; } } // Select the lane with the minimum counter. @@ -2019,9 +2023,7 @@ private: } /// Some of the instructions in the list have alternate opcodes. - bool isAltShuffle() const { - return getOpcode() != getAltOpcode(); - } + bool isAltShuffle() const { return MainOp != AltOp; } bool isOpcodeOrAlt(Instruction *I) const { unsigned CheckedOpcode = I->getOpcode(); @@ -2519,12 +2521,11 @@ private: SD->IsScheduled = true; LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n"); - ScheduleData *BundleMember = SD; - while (BundleMember) { - if (BundleMember->Inst != BundleMember->OpValue) { - BundleMember = BundleMember->NextInBundle; + for (ScheduleData *BundleMember = SD; BundleMember; + BundleMember = BundleMember->NextInBundle) { + if (BundleMember->Inst != BundleMember->OpValue) continue; - } + // Handle the def-use chain dependencies. // Decrement the unscheduled counter and insert to ready list if ready. @@ -2589,7 +2590,6 @@ private: << "SLP: gets ready (mem): " << *DepBundle << "\n"); } } - BundleMember = BundleMember->NextInBundle; } } @@ -2618,6 +2618,10 @@ private: } } + /// Build a bundle from the ScheduleData nodes corresponding to the + /// scalar instruction for each lane. + ScheduleData *buildBundle(ArrayRef<Value *> VL); + /// Checks if a bundle of instructions can be scheduled, i.e. has no /// cyclic dependencies. This is only a dry-run, no instructions are /// actually moved at this stage. @@ -3040,7 +3044,7 @@ Optional<BoUpSLP::OrdersType> BoUpSLP::getReorderingData(const TreeEntry &TE, void BoUpSLP::reorderTopToBottom() { // Maps VF to the graph nodes. - DenseMap<unsigned, SmallPtrSet<TreeEntry *, 4>> VFToOrderedEntries; + DenseMap<unsigned, SetVector<TreeEntry *>> VFToOrderedEntries; // ExtractElement gather nodes which can be vectorized and need to handle // their ordering. DenseMap<const TreeEntry *, OrdersType> GathersToOrders; @@ -3051,6 +3055,29 @@ void BoUpSLP::reorderTopToBottom() { const std::unique_ptr<TreeEntry> &TE) { if (Optional<OrdersType> CurrentOrder = getReorderingData(*TE.get(), /*TopToBottom=*/true)) { + // Do not include ordering for nodes used in the alt opcode vectorization, + // better to reorder them during bottom-to-top stage. If follow the order + // here, it causes reordering of the whole graph though actually it is + // profitable just to reorder the subgraph that starts from the alternate + // opcode vectorization node. Such nodes already end-up with the shuffle + // instruction and it is just enough to change this shuffle rather than + // rotate the scalars for the whole graph. + unsigned Cnt = 0; + const TreeEntry *UserTE = TE.get(); + while (UserTE && Cnt < RecursionMaxDepth) { + if (UserTE->UserTreeIndices.size() != 1) + break; + if (all_of(UserTE->UserTreeIndices, [](const EdgeInfo &EI) { + return EI.UserTE->State == TreeEntry::Vectorize && + EI.UserTE->isAltShuffle() && EI.UserTE->Idx != 0; + })) + return; + if (UserTE->UserTreeIndices.empty()) + UserTE = nullptr; + else + UserTE = UserTE->UserTreeIndices.back().UserTE; + ++Cnt; + } VFToOrderedEntries[TE->Scalars.size()].insert(TE.get()); if (TE->State != TreeEntry::Vectorize) GathersToOrders.try_emplace(TE.get(), *CurrentOrder); @@ -3066,7 +3093,7 @@ void BoUpSLP::reorderTopToBottom() { // Try to find the most profitable order. We just are looking for the most // used order and reorder scalar elements in the nodes according to this // mostly used order. - const SmallPtrSetImpl<TreeEntry *> &OrderedEntries = It->getSecond(); + ArrayRef<TreeEntry *> OrderedEntries = It->second.getArrayRef(); // All operands are reordered and used only in this node - propagate the // most used order to the user node. MapVector<OrdersType, unsigned, @@ -4459,6 +4486,8 @@ bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue, CurrentOrder.clear(); return false; } + if (ShouldKeepOrder) + CurrentOrder.clear(); return ShouldKeepOrder; } @@ -7202,6 +7231,33 @@ void BoUpSLP::optimizeGatherSequence() { GatherShuffleSeq.clear(); } +BoUpSLP::ScheduleData * +BoUpSLP::BlockScheduling::buildBundle(ArrayRef<Value *> VL) { + ScheduleData *Bundle = nullptr; + ScheduleData *PrevInBundle = nullptr; + for (Value *V : VL) { + ScheduleData *BundleMember = getScheduleData(V); + assert(BundleMember && + "no ScheduleData for bundle member " + "(maybe not in same basic block)"); + assert(BundleMember->isSchedulingEntity() && + "bundle member already part of other bundle"); + if (PrevInBundle) { + PrevInBundle->NextInBundle = BundleMember; + } else { + Bundle = BundleMember; + } + BundleMember->UnscheduledDepsInBundle = 0; + Bundle->UnscheduledDepsInBundle += BundleMember->UnscheduledDeps; + + // Group the instructions to a bundle. + BundleMember->FirstInBundle = Bundle; + PrevInBundle = BundleMember; + } + assert(Bundle && "Failed to find schedule bundle"); + return Bundle; +} + // Groups the instructions to a bundle (which is then a single scheduling entity) // and schedules instructions until the bundle gets ready. Optional<BoUpSLP::ScheduleData *> @@ -7214,12 +7270,9 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP, // Initialize the instruction bundle. Instruction *OldScheduleEnd = ScheduleEnd; - ScheduleData *PrevInBundle = nullptr; - ScheduleData *Bundle = nullptr; - bool ReSchedule = false; LLVM_DEBUG(dbgs() << "SLP: bundle: " << *S.OpValue << "\n"); - auto &&TryScheduleBundle = [this, OldScheduleEnd, SLP](bool ReSchedule, + auto TryScheduleBundleImpl = [this, OldScheduleEnd, SLP](bool ReSchedule, ScheduleData *Bundle) { // The scheduling region got new instructions at the lower end (or it is a // new region for the first bundle). This makes it necessary to @@ -7263,39 +7316,28 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP, // Otherwise the compiler may crash trying to incorrectly calculate // dependencies and emit instruction in the wrong order at the actual // scheduling. - TryScheduleBundle(/*ReSchedule=*/false, nullptr); + TryScheduleBundleImpl(/*ReSchedule=*/false, nullptr); return None; } } + bool ReSchedule = false; for (Value *V : VL) { ScheduleData *BundleMember = getScheduleData(V); assert(BundleMember && "no ScheduleData for bundle member (maybe not in same basic block)"); - if (BundleMember->IsScheduled) { - // A bundle member was scheduled as single instruction before and now - // needs to be scheduled as part of the bundle. We just get rid of the - // existing schedule. - LLVM_DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember - << " was already scheduled\n"); - ReSchedule = true; - } - assert(BundleMember->isSchedulingEntity() && - "bundle member already part of other bundle"); - if (PrevInBundle) { - PrevInBundle->NextInBundle = BundleMember; - } else { - Bundle = BundleMember; - } - BundleMember->UnscheduledDepsInBundle = 0; - Bundle->UnscheduledDepsInBundle += BundleMember->UnscheduledDeps; - - // Group the instructions to a bundle. - BundleMember->FirstInBundle = Bundle; - PrevInBundle = BundleMember; + if (!BundleMember->IsScheduled) + continue; + // A bundle member was scheduled as single instruction before and now + // needs to be scheduled as part of the bundle. We just get rid of the + // existing schedule. + LLVM_DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember + << " was already scheduled\n"); + ReSchedule = true; } - assert(Bundle && "Failed to find schedule bundle"); - TryScheduleBundle(ReSchedule, Bundle); + + auto *Bundle = buildBundle(VL); + TryScheduleBundleImpl(ReSchedule, Bundle); if (!Bundle->isReady()) { cancelScheduling(VL, S.OpValue); return None; @@ -7464,20 +7506,33 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD, while (!WorkList.empty()) { ScheduleData *SD = WorkList.pop_back_val(); - - ScheduleData *BundleMember = SD; - while (BundleMember) { + for (ScheduleData *BundleMember = SD; BundleMember; + BundleMember = BundleMember->NextInBundle) { assert(isInSchedulingRegion(BundleMember)); - if (!BundleMember->hasValidDependencies()) { - - LLVM_DEBUG(dbgs() << "SLP: update deps of " << *BundleMember - << "\n"); - BundleMember->Dependencies = 0; - BundleMember->resetUnscheduledDeps(); + if (BundleMember->hasValidDependencies()) + continue; - // Handle def-use chain dependencies. - if (BundleMember->OpValue != BundleMember->Inst) { - ScheduleData *UseSD = getScheduleData(BundleMember->Inst); + LLVM_DEBUG(dbgs() << "SLP: update deps of " << *BundleMember + << "\n"); + BundleMember->Dependencies = 0; + BundleMember->resetUnscheduledDeps(); + + // Handle def-use chain dependencies. + if (BundleMember->OpValue != BundleMember->Inst) { + ScheduleData *UseSD = getScheduleData(BundleMember->Inst); + if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) { + BundleMember->Dependencies++; + ScheduleData *DestBundle = UseSD->FirstInBundle; + if (!DestBundle->IsScheduled) + BundleMember->incrementUnscheduledDeps(1); + if (!DestBundle->hasValidDependencies()) + WorkList.push_back(DestBundle); + } + } else { + for (User *U : BundleMember->Inst->users()) { + assert(isa<Instruction>(U) && + "user of instruction must be instruction"); + ScheduleData *UseSD = getScheduleData(U); if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) { BundleMember->Dependencies++; ScheduleData *DestBundle = UseSD->FirstInBundle; @@ -7486,89 +7541,69 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD, if (!DestBundle->hasValidDependencies()) WorkList.push_back(DestBundle); } - } else { - for (User *U : BundleMember->Inst->users()) { - if (isa<Instruction>(U)) { - ScheduleData *UseSD = getScheduleData(U); - if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) { - BundleMember->Dependencies++; - ScheduleData *DestBundle = UseSD->FirstInBundle; - if (!DestBundle->IsScheduled) - BundleMember->incrementUnscheduledDeps(1); - if (!DestBundle->hasValidDependencies()) - WorkList.push_back(DestBundle); - } - } else { - // I'm not sure if this can ever happen. But we need to be safe. - // This lets the instruction/bundle never be scheduled and - // eventually disable vectorization. - BundleMember->Dependencies++; - BundleMember->incrementUnscheduledDeps(1); - } - } } + } - // Handle the memory dependencies. - ScheduleData *DepDest = BundleMember->NextLoadStore; - if (DepDest) { - Instruction *SrcInst = BundleMember->Inst; - MemoryLocation SrcLoc = getLocation(SrcInst, SLP->AA); - bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory(); - unsigned numAliased = 0; - unsigned DistToSrc = 1; - - while (DepDest) { - assert(isInSchedulingRegion(DepDest)); - - // We have two limits to reduce the complexity: - // 1) AliasedCheckLimit: It's a small limit to reduce calls to - // SLP->isAliased (which is the expensive part in this loop). - // 2) MaxMemDepDistance: It's for very large blocks and it aborts - // the whole loop (even if the loop is fast, it's quadratic). - // It's important for the loop break condition (see below) to - // check this limit even between two read-only instructions. - if (DistToSrc >= MaxMemDepDistance || - ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) && - (numAliased >= AliasedCheckLimit || - SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) { - - // We increment the counter only if the locations are aliased - // (instead of counting all alias checks). This gives a better - // balance between reduced runtime and accurate dependencies. - numAliased++; - - DepDest->MemoryDependencies.push_back(BundleMember); - BundleMember->Dependencies++; - ScheduleData *DestBundle = DepDest->FirstInBundle; - if (!DestBundle->IsScheduled) { - BundleMember->incrementUnscheduledDeps(1); - } - if (!DestBundle->hasValidDependencies()) { - WorkList.push_back(DestBundle); - } - } - DepDest = DepDest->NextLoadStore; - - // Example, explaining the loop break condition: Let's assume our - // starting instruction is i0 and MaxMemDepDistance = 3. - // - // +--------v--v--v - // i0,i1,i2,i3,i4,i5,i6,i7,i8 - // +--------^--^--^ - // - // MaxMemDepDistance let us stop alias-checking at i3 and we add - // dependencies from i0 to i3,i4,.. (even if they are not aliased). - // Previously we already added dependencies from i3 to i6,i7,i8 - // (because of MaxMemDepDistance). As we added a dependency from - // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8 - // and we can abort this loop at i6. - if (DistToSrc >= 2 * MaxMemDepDistance) - break; - DistToSrc++; + // Handle the memory dependencies (if any). + ScheduleData *DepDest = BundleMember->NextLoadStore; + if (!DepDest) + continue; + Instruction *SrcInst = BundleMember->Inst; + assert(SrcInst->mayReadOrWriteMemory() && + "NextLoadStore list for non memory effecting bundle?"); + MemoryLocation SrcLoc = getLocation(SrcInst); + bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory(); + unsigned numAliased = 0; + unsigned DistToSrc = 1; + + for ( ; DepDest; DepDest = DepDest->NextLoadStore) { + assert(isInSchedulingRegion(DepDest)); + + // We have two limits to reduce the complexity: + // 1) AliasedCheckLimit: It's a small limit to reduce calls to + // SLP->isAliased (which is the expensive part in this loop). + // 2) MaxMemDepDistance: It's for very large blocks and it aborts + // the whole loop (even if the loop is fast, it's quadratic). + // It's important for the loop break condition (see below) to + // check this limit even between two read-only instructions. + if (DistToSrc >= MaxMemDepDistance || + ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) && + (numAliased >= AliasedCheckLimit || + SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) { + + // We increment the counter only if the locations are aliased + // (instead of counting all alias checks). This gives a better + // balance between reduced runtime and accurate dependencies. + numAliased++; + + DepDest->MemoryDependencies.push_back(BundleMember); + BundleMember->Dependencies++; + ScheduleData *DestBundle = DepDest->FirstInBundle; + if (!DestBundle->IsScheduled) { + BundleMember->incrementUnscheduledDeps(1); + } + if (!DestBundle->hasValidDependencies()) { + WorkList.push_back(DestBundle); } } + + // Example, explaining the loop break condition: Let's assume our + // starting instruction is i0 and MaxMemDepDistance = 3. + // + // +--------v--v--v + // i0,i1,i2,i3,i4,i5,i6,i7,i8 + // +--------^--^--^ + // + // MaxMemDepDistance let us stop alias-checking at i3 and we add + // dependencies from i0 to i3,i4,.. (even if they are not aliased). + // Previously we already added dependencies from i3 to i6,i7,i8 + // (because of MaxMemDepDistance). As we added a dependency from + // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8 + // and we can abort this loop at i6. + if (DistToSrc >= 2 * MaxMemDepDistance) + break; + DistToSrc++; } - BundleMember = BundleMember->NextInBundle; } if (InsertInReadyList && SD->isReady()) { ReadyInsts.push_back(SD); @@ -7638,8 +7673,8 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) { // Move the scheduled instruction(s) to their dedicated places, if not // there yet. - ScheduleData *BundleMember = picked; - while (BundleMember) { + for (ScheduleData *BundleMember = picked; BundleMember; + BundleMember = BundleMember->NextInBundle) { Instruction *pickedInst = BundleMember->Inst; if (pickedInst->getNextNode() != LastScheduledInst) { BS->BB->getInstList().remove(pickedInst); @@ -7647,7 +7682,6 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) { pickedInst); } LastScheduledInst = pickedInst; - BundleMember = BundleMember->NextInBundle; } BS->schedule(picked, ReadyInsts); @@ -8045,8 +8079,11 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_, // If the target claims to have no vector registers don't attempt // vectorization. - if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true))) + if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true))) { + LLVM_DEBUG( + dbgs() << "SLP: Didn't find any vector registers for target, abort.\n"); return false; + } // Don't vectorize when the attribute NoImplicitFloat is used. if (F.hasFnAttribute(Attribute::NoImplicitFloat)) @@ -8693,7 +8730,6 @@ class HorizontalReduction { static RecurKind getRdxKind(Instruction *I) { assert(I && "Expected instruction for reduction matching"); - TargetTransformInfo::ReductionFlags RdxFlags; if (match(I, m_Add(m_Value(), m_Value()))) return RecurKind::Add; if (match(I, m_Mul(m_Value(), m_Value()))) @@ -8767,7 +8803,6 @@ class HorizontalReduction { return RecurKind::None; } - TargetTransformInfo::ReductionFlags RdxFlags; switch (Pred) { default: return RecurKind::None; @@ -9206,7 +9241,7 @@ private: auto *SclCondTy = CmpInst::makeCmpResultType(ScalarTy); auto *VecCondTy = cast<VectorType>(CmpInst::makeCmpResultType(VectorTy)); VectorCost = TTI->getMinMaxReductionCost(VectorTy, VecCondTy, - /*unsigned=*/false, CostKind); + /*IsUnsigned=*/false, CostKind); CmpInst::Predicate RdxPred = getMinMaxReductionPredicate(RdxKind); ScalarCost = TTI->getCmpSelInstrCost(Instruction::FCmp, ScalarTy, SclCondTy, RdxPred, CostKind) + @@ -9571,8 +9606,7 @@ bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI, return false; LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n"); - // Aggregate value is unlikely to be processed in vector register, we need to - // extract scalars into scalar registers, so NeedExtraction is set true. + // Aggregate value is unlikely to be processed in vector register. return tryToVectorizeList(BuildVectorOpds, R); } @@ -9598,7 +9632,7 @@ tryToVectorizeSequence(SmallVectorImpl<T *> &Incoming, function_ref<unsigned(T *)> Limit, function_ref<bool(T *, T *)> Comparator, function_ref<bool(T *, T *)> AreCompatible, - function_ref<bool(ArrayRef<T *>, bool)> TryToVectorize, + function_ref<bool(ArrayRef<T *>, bool)> TryToVectorizeHelper, bool LimitForRegisterSize) { bool Changed = false; // Sort by type, parent, operands. @@ -9627,7 +9661,7 @@ tryToVectorizeSequence(SmallVectorImpl<T *> &Incoming, // same/alternate ops only, this may result in some extra final // vectorization. if (NumElts > 1 && - TryToVectorize(makeArrayRef(IncIt, NumElts), LimitForRegisterSize)) { + TryToVectorizeHelper(makeArrayRef(IncIt, NumElts), LimitForRegisterSize)) { // Success start over because instructions might have been changed. Changed = true; } else if (NumElts < Limit(*IncIt) && @@ -9638,7 +9672,7 @@ tryToVectorizeSequence(SmallVectorImpl<T *> &Incoming, // Final attempt to vectorize instructions with the same types. if (Candidates.size() > 1 && (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType())) { - if (TryToVectorize(Candidates, /*LimitForRegisterSize=*/false)) { + if (TryToVectorizeHelper(Candidates, /*LimitForRegisterSize=*/false)) { // Success start over because instructions might have been changed. Changed = true; } else if (LimitForRegisterSize) { @@ -9649,7 +9683,7 @@ tryToVectorizeSequence(SmallVectorImpl<T *> &Incoming, while (SameTypeIt != End && AreCompatible(*SameTypeIt, *It)) ++SameTypeIt; unsigned NumElts = (SameTypeIt - It); - if (NumElts > 1 && TryToVectorize(makeArrayRef(It, NumElts), + if (NumElts > 1 && TryToVectorizeHelper(makeArrayRef(It, NumElts), /*LimitForRegisterSize=*/false)) Changed = true; It = SameTypeIt; diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h index 65857f034210..e5dded3c0f1e 100644 --- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h +++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h @@ -59,7 +59,7 @@ class VPRecipeBuilder { /// Cross-iteration reduction & first-order recurrence phis for which we need /// to add the incoming value from the backedge after all recipes have been /// created. - SmallVector<VPWidenPHIRecipe *, 4> PhisToFix; + SmallVector<VPHeaderPHIRecipe *, 4> PhisToFix; /// Check if \p I can be widened at the start of \p Range and possibly /// decrease the range such that the returned value holds for the entire \p diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 1d9e71663cd2..a96c122db2a9 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -677,10 +677,10 @@ void VPInstruction::generateInstruction(VPTransformState &State, // Get first lane of vector induction variable. Value *VIVElem0 = State.get(getOperand(0), VPIteration(Part, 0)); // Get the original loop tripcount. - Value *ScalarTC = State.TripCount; + Value *ScalarTC = State.get(getOperand(1), Part); auto *Int1Ty = Type::getInt1Ty(Builder.getContext()); - auto *PredTy = FixedVectorType::get(Int1Ty, State.VF.getKnownMinValue()); + auto *PredTy = VectorType::get(Int1Ty, State.VF); Instruction *Call = Builder.CreateIntrinsic( Intrinsic::get_active_lane_mask, {PredTy, ScalarTC->getType()}, {VIVElem0, ScalarTC}, nullptr, "active.lane.mask"); @@ -711,6 +711,51 @@ void VPInstruction::generateInstruction(VPTransformState &State, } break; } + + case VPInstruction::CanonicalIVIncrement: + case VPInstruction::CanonicalIVIncrementNUW: { + Value *Next = nullptr; + if (Part == 0) { + bool IsNUW = getOpcode() == VPInstruction::CanonicalIVIncrementNUW; + auto *Phi = State.get(getOperand(0), 0); + // The loop step is equal to the vectorization factor (num of SIMD + // elements) times the unroll factor (num of SIMD instructions). + Value *Step = + createStepForVF(Builder, Phi->getType(), State.VF, State.UF); + Next = Builder.CreateAdd(Phi, Step, "index.next", IsNUW, false); + } else { + Next = State.get(this, 0); + } + + State.set(this, Next, Part); + break; + } + case VPInstruction::BranchOnCount: { + if (Part != 0) + break; + // First create the compare. + Value *IV = State.get(getOperand(0), Part); + Value *TC = State.get(getOperand(1), Part); + Value *Cond = Builder.CreateICmpEQ(IV, TC); + + // Now create the branch. + auto *Plan = getParent()->getPlan(); + VPRegionBlock *TopRegion = Plan->getVectorLoopRegion(); + VPBasicBlock *Header = TopRegion->getEntry()->getEntryBasicBlock(); + if (Header->empty()) { + assert(EnableVPlanNativePath && + "empty entry block only expected in VPlanNativePath"); + Header = cast<VPBasicBlock>(Header->getSingleSuccessor()); + } + // TODO: Once the exit block is modeled in VPlan, use it instead of going + // through State.CFG.LastBB. + BasicBlock *Exit = + cast<BranchInst>(State.CFG.LastBB->getTerminator())->getSuccessor(0); + + Builder.CreateCondBr(Cond, Exit, State.CFG.VPBB2IRBB[Header]); + Builder.GetInsertBlock()->getTerminator()->eraseFromParent(); + break; + } default: llvm_unreachable("Unsupported opcode for instruction"); } @@ -758,6 +803,15 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent, case VPInstruction::FirstOrderRecurrenceSplice: O << "first-order splice"; break; + case VPInstruction::CanonicalIVIncrement: + O << "VF * UF + "; + break; + case VPInstruction::CanonicalIVIncrementNUW: + O << "VF * UF +(nuw) "; + break; + case VPInstruction::BranchOnCount: + O << "branch-on-count "; + break; default: O << Instruction::getOpcodeName(getOpcode()); } @@ -786,23 +840,55 @@ void VPInstruction::setFastMathFlags(FastMathFlags FMFNew) { FMF = FMFNew; } -/// Generate the code inside the body of the vectorized loop. Assumes a single -/// LoopVectorBody basic-block was created for this. Introduce additional -/// basic-blocks as needed, and fill them all. -void VPlan::execute(VPTransformState *State) { - // -1. Check if the backedge taken count is needed, and if so build it. +void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV, + Value *CanonicalIVStartValue, + VPTransformState &State) { + // Check if the trip count is needed, and if so build it. + if (TripCount && TripCount->getNumUsers()) { + for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) + State.set(TripCount, TripCountV, Part); + } + + // Check if the backedge taken count is needed, and if so build it. if (BackedgeTakenCount && BackedgeTakenCount->getNumUsers()) { - Value *TC = State->TripCount; - IRBuilder<> Builder(State->CFG.PrevBB->getTerminator()); - auto *TCMO = Builder.CreateSub(TC, ConstantInt::get(TC->getType(), 1), + IRBuilder<> Builder(State.CFG.PrevBB->getTerminator()); + auto *TCMO = Builder.CreateSub(TripCountV, + ConstantInt::get(TripCountV->getType(), 1), "trip.count.minus.1"); - auto VF = State->VF; + auto VF = State.VF; Value *VTCMO = VF.isScalar() ? TCMO : Builder.CreateVectorSplat(VF, TCMO, "broadcast"); - for (unsigned Part = 0, UF = State->UF; Part < UF; ++Part) - State->set(BackedgeTakenCount, VTCMO, Part); + for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) + State.set(BackedgeTakenCount, VTCMO, Part); } + for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) + State.set(&VectorTripCount, VectorTripCountV, Part); + + // When vectorizing the epilogue loop, the canonical induction start value + // needs to be changed from zero to the value after the main vector loop. + if (CanonicalIVStartValue) { + VPValue *VPV = new VPValue(CanonicalIVStartValue); + addExternalDef(VPV); + auto *IV = getCanonicalIV(); + assert(all_of(IV->users(), + [](const VPUser *U) { + auto *VPI = cast<VPInstruction>(U); + return VPI->getOpcode() == + VPInstruction::CanonicalIVIncrement || + VPI->getOpcode() == + VPInstruction::CanonicalIVIncrementNUW; + }) && + "the canonical IV should only be used by its increments when " + "resetting the start value"); + IV->setOperand(0, VPV); + } +} + +/// Generate the code inside the body of the vectorized loop. Assumes a single +/// LoopVectorBody basic-block was created for this. Introduce additional +/// basic-blocks as needed, and fill them all. +void VPlan::execute(VPTransformState *State) { // 0. Set the reverse mapping from VPValues to Values for code generation. for (auto &Entry : Value2VPValue) State->VPValue2Value[Entry.second] = Entry.first; @@ -834,28 +920,6 @@ void VPlan::execute(VPTransformState *State) { for (VPBlockBase *Block : depth_first(Entry)) Block->execute(State); - // Fix the latch value of reduction and first-order recurrences phis in the - // vector loop. - VPBasicBlock *Header = Entry->getEntryBasicBlock(); - for (VPRecipeBase &R : Header->phis()) { - auto *PhiR = dyn_cast<VPWidenPHIRecipe>(&R); - if (!PhiR || !(isa<VPFirstOrderRecurrencePHIRecipe>(&R) || - isa<VPReductionPHIRecipe>(&R))) - continue; - // For first-order recurrences and in-order reduction phis, only a single - // part is generated, which provides the last part from the previous - // iteration. Otherwise all UF parts are generated. - bool SinglePartNeeded = isa<VPFirstOrderRecurrencePHIRecipe>(&R) || - cast<VPReductionPHIRecipe>(&R)->isOrdered(); - unsigned LastPartForNewPhi = SinglePartNeeded ? 1 : State->UF; - for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) { - Value *VecPhi = State->get(PhiR, Part); - Value *Val = State->get(PhiR->getBackedgeValue(), - SinglePartNeeded ? State->UF - 1 : Part); - cast<PHINode>(VecPhi)->addIncoming(Val, VectorLatchBB); - } - } - // Setup branch terminator successors for VPBBs in VPBBsToFix based on // VPBB's successors. for (auto VPBB : State->CFG.VPBBsToFix) { @@ -876,13 +940,19 @@ void VPlan::execute(VPTransformState *State) { // 3. Merge the temporary latch created with the last basic-block filled. BasicBlock *LastBB = State->CFG.PrevBB; + assert(isa<BranchInst>(LastBB->getTerminator()) && + "Expected VPlan CFG to terminate with branch"); + + // Move both the branch and check from LastBB to VectorLatchBB. + auto *LastBranch = cast<BranchInst>(LastBB->getTerminator()); + LastBranch->moveBefore(VectorLatchBB->getTerminator()); + VectorLatchBB->getTerminator()->eraseFromParent(); + // Move condition so it is guaranteed to be next to branch. This is only done + // to avoid excessive test updates. + // TODO: Remove special handling once the increments for all inductions are + // modeled explicitly in VPlan. + cast<Instruction>(LastBranch->getCondition())->moveBefore(LastBranch); // Connect LastBB to VectorLatchBB to facilitate their merge. - assert((EnableVPlanNativePath || - isa<UnreachableInst>(LastBB->getTerminator())) && - "Expected InnerLoop VPlan CFG to terminate with unreachable"); - assert((!EnableVPlanNativePath || isa<BranchInst>(LastBB->getTerminator())) && - "Expected VPlan CFG to terminate with branch in NativePath"); - LastBB->getTerminator()->eraseFromParent(); BranchInst::Create(VectorLatchBB, LastBB); // Merge LastBB with Latch. @@ -891,6 +961,37 @@ void VPlan::execute(VPTransformState *State) { assert(Merged && "Could not merge last basic block with latch."); VectorLatchBB = LastBB; + // Fix the latch value of canonical, reduction and first-order recurrences + // phis in the vector loop. + VPBasicBlock *Header = Entry->getEntryBasicBlock(); + if (Header->empty()) { + assert(EnableVPlanNativePath); + Header = cast<VPBasicBlock>(Header->getSingleSuccessor()); + } + for (VPRecipeBase &R : Header->phis()) { + // Skip phi-like recipes that generate their backedege values themselves. + // TODO: Model their backedge values explicitly. + if (isa<VPWidenIntOrFpInductionRecipe>(&R) || isa<VPWidenPHIRecipe>(&R)) + continue; + + auto *PhiR = cast<VPHeaderPHIRecipe>(&R); + // For canonical IV, first-order recurrences and in-order reduction phis, + // only a single part is generated, which provides the last part from the + // previous iteration. For non-ordered reductions all UF parts are + // generated. + bool SinglePartNeeded = isa<VPCanonicalIVPHIRecipe>(PhiR) || + isa<VPFirstOrderRecurrencePHIRecipe>(PhiR) || + cast<VPReductionPHIRecipe>(PhiR)->isOrdered(); + unsigned LastPartForNewPhi = SinglePartNeeded ? 1 : State->UF; + + for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) { + Value *Phi = State->get(PhiR, Part); + Value *Val = State->get(PhiR->getBackedgeValue(), + SinglePartNeeded ? State->UF - 1 : Part); + cast<PHINode>(Phi)->addIncoming(Val, VectorLatchBB); + } + } + // We do not attempt to preserve DT for outer loop vectorization currently. if (!EnableVPlanNativePath) updateDominatorTree(State->DT, VectorPreHeaderBB, VectorLatchBB, @@ -904,6 +1005,12 @@ void VPlan::print(raw_ostream &O) const { O << "VPlan '" << Name << "' {"; + if (VectorTripCount.getNumUsers() > 0) { + O << "\nLive-in "; + VectorTripCount.printAsOperand(O, SlotTracker); + O << " = vector-trip-count\n"; + } + if (BackedgeTakenCount && BackedgeTakenCount->getNumUsers()) { O << "\nLive-in "; BackedgeTakenCount->printAsOperand(O, SlotTracker); @@ -1155,7 +1262,15 @@ void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent, } else O << " " << VPlanIngredient(IV); } +#endif +bool VPWidenIntOrFpInductionRecipe::isCanonical() const { + auto *StartC = dyn_cast<ConstantInt>(getStartValue()->getLiveInIRValue()); + auto *StepC = dyn_cast<SCEVConstant>(getInductionDescriptor().getStep()); + return StartC && StartC->isZero() && StepC && StepC->isOne(); +} + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << "WIDEN-GEP "; @@ -1255,7 +1370,7 @@ void VPWidenMemoryInstructionRecipe::print(raw_ostream &O, const Twine &Indent, O << Indent << "WIDEN "; if (!isStore()) { - getVPSingleValue()->printAsOperand(O, SlotTracker); + printAsOperand(O, SlotTracker); O << " = "; } O << Instruction::getOpcodeName(Ingredient.getOpcode()) << " "; @@ -1264,26 +1379,39 @@ void VPWidenMemoryInstructionRecipe::print(raw_ostream &O, const Twine &Indent, } #endif +void VPCanonicalIVPHIRecipe::execute(VPTransformState &State) { + Value *Start = getStartValue()->getLiveInIRValue(); + PHINode *EntryPart = PHINode::Create( + Start->getType(), 2, "index", &*State.CFG.PrevBB->getFirstInsertionPt()); + EntryPart->addIncoming(Start, State.CFG.VectorPreHeader); + EntryPart->setDebugLoc(DL); + for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) + State.set(this, EntryPart, Part); +} + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +void VPCanonicalIVPHIRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << Indent << "EMIT "; + printAsOperand(O, SlotTracker); + O << " = CANONICAL-INDUCTION"; +} +#endif + void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) { - Value *CanonicalIV = State.CanonicalIV; + Value *CanonicalIV = State.get(getOperand(0), 0); Type *STy = CanonicalIV->getType(); IRBuilder<> Builder(State.CFG.PrevBB->getTerminator()); ElementCount VF = State.VF; - assert(!VF.isScalable() && "the code following assumes non scalables ECs"); Value *VStart = VF.isScalar() ? CanonicalIV - : Builder.CreateVectorSplat(VF.getKnownMinValue(), - CanonicalIV, "broadcast"); + : Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast"); for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) { - SmallVector<Constant *, 8> Indices; - for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) - Indices.push_back( - ConstantInt::get(STy, Part * VF.getKnownMinValue() + Lane)); - // If VF == 1, there is only one iteration in the loop above, thus the - // element pushed back into Indices is ConstantInt::get(STy, Part) - Constant *VStep = - VF.isScalar() ? Indices.back() : ConstantVector::get(Indices); - // Add the consecutive indices to the vector value. + Value *VStep = createStepForVF(Builder, STy, VF, Part); + if (VF.isVector()) { + VStep = Builder.CreateVectorSplat(VF, VStep); + VStep = Builder.CreateAdd(VStep, Builder.CreateStepVector(VStep->getType())); + } Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv"); State.set(this, CanonicalVectorIV, Part); } @@ -1294,7 +1422,8 @@ void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << "EMIT "; printAsOperand(O, SlotTracker); - O << " = WIDEN-CANONICAL-INDUCTION"; + O << " = WIDEN-CANONICAL-INDUCTION "; + printOperands(O, SlotTracker); } #endif @@ -1461,7 +1590,7 @@ void VPInterleavedAccessInfo::visitBlock(VPBlockBase *Block, Old2NewTy &Old2New, InterleavedAccessInfo &IAI) { if (VPBasicBlock *VPBB = dyn_cast<VPBasicBlock>(Block)) { for (VPRecipeBase &VPI : *VPBB) { - if (isa<VPWidenPHIRecipe>(&VPI)) + if (isa<VPHeaderPHIRecipe>(&VPI)) continue; assert(isa<VPInstruction>(&VPI) && "Can only handle VPInstructions"); auto *VPInst = cast<VPInstruction>(&VPI); @@ -1506,6 +1635,7 @@ void VPSlotTracker::assignSlots(const VPlan &Plan) { for (const VPValue *V : Plan.VPExternalDefs) assignSlot(V); + assignSlot(&Plan.VectorTripCount); if (Plan.BackedgeTakenCount) assignSlot(Plan.BackedgeTakenCount); diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index f4a1883e35d5..824440f98a8b 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -69,6 +69,9 @@ class VPlanSlp; /// vectors it is an expression determined at runtime. Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF); +/// Return a value for Step multiplied by VF. +Value *createStepForVF(IRBuilder<> &B, Type *Ty, ElementCount VF, int64_t Step); + /// A range of powers-of-2 vectorization factors with fixed start and /// adjustable end. The range includes start and excludes end, e.g.,: /// [1, 9) = {1, 2, 4, 8} @@ -198,8 +201,8 @@ struct VPTransformState { VPTransformState(ElementCount VF, unsigned UF, LoopInfo *LI, DominatorTree *DT, IRBuilder<> &Builder, InnerLoopVectorizer *ILV, VPlan *Plan) - : VF(VF), UF(UF), Instance(), LI(LI), DT(DT), Builder(Builder), ILV(ILV), - Plan(Plan) {} + : VF(VF), UF(UF), LI(LI), DT(DT), Builder(Builder), ILV(ILV), Plan(Plan) { + } /// The chosen Vectorization and Unroll Factors of the loop being vectorized. ElementCount VF; @@ -341,9 +344,6 @@ struct VPTransformState { /// Hold the canonical scalar IV of the vector loop (start=0, step=VF*UF). Value *CanonicalIV = nullptr; - /// Hold the trip count of the scalar loop. - Value *TripCount = nullptr; - /// Hold a pointer to InnerLoopVectorizer to reuse its IR generation methods. InnerLoopVectorizer *ILV; @@ -793,6 +793,9 @@ public: SLPLoad, SLPStore, ActiveLaneMask, + CanonicalIVIncrement, + CanonicalIVIncrementNUW, + BranchOnCount, }; private: @@ -833,6 +836,16 @@ public: return R->getVPDefID() == VPRecipeBase::VPInstructionSC; } + /// Extra classof implementations to allow directly casting from VPUser -> + /// VPInstruction. + static inline bool classof(const VPUser *U) { + auto *R = dyn_cast<VPRecipeBase>(U); + return R && R->getVPDefID() == VPRecipeBase::VPInstructionSC; + } + static inline bool classof(const VPRecipeBase *R) { + return R->getVPDefID() == VPRecipeBase::VPInstructionSC; + } + unsigned getOpcode() const { return Opcode; } /// Generate the instruction. @@ -871,6 +884,7 @@ public: case Instruction::Unreachable: case Instruction::Fence: case Instruction::AtomicRMW: + case VPInstruction::BranchOnCount: return false; default: return true; @@ -1045,6 +1059,7 @@ public: /// Returns the start value of the induction. VPValue *getStartValue() { return getOperand(0); } + const VPValue *getStartValue() const { return getOperand(0); } /// Returns the first defined value as TruncInst, if it is one or nullptr /// otherwise. @@ -1057,66 +1072,65 @@ public: /// Returns the induction descriptor for the recipe. const InductionDescriptor &getInductionDescriptor() const { return IndDesc; } -}; -/// A recipe for handling first order recurrences and pointer inductions. For -/// first-order recurrences, the start value is the first operand of the recipe -/// and the incoming value from the backedge is the second operand. It also -/// serves as base class for VPReductionPHIRecipe. In the VPlan native path, all -/// incoming VPValues & VPBasicBlock pairs are managed in the recipe directly. -class VPWidenPHIRecipe : public VPRecipeBase, public VPValue { - /// List of incoming blocks. Only used in the VPlan native path. - SmallVector<VPBasicBlock *, 2> IncomingBlocks; + /// Returns true if the induction is canonical, i.e. starting at 0 and + /// incremented by UF * VF (= the original IV is incremented by 1). + bool isCanonical() const; + + /// Returns the scalar type of the induction. + const Type *getScalarType() const { + const TruncInst *TruncI = getTruncInst(); + return TruncI ? TruncI->getType() : IV->getType(); + } +}; +/// A pure virtual base class for all recipes modeling header phis, including +/// phis for first order recurrences, pointer inductions and reductions. The +/// start value is the first operand of the recipe and the incoming value from +/// the backedge is the second operand. +class VPHeaderPHIRecipe : public VPRecipeBase, public VPValue { protected: - VPWidenPHIRecipe(unsigned char VPVID, unsigned char VPDefID, PHINode *Phi, - VPValue *Start = nullptr) + VPHeaderPHIRecipe(unsigned char VPVID, unsigned char VPDefID, PHINode *Phi, + VPValue *Start = nullptr) : VPRecipeBase(VPDefID, {}), VPValue(VPVID, Phi, this) { if (Start) addOperand(Start); } public: - /// Create a VPWidenPHIRecipe for \p Phi - VPWidenPHIRecipe(PHINode *Phi) - : VPWidenPHIRecipe(VPVWidenPHISC, VPWidenPHISC, Phi) {} - - /// Create a new VPWidenPHIRecipe for \p Phi with start value \p Start. - VPWidenPHIRecipe(PHINode *Phi, VPValue &Start) : VPWidenPHIRecipe(Phi) { - addOperand(&Start); - } - - ~VPWidenPHIRecipe() override = default; + ~VPHeaderPHIRecipe() override = default; /// Method to support type inquiry through isa, cast, and dyn_cast. static inline bool classof(const VPRecipeBase *B) { - return B->getVPDefID() == VPRecipeBase::VPWidenPHISC || + return B->getVPDefID() == VPRecipeBase::VPCanonicalIVPHISC || B->getVPDefID() == VPRecipeBase::VPFirstOrderRecurrencePHISC || - B->getVPDefID() == VPRecipeBase::VPReductionPHISC; + B->getVPDefID() == VPRecipeBase::VPReductionPHISC || + B->getVPDefID() == VPRecipeBase::VPWidenIntOrFpInductionSC || + B->getVPDefID() == VPRecipeBase::VPWidenPHISC; } static inline bool classof(const VPValue *V) { - return V->getVPValueID() == VPValue::VPVWidenPHISC || + return V->getVPValueID() == VPValue::VPVCanonicalIVPHISC || V->getVPValueID() == VPValue::VPVFirstOrderRecurrencePHISC || - V->getVPValueID() == VPValue::VPVReductionPHISC; + V->getVPValueID() == VPValue::VPVReductionPHISC || + V->getVPValueID() == VPValue::VPVWidenIntOrFpInductionSC || + V->getVPValueID() == VPValue::VPVWidenPHISC; } - /// Generate the phi/select nodes. - void execute(VPTransformState &State) override; + /// Generate the phi nodes. + void execute(VPTransformState &State) override = 0; #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print the recipe. void print(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const override; + VPSlotTracker &SlotTracker) const override = 0; #endif - /// Returns the start value of the phi, if it is a reduction or first-order - /// recurrence. + /// Returns the start value of the phi, if one is set. VPValue *getStartValue() { return getNumOperands() == 0 ? nullptr : getOperand(0); } - /// Returns the incoming value from the loop backedge, if it is a reduction or - /// first-order recurrence. + /// Returns the incoming value from the loop backedge. VPValue *getBackedgeValue() { return getOperand(1); } @@ -1126,6 +1140,44 @@ public: VPRecipeBase *getBackedgeRecipe() { return cast<VPRecipeBase>(getBackedgeValue()->getDef()); } +}; + +/// A recipe for handling header phis that are widened in the vector loop. +/// In the VPlan native path, all incoming VPValues & VPBasicBlock pairs are +/// managed in the recipe directly. +class VPWidenPHIRecipe : public VPHeaderPHIRecipe { + /// List of incoming blocks. Only used in the VPlan native path. + SmallVector<VPBasicBlock *, 2> IncomingBlocks; + +public: + /// Create a new VPWidenPHIRecipe for \p Phi with start value \p Start. + VPWidenPHIRecipe(PHINode *Phi, VPValue *Start = nullptr) + : VPHeaderPHIRecipe(VPVWidenPHISC, VPWidenPHISC, Phi) { + if (Start) + addOperand(Start); + } + + ~VPWidenPHIRecipe() override = default; + + /// Method to support type inquiry through isa, cast, and dyn_cast. + static inline bool classof(const VPRecipeBase *B) { + return B->getVPDefID() == VPRecipeBase::VPWidenPHISC; + } + static inline bool classof(const VPHeaderPHIRecipe *R) { + return R->getVPDefID() == VPRecipeBase::VPWidenPHISC; + } + static inline bool classof(const VPValue *V) { + return V->getVPValueID() == VPValue::VPVWidenPHISC; + } + + /// Generate the phi/select nodes. + void execute(VPTransformState &State) override; + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; +#endif /// Adds a pair (\p IncomingV, \p IncomingBlock) to the phi. void addIncoming(VPValue *IncomingV, VPBasicBlock *IncomingBlock) { @@ -1133,27 +1185,27 @@ public: IncomingBlocks.push_back(IncomingBlock); } - /// Returns the \p I th incoming VPValue. - VPValue *getIncomingValue(unsigned I) { return getOperand(I); } - /// Returns the \p I th incoming VPBasicBlock. VPBasicBlock *getIncomingBlock(unsigned I) { return IncomingBlocks[I]; } + + /// Returns the \p I th incoming VPValue. + VPValue *getIncomingValue(unsigned I) { return getOperand(I); } }; /// A recipe for handling first-order recurrence phis. The start value is the /// first operand of the recipe and the incoming value from the backedge is the /// second operand. -struct VPFirstOrderRecurrencePHIRecipe : public VPWidenPHIRecipe { +struct VPFirstOrderRecurrencePHIRecipe : public VPHeaderPHIRecipe { VPFirstOrderRecurrencePHIRecipe(PHINode *Phi, VPValue &Start) - : VPWidenPHIRecipe(VPVFirstOrderRecurrencePHISC, - VPFirstOrderRecurrencePHISC, Phi, &Start) {} + : VPHeaderPHIRecipe(VPVFirstOrderRecurrencePHISC, + VPFirstOrderRecurrencePHISC, Phi, &Start) {} /// Method to support type inquiry through isa, cast, and dyn_cast. static inline bool classof(const VPRecipeBase *R) { return R->getVPDefID() == VPRecipeBase::VPFirstOrderRecurrencePHISC; } - static inline bool classof(const VPWidenPHIRecipe *D) { - return D->getVPDefID() == VPRecipeBase::VPFirstOrderRecurrencePHISC; + static inline bool classof(const VPHeaderPHIRecipe *R) { + return R->getVPDefID() == VPRecipeBase::VPFirstOrderRecurrencePHISC; } static inline bool classof(const VPValue *V) { return V->getVPValueID() == VPValue::VPVFirstOrderRecurrencePHISC; @@ -1171,7 +1223,7 @@ struct VPFirstOrderRecurrencePHIRecipe : public VPWidenPHIRecipe { /// A recipe for handling reduction phis. The start value is the first operand /// of the recipe and the incoming value from the backedge is the second /// operand. -class VPReductionPHIRecipe : public VPWidenPHIRecipe { +class VPReductionPHIRecipe : public VPHeaderPHIRecipe { /// Descriptor for the reduction. const RecurrenceDescriptor &RdxDesc; @@ -1187,7 +1239,7 @@ public: VPReductionPHIRecipe(PHINode *Phi, const RecurrenceDescriptor &RdxDesc, VPValue &Start, bool IsInLoop = false, bool IsOrdered = false) - : VPWidenPHIRecipe(VPVReductionPHISC, VPReductionPHISC, Phi, &Start), + : VPHeaderPHIRecipe(VPVReductionPHISC, VPReductionPHISC, Phi, &Start), RdxDesc(RdxDesc), IsInLoop(IsInLoop), IsOrdered(IsOrdered) { assert((!IsOrdered || IsInLoop) && "IsOrdered requires IsInLoop"); } @@ -1198,12 +1250,12 @@ public: static inline bool classof(const VPRecipeBase *R) { return R->getVPDefID() == VPRecipeBase::VPReductionPHISC; } + static inline bool classof(const VPHeaderPHIRecipe *R) { + return R->getVPDefID() == VPRecipeBase::VPReductionPHISC; + } static inline bool classof(const VPValue *V) { return V->getVPValueID() == VPValue::VPVReductionPHISC; } - static inline bool classof(const VPWidenPHIRecipe *R) { - return R->getVPDefID() == VPRecipeBase::VPReductionPHISC; - } /// Generate the phi/select nodes. void execute(VPTransformState &State) override; @@ -1601,11 +1653,46 @@ public: #endif }; +/// Canonical scalar induction phi of the vector loop. Starting at the specified +/// start value (either 0 or the resume value when vectorizing the epilogue +/// loop). VPWidenCanonicalIVRecipe represents the vector version of the +/// canonical induction variable. +class VPCanonicalIVPHIRecipe : public VPHeaderPHIRecipe { + DebugLoc DL; + +public: + VPCanonicalIVPHIRecipe(VPValue *StartV, DebugLoc DL) + : VPHeaderPHIRecipe(VPValue::VPVCanonicalIVPHISC, VPCanonicalIVPHISC, + nullptr, StartV), + DL(DL) {} + + ~VPCanonicalIVPHIRecipe() override = default; + + /// Method to support type inquiry through isa, cast, and dyn_cast. + static inline bool classof(const VPDef *D) { + return D->getVPDefID() == VPCanonicalIVPHISC; + } + + /// Generate the canonical scalar induction phi of the vector loop. + void execute(VPTransformState &State) override; + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; +#endif + + /// Returns the scalar type of the induction. + const Type *getScalarType() const { + return getOperand(0)->getLiveInIRValue()->getType(); + } +}; + /// A Recipe for widening the canonical induction variable of the vector loop. class VPWidenCanonicalIVRecipe : public VPRecipeBase, public VPValue { public: - VPWidenCanonicalIVRecipe() - : VPRecipeBase(VPWidenCanonicalIVSC, {}), + VPWidenCanonicalIVRecipe(VPCanonicalIVPHIRecipe *CanonicalIV) + : VPRecipeBase(VPWidenCanonicalIVSC, {CanonicalIV}), VPValue(VPValue::VPVWidenCanonicalIVSC, nullptr, this) {} ~VPWidenCanonicalIVRecipe() override = default; @@ -1615,6 +1702,16 @@ public: return D->getVPDefID() == VPRecipeBase::VPWidenCanonicalIVSC; } + /// Extra classof implementations to allow directly casting from VPUser -> + /// VPWidenCanonicalIVRecipe. + static inline bool classof(const VPUser *U) { + auto *R = dyn_cast<VPRecipeBase>(U); + return R && R->getVPDefID() == VPRecipeBase::VPWidenCanonicalIVSC; + } + static inline bool classof(const VPRecipeBase *R) { + return R->getVPDefID() == VPRecipeBase::VPWidenCanonicalIVSC; + } + /// Generate a canonical vector induction variable of the vector loop, with /// start = {<Part*VF, Part*VF+1, ..., Part*VF+VF-1> for 0 <= Part < UF}, and /// step = <VF*UF, VF*UF, ..., VF*UF>. @@ -1625,6 +1722,12 @@ public: void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override; #endif + + /// Returns the scalar type of the induction. + const Type *getScalarType() const { + return cast<VPCanonicalIVPHIRecipe>(getOperand(0)->getDef()) + ->getScalarType(); + } }; /// VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph. It @@ -2112,10 +2215,17 @@ class VPlan { // (operators '==' and '<'). SetVector<VPValue *> VPExternalDefs; - /// Represents the backedge taken count of the original loop, for folding + /// Represents the trip count of the original loop, for folding /// the tail. + VPValue *TripCount = nullptr; + + /// Represents the backedge taken count of the original loop, for folding + /// the tail. It equals TripCount - 1. VPValue *BackedgeTakenCount = nullptr; + /// Represents the vector trip count. + VPValue VectorTripCount; + /// Holds a mapping between Values and their corresponding VPValue inside /// VPlan. Value2VPValueTy Value2VPValue; @@ -2147,12 +2257,18 @@ public: } for (VPValue *VPV : VPValuesToFree) delete VPV; + if (TripCount) + delete TripCount; if (BackedgeTakenCount) delete BackedgeTakenCount; for (VPValue *Def : VPExternalDefs) delete Def; } + /// Prepare the plan for execution, setting up the required live-in values. + void prepareToExecute(Value *TripCount, Value *VectorTripCount, + Value *CanonicalIVStartValue, VPTransformState &State); + /// Generate the IR code for this VPlan. void execute(struct VPTransformState *State); @@ -2165,6 +2281,13 @@ public: return Entry; } + /// The trip count of the original loop. + VPValue *getOrCreateTripCount() { + if (!TripCount) + TripCount = new VPValue(); + return TripCount; + } + /// The backedge taken count of the original loop. VPValue *getOrCreateBackedgeTakenCount() { if (!BackedgeTakenCount) @@ -2172,6 +2295,9 @@ public: return BackedgeTakenCount; } + /// The vector trip count. + VPValue &getVectorTripCount() { return VectorTripCount; } + /// Mark the plan to indicate that using Value2VPValue is not safe any /// longer, because it may be stale. void disableValue2VPValue() { Value2VPValueEnabled = false; } @@ -2264,6 +2390,21 @@ public: return !VPV->getDef() || (RepR && RepR->isUniform()); } + /// Returns the VPRegionBlock of the vector loop. + VPRegionBlock *getVectorLoopRegion() { + return cast<VPRegionBlock>(getEntry()); + } + + /// Returns the canonical induction recipe of the vector loop. + VPCanonicalIVPHIRecipe *getCanonicalIV() { + VPBasicBlock *EntryVPBB = getVectorLoopRegion()->getEntryBasicBlock(); + if (EntryVPBB->empty()) { + // VPlan native path. + EntryVPBB = cast<VPBasicBlock>(EntryVPBB->getSingleSuccessor()); + } + return cast<VPCanonicalIVPHIRecipe>(&*EntryVPBB->begin()); + } + private: /// Add to the given dominator tree the header block and every new basic block /// that was created between it and the latch block, inclusive. diff --git a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp index 86ecd6817873..e879a33db6ee 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp @@ -231,7 +231,7 @@ void VPlanPredicator::linearizeRegionRec(VPRegionBlock *Region) { } // Entry point. The driver function for the predicator. -void VPlanPredicator::predicate(void) { +void VPlanPredicator::predicate() { // Predicate the blocks within Region. predicateRegionRec(cast<VPRegionBlock>(Plan.getEntry())); diff --git a/llvm/lib/Transforms/Vectorize/VPlanPredicator.h b/llvm/lib/Transforms/Vectorize/VPlanPredicator.h index 692afd2978d5..a5db9a54da3c 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanPredicator.h +++ b/llvm/lib/Transforms/Vectorize/VPlanPredicator.h @@ -68,7 +68,7 @@ public: VPlanPredicator(VPlan &Plan); /// Predicate Plan's HCFG. - void predicate(void); + void predicate(); }; } // end namespace llvm #endif // LLVM_TRANSFORMS_VECTORIZE_VPLAN_PREDICATOR_H diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index d2daf558c2c5..fb5f3d428189 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -324,3 +324,30 @@ void VPlanTransforms::removeRedundantInductionCasts(VPlan &Plan) { E.first->eraseFromParent(); } } + +void VPlanTransforms::removeRedundantCanonicalIVs(VPlan &Plan) { + VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV(); + VPWidenCanonicalIVRecipe *WidenNewIV = nullptr; + for (VPUser *U : CanonicalIV->users()) { + WidenNewIV = dyn_cast<VPWidenCanonicalIVRecipe>(U); + if (WidenNewIV) + break; + } + + if (!WidenNewIV) + return; + + VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock(); + for (VPRecipeBase &Phi : HeaderVPBB->phis()) { + auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi); + + // If the induction recipe is canonical and the types match, use it + // directly. + if (WidenOriginalIV && WidenOriginalIV->isCanonical() && + WidenOriginalIV->getScalarType() == WidenNewIV->getScalarType()) { + WidenNewIV->replaceAllUsesWith(WidenOriginalIV); + WidenNewIV->eraseFromParent(); + return; + } + } +} diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index a82a562d5e35..e74409a86466 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -45,6 +45,10 @@ struct VPlanTransforms { /// in the vectorized loop. There is no need to vectorize the cast - the same /// value can be used for both the phi and casts in the vector loop. static void removeRedundantInductionCasts(VPlan &Plan); + + /// Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV + /// recipe, if it exists. + static void removeRedundantCanonicalIVs(VPlan &Plan); }; } // namespace llvm diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h index fd92201614df..5296d2b9485c 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanValue.h +++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -96,14 +96,15 @@ public: VPVReplicateSC, VPVWidenSC, VPVWidenCallSC, + VPVWidenCanonicalIVSC, VPVWidenGEPSC, VPVWidenSelectSC, // Phi-like VPValues. Need to be kept together. VPVBlendSC, + VPVCanonicalIVPHISC, VPVFirstOrderRecurrencePHISC, VPVWidenPHISC, - VPVWidenCanonicalIVSC, VPVWidenIntOrFpInductionSC, VPVPredInstPHI, VPVReductionPHISC, @@ -177,6 +178,7 @@ public: void replaceAllUsesWith(VPValue *New); VPDef *getDef() { return Def; } + const VPDef *getDef() const { return Def; } /// Returns the underlying IR value, if this VPValue is defined outside the /// scope of VPlan. Returns nullptr if the VPValue is defined by a VPDef @@ -186,6 +188,11 @@ public: "VPValue is not a live-in; it is defined by a VPDef inside a VPlan"); return getUnderlyingValue(); } + const Value *getLiveInIRValue() const { + assert(!getDef() && + "VPValue is not a live-in; it is defined by a VPDef inside a VPlan"); + return getUnderlyingValue(); + } }; typedef DenseMap<Value *, VPValue *> Value2VPValueTy; @@ -325,6 +332,7 @@ public: VPReductionSC, VPReplicateSC, VPWidenCallSC, + VPWidenCanonicalIVSC, VPWidenGEPSC, VPWidenMemoryInstructionSC, VPWidenSC, @@ -332,9 +340,9 @@ public: // Phi-like recipes. Need to be kept together. VPBlendSC, + VPCanonicalIVPHISC, VPFirstOrderRecurrencePHISC, VPWidenPHISC, - VPWidenCanonicalIVSC, VPWidenIntOrFpInductionSC, VPPredInstPHISC, VPReductionPHISC, @@ -403,7 +411,6 @@ public: class VPlan; class VPBasicBlock; -class VPRegionBlock; /// This class can be used to assign consecutive numbers to all VPValues in a /// VPlan and allows querying the numbering for printing, similar to the diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp index 7732d9367985..d36f250995e1 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp @@ -163,12 +163,32 @@ bool VPlanVerifier::verifyPlanIsValid(const VPlan &Plan) { errs() << "VPlan entry block is not a VPBasicBlock\n"; return false; } + + if (!isa<VPCanonicalIVPHIRecipe>(&*Entry->begin())) { + errs() << "VPlan vector loop header does not start with a " + "VPCanonicalIVPHIRecipe\n"; + return false; + } + const VPBasicBlock *Exit = dyn_cast<VPBasicBlock>(TopRegion->getExit()); if (!Exit) { errs() << "VPlan exit block is not a VPBasicBlock\n"; return false; } + if (Exit->empty()) { + errs() << "VPlan vector loop exit must end with BranchOnCount " + "VPInstruction but is empty\n"; + return false; + } + + auto *LastInst = dyn_cast<VPInstruction>(std::prev(Exit->end())); + if (!LastInst || LastInst->getOpcode() != VPInstruction::BranchOnCount) { + errs() << "VPlan vector loop exit must end with BranchOnCount " + "VPInstruction\n"; + return false; + } + for (const VPRegionBlock *Region : VPBlockUtils::blocksOnly<const VPRegionBlock>( depth_first(VPBlockRecursiveTraversalWrapper<const VPBlockBase *>( diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index c0aedab2fed0..620d388199e0 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -881,7 +881,8 @@ static ScalarizationResult canScalarizeAccess(FixedVectorType *VecTy, ConstantRange IdxRange(IntWidth, true); if (isGuaranteedNotToBePoison(Idx, &AC)) { - if (ValidIndices.contains(computeConstantRange(Idx, true, &AC, CtxI, &DT))) + if (ValidIndices.contains(computeConstantRange(Idx, /* ForSigned */ false, + true, &AC, CtxI, &DT))) return ScalarizationResult::safe(); return ScalarizationResult::unsafe(); } diff --git a/llvm/lib/WindowsManifest/WindowsManifestMerger.cpp b/llvm/lib/WindowsManifest/WindowsManifestMerger.cpp index 1be1d34417eb..40c03f7b0de7 100644 --- a/llvm/lib/WindowsManifest/WindowsManifestMerger.cpp +++ b/llvm/lib/WindowsManifest/WindowsManifestMerger.cpp @@ -669,7 +669,7 @@ WindowsManifestMerger::WindowsManifestMergerImpl::getMergedManifest() { std::unique_ptr<xmlDoc, XmlDeleter> OutputDoc( xmlNewDoc((const unsigned char *)"1.0")); xmlDocSetRootElement(OutputDoc.get(), CombinedRoot); - assert(0 == xmlDocGetRootElement(CombinedDoc)); + assert(nullptr == xmlDocGetRootElement(CombinedDoc)); xmlKeepBlanksDefault(0); xmlChar *Buff = nullptr; |