diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2020-01-17 20:45:01 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2020-01-17 20:45:01 +0000 |
commit | 706b4fc47bbc608932d3b491ae19a3b9cde9497b (patch) | |
tree | 4adf86a776049cbf7f69a1929c4babcbbef925eb /llvm/lib/Transforms/Vectorize | |
parent | 7cc9cf2bf09f069cb2dd947ead05d0b54301fb71 (diff) | |
download | src-706b4fc47bbc608932d3b491ae19a3b9cde9497b.tar.gz src-706b4fc47bbc608932d3b491ae19a3b9cde9497b.zip |
Vendor import of llvm-project master e26a78e70, the last commit beforevendor/llvm-project/llvmorg-10-init-17466-ge26a78e7085
the llvmorg-11-init tag, from which release/10.x was branched.
Notes
Notes:
svn path=/vendor/llvm-project/master/; revision=356843
svn path=/vendor/llvm-project/llvmorg-10-init-17466-ge26a78e7085/; revision=356844; tag=vendor/llvm-project/llvmorg-10-init-17466-ge26a78e7085
Diffstat (limited to 'llvm/lib/Transforms/Vectorize')
-rw-r--r-- | llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp | 5 | ||||
-rw-r--r-- | llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp | 14 | ||||
-rw-r--r-- | llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h | 11 | ||||
-rw-r--r-- | llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 1011 | ||||
-rw-r--r-- | llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 965 | ||||
-rw-r--r-- | llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h | 44 | ||||
-rw-r--r-- | llvm/lib/Transforms/Vectorize/VPlan.cpp | 61 | ||||
-rw-r--r-- | llvm/lib/Transforms/Vectorize/VPlan.h | 365 | ||||
-rw-r--r-- | llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp (renamed from llvm/lib/Transforms/Vectorize/VPlanHCFGTransforms.cpp) | 14 | ||||
-rw-r--r-- | llvm/lib/Transforms/Vectorize/VPlanTransforms.h (renamed from llvm/lib/Transforms/Vectorize/VPlanHCFGTransforms.h) | 12 | ||||
-rw-r--r-- | llvm/lib/Transforms/Vectorize/VPlanValue.h | 2 | ||||
-rw-r--r-- | llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp | 1 |
12 files changed, 1548 insertions, 957 deletions
diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp index f44976c723ec..7478daa2a0a5 100644 --- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp @@ -38,6 +38,7 @@ // could use this pass (with some modifications), but currently it implements // its own pass to do something similar to what we do here. +#include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h" #include "llvm/ADT/APInt.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/MapVector.h" @@ -52,7 +53,6 @@ #include "llvm/Analysis/OrderedBasicBlock.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/TargetTransformInfo.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/Analysis/VectorUtils.h" #include "llvm/IR/Attributes.h" @@ -71,14 +71,15 @@ #include "llvm/IR/Type.h" #include "llvm/IR/User.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/Debug.h" #include "llvm/Support/KnownBits.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Vectorize.h" -#include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h" #include <algorithm> #include <cassert> #include <cstdlib> diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp index f43842be5357..3f943f4c0688 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -72,7 +72,7 @@ LoopVectorizeHints::LoopVectorizeHints(const Loop *L, Interleave("interleave.count", InterleaveOnlyWhenForced, HK_UNROLL), Force("vectorize.enable", FK_Undefined, HK_FORCE), IsVectorized("isvectorized", 0, HK_ISVECTORIZED), - Predicate("vectorize.predicate.enable", 0, HK_PREDICATE), TheLoop(L), + Predicate("vectorize.predicate.enable", FK_Undefined, HK_PREDICATE), TheLoop(L), ORE(ORE) { // Populate values with existing loop metadata. getHintsFromMetadata(); @@ -815,6 +815,18 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { } } + // For first order recurrences, we use the previous value (incoming value from + // the latch) to check if it dominates all users of the recurrence. Bail out + // if we have to sink such an instruction for another recurrence, as the + // dominance requirement may not hold after sinking. + BasicBlock *LoopLatch = TheLoop->getLoopLatch(); + if (any_of(FirstOrderRecurrences, [LoopLatch, this](const PHINode *Phi) { + Instruction *V = + cast<Instruction>(Phi->getIncomingValueForBlock(LoopLatch)); + return SinkAfter.find(V) != SinkAfter.end(); + })) + return false; + // Now we know the widest induction type, check if our found induction // is the same size. If it's not, unset it here and InnerLoopVectorizer // will create another. diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h index a5e85f27fabf..c3ca43fcd492 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -201,6 +201,9 @@ class LoopVectorizationPlanner { /// The profitability analysis. LoopVectorizationCostModel &CM; + /// The interleaved access analysis. + InterleavedAccessInfo &IAI; + SmallVector<VPlanPtr, 4> VPlans; /// This class is used to enable the VPlan to invoke a method of ILV. This is @@ -211,6 +214,8 @@ class LoopVectorizationPlanner { VPCallbackILV(InnerLoopVectorizer &ILV) : ILV(ILV) {} Value *getOrCreateVectorValues(Value *V, unsigned Part) override; + Value *getOrCreateScalarValue(Value *V, + const VPIteration &Instance) override; }; /// A builder used to construct the current plan. @@ -223,8 +228,10 @@ public: LoopVectorizationPlanner(Loop *L, LoopInfo *LI, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, LoopVectorizationLegality *Legal, - LoopVectorizationCostModel &CM) - : OrigLoop(L), LI(LI), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM) {} + LoopVectorizationCostModel &CM, + InterleavedAccessInfo &IAI) + : OrigLoop(L), LI(LI), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM), + IAI(IAI) {} /// Plan how to best vectorize, return the best VF and its cost, or None if /// vectorization and interleaving should be avoided up front. diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 8f0bf70f873c..684a3098e564 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -58,8 +58,8 @@ #include "VPRecipeBuilder.h" #include "VPlan.h" #include "VPlanHCFGBuilder.h" -#include "VPlanHCFGTransforms.h" #include "VPlanPredicator.h" +#include "VPlanTransforms.h" #include "llvm/ADT/APInt.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseMap.h" @@ -124,6 +124,7 @@ #include "llvm/IR/Value.h" #include "llvm/IR/ValueHandle.h" #include "llvm/IR/Verifier.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" @@ -149,7 +150,6 @@ #include <string> #include <tuple> #include <utility> -#include <vector> using namespace llvm; @@ -200,9 +200,10 @@ static cl::opt<bool> EnableMaskedInterleavedMemAccesses( "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); -/// We don't interleave loops with a known constant trip count below this -/// number. -static const unsigned TinyTripCountInterleaveThreshold = 128; +static cl::opt<unsigned> TinyTripCountInterleaveThreshold( + "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, + cl::desc("We don't interleave loops with a estimated constant trip count " + "below this number")); static cl::opt<unsigned> ForceTargetNumScalarRegs( "force-target-num-scalar-regs", cl::init(0), cl::Hidden, @@ -427,6 +428,11 @@ public: /// new unrolled loop, where UF is the unroll factor. using VectorParts = SmallVector<Value *, 2>; + /// Vectorize a single GetElementPtrInst based on information gathered and + /// decisions taken during planning. + void widenGEP(GetElementPtrInst *GEP, unsigned UF, unsigned VF, + bool IsPtrLoopInvariant, SmallBitVector &IsIndexLoopInvariant); + /// Vectorize a single PHINode in a block. This method handles the induction /// variable canonicalization. It supports both VF = 1 for unrolled loops and /// arbitrary length vectors. @@ -476,15 +482,20 @@ public: /// Construct the vector value of a scalarized value \p V one lane at a time. void packScalarIntoVectorValue(Value *V, const VPIteration &Instance); - /// Try to vectorize the interleaved access group that \p Instr belongs to, - /// optionally masking the vector operations if \p BlockInMask is non-null. - void vectorizeInterleaveGroup(Instruction *Instr, - VectorParts *BlockInMask = nullptr); - - /// Vectorize Load and Store instructions, optionally masking the vector - /// operations if \p BlockInMask is non-null. - void vectorizeMemoryInstruction(Instruction *Instr, - VectorParts *BlockInMask = nullptr); + /// Try to vectorize the interleaved access group that \p Instr belongs to + /// with the base address given in \p Addr, optionally masking the vector + /// operations if \p BlockInMask is non-null. Use \p State to translate given + /// VPValues to IR values in the vectorized loop. + void vectorizeInterleaveGroup(Instruction *Instr, VPTransformState &State, + VPValue *Addr, VPValue *BlockInMask = nullptr); + + /// Vectorize Load and Store instructions with the base address given in \p + /// Addr, optionally masking the vector operations if \p BlockInMask is + /// non-null. Use \p State to translate given VPValues to IR values in the + /// vectorized loop. + void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State, + VPValue *Addr, + VPValue *BlockInMask = nullptr); /// Set the debug location in the builder using the debug location in /// the instruction. @@ -525,6 +536,9 @@ protected: /// vectorizing this phi node. void fixReduction(PHINode *Phi); + /// Clear NSW/NUW flags from reduction instructions if necessary. + void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc); + /// The Loop exit block may have single value PHI nodes with some /// incoming value. While vectorizing we only handled real values /// that were defined inside the loop and we should have one value for @@ -539,10 +553,6 @@ protected: /// represented as. void truncateToMinimalBitwidths(); - /// Insert the new loop to the loop hierarchy and pass manager - /// and update the analysis passes. - void updateAnalysis(); - /// Create a broadcast instruction. This method generates a broadcast /// instruction (shuffle) for loop invariant values and for the induction /// value. If this is the induction variable then we extend it to N, N+1, ... @@ -1204,14 +1214,14 @@ public: /// Returns true if the target machine supports masked scatter operation /// for the given \p DataType. - bool isLegalMaskedScatter(Type *DataType) { - return TTI.isLegalMaskedScatter(DataType); + bool isLegalMaskedScatter(Type *DataType, MaybeAlign Alignment) { + return TTI.isLegalMaskedScatter(DataType, Alignment); } /// Returns true if the target machine supports masked gather operation /// for the given \p DataType. - bool isLegalMaskedGather(Type *DataType) { - return TTI.isLegalMaskedGather(DataType); + bool isLegalMaskedGather(Type *DataType, MaybeAlign Alignment) { + return TTI.isLegalMaskedGather(DataType, Alignment); } /// Returns true if the target machine can represent \p V as a masked gather @@ -1222,7 +1232,9 @@ public: if (!LI && !SI) return false; auto *Ty = getMemInstValueType(V); - return (LI && isLegalMaskedGather(Ty)) || (SI && isLegalMaskedScatter(Ty)); + MaybeAlign Align = getLoadStoreAlignment(V); + return (LI && isLegalMaskedGather(Ty, Align)) || + (SI && isLegalMaskedScatter(Ty, Align)); } /// Returns true if \p I is an instruction that will be scalarized with @@ -2155,7 +2167,9 @@ static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr, - VectorParts *BlockInMask) { + VPTransformState &State, + VPValue *Addr, + VPValue *BlockInMask) { const InterleaveGroup<Instruction> *Group = Cost->getInterleavedAccessGroup(Instr); assert(Group && "Fail to get an interleaved access group."); @@ -2165,27 +2179,19 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr, return; const DataLayout &DL = Instr->getModule()->getDataLayout(); - Value *Ptr = getLoadStorePointerOperand(Instr); // Prepare for the vector type of the interleaved load/store. Type *ScalarTy = getMemInstValueType(Instr); unsigned InterleaveFactor = Group->getFactor(); Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF); - Type *PtrTy = VecTy->getPointerTo(getLoadStoreAddressSpace(Instr)); // Prepare for the new pointers. - setDebugLocFromInst(Builder, Ptr); - SmallVector<Value *, 2> NewPtrs; + SmallVector<Value *, 2> AddrParts; unsigned Index = Group->getIndex(Instr); - VectorParts Mask; - bool IsMaskForCondRequired = BlockInMask; - if (IsMaskForCondRequired) { - Mask = *BlockInMask; - // TODO: extend the masked interleaved-group support to reversed access. - assert(!Group->isReverse() && "Reversed masked interleave-group " - "not supported."); - } + // TODO: extend the masked interleaved-group support to reversed access. + assert((!BlockInMask || !Group->isReverse()) && + "Reversed masked interleave-group not supported."); // If the group is reverse, adjust the index to refer to the last vector lane // instead of the first. We adjust the index from the first vector lane, @@ -2196,12 +2202,9 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr, if (Group->isReverse()) Index += (VF - 1) * Group->getFactor(); - bool InBounds = false; - if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) - InBounds = gep->isInBounds(); - for (unsigned Part = 0; Part < UF; Part++) { - Value *NewPtr = getOrCreateScalarValue(Ptr, {Part, 0}); + Value *AddrPart = State.get(Addr, {Part, 0}); + setDebugLocFromInst(Builder, AddrPart); // Notice current instruction could be any index. Need to adjust the address // to the member of index 0. @@ -2214,12 +2217,17 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr, // A[i] = b; // Member of index 0 // A[i+2] = c; // Member of index 2 (Current instruction) // Current pointer is pointed to A[i+2], adjust it to A[i]. - NewPtr = Builder.CreateGEP(ScalarTy, NewPtr, Builder.getInt32(-Index)); - if (InBounds) - cast<GetElementPtrInst>(NewPtr)->setIsInBounds(true); + + bool InBounds = false; + if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) + InBounds = gep->isInBounds(); + AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); + cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds); // Cast to the vector pointer type. - NewPtrs.push_back(Builder.CreateBitCast(NewPtr, PtrTy)); + unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); + Type *PtrTy = VecTy->getPointerTo(AddressSpace); + AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); } setDebugLocFromInst(Builder, Instr); @@ -2237,26 +2245,27 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr, SmallVector<Value *, 2> NewLoads; for (unsigned Part = 0; Part < UF; Part++) { Instruction *NewLoad; - if (IsMaskForCondRequired || MaskForGaps) { + if (BlockInMask || MaskForGaps) { assert(useMaskedInterleavedAccesses(*TTI) && "masked interleaved groups are not allowed."); Value *GroupMask = MaskForGaps; - if (IsMaskForCondRequired) { - auto *Undefs = UndefValue::get(Mask[Part]->getType()); + if (BlockInMask) { + Value *BlockInMaskPart = State.get(BlockInMask, Part); + auto *Undefs = UndefValue::get(BlockInMaskPart->getType()); auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF); Value *ShuffledMask = Builder.CreateShuffleVector( - Mask[Part], Undefs, RepMask, "interleaved.mask"); + BlockInMaskPart, Undefs, RepMask, "interleaved.mask"); GroupMask = MaskForGaps ? Builder.CreateBinOp(Instruction::And, ShuffledMask, MaskForGaps) : ShuffledMask; } NewLoad = - Builder.CreateMaskedLoad(NewPtrs[Part], Group->getAlignment(), + Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlignment(), GroupMask, UndefVec, "wide.masked.vec"); } else - NewLoad = Builder.CreateAlignedLoad(VecTy, NewPtrs[Part], + NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], Group->getAlignment(), "wide.vec"); Group->addMetadata(NewLoad); NewLoads.push_back(NewLoad); @@ -2325,24 +2334,27 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr, "interleaved.vec"); Instruction *NewStoreInstr; - if (IsMaskForCondRequired) { - auto *Undefs = UndefValue::get(Mask[Part]->getType()); + if (BlockInMask) { + Value *BlockInMaskPart = State.get(BlockInMask, Part); + auto *Undefs = UndefValue::get(BlockInMaskPart->getType()); auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF); Value *ShuffledMask = Builder.CreateShuffleVector( - Mask[Part], Undefs, RepMask, "interleaved.mask"); + BlockInMaskPart, Undefs, RepMask, "interleaved.mask"); NewStoreInstr = Builder.CreateMaskedStore( - IVec, NewPtrs[Part], Group->getAlignment(), ShuffledMask); + IVec, AddrParts[Part], Group->getAlignment(), ShuffledMask); } else - NewStoreInstr = Builder.CreateAlignedStore(IVec, NewPtrs[Part], - Group->getAlignment()); + NewStoreInstr = Builder.CreateAlignedStore(IVec, AddrParts[Part], + Group->getAlignment()); Group->addMetadata(NewStoreInstr); } } void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, - VectorParts *BlockInMask) { + VPTransformState &State, + VPValue *Addr, + VPValue *BlockInMask) { // Attempt to issue a wide load. LoadInst *LI = dyn_cast<LoadInst>(Instr); StoreInst *SI = dyn_cast<StoreInst>(Instr); @@ -2354,17 +2366,15 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, assert(Decision != LoopVectorizationCostModel::CM_Unknown && "CM decision should be taken at this point"); if (Decision == LoopVectorizationCostModel::CM_Interleave) - return vectorizeInterleaveGroup(Instr); + return vectorizeInterleaveGroup(Instr, State, Addr, BlockInMask); Type *ScalarDataTy = getMemInstValueType(Instr); Type *DataTy = VectorType::get(ScalarDataTy, VF); - Value *Ptr = getLoadStorePointerOperand(Instr); // An alignment of 0 means target abi alignment. We need to use the scalar's // target abi alignment in such a case. const DataLayout &DL = Instr->getModule()->getDataLayout(); const Align Alignment = DL.getValueOrABITypeAlignment(getLoadStoreAlignment(Instr), ScalarDataTy); - unsigned AddressSpace = getLoadStoreAddressSpace(Instr); // Determine if the pointer operand of the access is either consecutive or // reverse consecutive. @@ -2378,25 +2388,22 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, // gather/scatter. Otherwise Decision should have been to Scalarize. assert((ConsecutiveStride || CreateGatherScatter) && "The instruction should be scalarized"); + (void)ConsecutiveStride; - // Handle consecutive loads/stores. - if (ConsecutiveStride) - Ptr = getOrCreateScalarValue(Ptr, {0, 0}); - - VectorParts Mask; + VectorParts BlockInMaskParts(UF); bool isMaskRequired = BlockInMask; if (isMaskRequired) - Mask = *BlockInMask; - - bool InBounds = false; - if (auto *gep = dyn_cast<GetElementPtrInst>( - getLoadStorePointerOperand(Instr)->stripPointerCasts())) - InBounds = gep->isInBounds(); + for (unsigned Part = 0; Part < UF; ++Part) + BlockInMaskParts[Part] = State.get(BlockInMask, Part); const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { // Calculate the pointer for the specific unroll-part. GetElementPtrInst *PartPtr = nullptr; + bool InBounds = false; + if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) + InBounds = gep->isInBounds(); + if (Reverse) { // If the address is consecutive but reversed, then the // wide store needs to start at the last vector element. @@ -2407,13 +2414,14 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, Builder.CreateGEP(ScalarDataTy, PartPtr, Builder.getInt32(1 - VF))); PartPtr->setIsInBounds(InBounds); if (isMaskRequired) // Reverse of a null all-one mask is a null mask. - Mask[Part] = reverseVector(Mask[Part]); + BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]); } else { PartPtr = cast<GetElementPtrInst>( Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(Part * VF))); PartPtr->setIsInBounds(InBounds); } + unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); }; @@ -2425,8 +2433,8 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, Instruction *NewSI = nullptr; Value *StoredVal = getOrCreateVectorValue(SI->getValueOperand(), Part); if (CreateGatherScatter) { - Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr; - Value *VectorGep = getOrCreateVectorValue(Ptr, Part); + Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; + Value *VectorGep = State.get(Addr, Part); NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment.value(), MaskPart); } else { @@ -2437,10 +2445,10 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, // We don't want to update the value in the map as it might be used in // another expression. So don't call resetVectorValue(StoredVal). } - auto *VecPtr = CreateVecPtr(Part, Ptr); + auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0})); if (isMaskRequired) - NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, - Alignment.value(), Mask[Part]); + NewSI = Builder.CreateMaskedStore( + StoredVal, VecPtr, Alignment.value(), BlockInMaskParts[Part]); else NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment.value()); @@ -2456,17 +2464,17 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, for (unsigned Part = 0; Part < UF; ++Part) { Value *NewLI; if (CreateGatherScatter) { - Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr; - Value *VectorGep = getOrCreateVectorValue(Ptr, Part); + Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; + Value *VectorGep = State.get(Addr, Part); NewLI = Builder.CreateMaskedGather(VectorGep, Alignment.value(), MaskPart, nullptr, "wide.masked.gather"); addMetadata(NewLI, LI); } else { - auto *VecPtr = CreateVecPtr(Part, Ptr); + auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0})); if (isMaskRequired) - NewLI = Builder.CreateMaskedLoad(VecPtr, Alignment.value(), Mask[Part], - UndefValue::get(DataTy), - "wide.masked.load"); + NewLI = Builder.CreateMaskedLoad( + VecPtr, Alignment.value(), BlockInMaskParts[Part], + UndefValue::get(DataTy), "wide.masked.load"); else NewLI = Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment.value(), "wide.load"); @@ -2676,8 +2684,10 @@ Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass) { Value *Count = getOrCreateTripCount(L); - BasicBlock *BB = L->getLoopPreheader(); - IRBuilder<> Builder(BB->getTerminator()); + // Reuse existing vector loop preheader for TC checks. + // Note that new preheader block is generated for vector loop. + BasicBlock *const TCCheckBlock = LoopVectorPreHeader; + IRBuilder<> Builder(TCCheckBlock->getTerminator()); // Generate code to check if the loop's trip count is less than VF * UF, or // equal to it in case a scalar epilogue is required; this implies that the @@ -2694,48 +2704,61 @@ void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, P, Count, ConstantInt::get(Count->getType(), VF * UF), "min.iters.check"); - BasicBlock *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph"); - // Update dominator tree immediately if the generated block is a - // LoopBypassBlock because SCEV expansions to generate loop bypass - // checks may query it before the current function is finished. - DT->addNewBlock(NewBB, BB); - if (L->getParentLoop()) - L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI); - ReplaceInstWithInst(BB->getTerminator(), - BranchInst::Create(Bypass, NewBB, CheckMinIters)); - LoopBypassBlocks.push_back(BB); + // Create new preheader for vector loop. + LoopVectorPreHeader = + SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, + "vector.ph"); + + assert(DT->properlyDominates(DT->getNode(TCCheckBlock), + DT->getNode(Bypass)->getIDom()) && + "TC check is expected to dominate Bypass"); + + // Update dominator for Bypass & LoopExit. + DT->changeImmediateDominator(Bypass, TCCheckBlock); + DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); + + ReplaceInstWithInst( + TCCheckBlock->getTerminator(), + BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); + LoopBypassBlocks.push_back(TCCheckBlock); } void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { - BasicBlock *BB = L->getLoopPreheader(); + // Reuse existing vector loop preheader for SCEV checks. + // Note that new preheader block is generated for vector loop. + BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader; // Generate the code to check that the SCEV assumptions that we made. // We want the new basic block to start at the first instruction in a // sequence of instructions that form a check. SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(), "scev.check"); - Value *SCEVCheck = - Exp.expandCodeForPredicate(&PSE.getUnionPredicate(), BB->getTerminator()); + Value *SCEVCheck = Exp.expandCodeForPredicate( + &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator()); if (auto *C = dyn_cast<ConstantInt>(SCEVCheck)) if (C->isZero()) return; - assert(!BB->getParent()->hasOptSize() && + assert(!SCEVCheckBlock->getParent()->hasOptSize() && "Cannot SCEV check stride or overflow when optimizing for size"); - // Create a new block containing the stride check. - BB->setName("vector.scevcheck"); - auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph"); - // Update dominator tree immediately if the generated block is a - // LoopBypassBlock because SCEV expansions to generate loop bypass - // checks may query it before the current function is finished. - DT->addNewBlock(NewBB, BB); - if (L->getParentLoop()) - L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI); - ReplaceInstWithInst(BB->getTerminator(), - BranchInst::Create(Bypass, NewBB, SCEVCheck)); - LoopBypassBlocks.push_back(BB); + SCEVCheckBlock->setName("vector.scevcheck"); + // Create new preheader for vector loop. + LoopVectorPreHeader = + SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI, + nullptr, "vector.ph"); + + // Update dominator only if this is first RT check. + if (LoopBypassBlocks.empty()) { + DT->changeImmediateDominator(Bypass, SCEVCheckBlock); + DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); + } + + ReplaceInstWithInst( + SCEVCheckBlock->getTerminator(), + BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck)); + LoopBypassBlocks.push_back(SCEVCheckBlock); AddedSafetyChecks = true; } @@ -2744,7 +2767,9 @@ void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) { if (EnableVPlanNativePath) return; - BasicBlock *BB = L->getLoopPreheader(); + // Reuse existing vector loop preheader for runtime memory checks. + // Note that new preheader block is generated for vector loop. + BasicBlock *const MemCheckBlock = L->getLoopPreheader(); // Generate the code that checks in runtime if arrays overlap. We put the // checks into a separate block to make the more common case of few elements @@ -2752,11 +2777,11 @@ void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) { Instruction *FirstCheckInst; Instruction *MemRuntimeCheck; std::tie(FirstCheckInst, MemRuntimeCheck) = - Legal->getLAI()->addRuntimeChecks(BB->getTerminator()); + Legal->getLAI()->addRuntimeChecks(MemCheckBlock->getTerminator()); if (!MemRuntimeCheck) return; - if (BB->getParent()->hasOptSize()) { + if (MemCheckBlock->getParent()->hasOptSize()) { assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && "Cannot emit memory checks when optimizing for size, unless forced " "to vectorize."); @@ -2770,24 +2795,28 @@ void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) { }); } - // Create a new block containing the memory check. - BB->setName("vector.memcheck"); - auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph"); - // Update dominator tree immediately if the generated block is a - // LoopBypassBlock because SCEV expansions to generate loop bypass - // checks may query it before the current function is finished. - DT->addNewBlock(NewBB, BB); - if (L->getParentLoop()) - L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI); - ReplaceInstWithInst(BB->getTerminator(), - BranchInst::Create(Bypass, NewBB, MemRuntimeCheck)); - LoopBypassBlocks.push_back(BB); + MemCheckBlock->setName("vector.memcheck"); + // Create new preheader for vector loop. + LoopVectorPreHeader = + SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr, + "vector.ph"); + + // Update dominator only if this is first RT check. + if (LoopBypassBlocks.empty()) { + DT->changeImmediateDominator(Bypass, MemCheckBlock); + DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock); + } + + ReplaceInstWithInst( + MemCheckBlock->getTerminator(), + BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheck)); + LoopBypassBlocks.push_back(MemCheckBlock); AddedSafetyChecks = true; // We currently don't use LoopVersioning for the actual loop cloning but we // still use it to add the noalias metadata. LVer = std::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT, - PSE.getSE()); + PSE.getSE()); LVer->prepareNoAliasMetadata(); } @@ -2912,12 +2941,7 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { ... */ - BasicBlock *OldBasicBlock = OrigLoop->getHeader(); - BasicBlock *VectorPH = OrigLoop->getLoopPreheader(); - BasicBlock *ExitBlock = OrigLoop->getExitBlock(); MDNode *OrigLoopID = OrigLoop->getLoopID(); - assert(VectorPH && "Invalid loop structure"); - assert(ExitBlock && "Must have an exit block"); // Some loops have a single integer induction variable, while other loops // don't. One example is c++ iterators that often have multiple pointer @@ -2934,12 +2958,27 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { Type *IdxTy = Legal->getWidestInductionType(); // Split the single block loop into the two loop structure described above. - BasicBlock *VecBody = - VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body"); - BasicBlock *MiddleBlock = - VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block"); - BasicBlock *ScalarPH = - MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph"); + LoopScalarBody = OrigLoop->getHeader(); + LoopVectorPreHeader = OrigLoop->getLoopPreheader(); + LoopExitBlock = OrigLoop->getExitBlock(); + assert(LoopExitBlock && "Must have an exit block"); + assert(LoopVectorPreHeader && "Invalid loop structure"); + + LoopMiddleBlock = + SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, + LI, nullptr, "middle.block"); + LoopScalarPreHeader = + SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, + nullptr, "scalar.ph"); + // We intentionally don't let SplitBlock to update LoopInfo since + // LoopVectorBody should belong to another loop than LoopVectorPreHeader. + // LoopVectorBody is explicitly added to the correct place few lines later. + LoopVectorBody = + SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, + nullptr, nullptr, "vector.body"); + + // Update dominator for loop exit. + DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); // Create and register the new vector loop. Loop *Lp = LI->AllocateLoop(); @@ -2949,12 +2988,10 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { // before calling any utilities such as SCEV that require valid LoopInfo. if (ParentLoop) { ParentLoop->addChildLoop(Lp); - ParentLoop->addBasicBlockToLoop(ScalarPH, *LI); - ParentLoop->addBasicBlockToLoop(MiddleBlock, *LI); } else { LI->addTopLevelLoop(Lp); } - Lp->addBasicBlockToLoop(VecBody, *LI); + Lp->addBasicBlockToLoop(LoopVectorBody, *LI); // Find the loop boundaries. Value *Count = getOrCreateTripCount(Lp); @@ -2966,16 +3003,16 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { // backedge-taken count is uint##_max: adding one to it will overflow leading // to an incorrect trip count of zero. In this (rare) case we will also jump // to the scalar loop. - emitMinimumIterationCountCheck(Lp, ScalarPH); + emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); // Generate the code to check any assumptions that we've made for SCEV // expressions. - emitSCEVChecks(Lp, ScalarPH); + emitSCEVChecks(Lp, LoopScalarPreHeader); // Generate the code that checks in runtime if arrays overlap. We put the // checks into a separate block to make the more common case of few elements // faster. - emitMemRuntimeChecks(Lp, ScalarPH); + emitMemRuntimeChecks(Lp, LoopScalarPreHeader); // Generate the induction variable. // The loop step is equal to the vectorization factor (num of SIMD elements) @@ -3003,8 +3040,9 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { InductionDescriptor II = InductionEntry.second; // Create phi nodes to merge from the backedge-taken check block. - PHINode *BCResumeVal = PHINode::Create( - OrigPhi->getType(), 3, "bc.resume.val", ScalarPH->getTerminator()); + PHINode *BCResumeVal = + PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", + LoopScalarPreHeader->getTerminator()); // Copy original phi DL over to the new one. BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); Value *&EndValue = IVEndValues[OrigPhi]; @@ -3015,23 +3053,23 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { IRBuilder<> B(Lp->getLoopPreheader()->getTerminator()); Type *StepType = II.getStep()->getType(); Instruction::CastOps CastOp = - CastInst::getCastOpcode(CountRoundDown, true, StepType, true); + CastInst::getCastOpcode(CountRoundDown, true, StepType, true); Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd"); - const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); + const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout(); EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); EndValue->setName("ind.end"); } // The new PHI merges the original incoming value, in case of a bypass, // or the value at the end of the vectorized loop. - BCResumeVal->addIncoming(EndValue, MiddleBlock); + BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); // Fix the scalar body counter (PHI node). // The old induction's phi node in the scalar body needs the truncated // value. for (BasicBlock *BB : LoopBypassBlocks) BCResumeVal->addIncoming(II.getStartValue(), BB); - OrigPhi->setIncomingValueForBlock(ScalarPH, BCResumeVal); + OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); } // We need the OrigLoop (scalar loop part) latch terminator to help @@ -3049,9 +3087,9 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { // If tail is to be folded, we know we don't need to run the remainder. Value *CmpN = Builder.getTrue(); if (!Cost->foldTailByMasking()) { - CmpN = - CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count, - CountRoundDown, "cmp.n", MiddleBlock->getTerminator()); + CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count, + CountRoundDown, "cmp.n", + LoopMiddleBlock->getTerminator()); // Here we use the same DebugLoc as the scalar loop latch branch instead // of the corresponding compare because they may have ended up with @@ -3060,20 +3098,15 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc()); } - BranchInst *BrInst = BranchInst::Create(ExitBlock, ScalarPH, CmpN); + BranchInst *BrInst = + BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, CmpN); BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc()); - ReplaceInstWithInst(MiddleBlock->getTerminator(), BrInst); + ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); // Get ready to start creating new instructions into the vectorized body. - Builder.SetInsertPoint(&*VecBody->getFirstInsertionPt()); - - // Save the state. - LoopVectorPreHeader = Lp->getLoopPreheader(); - LoopScalarPreHeader = ScalarPH; - LoopMiddleBlock = MiddleBlock; - LoopExitBlock = ExitBlock; - LoopVectorBody = VecBody; - LoopScalarBody = OldBasicBlock; + assert(LoopVectorPreHeader == Lp->getLoopPreheader() && + "Inconsistent vector loop preheader"); + Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); Optional<MDNode *> VectorizedLoopID = makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, @@ -3094,6 +3127,11 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { LoopVectorizeHints Hints(Lp, true, *ORE); Hints.setAlreadyVectorized(); +#ifdef EXPENSIVE_CHECKS + assert(DT->verify(DominatorTree::VerificationLevel::Fast)); + LI->verify(*DT); +#endif + return LoopVectorPreHeader; } @@ -3429,15 +3467,8 @@ void InnerLoopVectorizer::fixVectorizedLoop() { // This is the second stage of vectorizing recurrences. fixCrossIterationPHIs(); - // Update the dominator tree. - // - // FIXME: After creating the structure of the new loop, the dominator tree is - // no longer up-to-date, and it remains that way until we update it - // here. An out-of-date dominator tree is problematic for SCEV, - // because SCEVExpander uses it to guide code generation. The - // vectorizer use SCEVExpanders in several places. Instead, we should - // keep the dominator tree up-to-date as we go. - updateAnalysis(); + // Forget the original basic block. + PSE.getSE()->forgetLoop(OrigLoop); // Fix-up external users of the induction variables. for (auto &Entry : *Legal->getInductionVars()) @@ -3550,17 +3581,27 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { // among all unrolled iterations, due to the order of their construction. Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1); - // Set the insertion point after the previous value if it is an instruction. + // Find and set the insertion point after the previous value if it is an + // instruction. + BasicBlock::iterator InsertPt; // Note that the previous value may have been constant-folded so it is not - // guaranteed to be an instruction in the vector loop. Also, if the previous - // value is a phi node, we should insert after all the phi nodes to avoid - // breaking basic block verification. - if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart) || - isa<PHINode>(PreviousLastPart)) - Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); - else - Builder.SetInsertPoint( - &*++BasicBlock::iterator(cast<Instruction>(PreviousLastPart))); + // guaranteed to be an instruction in the vector loop. + // FIXME: Loop invariant values do not form recurrences. We should deal with + // them earlier. + if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart)) + InsertPt = LoopVectorBody->getFirstInsertionPt(); + else { + Instruction *PreviousInst = cast<Instruction>(PreviousLastPart); + if (isa<PHINode>(PreviousLastPart)) + // If the previous value is a phi node, we should insert after all the phi + // nodes in the block containing the PHI to avoid breaking basic block + // verification. Note that the basic block may be different to + // LoopVectorBody, in case we predicate the loop. + InsertPt = PreviousInst->getParent()->getFirstInsertionPt(); + else + InsertPt = ++PreviousInst->getIterator(); + } + Builder.SetInsertPoint(&*InsertPt); // We will construct a vector for the recurrence by combining the values for // the current and previous iterations. This is the required shuffle mask. @@ -3693,16 +3734,20 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) { } } + // Wrap flags are in general invalid after vectorization, clear them. + clearReductionWrapFlags(RdxDesc); + // Fix the vector-loop phi. // Reductions do not have to start at zero. They can start with // any loop invariant values. BasicBlock *Latch = OrigLoop->getLoopLatch(); Value *LoopVal = Phi->getIncomingValueForBlock(Latch); + for (unsigned Part = 0; Part < UF; ++Part) { Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part); Value *Val = getOrCreateVectorValue(LoopVal, Part); - // Make sure to add the reduction stat value only to the + // Make sure to add the reduction start value only to the // first unroll part. Value *StartVal = (Part == 0) ? VectorStart : Identity; cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader); @@ -3839,6 +3884,37 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) { Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); } +void InnerLoopVectorizer::clearReductionWrapFlags( + RecurrenceDescriptor &RdxDesc) { + RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind(); + if (RK != RecurrenceDescriptor::RK_IntegerAdd && + RK != RecurrenceDescriptor::RK_IntegerMult) + return; + + Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr(); + assert(LoopExitInstr && "null loop exit instruction"); + SmallVector<Instruction *, 8> Worklist; + SmallPtrSet<Instruction *, 8> Visited; + Worklist.push_back(LoopExitInstr); + Visited.insert(LoopExitInstr); + + while (!Worklist.empty()) { + Instruction *Cur = Worklist.pop_back_val(); + if (isa<OverflowingBinaryOperator>(Cur)) + for (unsigned Part = 0; Part < UF; ++Part) { + Value *V = getOrCreateVectorValue(Cur, Part); + cast<Instruction>(V)->dropPoisonGeneratingFlags(); + } + + for (User *U : Cur->users()) { + Instruction *UI = cast<Instruction>(U); + if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) && + Visited.insert(UI).second) + Worklist.push_back(UI); + } + } +} + void InnerLoopVectorizer::fixLCSSAPHIs() { for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { if (LCSSAPhi.getNumIncomingValues() == 1) { @@ -3960,6 +4036,75 @@ void InnerLoopVectorizer::fixNonInductionPHIs() { } } +void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, unsigned UF, + unsigned VF, bool IsPtrLoopInvariant, + SmallBitVector &IsIndexLoopInvariant) { + // Construct a vector GEP by widening the operands of the scalar GEP as + // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP + // results in a vector of pointers when at least one operand of the GEP + // is vector-typed. Thus, to keep the representation compact, we only use + // vector-typed operands for loop-varying values. + + if (VF > 1 && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { + // If we are vectorizing, but the GEP has only loop-invariant operands, + // the GEP we build (by only using vector-typed operands for + // loop-varying values) would be a scalar pointer. Thus, to ensure we + // produce a vector of pointers, we need to either arbitrarily pick an + // operand to broadcast, or broadcast a clone of the original GEP. + // Here, we broadcast a clone of the original. + // + // TODO: If at some point we decide to scalarize instructions having + // loop-invariant operands, this special case will no longer be + // required. We would add the scalarization decision to + // collectLoopScalars() and teach getVectorValue() to broadcast + // the lane-zero scalar value. + auto *Clone = Builder.Insert(GEP->clone()); + for (unsigned Part = 0; Part < UF; ++Part) { + Value *EntryPart = Builder.CreateVectorSplat(VF, Clone); + VectorLoopValueMap.setVectorValue(GEP, Part, EntryPart); + addMetadata(EntryPart, GEP); + } + } else { + // If the GEP has at least one loop-varying operand, we are sure to + // produce a vector of pointers. But if we are only unrolling, we want + // to produce a scalar GEP for each unroll part. Thus, the GEP we + // produce with the code below will be scalar (if VF == 1) or vector + // (otherwise). Note that for the unroll-only case, we still maintain + // values in the vector mapping with initVector, as we do for other + // instructions. + for (unsigned Part = 0; Part < UF; ++Part) { + // The pointer operand of the new GEP. If it's loop-invariant, we + // won't broadcast it. + auto *Ptr = IsPtrLoopInvariant + ? GEP->getPointerOperand() + : getOrCreateVectorValue(GEP->getPointerOperand(), Part); + + // Collect all the indices for the new GEP. If any index is + // loop-invariant, we won't broadcast it. + SmallVector<Value *, 4> Indices; + for (auto Index : enumerate(GEP->indices())) { + Value *User = Index.value().get(); + if (IsIndexLoopInvariant[Index.index()]) + Indices.push_back(User); + else + Indices.push_back(getOrCreateVectorValue(User, Part)); + } + + // Create the new GEP. Note that this GEP may be a scalar if VF == 1, + // but it should be a vector, otherwise. + auto *NewGEP = + GEP->isInBounds() + ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr, + Indices) + : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices); + assert((VF == 1 || NewGEP->getType()->isVectorTy()) && + "NewGEP is not a pointer vector"); + VectorLoopValueMap.setVectorValue(GEP, Part, NewGEP); + addMetadata(NewGEP, GEP); + } + } +} + void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF) { PHINode *P = cast<PHINode>(PN); @@ -4062,76 +4207,8 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I) { switch (I.getOpcode()) { case Instruction::Br: case Instruction::PHI: + case Instruction::GetElementPtr: llvm_unreachable("This instruction is handled by a different recipe."); - case Instruction::GetElementPtr: { - // Construct a vector GEP by widening the operands of the scalar GEP as - // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP - // results in a vector of pointers when at least one operand of the GEP - // is vector-typed. Thus, to keep the representation compact, we only use - // vector-typed operands for loop-varying values. - auto *GEP = cast<GetElementPtrInst>(&I); - - if (VF > 1 && OrigLoop->hasLoopInvariantOperands(GEP)) { - // If we are vectorizing, but the GEP has only loop-invariant operands, - // the GEP we build (by only using vector-typed operands for - // loop-varying values) would be a scalar pointer. Thus, to ensure we - // produce a vector of pointers, we need to either arbitrarily pick an - // operand to broadcast, or broadcast a clone of the original GEP. - // Here, we broadcast a clone of the original. - // - // TODO: If at some point we decide to scalarize instructions having - // loop-invariant operands, this special case will no longer be - // required. We would add the scalarization decision to - // collectLoopScalars() and teach getVectorValue() to broadcast - // the lane-zero scalar value. - auto *Clone = Builder.Insert(GEP->clone()); - for (unsigned Part = 0; Part < UF; ++Part) { - Value *EntryPart = Builder.CreateVectorSplat(VF, Clone); - VectorLoopValueMap.setVectorValue(&I, Part, EntryPart); - addMetadata(EntryPart, GEP); - } - } else { - // If the GEP has at least one loop-varying operand, we are sure to - // produce a vector of pointers. But if we are only unrolling, we want - // to produce a scalar GEP for each unroll part. Thus, the GEP we - // produce with the code below will be scalar (if VF == 1) or vector - // (otherwise). Note that for the unroll-only case, we still maintain - // values in the vector mapping with initVector, as we do for other - // instructions. - for (unsigned Part = 0; Part < UF; ++Part) { - // The pointer operand of the new GEP. If it's loop-invariant, we - // won't broadcast it. - auto *Ptr = - OrigLoop->isLoopInvariant(GEP->getPointerOperand()) - ? GEP->getPointerOperand() - : getOrCreateVectorValue(GEP->getPointerOperand(), Part); - - // Collect all the indices for the new GEP. If any index is - // loop-invariant, we won't broadcast it. - SmallVector<Value *, 4> Indices; - for (auto &U : make_range(GEP->idx_begin(), GEP->idx_end())) { - if (OrigLoop->isLoopInvariant(U.get())) - Indices.push_back(U.get()); - else - Indices.push_back(getOrCreateVectorValue(U.get(), Part)); - } - - // Create the new GEP. Note that this GEP may be a scalar if VF == 1, - // but it should be a vector, otherwise. - auto *NewGEP = - GEP->isInBounds() - ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr, - Indices) - : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices); - assert((VF == 1 || NewGEP->getType()->isVectorTy()) && - "NewGEP is not a pointer vector"); - VectorLoopValueMap.setVectorValue(&I, Part, NewGEP); - addMetadata(NewGEP, GEP); - } - } - - break; - } case Instruction::UDiv: case Instruction::SDiv: case Instruction::SRem: @@ -4335,26 +4412,6 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I) { } // end of switch. } -void InnerLoopVectorizer::updateAnalysis() { - // Forget the original basic block. - PSE.getSE()->forgetLoop(OrigLoop); - - // DT is not kept up-to-date for outer loop vectorization - if (EnableVPlanNativePath) - return; - - // Update the dominator tree information. - assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) && - "Entry does not dominate exit."); - - DT->addNewBlock(LoopMiddleBlock, - LI->getLoopFor(LoopVectorBody)->getLoopLatch()); - DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]); - DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader); - DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]); - assert(DT->verify(DominatorTree::VerificationLevel::Fast)); -} - void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) { // We should not collect Scalars more than once per VF. Right now, this // function is called from collectUniformsAndScalars(), which already does @@ -4562,9 +4619,10 @@ bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigne return WideningDecision == CM_Scalarize; } const MaybeAlign Alignment = getLoadStoreAlignment(I); - return isa<LoadInst>(I) ? - !(isLegalMaskedLoad(Ty, Ptr, Alignment) || isLegalMaskedGather(Ty)) - : !(isLegalMaskedStore(Ty, Ptr, Alignment) || isLegalMaskedScatter(Ty)); + return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || + isLegalMaskedGather(Ty, Alignment)) + : !(isLegalMaskedStore(Ty, Ptr, Alignment) || + isLegalMaskedScatter(Ty, Alignment)); } case Instruction::UDiv: case Instruction::SDiv: @@ -4667,14 +4725,26 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) { SetVector<Instruction *> Worklist; BasicBlock *Latch = TheLoop->getLoopLatch(); + // Instructions that are scalar with predication must not be considered + // uniform after vectorization, because that would create an erroneous + // replicating region where only a single instance out of VF should be formed. + // TODO: optimize such seldom cases if found important, see PR40816. + auto addToWorklistIfAllowed = [&](Instruction *I) -> void { + if (isScalarWithPredication(I, VF)) { + LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " + << *I << "\n"); + return; + } + LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); + Worklist.insert(I); + }; + // Start with the conditional branch. If the branch condition is an // instruction contained in the loop that is only used by the branch, it is // uniform. auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); - if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) { - Worklist.insert(Cmp); - LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Cmp << "\n"); - } + if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) + addToWorklistIfAllowed(Cmp); // Holds consecutive and consecutive-like pointers. Consecutive-like pointers // are pointers that are treated like consecutive pointers during @@ -4733,10 +4803,8 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) { // Add to the Worklist all consecutive and consecutive-like pointers that // aren't also identified as possibly non-uniform. for (auto *V : ConsecutiveLikePtrs) - if (PossibleNonUniformPtrs.find(V) == PossibleNonUniformPtrs.end()) { - LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *V << "\n"); - Worklist.insert(V); - } + if (PossibleNonUniformPtrs.find(V) == PossibleNonUniformPtrs.end()) + addToWorklistIfAllowed(V); // Expand Worklist in topological order: whenever a new instruction // is added , its users should be already inside Worklist. It ensures @@ -4762,10 +4830,8 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) { return Worklist.count(J) || (OI == getLoadStorePointerOperand(J) && isUniformDecision(J, VF)); - })) { - Worklist.insert(OI); - LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *OI << "\n"); - } + })) + addToWorklistIfAllowed(OI); } } @@ -4807,11 +4873,8 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) { continue; // The induction variable and its update instruction will remain uniform. - Worklist.insert(Ind); - Worklist.insert(IndUpdate); - LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Ind << "\n"); - LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *IndUpdate - << "\n"); + addToWorklistIfAllowed(Ind); + addToWorklistIfAllowed(IndUpdate); } Uniforms[VF].insert(Worklist.begin(), Worklist.end()); @@ -5143,9 +5206,10 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF, if (Legal->getMaxSafeDepDistBytes() != -1U) return 1; - // Do not interleave loops with a relatively small trip count. - unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); - if (TC > 1 && TC < TinyTripCountInterleaveThreshold) + // Do not interleave loops with a relatively small known or estimated trip + // count. + auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); + if (BestKnownTC && *BestKnownTC < TinyTripCountInterleaveThreshold) return 1; RegisterUsage R = calculateRegisterUsage({VF})[0]; @@ -5208,12 +5272,10 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF, MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; } - // If the trip count is constant, limit the interleave count to be less than - // the trip count divided by VF. - if (TC > 0) { - assert(TC >= VF && "VF exceeds trip count?"); - if ((TC / VF) < MaxInterleaveCount) - MaxInterleaveCount = (TC / VF); + // If trip count is known or estimated compile time constant, limit the + // interleave count to be less than the trip count divided by VF. + if (BestKnownTC) { + MaxInterleaveCount = std::min(*BestKnownTC / VF, MaxInterleaveCount); } // If we did not calculate the cost for VF (because the user selected the VF) @@ -5746,7 +5808,7 @@ unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, // vectorized loop where the user of it is a vectorized instruction. const MaybeAlign Alignment = getLoadStoreAlignment(I); Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), - Alignment ? Alignment->value() : 0, AS); + Alignment, AS); // Get the overhead of the extractelement and insertelement instructions // we might create due to scalarization. @@ -5783,8 +5845,7 @@ unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment ? Alignment->value() : 0, AS); else - Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, - Alignment ? Alignment->value() : 0, AS, I); + Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, I); bool Reverse = ConsecutiveStride < 0; if (Reverse) @@ -5800,16 +5861,14 @@ unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, unsigned AS = getLoadStoreAddressSpace(I); if (isa<LoadInst>(I)) { return TTI.getAddressComputationCost(ValTy) + - TTI.getMemoryOpCost(Instruction::Load, ValTy, - Alignment ? Alignment->value() : 0, AS) + + TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS) + TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); } StoreInst *SI = cast<StoreInst>(I); bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); return TTI.getAddressComputationCost(ValTy) + - TTI.getMemoryOpCost(Instruction::Store, ValTy, - Alignment ? Alignment->value() : 0, AS) + + TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS) + (isLoopInvariantStoreValue ? 0 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, @@ -5877,8 +5936,7 @@ unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, unsigned AS = getLoadStoreAddressSpace(I); return TTI.getAddressComputationCost(ValTy) + - TTI.getMemoryOpCost(I->getOpcode(), ValTy, - Alignment ? Alignment->value() : 0, AS, I); + TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, I); } return getWideningCost(I, VF); } @@ -6217,7 +6275,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; return N * TTI.getArithmeticInstrCost( I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue, - Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands); + Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); } case Instruction::FNeg: { unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; @@ -6225,7 +6283,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None, TargetTransformInfo::OP_None, - I->getOperand(0)); + I->getOperand(0), I); } case Instruction::Select: { SelectInst *SI = cast<SelectInst>(I); @@ -6714,37 +6772,6 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { return BlockMaskCache[BB] = BlockMask; } -VPInterleaveRecipe *VPRecipeBuilder::tryToInterleaveMemory(Instruction *I, - VFRange &Range, - VPlanPtr &Plan) { - const InterleaveGroup<Instruction> *IG = CM.getInterleavedAccessGroup(I); - if (!IG) - return nullptr; - - // Now check if IG is relevant for VF's in the given range. - auto isIGMember = [&](Instruction *I) -> std::function<bool(unsigned)> { - return [=](unsigned VF) -> bool { - return (VF >= 2 && // Query is illegal for VF == 1 - CM.getWideningDecision(I, VF) == - LoopVectorizationCostModel::CM_Interleave); - }; - }; - if (!LoopVectorizationPlanner::getDecisionAndClampRange(isIGMember(I), Range)) - return nullptr; - - // I is a member of an InterleaveGroup for VF's in the (possibly trimmed) - // range. If it's the primary member of the IG construct a VPInterleaveRecipe. - // Otherwise, it's an adjunct member of the IG, do not construct any Recipe. - assert(I == IG->getInsertPos() && - "Generating a recipe for an adjunct member of an interleave group"); - - VPValue *Mask = nullptr; - if (Legal->isMaskRequired(I)) - Mask = createBlockInMask(I->getParent(), Plan); - - return new VPInterleaveRecipe(IG, Mask); -} - VPWidenMemoryInstructionRecipe * VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range, VPlanPtr &Plan) { @@ -6754,15 +6781,15 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range, auto willWiden = [&](unsigned VF) -> bool { if (VF == 1) return false; - if (CM.isScalarAfterVectorization(I, VF) || - CM.isProfitableToScalarize(I, VF)) - return false; LoopVectorizationCostModel::InstWidening Decision = CM.getWideningDecision(I, VF); assert(Decision != LoopVectorizationCostModel::CM_Unknown && "CM decision should be taken at this point."); - assert(Decision != LoopVectorizationCostModel::CM_Interleave && - "Interleave memory opportunity should be caught earlier."); + if (Decision == LoopVectorizationCostModel::CM_Interleave) + return true; + if (CM.isScalarAfterVectorization(I, VF) || + CM.isProfitableToScalarize(I, VF)) + return false; return Decision != LoopVectorizationCostModel::CM_Scalarize; }; @@ -6773,7 +6800,8 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range, if (Legal->isMaskRequired(I)) Mask = createBlockInMask(I->getParent(), Plan); - return new VPWidenMemoryInstructionRecipe(*I, Mask); + VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I)); + return new VPWidenMemoryInstructionRecipe(*I, Addr, Mask); } VPWidenIntOrFpInductionRecipe * @@ -6861,7 +6889,6 @@ bool VPRecipeBuilder::tryToWiden(Instruction *I, VPBasicBlock *VPBB, case Instruction::FPTrunc: case Instruction::FRem: case Instruction::FSub: - case Instruction::GetElementPtr: case Instruction::ICmp: case Instruction::IntToPtr: case Instruction::Load: @@ -6926,16 +6953,23 @@ bool VPRecipeBuilder::tryToWiden(Instruction *I, VPBasicBlock *VPBB, if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) return false; + // If this ingredient's recipe is to be recorded, keep its recipe a singleton + // to avoid having to split recipes later. + bool IsSingleton = Ingredient2Recipe.count(I); + + // Success: widen this instruction. - // Success: widen this instruction. We optimize the common case where + // Use the default widening recipe. We optimize the common case where // consecutive instructions can be represented by a single recipe. - if (!VPBB->empty()) { - VPWidenRecipe *LastWidenRecipe = dyn_cast<VPWidenRecipe>(&VPBB->back()); - if (LastWidenRecipe && LastWidenRecipe->appendInstruction(I)) - return true; - } + if (!IsSingleton && !VPBB->empty() && LastExtensibleRecipe == &VPBB->back() && + LastExtensibleRecipe->appendInstruction(I)) + return true; - VPBB->appendRecipe(new VPWidenRecipe(I)); + VPWidenRecipe *WidenRecipe = new VPWidenRecipe(I); + if (!IsSingleton) + LastExtensibleRecipe = WidenRecipe; + setRecipe(I, WidenRecipe); + VPBB->appendRecipe(WidenRecipe); return true; } @@ -6951,6 +6985,7 @@ VPBasicBlock *VPRecipeBuilder::handleReplication( [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range); auto *Recipe = new VPReplicateRecipe(I, IsUniform, IsPredicated); + setRecipe(I, Recipe); // Find if I uses a predicated instruction. If so, it will use its scalar // value. Avoid hoisting the insert-element which packs the scalar value into @@ -7009,36 +7044,36 @@ VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, bool VPRecipeBuilder::tryToCreateRecipe(Instruction *Instr, VFRange &Range, VPlanPtr &Plan, VPBasicBlock *VPBB) { VPRecipeBase *Recipe = nullptr; - // Check if Instr should belong to an interleave memory recipe, or already - // does. In the latter case Instr is irrelevant. - if ((Recipe = tryToInterleaveMemory(Instr, Range, Plan))) { - VPBB->appendRecipe(Recipe); - return true; - } - // Check if Instr is a memory operation that should be widened. - if ((Recipe = tryToWidenMemory(Instr, Range, Plan))) { + // First, check for specific widening recipes that deal with memory + // operations, inductions and Phi nodes. + if ((Recipe = tryToWidenMemory(Instr, Range, Plan)) || + (Recipe = tryToOptimizeInduction(Instr, Range)) || + (Recipe = tryToBlend(Instr, Plan)) || + (isa<PHINode>(Instr) && + (Recipe = new VPWidenPHIRecipe(cast<PHINode>(Instr))))) { + setRecipe(Instr, Recipe); VPBB->appendRecipe(Recipe); return true; } - // Check if Instr should form some PHI recipe. - if ((Recipe = tryToOptimizeInduction(Instr, Range))) { - VPBB->appendRecipe(Recipe); - return true; - } - if ((Recipe = tryToBlend(Instr, Plan))) { + // Handle GEP widening. + if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Instr)) { + auto Scalarize = [&](unsigned VF) { + return CM.isScalarWithPredication(Instr, VF) || + CM.isScalarAfterVectorization(Instr, VF) || + CM.isProfitableToScalarize(Instr, VF); + }; + if (LoopVectorizationPlanner::getDecisionAndClampRange(Scalarize, Range)) + return false; + VPWidenGEPRecipe *Recipe = new VPWidenGEPRecipe(GEP, OrigLoop); + setRecipe(Instr, Recipe); VPBB->appendRecipe(Recipe); return true; } - if (PHINode *Phi = dyn_cast<PHINode>(Instr)) { - VPBB->appendRecipe(new VPWidenPHIRecipe(Phi)); - return true; - } // Check if Instr is to be widened by a general VPWidenRecipe, after - // having first checked for specific widening recipes that deal with - // Interleave Groups, Inductions and Phi nodes. + // having first checked for specific widening recipes. if (tryToWiden(Instr, VPBB, Range)) return true; @@ -7094,19 +7129,57 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF, VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef, SmallPtrSetImpl<Instruction *> &DeadInstructions) { + // Hold a mapping from predicated instructions to their recipes, in order to // fix their AlsoPack behavior if a user is determined to replicate and use a // scalar instead of vector value. DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe; DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); - DenseMap<Instruction *, Instruction *> SinkAfterInverse; + + SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; + + VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, Builder); + + // --------------------------------------------------------------------------- + // Pre-construction: record ingredients whose recipes we'll need to further + // process after constructing the initial VPlan. + // --------------------------------------------------------------------------- + + // Mark instructions we'll need to sink later and their targets as + // ingredients whose recipe we'll need to record. + for (auto &Entry : SinkAfter) { + RecipeBuilder.recordRecipeOf(Entry.first); + RecipeBuilder.recordRecipeOf(Entry.second); + } + + // For each interleave group which is relevant for this (possibly trimmed) + // Range, add it to the set of groups to be later applied to the VPlan and add + // placeholders for its members' Recipes which we'll be replacing with a + // single VPInterleaveRecipe. + for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { + auto applyIG = [IG, this](unsigned VF) -> bool { + return (VF >= 2 && // Query is illegal for VF == 1 + CM.getWideningDecision(IG->getInsertPos(), VF) == + LoopVectorizationCostModel::CM_Interleave); + }; + if (!getDecisionAndClampRange(applyIG, Range)) + continue; + InterleaveGroups.insert(IG); + for (unsigned i = 0; i < IG->getFactor(); i++) + if (Instruction *Member = IG->getMember(i)) + RecipeBuilder.recordRecipeOf(Member); + }; + + // --------------------------------------------------------------------------- + // Build initial VPlan: Scan the body of the loop in a topological order to + // visit each basic block after having visited its predecessor basic blocks. + // --------------------------------------------------------------------------- // Create a dummy pre-entry VPBasicBlock to start building the VPlan. VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry"); auto Plan = std::make_unique<VPlan>(VPBB); - VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, Builder); // Represent values that will have defs inside VPlan. for (Value *V : NeedDef) Plan->addVPValue(V); @@ -7125,10 +7198,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( VPBB = FirstVPBBForBB; Builder.setInsertPoint(VPBB); - std::vector<Instruction *> Ingredients; - - // Organize the ingredients to vectorize from current basic block in the - // right order. + // Introduce each ingredient into VPlan. for (Instruction &I : BB->instructionsWithoutDebug()) { Instruction *Instr = &I; @@ -7138,43 +7208,6 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( DeadInstructions.find(Instr) != DeadInstructions.end()) continue; - // I is a member of an InterleaveGroup for Range.Start. If it's an adjunct - // member of the IG, do not construct any Recipe for it. - const InterleaveGroup<Instruction> *IG = - CM.getInterleavedAccessGroup(Instr); - if (IG && Instr != IG->getInsertPos() && - Range.Start >= 2 && // Query is illegal for VF == 1 - CM.getWideningDecision(Instr, Range.Start) == - LoopVectorizationCostModel::CM_Interleave) { - auto SinkCandidate = SinkAfterInverse.find(Instr); - if (SinkCandidate != SinkAfterInverse.end()) - Ingredients.push_back(SinkCandidate->second); - continue; - } - - // Move instructions to handle first-order recurrences, step 1: avoid - // handling this instruction until after we've handled the instruction it - // should follow. - auto SAIt = SinkAfter.find(Instr); - if (SAIt != SinkAfter.end()) { - LLVM_DEBUG(dbgs() << "Sinking" << *SAIt->first << " after" - << *SAIt->second - << " to vectorize a 1st order recurrence.\n"); - SinkAfterInverse[SAIt->second] = Instr; - continue; - } - - Ingredients.push_back(Instr); - - // Move instructions to handle first-order recurrences, step 2: push the - // instruction to be sunk at its insertion point. - auto SAInvIt = SinkAfterInverse.find(Instr); - if (SAInvIt != SinkAfterInverse.end()) - Ingredients.push_back(SAInvIt->second); - } - - // Introduce each ingredient into VPlan. - for (Instruction *Instr : Ingredients) { if (RecipeBuilder.tryToCreateRecipe(Instr, Range, Plan, VPBB)) continue; @@ -7199,6 +7232,33 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( VPBlockUtils::disconnectBlocks(PreEntry, Entry); delete PreEntry; + // --------------------------------------------------------------------------- + // Transform initial VPlan: Apply previously taken decisions, in order, to + // bring the VPlan to its final state. + // --------------------------------------------------------------------------- + + // Apply Sink-After legal constraints. + for (auto &Entry : SinkAfter) { + VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); + VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); + Sink->moveAfter(Target); + } + + // Interleave memory: for each Interleave Group we marked earlier as relevant + // for this VPlan, replace the Recipes widening its memory instructions with a + // single VPInterleaveRecipe at its insertion point. + for (auto IG : InterleaveGroups) { + auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( + RecipeBuilder.getRecipe(IG->getInsertPos())); + (new VPInterleaveRecipe(IG, Recipe->getAddr(), Recipe->getMask())) + ->insertBefore(Recipe); + + for (unsigned i = 0; i < IG->getFactor(); ++i) + if (Instruction *Member = IG->getMember(i)) { + RecipeBuilder.getRecipe(Member)->eraseFromParent(); + } + } + // Finally, if tail is folded by masking, introduce selects between the phi // and the live-out instruction of each reduction, at the end of the latch. if (CM.foldTailByMasking()) { @@ -7255,9 +7315,8 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { } SmallPtrSet<Instruction *, 1> DeadInstructions; - VPlanHCFGTransforms::VPInstructionsToVPRecipes( - Plan, Legal->getInductionVars(), DeadInstructions); - + VPlanTransforms::VPInstructionsToVPRecipes( + OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions); return Plan; } @@ -7266,13 +7325,21 @@ getOrCreateVectorValues(Value *V, unsigned Part) { return ILV.getOrCreateVectorValue(V, Part); } +Value *LoopVectorizationPlanner::VPCallbackILV::getOrCreateScalarValue( + Value *V, const VPIteration &Instance) { + return ILV.getOrCreateScalarValue(V, Instance); +} + void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent) const { O << " +\n" << Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; IG->getInsertPos()->printAsOperand(O, false); - if (User) { + O << ", "; + getAddr()->printAsOperand(O); + VPValue *Mask = getMask(); + if (Mask) { O << ", "; - User->getOperand(0)->printAsOperand(O); + Mask->printAsOperand(O); } O << "\\l\""; for (unsigned i = 0; i < IG->getFactor(); ++i) @@ -7286,6 +7353,11 @@ void VPWidenRecipe::execute(VPTransformState &State) { State.ILV->widenInstruction(Instr); } +void VPWidenGEPRecipe::execute(VPTransformState &State) { + State.ILV->widenGEP(GEP, State.UF, State.VF, IsPtrLoopInvariant, + IsIndexLoopInvariant); +} + void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { assert(!State.Instance && "Int or FP induction being replicated."); State.ILV->widenIntOrFpInduction(IV, Trunc); @@ -7336,15 +7408,8 @@ void VPBlendRecipe::execute(VPTransformState &State) { void VPInterleaveRecipe::execute(VPTransformState &State) { assert(!State.Instance && "Interleave group being replicated."); - if (!User) - return State.ILV->vectorizeInterleaveGroup(IG->getInsertPos()); - - // Last (and currently only) operand is a mask. - InnerLoopVectorizer::VectorParts MaskValues(State.UF); - VPValue *Mask = User->getOperand(User->getNumOperands() - 1); - for (unsigned Part = 0; Part < State.UF; ++Part) - MaskValues[Part] = State.get(Mask, Part); - State.ILV->vectorizeInterleaveGroup(IG->getInsertPos(), &MaskValues); + State.ILV->vectorizeInterleaveGroup(IG->getInsertPos(), State, getAddr(), + getMask()); } void VPReplicateRecipe::execute(VPTransformState &State) { @@ -7431,29 +7496,46 @@ void VPPredInstPHIRecipe::execute(VPTransformState &State) { } void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { - if (!User) - return State.ILV->vectorizeMemoryInstruction(&Instr); - - // Last (and currently only) operand is a mask. - InnerLoopVectorizer::VectorParts MaskValues(State.UF); - VPValue *Mask = User->getOperand(User->getNumOperands() - 1); - for (unsigned Part = 0; Part < State.UF; ++Part) - MaskValues[Part] = State.get(Mask, Part); - State.ILV->vectorizeMemoryInstruction(&Instr, &MaskValues); -} - -static ScalarEpilogueLowering -getScalarEpilogueLowering(Function *F, Loop *L, LoopVectorizeHints &Hints, - ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) { - ScalarEpilogueLowering SEL = CM_ScalarEpilogueAllowed; - if (Hints.getForce() != LoopVectorizeHints::FK_Enabled && - (F->hasOptSize() || - llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI))) - SEL = CM_ScalarEpilogueNotAllowedOptSize; - else if (PreferPredicateOverEpilog || Hints.getPredicate()) - SEL = CM_ScalarEpilogueNotNeededUsePredicate; - - return SEL; + State.ILV->vectorizeMemoryInstruction(&Instr, State, getAddr(), getMask()); +} + +// Determine how to lower the scalar epilogue, which depends on 1) optimising +// for minimum code-size, 2) predicate compiler options, 3) loop hints forcing +// predication, and 4) a TTI hook that analyses whether the loop is suitable +// for predication. +static ScalarEpilogueLowering getScalarEpilogueLowering( + Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, + BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, + AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, + LoopVectorizationLegality &LVL) { + bool OptSize = + F->hasOptSize() || llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, + PGSOQueryType::IRPass); + // 1) OptSize takes precedence over all other options, i.e. if this is set, + // don't look at hints or options, and don't request a scalar epilogue. + if (OptSize && Hints.getForce() != LoopVectorizeHints::FK_Enabled) + return CM_ScalarEpilogueNotAllowedOptSize; + + bool PredicateOptDisabled = PreferPredicateOverEpilog.getNumOccurrences() && + !PreferPredicateOverEpilog; + + // 2) Next, if disabling predication is requested on the command line, honour + // this and request a scalar epilogue. Also do this if we don't have a + // primary induction variable, which is required for predication. + if (PredicateOptDisabled || !LVL.getPrimaryInduction()) + return CM_ScalarEpilogueAllowed; + + // 3) and 4) look if enabling predication is requested on the command line, + // with a loop hint, or if the TTI hook indicates this is profitable, request + // predication . + if (PreferPredicateOverEpilog || + Hints.getPredicate() == LoopVectorizeHints::FK_Enabled || + (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, + LVL.getLAI()) && + Hints.getPredicate() != LoopVectorizeHints::FK_Disabled)) + return CM_ScalarEpilogueNotNeededUsePredicate; + + return CM_ScalarEpilogueAllowed; } // Process the loop in the VPlan-native vectorization path. This path builds @@ -7470,14 +7552,16 @@ static bool processLoopInVPlanNativePath( assert(EnableVPlanNativePath && "VPlan-native path is disabled."); Function *F = L->getHeader()->getParent(); InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); - ScalarEpilogueLowering SEL = getScalarEpilogueLowering(F, L, Hints, PSI, BFI); + + ScalarEpilogueLowering SEL = getScalarEpilogueLowering( + F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL); LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, &Hints, IAI); // Use the planner for outer loop vectorization. // TODO: CM is not used at this point inside the planner. Turn CM into an // optional argument if we don't need it in the future. - LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM); + LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI); // Get user vectorization factor. const unsigned UserVF = Hints.getWidth(); @@ -7562,7 +7646,8 @@ bool LoopVectorizePass::processLoop(Loop *L) { // Check the function attributes and profiles to find out if this function // should be optimized for size. - ScalarEpilogueLowering SEL = getScalarEpilogueLowering(F, L, Hints, PSI, BFI); + ScalarEpilogueLowering SEL = getScalarEpilogueLowering( + F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL); // Entrance to the VPlan-native vectorization path. Outer loops are processed // here. They may require CFG and instruction level transformations before @@ -7635,7 +7720,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { CM.collectValuesToIgnore(); // Use the planner for vectorization. - LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM); + LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI); // Get user vectorization factor. unsigned UserVF = Hints.getWidth(); diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 974eff9974d9..aabd974cd73e 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -26,6 +26,7 @@ #include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallBitVector.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" @@ -72,6 +73,7 @@ #include "llvm/IR/Value.h" #include "llvm/IR/ValueHandle.h" #include "llvm/IR/Verifier.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" @@ -127,6 +129,10 @@ static cl::opt<int> MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits")); +static cl::opt<int> +MaxStoreLookup("slp-max-store-lookup", cl::init(32), cl::Hidden, + cl::desc("Maximum depth of the lookup for consecutive stores.")); + /// Limits the size of scheduling regions in a block. /// It avoid long compile times for _very_ large blocks where vector /// instructions are spread over a wide range. @@ -147,6 +153,20 @@ static cl::opt<unsigned> MinTreeSize( "slp-min-tree-size", cl::init(3), cl::Hidden, cl::desc("Only vectorize small trees if they are fully vectorizable")); +// The maximum depth that the look-ahead score heuristic will explore. +// The higher this value, the higher the compilation time overhead. +static cl::opt<int> LookAheadMaxDepth( + "slp-max-look-ahead-depth", cl::init(2), cl::Hidden, + cl::desc("The maximum look-ahead depth for operand reordering scores")); + +// The Look-ahead heuristic goes through the users of the bundle to calculate +// the users cost in getExternalUsesCost(). To avoid compilation time increase +// we limit the number of users visited to this value. +static cl::opt<unsigned> LookAheadUsersBudget( + "slp-look-ahead-users-budget", cl::init(2), cl::Hidden, + cl::desc("The maximum number of users to visit while visiting the " + "predecessors. This prevents compilation time increase.")); + static cl::opt<bool> ViewSLPTree("view-slp-tree", cl::Hidden, cl::desc("Display the SLP trees with Graphviz")); @@ -547,7 +567,7 @@ public: /// Construct a vectorizable tree that starts at \p Roots, ignoring users for /// the purpose of scheduling and extraction in the \p UserIgnoreLst taking - /// into account (anf updating it, if required) list of externally used + /// into account (and updating it, if required) list of externally used /// values stored in \p ExternallyUsedValues. void buildTree(ArrayRef<Value *> Roots, ExtraValueToDebugLocsMap &ExternallyUsedValues, @@ -609,7 +629,10 @@ public: return MinVecRegSize; } - /// Check if ArrayType or StructType is isomorphic to some VectorType. + /// Check if homogeneous aggregate is isomorphic to some VectorType. + /// Accepts homogeneous multidimensional aggregate of scalars/vectors like + /// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> }, + /// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on. /// /// \returns number of elements in vector if isomorphism exists, 0 otherwise. unsigned canMapToVector(Type *T, const DataLayout &DL) const; @@ -721,6 +744,7 @@ public: const DataLayout &DL; ScalarEvolution &SE; + const BoUpSLP &R; /// \returns the operand data at \p OpIdx and \p Lane. OperandData &getData(unsigned OpIdx, unsigned Lane) { @@ -746,6 +770,227 @@ public: std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]); } + // The hard-coded scores listed here are not very important. When computing + // the scores of matching one sub-tree with another, we are basically + // counting the number of values that are matching. So even if all scores + // are set to 1, we would still get a decent matching result. + // However, sometimes we have to break ties. For example we may have to + // choose between matching loads vs matching opcodes. This is what these + // scores are helping us with: they provide the order of preference. + + /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]). + static const int ScoreConsecutiveLoads = 3; + /// ExtractElementInst from same vector and consecutive indexes. + static const int ScoreConsecutiveExtracts = 3; + /// Constants. + static const int ScoreConstants = 2; + /// Instructions with the same opcode. + static const int ScoreSameOpcode = 2; + /// Instructions with alt opcodes (e.g, add + sub). + static const int ScoreAltOpcodes = 1; + /// Identical instructions (a.k.a. splat or broadcast). + static const int ScoreSplat = 1; + /// Matching with an undef is preferable to failing. + static const int ScoreUndef = 1; + /// Score for failing to find a decent match. + static const int ScoreFail = 0; + /// User exteranl to the vectorized code. + static const int ExternalUseCost = 1; + /// The user is internal but in a different lane. + static const int UserInDiffLaneCost = ExternalUseCost; + + /// \returns the score of placing \p V1 and \p V2 in consecutive lanes. + static int getShallowScore(Value *V1, Value *V2, const DataLayout &DL, + ScalarEvolution &SE) { + auto *LI1 = dyn_cast<LoadInst>(V1); + auto *LI2 = dyn_cast<LoadInst>(V2); + if (LI1 && LI2) + return isConsecutiveAccess(LI1, LI2, DL, SE) + ? VLOperands::ScoreConsecutiveLoads + : VLOperands::ScoreFail; + + auto *C1 = dyn_cast<Constant>(V1); + auto *C2 = dyn_cast<Constant>(V2); + if (C1 && C2) + return VLOperands::ScoreConstants; + + // Extracts from consecutive indexes of the same vector better score as + // the extracts could be optimized away. + auto *Ex1 = dyn_cast<ExtractElementInst>(V1); + auto *Ex2 = dyn_cast<ExtractElementInst>(V2); + if (Ex1 && Ex2 && Ex1->getVectorOperand() == Ex2->getVectorOperand() && + cast<ConstantInt>(Ex1->getIndexOperand())->getZExtValue() + 1 == + cast<ConstantInt>(Ex2->getIndexOperand())->getZExtValue()) { + return VLOperands::ScoreConsecutiveExtracts; + } + + auto *I1 = dyn_cast<Instruction>(V1); + auto *I2 = dyn_cast<Instruction>(V2); + if (I1 && I2) { + if (I1 == I2) + return VLOperands::ScoreSplat; + InstructionsState S = getSameOpcode({I1, I2}); + // Note: Only consider instructions with <= 2 operands to avoid + // complexity explosion. + if (S.getOpcode() && S.MainOp->getNumOperands() <= 2) + return S.isAltShuffle() ? VLOperands::ScoreAltOpcodes + : VLOperands::ScoreSameOpcode; + } + + if (isa<UndefValue>(V2)) + return VLOperands::ScoreUndef; + + return VLOperands::ScoreFail; + } + + /// Holds the values and their lane that are taking part in the look-ahead + /// score calculation. This is used in the external uses cost calculation. + SmallDenseMap<Value *, int> InLookAheadValues; + + /// \Returns the additinal cost due to uses of \p LHS and \p RHS that are + /// either external to the vectorized code, or require shuffling. + int getExternalUsesCost(const std::pair<Value *, int> &LHS, + const std::pair<Value *, int> &RHS) { + int Cost = 0; + SmallVector<std::pair<Value *, int>, 2> Values = {LHS, RHS}; + for (int Idx = 0, IdxE = Values.size(); Idx != IdxE; ++Idx) { + Value *V = Values[Idx].first; + // Calculate the absolute lane, using the minimum relative lane of LHS + // and RHS as base and Idx as the offset. + int Ln = std::min(LHS.second, RHS.second) + Idx; + assert(Ln >= 0 && "Bad lane calculation"); + unsigned UsersBudget = LookAheadUsersBudget; + for (User *U : V->users()) { + if (const TreeEntry *UserTE = R.getTreeEntry(U)) { + // The user is in the VectorizableTree. Check if we need to insert. + auto It = llvm::find(UserTE->Scalars, U); + assert(It != UserTE->Scalars.end() && "U is in UserTE"); + int UserLn = std::distance(UserTE->Scalars.begin(), It); + assert(UserLn >= 0 && "Bad lane"); + if (UserLn != Ln) + Cost += UserInDiffLaneCost; + } else { + // Check if the user is in the look-ahead code. + auto It2 = InLookAheadValues.find(U); + if (It2 != InLookAheadValues.end()) { + // The user is in the look-ahead code. Check the lane. + if (It2->second != Ln) + Cost += UserInDiffLaneCost; + } else { + // The user is neither in SLP tree nor in the look-ahead code. + Cost += ExternalUseCost; + } + } + // Limit the number of visited uses to cap compilation time. + if (--UsersBudget == 0) + break; + } + } + return Cost; + } + + /// Go through the operands of \p LHS and \p RHS recursively until \p + /// MaxLevel, and return the cummulative score. For example: + /// \verbatim + /// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1] + /// \ / \ / \ / \ / + /// + + + + + /// G1 G2 G3 G4 + /// \endverbatim + /// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at + /// each level recursively, accumulating the score. It starts from matching + /// the additions at level 0, then moves on to the loads (level 1). The + /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and + /// {B[0],B[1]} match with VLOperands::ScoreConsecutiveLoads, while + /// {A[0],C[0]} has a score of VLOperands::ScoreFail. + /// Please note that the order of the operands does not matter, as we + /// evaluate the score of all profitable combinations of operands. In + /// other words the score of G1 and G4 is the same as G1 and G2. This + /// heuristic is based on ideas described in: + /// Look-ahead SLP: Auto-vectorization in the presence of commutative + /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha, + /// LuÃs F. W. Góes + int getScoreAtLevelRec(const std::pair<Value *, int> &LHS, + const std::pair<Value *, int> &RHS, int CurrLevel, + int MaxLevel) { + + Value *V1 = LHS.first; + Value *V2 = RHS.first; + // Get the shallow score of V1 and V2. + int ShallowScoreAtThisLevel = + std::max((int)ScoreFail, getShallowScore(V1, V2, DL, SE) - + getExternalUsesCost(LHS, RHS)); + int Lane1 = LHS.second; + int Lane2 = RHS.second; + + // If reached MaxLevel, + // or if V1 and V2 are not instructions, + // or if they are SPLAT, + // or if they are not consecutive, early return the current cost. + auto *I1 = dyn_cast<Instruction>(V1); + auto *I2 = dyn_cast<Instruction>(V2); + if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 || + ShallowScoreAtThisLevel == VLOperands::ScoreFail || + (isa<LoadInst>(I1) && isa<LoadInst>(I2) && ShallowScoreAtThisLevel)) + return ShallowScoreAtThisLevel; + assert(I1 && I2 && "Should have early exited."); + + // Keep track of in-tree values for determining the external-use cost. + InLookAheadValues[V1] = Lane1; + InLookAheadValues[V2] = Lane2; + + // Contains the I2 operand indexes that got matched with I1 operands. + SmallSet<unsigned, 4> Op2Used; + + // Recursion towards the operands of I1 and I2. We are trying all possbile + // operand pairs, and keeping track of the best score. + for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands(); + OpIdx1 != NumOperands1; ++OpIdx1) { + // Try to pair op1I with the best operand of I2. + int MaxTmpScore = 0; + unsigned MaxOpIdx2 = 0; + bool FoundBest = false; + // If I2 is commutative try all combinations. + unsigned FromIdx = isCommutative(I2) ? 0 : OpIdx1; + unsigned ToIdx = isCommutative(I2) + ? I2->getNumOperands() + : std::min(I2->getNumOperands(), OpIdx1 + 1); + assert(FromIdx <= ToIdx && "Bad index"); + for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) { + // Skip operands already paired with OpIdx1. + if (Op2Used.count(OpIdx2)) + continue; + // Recursively calculate the cost at each level + int TmpScore = getScoreAtLevelRec({I1->getOperand(OpIdx1), Lane1}, + {I2->getOperand(OpIdx2), Lane2}, + CurrLevel + 1, MaxLevel); + // Look for the best score. + if (TmpScore > VLOperands::ScoreFail && TmpScore > MaxTmpScore) { + MaxTmpScore = TmpScore; + MaxOpIdx2 = OpIdx2; + FoundBest = true; + } + } + if (FoundBest) { + // Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it. + Op2Used.insert(MaxOpIdx2); + ShallowScoreAtThisLevel += MaxTmpScore; + } + } + return ShallowScoreAtThisLevel; + } + + /// \Returns the look-ahead score, which tells us how much the sub-trees + /// rooted at \p LHS and \p RHS match, the more they match the higher the + /// score. This helps break ties in an informed way when we cannot decide on + /// the order of the operands by just considering the immediate + /// predecessors. + int getLookAheadScore(const std::pair<Value *, int> &LHS, + const std::pair<Value *, int> &RHS) { + InLookAheadValues.clear(); + return getScoreAtLevelRec(LHS, RHS, 1, LookAheadMaxDepth); + } + // Search all operands in Ops[*][Lane] for the one that matches best // Ops[OpIdx][LastLane] and return its opreand index. // If no good match can be found, return None. @@ -763,9 +1008,6 @@ public: // The linearized opcode of the operand at OpIdx, Lane. bool OpIdxAPO = getData(OpIdx, Lane).APO; - const unsigned BestScore = 2; - const unsigned GoodScore = 1; - // The best operand index and its score. // Sometimes we have more than one option (e.g., Opcode and Undefs), so we // are using the score to differentiate between the two. @@ -794,41 +1036,19 @@ public: // Look for an operand that matches the current mode. switch (RMode) { case ReorderingMode::Load: - if (isa<LoadInst>(Op)) { - // Figure out which is left and right, so that we can check for - // consecutive loads - bool LeftToRight = Lane > LastLane; - Value *OpLeft = (LeftToRight) ? OpLastLane : Op; - Value *OpRight = (LeftToRight) ? Op : OpLastLane; - if (isConsecutiveAccess(cast<LoadInst>(OpLeft), - cast<LoadInst>(OpRight), DL, SE)) - BestOp.Idx = Idx; - } - break; - case ReorderingMode::Opcode: - // We accept both Instructions and Undefs, but with different scores. - if ((isa<Instruction>(Op) && isa<Instruction>(OpLastLane) && - cast<Instruction>(Op)->getOpcode() == - cast<Instruction>(OpLastLane)->getOpcode()) || - (isa<UndefValue>(OpLastLane) && isa<Instruction>(Op)) || - isa<UndefValue>(Op)) { - // An instruction has a higher score than an undef. - unsigned Score = (isa<UndefValue>(Op)) ? GoodScore : BestScore; - if (Score > BestOp.Score) { - BestOp.Idx = Idx; - BestOp.Score = Score; - } - } - break; case ReorderingMode::Constant: - if (isa<Constant>(Op)) { - unsigned Score = (isa<UndefValue>(Op)) ? GoodScore : BestScore; - if (Score > BestOp.Score) { - BestOp.Idx = Idx; - BestOp.Score = Score; - } + case ReorderingMode::Opcode: { + bool LeftToRight = Lane > LastLane; + Value *OpLeft = (LeftToRight) ? OpLastLane : Op; + Value *OpRight = (LeftToRight) ? Op : OpLastLane; + unsigned Score = + getLookAheadScore({OpLeft, LastLane}, {OpRight, Lane}); + if (Score > BestOp.Score) { + BestOp.Idx = Idx; + BestOp.Score = Score; } break; + } case ReorderingMode::Splat: if (Op == OpLastLane) BestOp.Idx = Idx; @@ -959,8 +1179,8 @@ public: public: /// Initialize with all the operands of the instruction vector \p RootVL. VLOperands(ArrayRef<Value *> RootVL, const DataLayout &DL, - ScalarEvolution &SE) - : DL(DL), SE(SE) { + ScalarEvolution &SE, const BoUpSLP &R) + : DL(DL), SE(SE), R(R) { // Append all the operands of RootVL. appendOperandsOfVL(RootVL); } @@ -1189,7 +1409,8 @@ private: SmallVectorImpl<Value *> &Left, SmallVectorImpl<Value *> &Right, const DataLayout &DL, - ScalarEvolution &SE); + ScalarEvolution &SE, + const BoUpSLP &R); struct TreeEntry { using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>; TreeEntry(VecTreeTy &Container) : Container(Container) {} @@ -1211,7 +1432,8 @@ private: Value *VectorizedValue = nullptr; /// Do we need to gather this sequence ? - bool NeedToGather = false; + enum EntryState { Vectorize, NeedToGather }; + EntryState State; /// Does this sequence require some shuffling? SmallVector<unsigned, 4> ReuseShuffleIndices; @@ -1353,15 +1575,30 @@ private: dbgs() << "Scalars: \n"; for (Value *V : Scalars) dbgs().indent(2) << *V << "\n"; - dbgs() << "NeedToGather: " << NeedToGather << "\n"; - dbgs() << "MainOp: " << *MainOp << "\n"; - dbgs() << "AltOp: " << *AltOp << "\n"; + dbgs() << "State: "; + switch (State) { + case Vectorize: + dbgs() << "Vectorize\n"; + break; + case NeedToGather: + dbgs() << "NeedToGather\n"; + break; + } + dbgs() << "MainOp: "; + if (MainOp) + dbgs() << *MainOp << "\n"; + else + dbgs() << "NULL\n"; + dbgs() << "AltOp: "; + if (AltOp) + dbgs() << *AltOp << "\n"; + else + dbgs() << "NULL\n"; dbgs() << "VectorizedValue: "; if (VectorizedValue) - dbgs() << *VectorizedValue; + dbgs() << *VectorizedValue << "\n"; else - dbgs() << "NULL"; - dbgs() << "\n"; + dbgs() << "NULL\n"; dbgs() << "ReuseShuffleIndices: "; if (ReuseShuffleIndices.empty()) dbgs() << "Emtpy"; @@ -1392,7 +1629,7 @@ private: TreeEntry *Last = VectorizableTree.back().get(); Last->Idx = VectorizableTree.size() - 1; Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end()); - Last->NeedToGather = !Vectorized; + Last->State = Vectorized ? TreeEntry::Vectorize : TreeEntry::NeedToGather; Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(), ReuseShuffleIndices.end()); Last->ReorderIndices = ReorderIndices; @@ -1721,7 +1958,7 @@ private: return nullptr; } - bool isInSchedulingRegion(ScheduleData *SD) { + bool isInSchedulingRegion(ScheduleData *SD) const { return SD->SchedulingRegionID == SchedulingRegionID; } @@ -2063,7 +2300,7 @@ template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits { static std::string getNodeAttributes(const TreeEntry *Entry, const BoUpSLP *) { - if (Entry->NeedToGather) + if (Entry->State == TreeEntry::NeedToGather) return "color=red"; return ""; } @@ -2115,7 +2352,7 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots, TreeEntry *Entry = TEPtr.get(); // No need to handle users of gathered values. - if (Entry->NeedToGather) + if (Entry->State == TreeEntry::NeedToGather) continue; // For each lane: @@ -2152,7 +2389,7 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots, !InTreeUserNeedToExtract(Scalar, UserInst, TLI)) { LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U << ".\n"); - assert(!UseEntry->NeedToGather && "Bad state"); + assert(UseEntry->State != TreeEntry::NeedToGather && "Bad state"); continue; } } @@ -2448,7 +2685,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, dyn_cast<SCEVConstant>(SE->getMinusSCEV(ScevN, Scev0)); uint64_t Size = DL->getTypeAllocSize(ScalarTy); // Check that the sorted loads are consecutive. - if (Diff && Diff->getAPInt().getZExtValue() == (VL.size() - 1) * Size) { + if (Diff && Diff->getAPInt() == (VL.size() - 1) * Size) { if (CurrentOrder.empty()) { // Original loads are consecutive and does not require reordering. ++NumOpsWantToKeepOriginalOrder; @@ -2543,7 +2780,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, // Commutative predicate - collect + sort operands of the instructions // so that each side is more likely to have the same opcode. assert(P0 == SwapP0 && "Commutative Predicate mismatch"); - reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE); + reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this); } else { // Collect operands - commute if it uses the swapped predicate. for (Value *V : VL) { @@ -2590,7 +2827,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, // have the same opcode. if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) { ValueList Left, Right; - reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE); + reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this); TE->setOperand(0, Left); TE->setOperand(1, Right); buildTree_rec(Left, Depth + 1, {TE, 0}); @@ -2637,9 +2874,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, } // We don't combine GEPs with non-constant indexes. + Type *Ty1 = VL0->getOperand(1)->getType(); for (Value *V : VL) { auto Op = cast<Instruction>(V)->getOperand(1); - if (!isa<ConstantInt>(Op)) { + if (!isa<ConstantInt>(Op) || + (Op->getType() != Ty1 && + Op->getType()->getScalarSizeInBits() > + DL->getIndexSizeInBits( + V->getType()->getPointerAddressSpace()))) { LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n"); BS.cancelScheduling(VL, VL0); @@ -2665,24 +2907,74 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, } case Instruction::Store: { // Check if the stores are consecutive or if we need to swizzle them. - for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) - if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) { + llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType(); + // Make sure all stores in the bundle are simple - we can't vectorize + // atomic or volatile stores. + SmallVector<Value *, 4> PointerOps(VL.size()); + ValueList Operands(VL.size()); + auto POIter = PointerOps.begin(); + auto OIter = Operands.begin(); + for (Value *V : VL) { + auto *SI = cast<StoreInst>(V); + if (!SI->isSimple()) { BS.cancelScheduling(VL, VL0); newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, ReuseShuffleIndicies); - LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n"); + LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple stores.\n"); return; } + *POIter = SI->getPointerOperand(); + *OIter = SI->getValueOperand(); + ++POIter; + ++OIter; + } - TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, - ReuseShuffleIndicies); - LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n"); + OrdersType CurrentOrder; + // Check the order of pointer operands. + if (llvm::sortPtrAccesses(PointerOps, *DL, *SE, CurrentOrder)) { + Value *Ptr0; + Value *PtrN; + if (CurrentOrder.empty()) { + Ptr0 = PointerOps.front(); + PtrN = PointerOps.back(); + } else { + Ptr0 = PointerOps[CurrentOrder.front()]; + PtrN = PointerOps[CurrentOrder.back()]; + } + const SCEV *Scev0 = SE->getSCEV(Ptr0); + const SCEV *ScevN = SE->getSCEV(PtrN); + const auto *Diff = + dyn_cast<SCEVConstant>(SE->getMinusSCEV(ScevN, Scev0)); + uint64_t Size = DL->getTypeAllocSize(ScalarTy); + // Check that the sorted pointer operands are consecutive. + if (Diff && Diff->getAPInt() == (VL.size() - 1) * Size) { + if (CurrentOrder.empty()) { + // Original stores are consecutive and does not require reordering. + ++NumOpsWantToKeepOriginalOrder; + TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, + UserTreeIdx, ReuseShuffleIndicies); + TE->setOperandsInOrder(); + buildTree_rec(Operands, Depth + 1, {TE, 0}); + LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n"); + } else { + // Need to reorder. + auto I = NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first; + ++(I->getSecond()); + TreeEntry *TE = + newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies, I->getFirst()); + TE->setOperandsInOrder(); + buildTree_rec(Operands, Depth + 1, {TE, 0}); + LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled stores.\n"); + } + return; + } + } - ValueList Operands; - for (Value *V : VL) - Operands.push_back(cast<Instruction>(V)->getOperand(0)); - TE->setOperandsInOrder(); - buildTree_rec(Operands, Depth + 1, {TE, 0}); + BS.cancelScheduling(VL, VL0); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); + LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n"); return; } case Instruction::Call: { @@ -2777,7 +3069,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, // Reorder operands if reordering would enable vectorization. if (isa<BinaryOperator>(VL0)) { ValueList Left, Right; - reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE); + reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this); TE->setOperand(0, Left); TE->setOperand(1, Right); buildTree_rec(Left, Depth + 1, {TE, 0}); @@ -2806,27 +3098,29 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, } unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const { - unsigned N; - Type *EltTy; - auto *ST = dyn_cast<StructType>(T); - if (ST) { - N = ST->getNumElements(); - EltTy = *ST->element_begin(); - } else { - N = cast<ArrayType>(T)->getNumElements(); - EltTy = cast<ArrayType>(T)->getElementType(); + unsigned N = 1; + Type *EltTy = T; + + while (isa<CompositeType>(EltTy)) { + if (auto *ST = dyn_cast<StructType>(EltTy)) { + // Check that struct is homogeneous. + for (const auto *Ty : ST->elements()) + if (Ty != *ST->element_begin()) + return 0; + N *= ST->getNumElements(); + EltTy = *ST->element_begin(); + } else { + auto *SeqT = cast<SequentialType>(EltTy); + N *= SeqT->getNumElements(); + EltTy = SeqT->getElementType(); + } } + if (!isValidElementType(EltTy)) return 0; uint64_t VTSize = DL.getTypeStoreSizeInBits(VectorType::get(EltTy, N)); if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize || VTSize != DL.getTypeStoreSizeInBits(T)) return 0; - if (ST) { - // Check that struct is homogeneous. - for (const auto *Ty : ST->elements()) - if (Ty != EltTy) - return 0; - } return N; } @@ -2927,7 +3221,7 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { ReuseShuffleCost = TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy); } - if (E->NeedToGather) { + if (E->State == TreeEntry::NeedToGather) { if (allConstant(VL)) return 0; if (isSplat(VL)) { @@ -2995,7 +3289,7 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, Idx); } } - if (!E->NeedToGather) { + if (E->State == TreeEntry::Vectorize) { int DeadCost = ReuseShuffleCost; if (!E->ReorderIndices.empty()) { // TODO: Merge this shuffle with the ReuseShuffleCost. @@ -3135,13 +3429,13 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { SmallVector<const Value *, 4> Operands(VL0->operand_values()); int ScalarEltCost = TTI->getArithmeticInstrCost( - E->getOpcode(), ScalarTy, Op1VK, Op2VK, Op1VP, Op2VP, Operands); + E->getOpcode(), ScalarTy, Op1VK, Op2VK, Op1VP, Op2VP, Operands, VL0); if (NeedToShuffleReuses) { ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; } int ScalarCost = VecTy->getNumElements() * ScalarEltCost; - int VecCost = TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, Op1VK, - Op2VK, Op1VP, Op2VP, Operands); + int VecCost = TTI->getArithmeticInstrCost( + E->getOpcode(), VecTy, Op1VK, Op2VK, Op1VP, Op2VP, Operands, VL0); return ReuseShuffleCost + VecCost - ScalarCost; } case Instruction::GetElementPtr: { @@ -3162,7 +3456,7 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { } case Instruction::Load: { // Cost of wide load - cost of scalar loads. - unsigned alignment = cast<LoadInst>(VL0)->getAlignment(); + MaybeAlign alignment(cast<LoadInst>(VL0)->getAlignment()); int ScalarEltCost = TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0, VL0); if (NeedToShuffleReuses) { @@ -3180,15 +3474,22 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { } case Instruction::Store: { // We know that we can merge the stores. Calculate the cost. - unsigned alignment = cast<StoreInst>(VL0)->getAlignment(); + bool IsReorder = !E->ReorderIndices.empty(); + auto *SI = + cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0); + MaybeAlign Alignment(SI->getAlignment()); int ScalarEltCost = - TTI->getMemoryOpCost(Instruction::Store, ScalarTy, alignment, 0, VL0); - if (NeedToShuffleReuses) { - ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; - } + TTI->getMemoryOpCost(Instruction::Store, ScalarTy, Alignment, 0, VL0); + if (NeedToShuffleReuses) + ReuseShuffleCost = -(ReuseShuffleNumbers - VL.size()) * ScalarEltCost; int ScalarStCost = VecTy->getNumElements() * ScalarEltCost; - int VecStCost = - TTI->getMemoryOpCost(Instruction::Store, VecTy, alignment, 0, VL0); + int VecStCost = TTI->getMemoryOpCost(Instruction::Store, + VecTy, Alignment, 0, VL0); + if (IsReorder) { + // TODO: Merge this shuffle with the ReuseShuffleCost. + VecStCost += TTI->getShuffleCost( + TargetTransformInfo::SK_PermuteSingleSrc, VecTy); + } return ReuseShuffleCost + VecStCost - ScalarStCost; } case Instruction::Call: { @@ -3274,20 +3575,22 @@ bool BoUpSLP::isFullyVectorizableTinyTree() const { << VectorizableTree.size() << " is fully vectorizable .\n"); // We only handle trees of heights 1 and 2. - if (VectorizableTree.size() == 1 && !VectorizableTree[0]->NeedToGather) + if (VectorizableTree.size() == 1 && + VectorizableTree[0]->State == TreeEntry::Vectorize) return true; if (VectorizableTree.size() != 2) return false; // Handle splat and all-constants stores. - if (!VectorizableTree[0]->NeedToGather && + if (VectorizableTree[0]->State == TreeEntry::Vectorize && (allConstant(VectorizableTree[1]->Scalars) || isSplat(VectorizableTree[1]->Scalars))) return true; // Gathering cost would be too much for tiny trees. - if (VectorizableTree[0]->NeedToGather || VectorizableTree[1]->NeedToGather) + if (VectorizableTree[0]->State == TreeEntry::NeedToGather || + VectorizableTree[1]->State == TreeEntry::NeedToGather) return false; return true; @@ -3397,7 +3700,7 @@ int BoUpSLP::getSpillCost() const { continue; } - // Debug informations don't impact spill cost. + // Debug information does not impact spill cost. if ((isa<CallInst>(&*PrevInstIt) && !isa<DbgInfoIntrinsic>(&*PrevInstIt)) && &*PrevInstIt != PrevInst) @@ -3441,12 +3744,13 @@ int BoUpSLP::getTreeCost() { // their uses. Since such an approach results in fewer total entries, // existing heuristics based on tree size may yield different results. // - if (TE.NeedToGather && - std::any_of( - std::next(VectorizableTree.begin(), I + 1), VectorizableTree.end(), - [TE](const std::unique_ptr<TreeEntry> &EntryPtr) { - return EntryPtr->NeedToGather && EntryPtr->isSame(TE.Scalars); - })) + if (TE.State == TreeEntry::NeedToGather && + std::any_of(std::next(VectorizableTree.begin(), I + 1), + VectorizableTree.end(), + [TE](const std::unique_ptr<TreeEntry> &EntryPtr) { + return EntryPtr->State == TreeEntry::NeedToGather && + EntryPtr->isSame(TE.Scalars); + })) continue; int C = getEntryCost(&TE); @@ -3538,13 +3842,15 @@ int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) const { // Perform operand reordering on the instructions in VL and return the reordered // operands in Left and Right. -void BoUpSLP::reorderInputsAccordingToOpcode( - ArrayRef<Value *> VL, SmallVectorImpl<Value *> &Left, - SmallVectorImpl<Value *> &Right, const DataLayout &DL, - ScalarEvolution &SE) { +void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL, + SmallVectorImpl<Value *> &Left, + SmallVectorImpl<Value *> &Right, + const DataLayout &DL, + ScalarEvolution &SE, + const BoUpSLP &R) { if (VL.empty()) return; - VLOperands Ops(VL, DL, SE); + VLOperands Ops(VL, DL, SE, R); // Reorder the operands in place. Ops.reorder(); Left = Ops.getVL(0); @@ -3735,7 +4041,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty(); - if (E->NeedToGather) { + if (E->State == TreeEntry::NeedToGather) { setInsertPointAfterBundle(E); auto *V = Gather(E->Scalars, VecTy); if (NeedToShuffleReuses) { @@ -3790,7 +4096,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { } case Instruction::ExtractElement: { - if (!E->NeedToGather) { + if (E->State == TreeEntry::Vectorize) { Value *V = E->getSingleOperand(0); if (!E->ReorderIndices.empty()) { OrdersType Mask; @@ -3823,7 +4129,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { return V; } case Instruction::ExtractValue: { - if (!E->NeedToGather) { + if (E->State == TreeEntry::Vectorize) { LoadInst *LI = cast<LoadInst>(E->getSingleOperand(0)); Builder.SetInsertPoint(LI); PointerType *PtrTy = PointerType::get(VecTy, LI->getPointerAddressSpace()); @@ -4050,15 +4356,25 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { return V; } case Instruction::Store: { - StoreInst *SI = cast<StoreInst>(VL0); + bool IsReorder = !E->ReorderIndices.empty(); + auto *SI = cast<StoreInst>( + IsReorder ? E->Scalars[E->ReorderIndices.front()] : VL0); unsigned Alignment = SI->getAlignment(); unsigned AS = SI->getPointerAddressSpace(); setInsertPointAfterBundle(E); Value *VecValue = vectorizeTree(E->getOperand(0)); + if (IsReorder) { + OrdersType Mask; + inversePermutation(E->ReorderIndices, Mask); + VecValue = Builder.CreateShuffleVector( + VecValue, UndefValue::get(VecValue->getType()), E->ReorderIndices, + "reorder_shuffle"); + } Value *ScalarPtr = SI->getPointerOperand(); - Value *VecPtr = Builder.CreateBitCast(ScalarPtr, VecTy->getPointerTo(AS)); + Value *VecPtr = Builder.CreateBitCast( + ScalarPtr, VecValue->getType()->getPointerTo(AS)); StoreInst *ST = Builder.CreateStore(VecValue, VecPtr); // The pointer operand uses an in-tree scalar, so add the new BitCast to @@ -4088,7 +4404,22 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { std::vector<Value *> OpVecs; for (int j = 1, e = cast<GetElementPtrInst>(VL0)->getNumOperands(); j < e; ++j) { - Value *OpVec = vectorizeTree(E->getOperand(j)); + ValueList &VL = E->getOperand(j); + // Need to cast all elements to the same type before vectorization to + // avoid crash. + Type *VL0Ty = VL0->getOperand(j)->getType(); + Type *Ty = llvm::all_of( + VL, [VL0Ty](Value *V) { return VL0Ty == V->getType(); }) + ? VL0Ty + : DL->getIndexType(cast<GetElementPtrInst>(VL0) + ->getPointerOperandType() + ->getScalarType()); + for (Value *&V : VL) { + auto *CI = cast<ConstantInt>(V); + V = ConstantExpr::getIntegerCast(CI, Ty, + CI->getValue().isSignBitSet()); + } + Value *OpVec = vectorizeTree(VL); OpVecs.push_back(OpVec); } @@ -4284,7 +4615,7 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) { continue; TreeEntry *E = getTreeEntry(Scalar); assert(E && "Invalid scalar"); - assert(!E->NeedToGather && "Extracting from a gather list"); + assert(E->State == TreeEntry::Vectorize && "Extracting from a gather list"); Value *Vec = E->VectorizedValue; assert(Vec && "Can't find vectorizable value"); @@ -4357,7 +4688,7 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) { TreeEntry *Entry = TEPtr.get(); // No need to handle users of gathered values. - if (Entry->NeedToGather) + if (Entry->State == TreeEntry::NeedToGather) continue; assert(Entry->VectorizedValue && "Can't find vectorizable value"); @@ -5332,125 +5663,140 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_, } bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R, - unsigned VecRegSize) { - const unsigned ChainLen = Chain.size(); - LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen + unsigned Idx) { + LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size() << "\n"); const unsigned Sz = R.getVectorElementSize(Chain[0]); - const unsigned VF = VecRegSize / Sz; + const unsigned MinVF = R.getMinVecRegSize() / Sz; + unsigned VF = Chain.size(); - if (!isPowerOf2_32(Sz) || VF < 2) + if (!isPowerOf2_32(Sz) || !isPowerOf2_32(VF) || VF < 2 || VF < MinVF) return false; - bool Changed = false; - // Look for profitable vectorizable trees at all offsets, starting at zero. - for (unsigned i = 0, e = ChainLen; i + VF <= e; ++i) { - - ArrayRef<Value *> Operands = Chain.slice(i, VF); - // Check that a previous iteration of this loop did not delete the Value. - if (llvm::any_of(Operands, [&R](Value *V) { - auto *I = dyn_cast<Instruction>(V); - return I && R.isDeleted(I); - })) - continue; - - LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << i - << "\n"); - - R.buildTree(Operands); - if (R.isTreeTinyAndNotFullyVectorizable()) - continue; + LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << Idx + << "\n"); - R.computeMinimumValueSizes(); + R.buildTree(Chain); + Optional<ArrayRef<unsigned>> Order = R.bestOrder(); + // TODO: Handle orders of size less than number of elements in the vector. + if (Order && Order->size() == Chain.size()) { + // TODO: reorder tree nodes without tree rebuilding. + SmallVector<Value *, 4> ReorderedOps(Chain.rbegin(), Chain.rend()); + llvm::transform(*Order, ReorderedOps.begin(), + [Chain](const unsigned Idx) { return Chain[Idx]; }); + R.buildTree(ReorderedOps); + } + if (R.isTreeTinyAndNotFullyVectorizable()) + return false; - int Cost = R.getTreeCost(); + R.computeMinimumValueSizes(); - LLVM_DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF - << "\n"); - if (Cost < -SLPCostThreshold) { - LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n"); + int Cost = R.getTreeCost(); - using namespace ore; + LLVM_DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n"); + if (Cost < -SLPCostThreshold) { + LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n"); - R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized", - cast<StoreInst>(Chain[i])) - << "Stores SLP vectorized with cost " << NV("Cost", Cost) - << " and with tree size " - << NV("TreeSize", R.getTreeSize())); + using namespace ore; - R.vectorizeTree(); + R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized", + cast<StoreInst>(Chain[0])) + << "Stores SLP vectorized with cost " << NV("Cost", Cost) + << " and with tree size " + << NV("TreeSize", R.getTreeSize())); - // Move to the next bundle. - i += VF - 1; - Changed = true; - } + R.vectorizeTree(); + return true; } - return Changed; + return false; } bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores, BoUpSLP &R) { - SetVector<StoreInst *> Heads; - SmallDenseSet<StoreInst *> Tails; - SmallDenseMap<StoreInst *, StoreInst *> ConsecutiveChain; - // We may run into multiple chains that merge into a single chain. We mark the // stores that we vectorized so that we don't visit the same store twice. BoUpSLP::ValueSet VectorizedStores; bool Changed = false; - auto &&FindConsecutiveAccess = - [this, &Stores, &Heads, &Tails, &ConsecutiveChain] (int K, int Idx) { - if (!isConsecutiveAccess(Stores[K], Stores[Idx], *DL, *SE)) - return false; - - Tails.insert(Stores[Idx]); - Heads.insert(Stores[K]); - ConsecutiveChain[Stores[K]] = Stores[Idx]; - return true; - }; + int E = Stores.size(); + SmallBitVector Tails(E, false); + SmallVector<int, 16> ConsecutiveChain(E, E + 1); + int MaxIter = MaxStoreLookup.getValue(); + int IterCnt; + auto &&FindConsecutiveAccess = [this, &Stores, &Tails, &IterCnt, MaxIter, + &ConsecutiveChain](int K, int Idx) { + if (IterCnt >= MaxIter) + return true; + ++IterCnt; + if (!isConsecutiveAccess(Stores[K], Stores[Idx], *DL, *SE)) + return false; + Tails.set(Idx); + ConsecutiveChain[K] = Idx; + return true; + }; // Do a quadratic search on all of the given stores in reverse order and find // all of the pairs of stores that follow each other. - int E = Stores.size(); for (int Idx = E - 1; Idx >= 0; --Idx) { // If a store has multiple consecutive store candidates, search according // to the sequence: Idx-1, Idx+1, Idx-2, Idx+2, ... // This is because usually pairing with immediate succeeding or preceding // candidate create the best chance to find slp vectorization opportunity. - for (int Offset = 1, F = std::max(E - Idx, Idx + 1); Offset < F; ++Offset) + const int MaxLookDepth = std::max(E - Idx, Idx + 1); + IterCnt = 0; + for (int Offset = 1, F = MaxLookDepth; Offset < F; ++Offset) if ((Idx >= Offset && FindConsecutiveAccess(Idx - Offset, Idx)) || (Idx + Offset < E && FindConsecutiveAccess(Idx + Offset, Idx))) break; } // For stores that start but don't end a link in the chain: - for (auto *SI : llvm::reverse(Heads)) { - if (Tails.count(SI)) + for (int Cnt = E; Cnt > 0; --Cnt) { + int I = Cnt - 1; + if (ConsecutiveChain[I] == E + 1 || Tails.test(I)) continue; - // We found a store instr that starts a chain. Now follow the chain and try // to vectorize it. BoUpSLP::ValueList Operands; - StoreInst *I = SI; // Collect the chain into a list. - while ((Tails.count(I) || Heads.count(I)) && !VectorizedStores.count(I)) { - Operands.push_back(I); + while (I != E + 1 && !VectorizedStores.count(Stores[I])) { + Operands.push_back(Stores[I]); // Move to the next value in the chain. I = ConsecutiveChain[I]; } + // If a vector register can't hold 1 element, we are done. + unsigned MaxVecRegSize = R.getMaxVecRegSize(); + unsigned EltSize = R.getVectorElementSize(Stores[0]); + if (MaxVecRegSize % EltSize != 0) + continue; + + unsigned MaxElts = MaxVecRegSize / EltSize; // FIXME: Is division-by-2 the correct step? Should we assert that the // register size is a power-of-2? - for (unsigned Size = R.getMaxVecRegSize(); Size >= R.getMinVecRegSize(); - Size /= 2) { - if (vectorizeStoreChain(Operands, R, Size)) { - // Mark the vectorized stores so that we don't vectorize them again. - VectorizedStores.insert(Operands.begin(), Operands.end()); - Changed = true; - break; + unsigned StartIdx = 0; + for (unsigned Size = llvm::PowerOf2Ceil(MaxElts); Size >= 2; Size /= 2) { + for (unsigned Cnt = StartIdx, E = Operands.size(); Cnt + Size <= E;) { + ArrayRef<Value *> Slice = makeArrayRef(Operands).slice(Cnt, Size); + if (!VectorizedStores.count(Slice.front()) && + !VectorizedStores.count(Slice.back()) && + vectorizeStoreChain(Slice, R, Cnt)) { + // Mark the vectorized stores so that we don't vectorize them again. + VectorizedStores.insert(Slice.begin(), Slice.end()); + Changed = true; + // If we vectorized initial block, no need to try to vectorize it + // again. + if (Cnt == StartIdx) + StartIdx += Size; + Cnt += Size; + continue; + } + ++Cnt; } + // Check if the whole array was vectorized already - exit. + if (StartIdx >= Operands.size()) + break; } } @@ -5835,38 +6181,36 @@ class HorizontalReduction { explicit operator bool() const { return Opcode; } - /// Get the index of the first operand. - unsigned getFirstOperandIndex() const { - assert(!!*this && "The opcode is not set."); + /// Return true if this operation is any kind of minimum or maximum. + bool isMinMax() const { switch (Kind) { + case RK_Arithmetic: + return false; case RK_Min: - case RK_UMin: case RK_Max: + case RK_UMin: case RK_UMax: - return 1; - case RK_Arithmetic: + return true; case RK_None: break; } - return 0; + llvm_unreachable("Reduction kind is not set"); + } + + /// Get the index of the first operand. + unsigned getFirstOperandIndex() const { + assert(!!*this && "The opcode is not set."); + // We allow calling this before 'Kind' is set, so handle that specially. + if (Kind == RK_None) + return 0; + return isMinMax() ? 1 : 0; } /// Total number of operands in the reduction operation. unsigned getNumberOfOperands() const { assert(Kind != RK_None && !!*this && LHS && RHS && "Expected reduction operation."); - switch (Kind) { - case RK_Arithmetic: - return 2; - case RK_Min: - case RK_UMin: - case RK_Max: - case RK_UMax: - return 3; - case RK_None: - break; - } - llvm_unreachable("Reduction kind is not set"); + return isMinMax() ? 3 : 2; } /// Checks if the operation has the same parent as \p P. @@ -5875,79 +6219,46 @@ class HorizontalReduction { "Expected reduction operation."); if (!IsRedOp) return I->getParent() == P; - switch (Kind) { - case RK_Arithmetic: - // Arithmetic reduction operation must be used once only. - return I->getParent() == P; - case RK_Min: - case RK_UMin: - case RK_Max: - case RK_UMax: { + if (isMinMax()) { // SelectInst must be used twice while the condition op must have single // use only. auto *Cmp = cast<Instruction>(cast<SelectInst>(I)->getCondition()); return I->getParent() == P && Cmp && Cmp->getParent() == P; } - case RK_None: - break; - } - llvm_unreachable("Reduction kind is not set"); + // Arithmetic reduction operation must be used once only. + return I->getParent() == P; } + /// Expected number of uses for reduction operations/reduced values. bool hasRequiredNumberOfUses(Instruction *I, bool IsReductionOp) const { assert(Kind != RK_None && !!*this && LHS && RHS && "Expected reduction operation."); - switch (Kind) { - case RK_Arithmetic: - return I->hasOneUse(); - case RK_Min: - case RK_UMin: - case RK_Max: - case RK_UMax: + if (isMinMax()) return I->hasNUses(2) && (!IsReductionOp || cast<SelectInst>(I)->getCondition()->hasOneUse()); - case RK_None: - break; - } - llvm_unreachable("Reduction kind is not set"); + return I->hasOneUse(); } /// Initializes the list of reduction operations. void initReductionOps(ReductionOpsListType &ReductionOps) { assert(Kind != RK_None && !!*this && LHS && RHS && "Expected reduction operation."); - switch (Kind) { - case RK_Arithmetic: - ReductionOps.assign(1, ReductionOpsType()); - break; - case RK_Min: - case RK_UMin: - case RK_Max: - case RK_UMax: + if (isMinMax()) ReductionOps.assign(2, ReductionOpsType()); - break; - case RK_None: - llvm_unreachable("Reduction kind is not set"); - } + else + ReductionOps.assign(1, ReductionOpsType()); } + /// Add all reduction operations for the reduction instruction \p I. void addReductionOps(Instruction *I, ReductionOpsListType &ReductionOps) { assert(Kind != RK_None && !!*this && LHS && RHS && "Expected reduction operation."); - switch (Kind) { - case RK_Arithmetic: - ReductionOps[0].emplace_back(I); - break; - case RK_Min: - case RK_UMin: - case RK_Max: - case RK_UMax: + if (isMinMax()) { ReductionOps[0].emplace_back(cast<SelectInst>(I)->getCondition()); ReductionOps[1].emplace_back(I); - break; - case RK_None: - llvm_unreachable("Reduction kind is not set"); + } else { + ReductionOps[0].emplace_back(I); } } @@ -5980,12 +6291,12 @@ class HorizontalReduction { /// Checks if two operation data are both a reduction op or both a reduced /// value. - bool operator==(const OperationData &OD) { + bool operator==(const OperationData &OD) const { assert(((Kind != OD.Kind) || ((!LHS == !OD.LHS) && (!RHS == !OD.RHS))) && "One of the comparing operations is incorrect."); return this == &OD || (Kind == OD.Kind && Opcode == OD.Opcode); } - bool operator!=(const OperationData &OD) { return !(*this == OD); } + bool operator!=(const OperationData &OD) const { return !(*this == OD); } void clear() { Opcode = 0; LHS = nullptr; @@ -6005,18 +6316,7 @@ class HorizontalReduction { Value *getLHS() const { return LHS; } Value *getRHS() const { return RHS; } Type *getConditionType() const { - switch (Kind) { - case RK_Arithmetic: - return nullptr; - case RK_Min: - case RK_Max: - case RK_UMin: - case RK_UMax: - return CmpInst::makeCmpResultType(LHS->getType()); - case RK_None: - break; - } - llvm_unreachable("Reduction kind is not set"); + return isMinMax() ? CmpInst::makeCmpResultType(LHS->getType()) : nullptr; } /// Creates reduction operation with the current opcode with the IR flags @@ -6400,6 +6700,18 @@ public: assert(Pair.first && "DebugLoc must be set."); ExternallyUsedValues[Pair.second].push_back(Pair.first); } + + // The compare instruction of a min/max is the insertion point for new + // instructions and may be replaced with a new compare instruction. + auto getCmpForMinMaxReduction = [](Instruction *RdxRootInst) { + assert(isa<SelectInst>(RdxRootInst) && + "Expected min/max reduction to have select root instruction"); + Value *ScalarCond = cast<SelectInst>(RdxRootInst)->getCondition(); + assert(isa<Instruction>(ScalarCond) && + "Expected min/max reduction to have compare condition"); + return cast<Instruction>(ScalarCond); + }; + // The reduction root is used as the insertion point for new instructions, // so set it as externally used to prevent it from being deleted. ExternallyUsedValues[ReductionRoot]; @@ -6455,8 +6767,14 @@ public: DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc(); Value *VectorizedRoot = V.vectorizeTree(ExternallyUsedValues); - // Emit a reduction. - Builder.SetInsertPoint(cast<Instruction>(ReductionRoot)); + // Emit a reduction. For min/max, the root is a select, but the insertion + // point is the compare condition of that select. + Instruction *RdxRootInst = cast<Instruction>(ReductionRoot); + if (ReductionData.isMinMax()) + Builder.SetInsertPoint(getCmpForMinMaxReduction(RdxRootInst)); + else + Builder.SetInsertPoint(RdxRootInst); + Value *ReducedSubTree = emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI); if (VectorizedTree) { @@ -6492,8 +6810,20 @@ public: VectorizedTree = VectReductionData.createOp(Builder, "op.extra", I); } } - // Update users. + + // Update users. For a min/max reduction that ends with a compare and + // select, we also have to RAUW for the compare instruction feeding the + // reduction root. That's because the original compare may have extra uses + // besides the final select of the reduction. + if (ReductionData.isMinMax()) { + if (auto *VecSelect = dyn_cast<SelectInst>(VectorizedTree)) { + Instruction *ScalarCmp = + getCmpForMinMaxReduction(cast<Instruction>(ReductionRoot)); + ScalarCmp->replaceAllUsesWith(VecSelect->getCondition()); + } + } ReductionRoot->replaceAllUsesWith(VectorizedTree); + // Mark all scalar reduction ops for deletion, they are replaced by the // vector reductions. V.eraseInstructions(IgnoreList); @@ -6619,45 +6949,54 @@ private: /// %rb = insertelement <4 x float> %ra, float %s1, i32 1 /// %rc = insertelement <4 x float> %rb, float %s2, i32 2 /// %rd = insertelement <4 x float> %rc, float %s3, i32 3 -/// starting from the last insertelement instruction. +/// starting from the last insertelement or insertvalue instruction. /// -/// Returns true if it matches -static bool findBuildVector(InsertElementInst *LastInsertElem, - TargetTransformInfo *TTI, - SmallVectorImpl<Value *> &BuildVectorOpds, - int &UserCost) { - UserCost = 0; - Value *V = nullptr; - do { - if (auto *CI = dyn_cast<ConstantInt>(LastInsertElem->getOperand(2))) { - UserCost += TTI->getVectorInstrCost(Instruction::InsertElement, - LastInsertElem->getType(), - CI->getZExtValue()); - } - BuildVectorOpds.push_back(LastInsertElem->getOperand(1)); - V = LastInsertElem->getOperand(0); - if (isa<UndefValue>(V)) - break; - LastInsertElem = dyn_cast<InsertElementInst>(V); - if (!LastInsertElem || !LastInsertElem->hasOneUse()) - return false; - } while (true); - std::reverse(BuildVectorOpds.begin(), BuildVectorOpds.end()); - return true; -} - -/// Like findBuildVector, but looks for construction of aggregate. +/// Also recognize aggregates like {<2 x float>, <2 x float>}, +/// {{float, float}, {float, float}}, [2 x {float, float}] and so on. +/// See llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll for examples. +/// +/// Assume LastInsertInst is of InsertElementInst or InsertValueInst type. /// /// \return true if it matches. -static bool findBuildAggregate(InsertValueInst *IV, - SmallVectorImpl<Value *> &BuildVectorOpds) { +static bool findBuildAggregate(Value *LastInsertInst, TargetTransformInfo *TTI, + SmallVectorImpl<Value *> &BuildVectorOpds, + int &UserCost) { + assert((isa<InsertElementInst>(LastInsertInst) || + isa<InsertValueInst>(LastInsertInst)) && + "Expected insertelement or insertvalue instruction!"); + UserCost = 0; do { - BuildVectorOpds.push_back(IV->getInsertedValueOperand()); - Value *V = IV->getAggregateOperand(); - if (isa<UndefValue>(V)) + Value *InsertedOperand; + if (auto *IE = dyn_cast<InsertElementInst>(LastInsertInst)) { + InsertedOperand = IE->getOperand(1); + LastInsertInst = IE->getOperand(0); + if (auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2))) { + UserCost += TTI->getVectorInstrCost(Instruction::InsertElement, + IE->getType(), CI->getZExtValue()); + } + } else { + auto *IV = cast<InsertValueInst>(LastInsertInst); + InsertedOperand = IV->getInsertedValueOperand(); + LastInsertInst = IV->getAggregateOperand(); + } + if (isa<InsertElementInst>(InsertedOperand) || + isa<InsertValueInst>(InsertedOperand)) { + int TmpUserCost; + SmallVector<Value *, 8> TmpBuildVectorOpds; + if (!findBuildAggregate(InsertedOperand, TTI, TmpBuildVectorOpds, + TmpUserCost)) + return false; + BuildVectorOpds.append(TmpBuildVectorOpds.rbegin(), + TmpBuildVectorOpds.rend()); + UserCost += TmpUserCost; + } else { + BuildVectorOpds.push_back(InsertedOperand); + } + if (isa<UndefValue>(LastInsertInst)) break; - IV = dyn_cast<InsertValueInst>(V); - if (!IV || !IV->hasOneUse()) + if ((!isa<InsertValueInst>(LastInsertInst) && + !isa<InsertElementInst>(LastInsertInst)) || + !LastInsertInst->hasOneUse()) return false; } while (true); std::reverse(BuildVectorOpds.begin(), BuildVectorOpds.end()); @@ -6825,25 +7164,26 @@ bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Value *V, bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI, BasicBlock *BB, BoUpSLP &R) { + int UserCost = 0; const DataLayout &DL = BB->getModule()->getDataLayout(); if (!R.canMapToVector(IVI->getType(), DL)) return false; SmallVector<Value *, 16> BuildVectorOpds; - if (!findBuildAggregate(IVI, BuildVectorOpds)) + if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, UserCost)) return false; LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n"); // Aggregate value is unlikely to be processed in vector register, we need to // extract scalars into scalar registers, so NeedExtraction is set true. - return tryToVectorizeList(BuildVectorOpds, R); + return tryToVectorizeList(BuildVectorOpds, R, UserCost); } bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI, BasicBlock *BB, BoUpSLP &R) { int UserCost; SmallVector<Value *, 16> BuildVectorOpds; - if (!findBuildVector(IEI, TTI, BuildVectorOpds, UserCost) || + if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, UserCost) || (llvm::all_of(BuildVectorOpds, [](Value *V) { return isa<ExtractElementInst>(V); }) && isShuffle(BuildVectorOpds))) @@ -7118,14 +7458,7 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) { LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << it->second.size() << ".\n"); - // Process the stores in chunks of 16. - // TODO: The limit of 16 inhibits greater vectorization factors. - // For example, AVX2 supports v32i8. Increasing this limit, however, - // may cause a significant compile-time increase. - for (unsigned CI = 0, CE = it->second.size(); CI < CE; CI += 16) { - unsigned Len = std::min<unsigned>(CE - CI, 16); - Changed |= vectorizeStores(makeArrayRef(&it->second[CI], Len), R); - } + Changed |= vectorizeStores(it->second, R); } return Changed; } diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h index 0ca6a6b93cfd..598fb00e956e 100644 --- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h +++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h @@ -47,6 +47,24 @@ class VPRecipeBuilder { EdgeMaskCacheTy EdgeMaskCache; BlockMaskCacheTy BlockMaskCache; + // VPlan-VPlan transformations support: Hold a mapping from ingredients to + // their recipe. To save on memory, only do so for selected ingredients, + // marked by having a nullptr entry in this map. If those ingredients get a + // VPWidenRecipe, also avoid compressing other ingredients into it to avoid + // having to split such recipes later. + DenseMap<Instruction *, VPRecipeBase *> Ingredient2Recipe; + VPWidenRecipe *LastExtensibleRecipe = nullptr; + + /// Set the recipe created for given ingredient. This operation is a no-op for + /// ingredients that were not marked using a nullptr entry in the map. + void setRecipe(Instruction *I, VPRecipeBase *R) { + if (!Ingredient2Recipe.count(I)) + return; + assert(Ingredient2Recipe[I] == nullptr && + "Recipe already set for ingredient"); + Ingredient2Recipe[I] = R; + } + public: /// A helper function that computes the predicate of the block BB, assuming /// that the header block of the loop is set to True. It returns the *entry* @@ -57,16 +75,22 @@ public: /// and DST. VPValue *createEdgeMask(BasicBlock *Src, BasicBlock *Dst, VPlanPtr &Plan); - /// Check if \I belongs to an Interleave Group within the given VF \p Range, - /// \return true in the first returned value if so and false otherwise. - /// Build a new VPInterleaveGroup Recipe if \I is the primary member of an IG - /// for \p Range.Start, and provide it as the second returned value. - /// Note that if \I is an adjunct member of an IG for \p Range.Start, the - /// \return value is <true, nullptr>, as it is handled by another recipe. - /// \p Range.End may be decreased to ensure same decision from \p Range.Start - /// to \p Range.End. - VPInterleaveRecipe *tryToInterleaveMemory(Instruction *I, VFRange &Range, - VPlanPtr &Plan); + /// Mark given ingredient for recording its recipe once one is created for + /// it. + void recordRecipeOf(Instruction *I) { + assert((!Ingredient2Recipe.count(I) || Ingredient2Recipe[I] == nullptr) && + "Recipe already set for ingredient"); + Ingredient2Recipe[I] = nullptr; + } + + /// Return the recipe created for given ingredient. + VPRecipeBase *getRecipe(Instruction *I) { + assert(Ingredient2Recipe.count(I) && + "Recording this ingredients recipe was not requested"); + assert(Ingredient2Recipe[I] != nullptr && + "Ingredient doesn't have a recipe"); + return Ingredient2Recipe[I]; + } /// Check if \I is a memory instruction to be widened for \p Range.Start and /// potentially masked. Such instructions are handled by a recipe that takes diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 4b80d1fb20aa..f1c708720ccf 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -31,6 +31,7 @@ #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" #include "llvm/Support/Casting.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/GenericDomTreeConstruction.h" @@ -275,18 +276,35 @@ void VPRegionBlock::execute(VPTransformState *State) { } void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) { + assert(!Parent && "Recipe already in some VPBasicBlock"); + assert(InsertPos->getParent() && + "Insertion position not in any VPBasicBlock"); Parent = InsertPos->getParent(); Parent->getRecipeList().insert(InsertPos->getIterator(), this); } +void VPRecipeBase::insertAfter(VPRecipeBase *InsertPos) { + assert(!Parent && "Recipe already in some VPBasicBlock"); + assert(InsertPos->getParent() && + "Insertion position not in any VPBasicBlock"); + Parent = InsertPos->getParent(); + Parent->getRecipeList().insertAfter(InsertPos->getIterator(), this); +} + +void VPRecipeBase::removeFromParent() { + assert(getParent() && "Recipe not in any VPBasicBlock"); + getParent()->getRecipeList().remove(getIterator()); + Parent = nullptr; +} + iplist<VPRecipeBase>::iterator VPRecipeBase::eraseFromParent() { + assert(getParent() && "Recipe not in any VPBasicBlock"); return getParent()->getRecipeList().erase(getIterator()); } void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) { - InsertPos->getParent()->getRecipeList().splice( - std::next(InsertPos->getIterator()), getParent()->getRecipeList(), - getIterator()); + removeFromParent(); + insertAfter(InsertPos); } void VPInstruction::generateInstruction(VPTransformState &State, @@ -447,14 +465,20 @@ void VPlan::execute(VPTransformState *State) { // We do not attempt to preserve DT for outer loop vectorization currently. if (!EnableVPlanNativePath) - updateDominatorTree(State->DT, VectorPreHeaderBB, VectorLatchBB); + updateDominatorTree(State->DT, VectorPreHeaderBB, VectorLatchBB, + L->getExitBlock()); } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD +void VPlan::dump() const { dbgs() << *this << '\n'; } +#endif + void VPlan::updateDominatorTree(DominatorTree *DT, BasicBlock *LoopPreHeaderBB, - BasicBlock *LoopLatchBB) { + BasicBlock *LoopLatchBB, + BasicBlock *LoopExitBB) { BasicBlock *LoopHeaderBB = LoopPreHeaderBB->getSingleSuccessor(); assert(LoopHeaderBB && "Loop preheader does not have a single successor."); - DT->addNewBlock(LoopHeaderBB, LoopPreHeaderBB); // The vector body may be more than a single basic-block by this point. // Update the dominator tree information inside the vector body by propagating // it from header to latch, expecting only triangular control-flow, if any. @@ -485,6 +509,9 @@ void VPlan::updateDominatorTree(DominatorTree *DT, BasicBlock *LoopPreHeaderBB, DT->addNewBlock(InterimSucc, BB); DT->addNewBlock(PostDomSucc, BB); } + // Latch block is a new dominator for the loop exit. + DT->changeImmediateDominator(LoopExitBB, LoopLatchBB); + assert(DT->verify(DominatorTree::VerificationLevel::Fast)); } const Twine VPlanPrinter::getUID(const VPBlockBase *Block) { @@ -509,8 +536,7 @@ void VPlanPrinter::dump() { if (!Plan.Value2VPValue.empty() || Plan.BackedgeTakenCount) { OS << ", where:"; if (Plan.BackedgeTakenCount) - OS << "\\n" - << *Plan.getOrCreateBackedgeTakenCount() << " := BackedgeTakenCount"; + OS << "\\n" << *Plan.BackedgeTakenCount << " := BackedgeTakenCount"; for (auto Entry : Plan.Value2VPValue) { OS << "\\n" << *Entry.second; OS << DOT::EscapeString(" := "); @@ -522,7 +548,7 @@ void VPlanPrinter::dump() { OS << "edge [fontname=Courier, fontsize=30]\n"; OS << "compound=true\n"; - for (VPBlockBase *Block : depth_first(Plan.getEntry())) + for (const VPBlockBase *Block : depth_first(Plan.getEntry())) dumpBlock(Block); OS << "}\n"; @@ -661,6 +687,16 @@ void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, O << " " << VPlanIngredient(IV) << "\\l\""; } +void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent) const { + O << " +\n" << Indent << "\"WIDEN-GEP "; + O << (IsPtrLoopInvariant ? "Inv" : "Var"); + size_t IndicesNumber = IsIndexLoopInvariant.size(); + for (size_t I = 0; I < IndicesNumber; ++I) + O << "[" << (IsIndexLoopInvariant[I] ? "Inv" : "Var") << "]"; + O << "\\l\""; + O << " +\n" << Indent << "\" " << VPlanIngredient(GEP) << "\\l\""; +} + void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent) const { O << " +\n" << Indent << "\"WIDEN-PHI " << VPlanIngredient(Phi) << "\\l\""; } @@ -703,9 +739,12 @@ void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent) const { void VPWidenMemoryInstructionRecipe::print(raw_ostream &O, const Twine &Indent) const { O << " +\n" << Indent << "\"WIDEN " << VPlanIngredient(&Instr); - if (User) { + O << ", "; + getAddr()->printAsOperand(O); + VPValue *Mask = getMask(); + if (Mask) { O << ", "; - User->getOperand(0)->printAsOperand(O); + Mask->printAsOperand(O); } O << "\\l\""; } diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 44d8a198f27e..c65abc3639d7 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -31,6 +31,7 @@ #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/GraphTraits.h" #include "llvm/ADT/Optional.h" +#include "llvm/ADT/SmallBitVector.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" @@ -226,6 +227,8 @@ public: struct VPCallback { virtual ~VPCallback() {} virtual Value *getOrCreateVectorValues(Value *V, unsigned Part) = 0; + virtual Value *getOrCreateScalarValue(Value *V, + const VPIteration &Instance) = 0; }; /// VPTransformState holds information passed down when "executing" a VPlan, @@ -268,6 +271,13 @@ struct VPTransformState { return Callback.getOrCreateVectorValues(VPValue2Value[Def], Part); } + /// Get the generated Value for a given VPValue and given Part and Lane. Note + /// that as per-lane Defs are still created by ILV and managed in its ValueMap + /// this method currently just delegates the call to ILV. + Value *get(VPValue *Def, const VPIteration &Instance) { + return Callback.getOrCreateScalarValue(VPValue2Value[Def], Instance); + } + /// Set the generated Value for a given VPValue and a given Part. void set(VPValue *Def, Value *V, unsigned Part) { if (!Data.PerPartOutput.count(Def)) { @@ -567,6 +577,7 @@ public: /// instructions. class VPRecipeBase : public ilist_node_with_parent<VPRecipeBase, VPBasicBlock> { friend VPBasicBlock; + friend class VPBlockUtils; private: const unsigned char SubclassID; ///< Subclass identifier (for isa/dyn_cast). @@ -586,6 +597,7 @@ public: VPInterleaveSC, VPPredInstPHISC, VPReplicateSC, + VPWidenGEPSC, VPWidenIntOrFpInductionSC, VPWidenMemoryInstructionSC, VPWidenPHISC, @@ -615,10 +627,18 @@ public: /// the specified recipe. void insertBefore(VPRecipeBase *InsertPos); + /// Insert an unlinked Recipe into a basic block immediately after + /// the specified Recipe. + void insertAfter(VPRecipeBase *InsertPos); + /// Unlink this recipe from its current VPBasicBlock and insert it into /// the VPBasicBlock that MovePos lives in, right after MovePos. void moveAfter(VPRecipeBase *MovePos); + /// This method unlinks 'this' from the containing basic block, but does not + /// delete it. + void removeFromParent(); + /// This method unlinks 'this' from the containing basic block and deletes it. /// /// \returns an iterator pointing to the element after the erased one @@ -630,7 +650,6 @@ public: /// executed, these instructions would always form a single-def expression as /// the VPInstruction is also a single def-use vertex. class VPInstruction : public VPUser, public VPRecipeBase { - friend class VPlanHCFGTransforms; friend class VPlanSlp; public: @@ -740,6 +759,36 @@ public: void print(raw_ostream &O, const Twine &Indent) const override; }; +/// A recipe for handling GEP instructions. +class VPWidenGEPRecipe : public VPRecipeBase { +private: + GetElementPtrInst *GEP; + bool IsPtrLoopInvariant; + SmallBitVector IsIndexLoopInvariant; + +public: + VPWidenGEPRecipe(GetElementPtrInst *GEP, Loop *OrigLoop) + : VPRecipeBase(VPWidenGEPSC), GEP(GEP), + IsIndexLoopInvariant(GEP->getNumIndices(), false) { + IsPtrLoopInvariant = OrigLoop->isLoopInvariant(GEP->getPointerOperand()); + for (auto Index : enumerate(GEP->indices())) + IsIndexLoopInvariant[Index.index()] = + OrigLoop->isLoopInvariant(Index.value().get()); + } + ~VPWidenGEPRecipe() override = default; + + /// Method to support type inquiry through isa, cast, and dyn_cast. + static inline bool classof(const VPRecipeBase *V) { + return V->getVPRecipeID() == VPRecipeBase::VPWidenGEPSC; + } + + /// Generate the gep nodes. + void execute(VPTransformState &State) override; + + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent) const override; +}; + /// A recipe for handling phi nodes of integer and floating-point inductions, /// producing their vector and scalar values. class VPWidenIntOrFpInductionRecipe : public VPRecipeBase { @@ -822,13 +871,14 @@ public: class VPInterleaveRecipe : public VPRecipeBase { private: const InterleaveGroup<Instruction> *IG; - std::unique_ptr<VPUser> User; + VPUser User; public: - VPInterleaveRecipe(const InterleaveGroup<Instruction> *IG, VPValue *Mask) - : VPRecipeBase(VPInterleaveSC), IG(IG) { - if (Mask) // Create a VPInstruction to register as a user of the mask. - User.reset(new VPUser({Mask})); + VPInterleaveRecipe(const InterleaveGroup<Instruction> *IG, VPValue *Addr, + VPValue *Mask) + : VPRecipeBase(VPInterleaveSC), IG(IG), User({Addr}) { + if (Mask) + User.addOperand(Mask); } ~VPInterleaveRecipe() override = default; @@ -837,6 +887,18 @@ public: return V->getVPRecipeID() == VPRecipeBase::VPInterleaveSC; } + /// Return the address accessed by this recipe. + VPValue *getAddr() const { + return User.getOperand(0); // Address is the 1st, mandatory operand. + } + + /// Return the mask used by this recipe. Note that a full mask is represented + /// by a nullptr. + VPValue *getMask() const { + // Mask is optional and therefore the last, currently 2nd operand. + return User.getNumOperands() == 2 ? User.getOperand(1) : nullptr; + } + /// Generate the wide load or store, and shuffles. void execute(VPTransformState &State) override; @@ -959,13 +1021,14 @@ public: class VPWidenMemoryInstructionRecipe : public VPRecipeBase { private: Instruction &Instr; - std::unique_ptr<VPUser> User; + VPUser User; public: - VPWidenMemoryInstructionRecipe(Instruction &Instr, VPValue *Mask) - : VPRecipeBase(VPWidenMemoryInstructionSC), Instr(Instr) { - if (Mask) // Create a VPInstruction to register as a user of the mask. - User.reset(new VPUser({Mask})); + VPWidenMemoryInstructionRecipe(Instruction &Instr, VPValue *Addr, + VPValue *Mask) + : VPRecipeBase(VPWidenMemoryInstructionSC), Instr(Instr), User({Addr}) { + if (Mask) + User.addOperand(Mask); } /// Method to support type inquiry through isa, cast, and dyn_cast. @@ -973,6 +1036,18 @@ public: return V->getVPRecipeID() == VPRecipeBase::VPWidenMemoryInstructionSC; } + /// Return the address accessed by this recipe. + VPValue *getAddr() const { + return User.getOperand(0); // Address is the 1st, mandatory operand. + } + + /// Return the mask used by this recipe. Note that a full mask is represented + /// by a nullptr. + VPValue *getMask() const { + // Mask is optional and therefore the last, currently 2nd operand. + return User.getNumOperands() == 2 ? User.getOperand(1) : nullptr; + } + /// Generate the wide load/store. void execute(VPTransformState &State) override; @@ -1143,6 +1218,128 @@ public: void execute(struct VPTransformState *State) override; }; +//===----------------------------------------------------------------------===// +// GraphTraits specializations for VPlan Hierarchical Control-Flow Graphs // +//===----------------------------------------------------------------------===// + +// The following set of template specializations implement GraphTraits to treat +// any VPBlockBase as a node in a graph of VPBlockBases. It's important to note +// that VPBlockBase traits don't recurse into VPRegioBlocks, i.e., if the +// VPBlockBase is a VPRegionBlock, this specialization provides access to its +// successors/predecessors but not to the blocks inside the region. + +template <> struct GraphTraits<VPBlockBase *> { + using NodeRef = VPBlockBase *; + using ChildIteratorType = SmallVectorImpl<VPBlockBase *>::iterator; + + static NodeRef getEntryNode(NodeRef N) { return N; } + + static inline ChildIteratorType child_begin(NodeRef N) { + return N->getSuccessors().begin(); + } + + static inline ChildIteratorType child_end(NodeRef N) { + return N->getSuccessors().end(); + } +}; + +template <> struct GraphTraits<const VPBlockBase *> { + using NodeRef = const VPBlockBase *; + using ChildIteratorType = SmallVectorImpl<VPBlockBase *>::const_iterator; + + static NodeRef getEntryNode(NodeRef N) { return N; } + + static inline ChildIteratorType child_begin(NodeRef N) { + return N->getSuccessors().begin(); + } + + static inline ChildIteratorType child_end(NodeRef N) { + return N->getSuccessors().end(); + } +}; + +// Inverse order specialization for VPBasicBlocks. Predecessors are used instead +// of successors for the inverse traversal. +template <> struct GraphTraits<Inverse<VPBlockBase *>> { + using NodeRef = VPBlockBase *; + using ChildIteratorType = SmallVectorImpl<VPBlockBase *>::iterator; + + static NodeRef getEntryNode(Inverse<NodeRef> B) { return B.Graph; } + + static inline ChildIteratorType child_begin(NodeRef N) { + return N->getPredecessors().begin(); + } + + static inline ChildIteratorType child_end(NodeRef N) { + return N->getPredecessors().end(); + } +}; + +// The following set of template specializations implement GraphTraits to +// treat VPRegionBlock as a graph and recurse inside its nodes. It's important +// to note that the blocks inside the VPRegionBlock are treated as VPBlockBases +// (i.e., no dyn_cast is performed, VPBlockBases specialization is used), so +// there won't be automatic recursion into other VPBlockBases that turn to be +// VPRegionBlocks. + +template <> +struct GraphTraits<VPRegionBlock *> : public GraphTraits<VPBlockBase *> { + using GraphRef = VPRegionBlock *; + using nodes_iterator = df_iterator<NodeRef>; + + static NodeRef getEntryNode(GraphRef N) { return N->getEntry(); } + + static nodes_iterator nodes_begin(GraphRef N) { + return nodes_iterator::begin(N->getEntry()); + } + + static nodes_iterator nodes_end(GraphRef N) { + // df_iterator::end() returns an empty iterator so the node used doesn't + // matter. + return nodes_iterator::end(N); + } +}; + +template <> +struct GraphTraits<const VPRegionBlock *> + : public GraphTraits<const VPBlockBase *> { + using GraphRef = const VPRegionBlock *; + using nodes_iterator = df_iterator<NodeRef>; + + static NodeRef getEntryNode(GraphRef N) { return N->getEntry(); } + + static nodes_iterator nodes_begin(GraphRef N) { + return nodes_iterator::begin(N->getEntry()); + } + + static nodes_iterator nodes_end(GraphRef N) { + // df_iterator::end() returns an empty iterator so the node used doesn't + // matter. + return nodes_iterator::end(N); + } +}; + +template <> +struct GraphTraits<Inverse<VPRegionBlock *>> + : public GraphTraits<Inverse<VPBlockBase *>> { + using GraphRef = VPRegionBlock *; + using nodes_iterator = df_iterator<NodeRef>; + + static NodeRef getEntryNode(Inverse<GraphRef> N) { + return N.Graph->getExit(); + } + + static nodes_iterator nodes_begin(GraphRef N) { + return nodes_iterator::begin(N->getExit()); + } + + static nodes_iterator nodes_end(GraphRef N) { + // df_iterator::end() returns an empty iterator so the node used doesn't + // matter. + return nodes_iterator::end(N); + } +}; + /// VPlan models a candidate for vectorization, encoding various decisions take /// to produce efficient output IR, including which branches, basic-blocks and /// output IR instructions to generate, and their cost. VPlan holds a @@ -1245,35 +1442,45 @@ public: return Value2VPValue[V]; } + VPValue *getOrAddVPValue(Value *V) { + assert(V && "Trying to get or add the VPValue of a null Value"); + if (!Value2VPValue.count(V)) + addVPValue(V); + return getVPValue(V); + } + /// Return the VPLoopInfo analysis for this VPlan. VPLoopInfo &getVPLoopInfo() { return VPLInfo; } const VPLoopInfo &getVPLoopInfo() const { return VPLInfo; } + /// Dump the plan to stderr (for debugging). + void dump() const; + private: /// Add to the given dominator tree the header block and every new basic block /// that was created between it and the latch block, inclusive. - static void updateDominatorTree(DominatorTree *DT, + static void updateDominatorTree(DominatorTree *DT, BasicBlock *LoopLatchBB, BasicBlock *LoopPreHeaderBB, - BasicBlock *LoopLatchBB); + BasicBlock *LoopExitBB); }; /// VPlanPrinter prints a given VPlan to a given output stream. The printing is /// indented and follows the dot format. class VPlanPrinter { - friend inline raw_ostream &operator<<(raw_ostream &OS, VPlan &Plan); + friend inline raw_ostream &operator<<(raw_ostream &OS, const VPlan &Plan); friend inline raw_ostream &operator<<(raw_ostream &OS, const struct VPlanIngredient &I); private: raw_ostream &OS; - VPlan &Plan; - unsigned Depth; + const VPlan &Plan; + unsigned Depth = 0; unsigned TabWidth = 2; std::string Indent; unsigned BID = 0; SmallDenseMap<const VPBlockBase *, unsigned> BlockID; - VPlanPrinter(raw_ostream &O, VPlan &P) : OS(O), Plan(P) {} + VPlanPrinter(raw_ostream &O, const VPlan &P) : OS(O), Plan(P) {} /// Handle indentation. void bumpIndent(int b) { Indent = std::string((Depth += b) * TabWidth, ' '); } @@ -1320,135 +1527,13 @@ inline raw_ostream &operator<<(raw_ostream &OS, const VPlanIngredient &I) { return OS; } -inline raw_ostream &operator<<(raw_ostream &OS, VPlan &Plan) { +inline raw_ostream &operator<<(raw_ostream &OS, const VPlan &Plan) { VPlanPrinter Printer(OS, Plan); Printer.dump(); return OS; } //===----------------------------------------------------------------------===// -// GraphTraits specializations for VPlan Hierarchical Control-Flow Graphs // -//===----------------------------------------------------------------------===// - -// The following set of template specializations implement GraphTraits to treat -// any VPBlockBase as a node in a graph of VPBlockBases. It's important to note -// that VPBlockBase traits don't recurse into VPRegioBlocks, i.e., if the -// VPBlockBase is a VPRegionBlock, this specialization provides access to its -// successors/predecessors but not to the blocks inside the region. - -template <> struct GraphTraits<VPBlockBase *> { - using NodeRef = VPBlockBase *; - using ChildIteratorType = SmallVectorImpl<VPBlockBase *>::iterator; - - static NodeRef getEntryNode(NodeRef N) { return N; } - - static inline ChildIteratorType child_begin(NodeRef N) { - return N->getSuccessors().begin(); - } - - static inline ChildIteratorType child_end(NodeRef N) { - return N->getSuccessors().end(); - } -}; - -template <> struct GraphTraits<const VPBlockBase *> { - using NodeRef = const VPBlockBase *; - using ChildIteratorType = SmallVectorImpl<VPBlockBase *>::const_iterator; - - static NodeRef getEntryNode(NodeRef N) { return N; } - - static inline ChildIteratorType child_begin(NodeRef N) { - return N->getSuccessors().begin(); - } - - static inline ChildIteratorType child_end(NodeRef N) { - return N->getSuccessors().end(); - } -}; - -// Inverse order specialization for VPBasicBlocks. Predecessors are used instead -// of successors for the inverse traversal. -template <> struct GraphTraits<Inverse<VPBlockBase *>> { - using NodeRef = VPBlockBase *; - using ChildIteratorType = SmallVectorImpl<VPBlockBase *>::iterator; - - static NodeRef getEntryNode(Inverse<NodeRef> B) { return B.Graph; } - - static inline ChildIteratorType child_begin(NodeRef N) { - return N->getPredecessors().begin(); - } - - static inline ChildIteratorType child_end(NodeRef N) { - return N->getPredecessors().end(); - } -}; - -// The following set of template specializations implement GraphTraits to -// treat VPRegionBlock as a graph and recurse inside its nodes. It's important -// to note that the blocks inside the VPRegionBlock are treated as VPBlockBases -// (i.e., no dyn_cast is performed, VPBlockBases specialization is used), so -// there won't be automatic recursion into other VPBlockBases that turn to be -// VPRegionBlocks. - -template <> -struct GraphTraits<VPRegionBlock *> : public GraphTraits<VPBlockBase *> { - using GraphRef = VPRegionBlock *; - using nodes_iterator = df_iterator<NodeRef>; - - static NodeRef getEntryNode(GraphRef N) { return N->getEntry(); } - - static nodes_iterator nodes_begin(GraphRef N) { - return nodes_iterator::begin(N->getEntry()); - } - - static nodes_iterator nodes_end(GraphRef N) { - // df_iterator::end() returns an empty iterator so the node used doesn't - // matter. - return nodes_iterator::end(N); - } -}; - -template <> -struct GraphTraits<const VPRegionBlock *> - : public GraphTraits<const VPBlockBase *> { - using GraphRef = const VPRegionBlock *; - using nodes_iterator = df_iterator<NodeRef>; - - static NodeRef getEntryNode(GraphRef N) { return N->getEntry(); } - - static nodes_iterator nodes_begin(GraphRef N) { - return nodes_iterator::begin(N->getEntry()); - } - - static nodes_iterator nodes_end(GraphRef N) { - // df_iterator::end() returns an empty iterator so the node used doesn't - // matter. - return nodes_iterator::end(N); - } -}; - -template <> -struct GraphTraits<Inverse<VPRegionBlock *>> - : public GraphTraits<Inverse<VPBlockBase *>> { - using GraphRef = VPRegionBlock *; - using nodes_iterator = df_iterator<NodeRef>; - - static NodeRef getEntryNode(Inverse<GraphRef> N) { - return N.Graph->getExit(); - } - - static nodes_iterator nodes_begin(GraphRef N) { - return nodes_iterator::begin(N->getExit()); - } - - static nodes_iterator nodes_end(GraphRef N) { - // df_iterator::end() returns an empty iterator so the node used doesn't - // matter. - return nodes_iterator::end(N); - } -}; - -//===----------------------------------------------------------------------===// // VPlan Utilities //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Transforms/Vectorize/VPlanHCFGTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index b22d3190d654..3f6a2efd55cc 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanHCFGTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1,4 +1,4 @@ -//===-- VPlanHCFGTransforms.cpp - Utility VPlan to VPlan transforms -------===// +//===-- VPlanTransforms.cpp - Utility VPlan to VPlan transforms -----------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -11,13 +11,13 @@ /// //===----------------------------------------------------------------------===// -#include "VPlanHCFGTransforms.h" +#include "VPlanTransforms.h" #include "llvm/ADT/PostOrderIterator.h" using namespace llvm; -void VPlanHCFGTransforms::VPInstructionsToVPRecipes( - VPlanPtr &Plan, +void VPlanTransforms::VPInstructionsToVPRecipes( + Loop *OrigLoop, VPlanPtr &Plan, LoopVectorizationLegality::InductionList *Inductions, SmallPtrSetImpl<Instruction *> &DeadInstructions) { @@ -56,7 +56,9 @@ void VPlanHCFGTransforms::VPInstructionsToVPRecipes( VPRecipeBase *NewRecipe = nullptr; // Create VPWidenMemoryInstructionRecipe for loads and stores. if (isa<LoadInst>(Inst) || isa<StoreInst>(Inst)) - NewRecipe = new VPWidenMemoryInstructionRecipe(*Inst, nullptr /*Mask*/); + NewRecipe = new VPWidenMemoryInstructionRecipe( + *Inst, Plan->getOrAddVPValue(getLoadStorePointerOperand(Inst)), + nullptr /*Mask*/); else if (PHINode *Phi = dyn_cast<PHINode>(Inst)) { InductionDescriptor II = Inductions->lookup(Phi); if (II.getKind() == InductionDescriptor::IK_IntInduction || @@ -64,6 +66,8 @@ void VPlanHCFGTransforms::VPInstructionsToVPRecipes( NewRecipe = new VPWidenIntOrFpInductionRecipe(Phi); } else NewRecipe = new VPWidenPHIRecipe(Phi); + } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Inst)) { + NewRecipe = new VPWidenGEPRecipe(GEP, OrigLoop); } else { // If the last recipe is a VPWidenRecipe, add Inst to it instead of // creating a new recipe. diff --git a/llvm/lib/Transforms/Vectorize/VPlanHCFGTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index 79a23c33184f..0d3bd7da09a7 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanHCFGTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -1,4 +1,4 @@ -//===- VPlanHCFGTransforms.h - Utility VPlan to VPlan transforms ----------===// +//===- VPlanTransforms.h - Utility VPlan to VPlan transforms --------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -10,8 +10,8 @@ /// This file provides utility VPlan to VPlan transformations. //===----------------------------------------------------------------------===// -#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLANHCFGTRANSFORMS_H -#define LLVM_TRANSFORMS_VECTORIZE_VPLANHCFGTRANSFORMS_H +#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLANTRANSFORMS_H +#define LLVM_TRANSFORMS_VECTORIZE_VPLANTRANSFORMS_H #include "VPlan.h" #include "llvm/IR/Instruction.h" @@ -19,17 +19,17 @@ namespace llvm { -class VPlanHCFGTransforms { +class VPlanTransforms { public: /// Replaces the VPInstructions in \p Plan with corresponding /// widen recipes. static void VPInstructionsToVPRecipes( - VPlanPtr &Plan, + Loop *OrigLoop, VPlanPtr &Plan, LoopVectorizationLegality::InductionList *Inductions, SmallPtrSetImpl<Instruction *> &DeadInstructions); }; } // namespace llvm -#endif // LLVM_TRANSFORMS_VECTORIZE_VPLANHCFGTRANSFORMS_H +#endif // LLVM_TRANSFORMS_VECTORIZE_VPLANTRANSFORMS_H diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h index 7b6c228c229e..464498c29d89 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanValue.h +++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -37,7 +37,7 @@ class VPUser; // and live-outs which the VPlan will need to fix accordingly. class VPValue { friend class VPBuilder; - friend class VPlanHCFGTransforms; + friend class VPlanTransforms; friend class VPBasicBlock; friend class VPInterleavedAccessInfo; diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp index 394b1b93113b..ab3e7e2282e7 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp @@ -14,6 +14,7 @@ #include "VPlanVerifier.h" #include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/Support/CommandLine.h" #define DEBUG_TYPE "loop-vectorize" |