src - FreeBSD source tree

diff options


context:
space:
mode:

author	Dimitry Andric <dim@FreeBSD.org>	2020-01-17 20:45:01 +0000
committer	Dimitry Andric <dim@FreeBSD.org>	2020-01-17 20:45:01 +0000
commit	706b4fc47bbc608932d3b491ae19a3b9cde9497b (patch)
tree	4adf86a776049cbf7f69a1929c4babcbbef925eb /llvm/lib/Transforms/Vectorize
parent	7cc9cf2bf09f069cb2dd947ead05d0b54301fb71 (diff)
download	src-706b4fc47bbc608932d3b491ae19a3b9cde9497b.tar.gz src-706b4fc47bbc608932d3b491ae19a3b9cde9497b.zip

Vendor import of llvm-project master e26a78e70, the last commit beforevendor/llvm-project/llvmorg-10-init-17466-ge26a78e7085

the llvmorg-11-init tag, from which release/10.x was branched.

Notes

Notes: svn path=/vendor/llvm-project/master/; revision=356843 svn path=/vendor/llvm-project/llvmorg-10-init-17466-ge26a78e7085/; revision=356844; tag=vendor/llvm-project/llvmorg-10-init-17466-ge26a78e7085

Diffstat (limited to 'llvm/lib/Transforms/Vectorize')

-rw-r--r--

llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp

-rw-r--r--

llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp

-rw-r--r--

llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h

-rw-r--r--

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

1011

-rw-r--r--

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

965

-rw-r--r--

llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h

-rw-r--r--

llvm/lib/Transforms/Vectorize/VPlan.cpp

-rw-r--r--

llvm/lib/Transforms/Vectorize/VPlan.h

365

-rw-r--r--

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp (renamed from llvm/lib/Transforms/Vectorize/VPlanHCFGTransforms.cpp)

-rw-r--r--

llvm/lib/Transforms/Vectorize/VPlanTransforms.h (renamed from llvm/lib/Transforms/Vectorize/VPlanHCFGTransforms.h)

-rw-r--r--

llvm/lib/Transforms/Vectorize/VPlanValue.h

-rw-r--r--

llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp

12 files changed, 1548 insertions, 957 deletions

diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
index f44976c723ec..7478daa2a0a5 100644
--- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp

@@ -38,6 +38,7 @@

// could use this pass (with some modifications), but currently it implements

// its own pass to do something similar to what we do here.

+#include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h"

#include "llvm/ADT/APInt.h"

#include "llvm/ADT/ArrayRef.h"

#include "llvm/ADT/MapVector.h"

@@ -52,7 +53,6 @@

#include "llvm/Analysis/OrderedBasicBlock.h"

#include "llvm/Analysis/ScalarEvolution.h"

#include "llvm/Analysis/TargetTransformInfo.h"

-#include "llvm/Transforms/Utils/Local.h"

#include "llvm/Analysis/ValueTracking.h"

#include "llvm/Analysis/VectorUtils.h"

#include "llvm/IR/Attributes.h"

@@ -71,14 +71,15 @@

#include "llvm/IR/Type.h"

#include "llvm/IR/User.h"

#include "llvm/IR/Value.h"

+#include "llvm/InitializePasses.h"

#include "llvm/Pass.h"

#include "llvm/Support/Casting.h"

#include "llvm/Support/Debug.h"

#include "llvm/Support/KnownBits.h"

#include "llvm/Support/MathExtras.h"

#include "llvm/Support/raw_ostream.h"

+#include "llvm/Transforms/Utils/Local.h"

#include "llvm/Transforms/Vectorize.h"

-#include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h"

#include <algorithm>

#include <cassert>

#include <cstdlib>

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index f43842be5357..3f943f4c0688 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp

@@ -72,7 +72,7 @@ LoopVectorizeHints::LoopVectorizeHints(const Loop *L,

Interleave("interleave.count", InterleaveOnlyWhenForced, HK_UNROLL),

Force("vectorize.enable", FK_Undefined, HK_FORCE),

IsVectorized("isvectorized", 0, HK_ISVECTORIZED),

- Predicate("vectorize.predicate.enable", 0, HK_PREDICATE), TheLoop(L),

+ Predicate("vectorize.predicate.enable", FK_Undefined, HK_PREDICATE), TheLoop(L),

ORE(ORE) {

// Populate values with existing loop metadata.

getHintsFromMetadata();

@@ -815,6 +815,18 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {

}

+ // For first order recurrences, we use the previous value (incoming value from

+ // the latch) to check if it dominates all users of the recurrence. Bail out

+ // if we have to sink such an instruction for another recurrence, as the

+ // dominance requirement may not hold after sinking.

+ BasicBlock *LoopLatch = TheLoop->getLoopLatch();

+ if (any_of(FirstOrderRecurrences, [LoopLatch, this](const PHINode *Phi) {

+ Instruction *V =

+ cast<Instruction>(Phi->getIncomingValueForBlock(LoopLatch));

+ return SinkAfter.find(V) != SinkAfter.end();

+ }))

+ return false;

// Now we know the widest induction type, check if our found induction

// is the same size. If it's not, unset it here and InnerLoopVectorizer

// will create another.

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index a5e85f27fabf..c3ca43fcd492 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h

@@ -201,6 +201,9 @@ class LoopVectorizationPlanner {

/// The profitability analysis.

LoopVectorizationCostModel &CM;

+ /// The interleaved access analysis.

+ InterleavedAccessInfo &IAI;

SmallVector<VPlanPtr, 4> VPlans;

/// This class is used to enable the VPlan to invoke a method of ILV. This is

@@ -211,6 +214,8 @@ class LoopVectorizationPlanner {

VPCallbackILV(InnerLoopVectorizer &ILV) : ILV(ILV) {}

Value *getOrCreateVectorValues(Value *V, unsigned Part) override;

+ Value *getOrCreateScalarValue(Value *V,

+ const VPIteration &Instance) override;

};

/// A builder used to construct the current plan.

@@ -223,8 +228,10 @@ public:

LoopVectorizationPlanner(Loop *L, LoopInfo *LI, const TargetLibraryInfo *TLI,

const TargetTransformInfo *TTI,

LoopVectorizationLegality *Legal,

- LoopVectorizationCostModel &CM)

- : OrigLoop(L), LI(LI), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM) {}

+ LoopVectorizationCostModel &CM,

+ InterleavedAccessInfo &IAI)

+ : OrigLoop(L), LI(LI), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM),

+ IAI(IAI) {}

/// Plan how to best vectorize, return the best VF and its cost, or None if

/// vectorization and interleaving should be avoided up front.

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 8f0bf70f873c..684a3098e564 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

@@ -58,8 +58,8 @@

#include "VPRecipeBuilder.h"

#include "VPlan.h"

#include "VPlanHCFGBuilder.h"

-#include "VPlanHCFGTransforms.h"

#include "VPlanPredicator.h"

+#include "VPlanTransforms.h"

#include "llvm/ADT/APInt.h"

#include "llvm/ADT/ArrayRef.h"

#include "llvm/ADT/DenseMap.h"

@@ -124,6 +124,7 @@

#include "llvm/IR/Value.h"

#include "llvm/IR/ValueHandle.h"

#include "llvm/IR/Verifier.h"

+#include "llvm/InitializePasses.h"

#include "llvm/Pass.h"

#include "llvm/Support/Casting.h"

#include "llvm/Support/CommandLine.h"

@@ -149,7 +150,6 @@

#include <string>

#include <tuple>

#include <utility>

-#include <vector>

using namespace llvm;

@@ -200,9 +200,10 @@ static cl::opt<bool> EnableMaskedInterleavedMemAccesses(

"enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,

cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));

-/// We don't interleave loops with a known constant trip count below this

-/// number.

-static const unsigned TinyTripCountInterleaveThreshold = 128;

+static cl::opt<unsigned> TinyTripCountInterleaveThreshold(

+ "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,

+ cl::desc("We don't interleave loops with a estimated constant trip count "

+ "below this number"));

static cl::opt<unsigned> ForceTargetNumScalarRegs(

"force-target-num-scalar-regs", cl::init(0), cl::Hidden,

@@ -427,6 +428,11 @@ public:

/// new unrolled loop, where UF is the unroll factor.

using VectorParts = SmallVector<Value *, 2>;

+ /// Vectorize a single GetElementPtrInst based on information gathered and

+ /// decisions taken during planning.

+ void widenGEP(GetElementPtrInst *GEP, unsigned UF, unsigned VF,

+ bool IsPtrLoopInvariant, SmallBitVector &IsIndexLoopInvariant);

/// Vectorize a single PHINode in a block. This method handles the induction

/// variable canonicalization. It supports both VF = 1 for unrolled loops and

/// arbitrary length vectors.

@@ -476,15 +482,20 @@ public:

/// Construct the vector value of a scalarized value \p V one lane at a time.

void packScalarIntoVectorValue(Value *V, const VPIteration &Instance);

- /// Try to vectorize the interleaved access group that \p Instr belongs to,

- /// optionally masking the vector operations if \p BlockInMask is non-null.

- void vectorizeInterleaveGroup(Instruction *Instr,

- VectorParts *BlockInMask = nullptr);

- /// Vectorize Load and Store instructions, optionally masking the vector

- /// operations if \p BlockInMask is non-null.

- void vectorizeMemoryInstruction(Instruction *Instr,

- VectorParts *BlockInMask = nullptr);

+ /// Try to vectorize the interleaved access group that \p Instr belongs to

+ /// with the base address given in \p Addr, optionally masking the vector

+ /// operations if \p BlockInMask is non-null. Use \p State to translate given

+ /// VPValues to IR values in the vectorized loop.

+ void vectorizeInterleaveGroup(Instruction *Instr, VPTransformState &State,

+ VPValue *Addr, VPValue *BlockInMask = nullptr);

+ /// Vectorize Load and Store instructions with the base address given in \p

+ /// Addr, optionally masking the vector operations if \p BlockInMask is

+ /// non-null. Use \p State to translate given VPValues to IR values in the

+ /// vectorized loop.

+ void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State,

+ VPValue *Addr,

+ VPValue *BlockInMask = nullptr);

/// Set the debug location in the builder using the debug location in

/// the instruction.

@@ -525,6 +536,9 @@ protected:

/// vectorizing this phi node.

void fixReduction(PHINode *Phi);

+ /// Clear NSW/NUW flags from reduction instructions if necessary.

+ void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc);

/// The Loop exit block may have single value PHI nodes with some

/// incoming value. While vectorizing we only handled real values

/// that were defined inside the loop and we should have one value for

@@ -539,10 +553,6 @@ protected:

/// represented as.

void truncateToMinimalBitwidths();

- /// Insert the new loop to the loop hierarchy and pass manager

- /// and update the analysis passes.

- void updateAnalysis();

/// Create a broadcast instruction. This method generates a broadcast

/// instruction (shuffle) for loop invariant values and for the induction

/// value. If this is the induction variable then we extend it to N, N+1, ...

@@ -1204,14 +1214,14 @@ public:

/// Returns true if the target machine supports masked scatter operation

/// for the given \p DataType.

- bool isLegalMaskedScatter(Type *DataType) {

- return TTI.isLegalMaskedScatter(DataType);

+ bool isLegalMaskedScatter(Type *DataType, MaybeAlign Alignment) {

+ return TTI.isLegalMaskedScatter(DataType, Alignment);

}

/// Returns true if the target machine supports masked gather operation

/// for the given \p DataType.

- bool isLegalMaskedGather(Type *DataType) {

- return TTI.isLegalMaskedGather(DataType);

+ bool isLegalMaskedGather(Type *DataType, MaybeAlign Alignment) {

+ return TTI.isLegalMaskedGather(DataType, Alignment);

}

/// Returns true if the target machine can represent \p V as a masked gather

@@ -1222,7 +1232,9 @@ public:

if (!LI && !SI)

return false;

auto *Ty = getMemInstValueType(V);

- return (LI && isLegalMaskedGather(Ty)) || (SI && isLegalMaskedScatter(Ty));

+ MaybeAlign Align = getLoadStoreAlignment(V);

+ return (LI && isLegalMaskedGather(Ty, Align)) ||

+ (SI && isLegalMaskedScatter(Ty, Align));

}

/// Returns true if \p I is an instruction that will be scalarized with

@@ -2155,7 +2167,9 @@ static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {

// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements

// store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B

void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr,

- VectorParts *BlockInMask) {

+ VPTransformState &State,

+ VPValue *Addr,

+ VPValue *BlockInMask) {

const InterleaveGroup<Instruction> *Group =

Cost->getInterleavedAccessGroup(Instr);

assert(Group && "Fail to get an interleaved access group.");

@@ -2165,27 +2179,19 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr,

return;

const DataLayout &DL = Instr->getModule()->getDataLayout();

- Value *Ptr = getLoadStorePointerOperand(Instr);

// Prepare for the vector type of the interleaved load/store.

Type *ScalarTy = getMemInstValueType(Instr);

unsigned InterleaveFactor = Group->getFactor();

Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF);

- Type *PtrTy = VecTy->getPointerTo(getLoadStoreAddressSpace(Instr));

// Prepare for the new pointers.

- setDebugLocFromInst(Builder, Ptr);

- SmallVector<Value *, 2> NewPtrs;

+ SmallVector<Value *, 2> AddrParts;

unsigned Index = Group->getIndex(Instr);

- VectorParts Mask;

- bool IsMaskForCondRequired = BlockInMask;

- if (IsMaskForCondRequired) {

- Mask = *BlockInMask;

- // TODO: extend the masked interleaved-group support to reversed access.

- assert(!Group->isReverse() && "Reversed masked interleave-group "

- "not supported.");

- }

+ // TODO: extend the masked interleaved-group support to reversed access.

+ assert((!BlockInMask || !Group->isReverse()) &&

+ "Reversed masked interleave-group not supported.");

// If the group is reverse, adjust the index to refer to the last vector lane

// instead of the first. We adjust the index from the first vector lane,

@@ -2196,12 +2202,9 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr,

if (Group->isReverse())

Index += (VF - 1) * Group->getFactor();

- bool InBounds = false;

- if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))

- InBounds = gep->isInBounds();

for (unsigned Part = 0; Part < UF; Part++) {

- Value *NewPtr = getOrCreateScalarValue(Ptr, {Part, 0});

+ Value *AddrPart = State.get(Addr, {Part, 0});

+ setDebugLocFromInst(Builder, AddrPart);

// Notice current instruction could be any index. Need to adjust the address

// to the member of index 0.

@@ -2214,12 +2217,17 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr,

// A[i] = b; // Member of index 0

// A[i+2] = c; // Member of index 2 (Current instruction)

// Current pointer is pointed to A[i+2], adjust it to A[i].

- NewPtr = Builder.CreateGEP(ScalarTy, NewPtr, Builder.getInt32(-Index));

- if (InBounds)

- cast<GetElementPtrInst>(NewPtr)->setIsInBounds(true);

+ bool InBounds = false;

+ if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))

+ InBounds = gep->isInBounds();

+ AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index));

+ cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds);

// Cast to the vector pointer type.

- NewPtrs.push_back(Builder.CreateBitCast(NewPtr, PtrTy));

+ unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();

+ Type *PtrTy = VecTy->getPointerTo(AddressSpace);

+ AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));

}

setDebugLocFromInst(Builder, Instr);

@@ -2237,26 +2245,27 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr,

SmallVector<Value *, 2> NewLoads;

for (unsigned Part = 0; Part < UF; Part++) {

Instruction *NewLoad;

- if (IsMaskForCondRequired || MaskForGaps) {

+ if (BlockInMask || MaskForGaps) {

assert(useMaskedInterleavedAccesses(*TTI) &&

"masked interleaved groups are not allowed.");

Value *GroupMask = MaskForGaps;

- if (IsMaskForCondRequired) {

- auto *Undefs = UndefValue::get(Mask[Part]->getType());

+ if (BlockInMask) {

+ Value *BlockInMaskPart = State.get(BlockInMask, Part);

+ auto *Undefs = UndefValue::get(BlockInMaskPart->getType());

auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);

Value *ShuffledMask = Builder.CreateShuffleVector(

- Mask[Part], Undefs, RepMask, "interleaved.mask");

+ BlockInMaskPart, Undefs, RepMask, "interleaved.mask");

GroupMask = MaskForGaps

? Builder.CreateBinOp(Instruction::And, ShuffledMask,

MaskForGaps)

: ShuffledMask;

}

NewLoad =

- Builder.CreateMaskedLoad(NewPtrs[Part], Group->getAlignment(),

+ Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlignment(),

GroupMask, UndefVec, "wide.masked.vec");

}

else

- NewLoad = Builder.CreateAlignedLoad(VecTy, NewPtrs[Part],

+ NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],

Group->getAlignment(), "wide.vec");

Group->addMetadata(NewLoad);

NewLoads.push_back(NewLoad);

@@ -2325,24 +2334,27 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr,

"interleaved.vec");

Instruction *NewStoreInstr;

- if (IsMaskForCondRequired) {

- auto *Undefs = UndefValue::get(Mask[Part]->getType());

+ if (BlockInMask) {

+ Value *BlockInMaskPart = State.get(BlockInMask, Part);

+ auto *Undefs = UndefValue::get(BlockInMaskPart->getType());

auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);

Value *ShuffledMask = Builder.CreateShuffleVector(

- Mask[Part], Undefs, RepMask, "interleaved.mask");

+ BlockInMaskPart, Undefs, RepMask, "interleaved.mask");

NewStoreInstr = Builder.CreateMaskedStore(

- IVec, NewPtrs[Part], Group->getAlignment(), ShuffledMask);

+ IVec, AddrParts[Part], Group->getAlignment(), ShuffledMask);

}

else

- NewStoreInstr = Builder.CreateAlignedStore(IVec, NewPtrs[Part],

- Group->getAlignment());

+ NewStoreInstr = Builder.CreateAlignedStore(IVec, AddrParts[Part],

+ Group->getAlignment());

Group->addMetadata(NewStoreInstr);

}

void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,

- VectorParts *BlockInMask) {

+ VPTransformState &State,

+ VPValue *Addr,

+ VPValue *BlockInMask) {

// Attempt to issue a wide load.

LoadInst *LI = dyn_cast<LoadInst>(Instr);

StoreInst *SI = dyn_cast<StoreInst>(Instr);

@@ -2354,17 +2366,15 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,

assert(Decision != LoopVectorizationCostModel::CM_Unknown &&

"CM decision should be taken at this point");

if (Decision == LoopVectorizationCostModel::CM_Interleave)

- return vectorizeInterleaveGroup(Instr);

+ return vectorizeInterleaveGroup(Instr, State, Addr, BlockInMask);

Type *ScalarDataTy = getMemInstValueType(Instr);

Type *DataTy = VectorType::get(ScalarDataTy, VF);

- Value *Ptr = getLoadStorePointerOperand(Instr);

// An alignment of 0 means target abi alignment. We need to use the scalar's

// target abi alignment in such a case.

const DataLayout &DL = Instr->getModule()->getDataLayout();

const Align Alignment =

DL.getValueOrABITypeAlignment(getLoadStoreAlignment(Instr), ScalarDataTy);

- unsigned AddressSpace = getLoadStoreAddressSpace(Instr);

// Determine if the pointer operand of the access is either consecutive or

// reverse consecutive.

@@ -2378,25 +2388,22 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,

// gather/scatter. Otherwise Decision should have been to Scalarize.

assert((ConsecutiveStride || CreateGatherScatter) &&

"The instruction should be scalarized");

+ (void)ConsecutiveStride;

- // Handle consecutive loads/stores.

- if (ConsecutiveStride)

- Ptr = getOrCreateScalarValue(Ptr, {0, 0});

- VectorParts Mask;

+ VectorParts BlockInMaskParts(UF);

bool isMaskRequired = BlockInMask;

if (isMaskRequired)

- Mask = *BlockInMask;

- bool InBounds = false;

- if (auto *gep = dyn_cast<GetElementPtrInst>(

- getLoadStorePointerOperand(Instr)->stripPointerCasts()))

- InBounds = gep->isInBounds();

+ for (unsigned Part = 0; Part < UF; ++Part)

+ BlockInMaskParts[Part] = State.get(BlockInMask, Part);

const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {

// Calculate the pointer for the specific unroll-part.

GetElementPtrInst *PartPtr = nullptr;

+ bool InBounds = false;

+ if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))

+ InBounds = gep->isInBounds();

if (Reverse) {

// If the address is consecutive but reversed, then the

// wide store needs to start at the last vector element.

@@ -2407,13 +2414,14 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,

Builder.CreateGEP(ScalarDataTy, PartPtr, Builder.getInt32(1 - VF)));

PartPtr->setIsInBounds(InBounds);

if (isMaskRequired) // Reverse of a null all-one mask is a null mask.

- Mask[Part] = reverseVector(Mask[Part]);

+ BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]);

} else {

PartPtr = cast<GetElementPtrInst>(

Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(Part * VF)));

PartPtr->setIsInBounds(InBounds);

}

+ unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();

return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));

};

@@ -2425,8 +2433,8 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,

Instruction *NewSI = nullptr;

Value *StoredVal = getOrCreateVectorValue(SI->getValueOperand(), Part);

if (CreateGatherScatter) {

- Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr;

- Value *VectorGep = getOrCreateVectorValue(Ptr, Part);

+ Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;

+ Value *VectorGep = State.get(Addr, Part);

NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep,

Alignment.value(), MaskPart);

} else {

@@ -2437,10 +2445,10 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,

// We don't want to update the value in the map as it might be used in

// another expression. So don't call resetVectorValue(StoredVal).

}

- auto *VecPtr = CreateVecPtr(Part, Ptr);

+ auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));

if (isMaskRequired)

- NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr,

- Alignment.value(), Mask[Part]);

+ NewSI = Builder.CreateMaskedStore(

+ StoredVal, VecPtr, Alignment.value(), BlockInMaskParts[Part]);

else

NewSI =

Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment.value());

@@ -2456,17 +2464,17 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,

for (unsigned Part = 0; Part < UF; ++Part) {

Value *NewLI;

if (CreateGatherScatter) {

- Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr;

- Value *VectorGep = getOrCreateVectorValue(Ptr, Part);

+ Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;

+ Value *VectorGep = State.get(Addr, Part);

NewLI = Builder.CreateMaskedGather(VectorGep, Alignment.value(), MaskPart,

nullptr, "wide.masked.gather");

addMetadata(NewLI, LI);

} else {

- auto *VecPtr = CreateVecPtr(Part, Ptr);

+ auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));

if (isMaskRequired)

- NewLI = Builder.CreateMaskedLoad(VecPtr, Alignment.value(), Mask[Part],

- UndefValue::get(DataTy),

- "wide.masked.load");

+ NewLI = Builder.CreateMaskedLoad(

+ VecPtr, Alignment.value(), BlockInMaskParts[Part],

+ UndefValue::get(DataTy), "wide.masked.load");

else

NewLI = Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment.value(),

"wide.load");

@@ -2676,8 +2684,10 @@ Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,

void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,

BasicBlock *Bypass) {

Value *Count = getOrCreateTripCount(L);

- BasicBlock *BB = L->getLoopPreheader();

- IRBuilder<> Builder(BB->getTerminator());

+ // Reuse existing vector loop preheader for TC checks.

+ // Note that new preheader block is generated for vector loop.

+ BasicBlock *const TCCheckBlock = LoopVectorPreHeader;

+ IRBuilder<> Builder(TCCheckBlock->getTerminator());

// Generate code to check if the loop's trip count is less than VF * UF, or

// equal to it in case a scalar epilogue is required; this implies that the

@@ -2694,48 +2704,61 @@ void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,

P, Count, ConstantInt::get(Count->getType(), VF * UF),

"min.iters.check");

- BasicBlock *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");

- // Update dominator tree immediately if the generated block is a

- // LoopBypassBlock because SCEV expansions to generate loop bypass

- // checks may query it before the current function is finished.

- DT->addNewBlock(NewBB, BB);

- if (L->getParentLoop())

- L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);

- ReplaceInstWithInst(BB->getTerminator(),

- BranchInst::Create(Bypass, NewBB, CheckMinIters));

- LoopBypassBlocks.push_back(BB);

+ // Create new preheader for vector loop.

+ LoopVectorPreHeader =

+ SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,

+ "vector.ph");

+ assert(DT->properlyDominates(DT->getNode(TCCheckBlock),

+ DT->getNode(Bypass)->getIDom()) &&

+ "TC check is expected to dominate Bypass");

+ // Update dominator for Bypass & LoopExit.

+ DT->changeImmediateDominator(Bypass, TCCheckBlock);

+ DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);

+ ReplaceInstWithInst(

+ TCCheckBlock->getTerminator(),

+ BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));

+ LoopBypassBlocks.push_back(TCCheckBlock);

}

void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {

- BasicBlock *BB = L->getLoopPreheader();

+ // Reuse existing vector loop preheader for SCEV checks.

+ // Note that new preheader block is generated for vector loop.

+ BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader;

// Generate the code to check that the SCEV assumptions that we made.

// We want the new basic block to start at the first instruction in a

// sequence of instructions that form a check.

SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(),

"scev.check");

- Value *SCEVCheck =

- Exp.expandCodeForPredicate(&PSE.getUnionPredicate(), BB->getTerminator());

+ Value *SCEVCheck = Exp.expandCodeForPredicate(

+ &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator());

if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))

if (C->isZero())

return;

- assert(!BB->getParent()->hasOptSize() &&

+ assert(!SCEVCheckBlock->getParent()->hasOptSize() &&

"Cannot SCEV check stride or overflow when optimizing for size");

- // Create a new block containing the stride check.

- BB->setName("vector.scevcheck");

- auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");

- // Update dominator tree immediately if the generated block is a

- // LoopBypassBlock because SCEV expansions to generate loop bypass

- // checks may query it before the current function is finished.

- DT->addNewBlock(NewBB, BB);

- if (L->getParentLoop())

- L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);

- ReplaceInstWithInst(BB->getTerminator(),

- BranchInst::Create(Bypass, NewBB, SCEVCheck));

- LoopBypassBlocks.push_back(BB);

+ SCEVCheckBlock->setName("vector.scevcheck");

+ // Create new preheader for vector loop.

+ LoopVectorPreHeader =

+ SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI,

+ nullptr, "vector.ph");

+ // Update dominator only if this is first RT check.

+ if (LoopBypassBlocks.empty()) {

+ DT->changeImmediateDominator(Bypass, SCEVCheckBlock);

+ DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);

+ }

+ ReplaceInstWithInst(

+ SCEVCheckBlock->getTerminator(),

+ BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck));

+ LoopBypassBlocks.push_back(SCEVCheckBlock);

AddedSafetyChecks = true;

}

@@ -2744,7 +2767,9 @@ void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {

if (EnableVPlanNativePath)

return;

- BasicBlock *BB = L->getLoopPreheader();

+ // Reuse existing vector loop preheader for runtime memory checks.

+ // Note that new preheader block is generated for vector loop.

+ BasicBlock *const MemCheckBlock = L->getLoopPreheader();

// Generate the code that checks in runtime if arrays overlap. We put the

// checks into a separate block to make the more common case of few elements

@@ -2752,11 +2777,11 @@ void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {

Instruction *FirstCheckInst;

Instruction *MemRuntimeCheck;

std::tie(FirstCheckInst, MemRuntimeCheck) =

- Legal->getLAI()->addRuntimeChecks(BB->getTerminator());

+ Legal->getLAI()->addRuntimeChecks(MemCheckBlock->getTerminator());

if (!MemRuntimeCheck)

return;

- if (BB->getParent()->hasOptSize()) {

+ if (MemCheckBlock->getParent()->hasOptSize()) {

assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&

"Cannot emit memory checks when optimizing for size, unless forced "

"to vectorize.");

@@ -2770,24 +2795,28 @@ void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {

});

}

- // Create a new block containing the memory check.

- BB->setName("vector.memcheck");

- auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");

- // Update dominator tree immediately if the generated block is a

- // LoopBypassBlock because SCEV expansions to generate loop bypass

- // checks may query it before the current function is finished.

- DT->addNewBlock(NewBB, BB);

- if (L->getParentLoop())

- L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);

- ReplaceInstWithInst(BB->getTerminator(),

- BranchInst::Create(Bypass, NewBB, MemRuntimeCheck));

- LoopBypassBlocks.push_back(BB);

+ MemCheckBlock->setName("vector.memcheck");

+ // Create new preheader for vector loop.

+ LoopVectorPreHeader =

+ SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr,

+ "vector.ph");

+ // Update dominator only if this is first RT check.

+ if (LoopBypassBlocks.empty()) {

+ DT->changeImmediateDominator(Bypass, MemCheckBlock);

+ DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock);

+ }

+ ReplaceInstWithInst(

+ MemCheckBlock->getTerminator(),

+ BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheck));

+ LoopBypassBlocks.push_back(MemCheckBlock);

AddedSafetyChecks = true;

// We currently don't use LoopVersioning for the actual loop cloning but we

// still use it to add the noalias metadata.

LVer = std::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT,

- PSE.getSE());

+ PSE.getSE());

LVer->prepareNoAliasMetadata();

}

@@ -2912,12 +2941,7 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {

...

- BasicBlock *OldBasicBlock = OrigLoop->getHeader();

- BasicBlock *VectorPH = OrigLoop->getLoopPreheader();

- BasicBlock *ExitBlock = OrigLoop->getExitBlock();

MDNode *OrigLoopID = OrigLoop->getLoopID();

- assert(VectorPH && "Invalid loop structure");

- assert(ExitBlock && "Must have an exit block");

// Some loops have a single integer induction variable, while other loops

// don't. One example is c++ iterators that often have multiple pointer

@@ -2934,12 +2958,27 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {

Type *IdxTy = Legal->getWidestInductionType();

// Split the single block loop into the two loop structure described above.

- BasicBlock *VecBody =

- VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body");

- BasicBlock *MiddleBlock =

- VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block");

- BasicBlock *ScalarPH =

- MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph");

+ LoopScalarBody = OrigLoop->getHeader();

+ LoopVectorPreHeader = OrigLoop->getLoopPreheader();

+ LoopExitBlock = OrigLoop->getExitBlock();

+ assert(LoopExitBlock && "Must have an exit block");

+ assert(LoopVectorPreHeader && "Invalid loop structure");

+ LoopMiddleBlock =

+ SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,

+ LI, nullptr, "middle.block");

+ LoopScalarPreHeader =

+ SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,

+ nullptr, "scalar.ph");

+ // We intentionally don't let SplitBlock to update LoopInfo since

+ // LoopVectorBody should belong to another loop than LoopVectorPreHeader.

+ // LoopVectorBody is explicitly added to the correct place few lines later.

+ LoopVectorBody =

+ SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,

+ nullptr, nullptr, "vector.body");

+ // Update dominator for loop exit.

+ DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);

// Create and register the new vector loop.

Loop *Lp = LI->AllocateLoop();

@@ -2949,12 +2988,10 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {

// before calling any utilities such as SCEV that require valid LoopInfo.

if (ParentLoop) {

ParentLoop->addChildLoop(Lp);

- ParentLoop->addBasicBlockToLoop(ScalarPH, *LI);

- ParentLoop->addBasicBlockToLoop(MiddleBlock, *LI);

} else {

LI->addTopLevelLoop(Lp);

}

- Lp->addBasicBlockToLoop(VecBody, *LI);

+ Lp->addBasicBlockToLoop(LoopVectorBody, *LI);

// Find the loop boundaries.

Value *Count = getOrCreateTripCount(Lp);

@@ -2966,16 +3003,16 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {

// backedge-taken count is uint##_max: adding one to it will overflow leading

// to an incorrect trip count of zero. In this (rare) case we will also jump

// to the scalar loop.

- emitMinimumIterationCountCheck(Lp, ScalarPH);

+ emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader);

// Generate the code to check any assumptions that we've made for SCEV

// expressions.

- emitSCEVChecks(Lp, ScalarPH);

+ emitSCEVChecks(Lp, LoopScalarPreHeader);

// Generate the code that checks in runtime if arrays overlap. We put the

// checks into a separate block to make the more common case of few elements

// faster.

- emitMemRuntimeChecks(Lp, ScalarPH);

+ emitMemRuntimeChecks(Lp, LoopScalarPreHeader);

// Generate the induction variable.

// The loop step is equal to the vectorization factor (num of SIMD elements)

@@ -3003,8 +3040,9 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {

InductionDescriptor II = InductionEntry.second;

// Create phi nodes to merge from the backedge-taken check block.

- PHINode *BCResumeVal = PHINode::Create(

- OrigPhi->getType(), 3, "bc.resume.val", ScalarPH->getTerminator());

+ PHINode *BCResumeVal =

+ PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",

+ LoopScalarPreHeader->getTerminator());

// Copy original phi DL over to the new one.

BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());

Value *&EndValue = IVEndValues[OrigPhi];

@@ -3015,23 +3053,23 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {

IRBuilder<> B(Lp->getLoopPreheader()->getTerminator());

Type *StepType = II.getStep()->getType();

Instruction::CastOps CastOp =

- CastInst::getCastOpcode(CountRoundDown, true, StepType, true);

+ CastInst::getCastOpcode(CountRoundDown, true, StepType, true);

Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd");

- const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();

+ const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout();

EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);

EndValue->setName("ind.end");

}

// The new PHI merges the original incoming value, in case of a bypass,

// or the value at the end of the vectorized loop.

- BCResumeVal->addIncoming(EndValue, MiddleBlock);

+ BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);

// Fix the scalar body counter (PHI node).

// The old induction's phi node in the scalar body needs the truncated

// value.

for (BasicBlock *BB : LoopBypassBlocks)

BCResumeVal->addIncoming(II.getStartValue(), BB);

- OrigPhi->setIncomingValueForBlock(ScalarPH, BCResumeVal);

+ OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);

}

// We need the OrigLoop (scalar loop part) latch terminator to help

@@ -3049,9 +3087,9 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {

// If tail is to be folded, we know we don't need to run the remainder.

Value *CmpN = Builder.getTrue();

if (!Cost->foldTailByMasking()) {

- CmpN =

- CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,

- CountRoundDown, "cmp.n", MiddleBlock->getTerminator());

+ CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,

+ CountRoundDown, "cmp.n",

+ LoopMiddleBlock->getTerminator());

// Here we use the same DebugLoc as the scalar loop latch branch instead

// of the corresponding compare because they may have ended up with

@@ -3060,20 +3098,15 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {

cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc());

}

- BranchInst *BrInst = BranchInst::Create(ExitBlock, ScalarPH, CmpN);

+ BranchInst *BrInst =

+ BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, CmpN);

BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc());

- ReplaceInstWithInst(MiddleBlock->getTerminator(), BrInst);

+ ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);

// Get ready to start creating new instructions into the vectorized body.

- Builder.SetInsertPoint(&*VecBody->getFirstInsertionPt());

- // Save the state.

- LoopVectorPreHeader = Lp->getLoopPreheader();

- LoopScalarPreHeader = ScalarPH;

- LoopMiddleBlock = MiddleBlock;

- LoopExitBlock = ExitBlock;

- LoopVectorBody = VecBody;

- LoopScalarBody = OldBasicBlock;

+ assert(LoopVectorPreHeader == Lp->getLoopPreheader() &&

+ "Inconsistent vector loop preheader");

+ Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());

Optional<MDNode *> VectorizedLoopID =

makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,

@@ -3094,6 +3127,11 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {

LoopVectorizeHints Hints(Lp, true, *ORE);

Hints.setAlreadyVectorized();

+#ifdef EXPENSIVE_CHECKS

+ assert(DT->verify(DominatorTree::VerificationLevel::Fast));

+ LI->verify(*DT);

+#endif

return LoopVectorPreHeader;

}

@@ -3429,15 +3467,8 @@ void InnerLoopVectorizer::fixVectorizedLoop() {

// This is the second stage of vectorizing recurrences.

fixCrossIterationPHIs();

- // Update the dominator tree.

- //

- // FIXME: After creating the structure of the new loop, the dominator tree is

- // no longer up-to-date, and it remains that way until we update it

- // here. An out-of-date dominator tree is problematic for SCEV,

- // because SCEVExpander uses it to guide code generation. The

- // vectorizer use SCEVExpanders in several places. Instead, we should

- // keep the dominator tree up-to-date as we go.

- updateAnalysis();

+ // Forget the original basic block.

+ PSE.getSE()->forgetLoop(OrigLoop);

// Fix-up external users of the induction variables.

for (auto &Entry : *Legal->getInductionVars())

@@ -3550,17 +3581,27 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {

// among all unrolled iterations, due to the order of their construction.

Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1);

- // Set the insertion point after the previous value if it is an instruction.

+ // Find and set the insertion point after the previous value if it is an

+ // instruction.

+ BasicBlock::iterator InsertPt;

// Note that the previous value may have been constant-folded so it is not

- // guaranteed to be an instruction in the vector loop. Also, if the previous

- // value is a phi node, we should insert after all the phi nodes to avoid

- // breaking basic block verification.

- if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart) ||

- isa<PHINode>(PreviousLastPart))

- Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());

- else

- Builder.SetInsertPoint(

- &*++BasicBlock::iterator(cast<Instruction>(PreviousLastPart)));

+ // guaranteed to be an instruction in the vector loop.

+ // FIXME: Loop invariant values do not form recurrences. We should deal with

+ // them earlier.

+ if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart))

+ InsertPt = LoopVectorBody->getFirstInsertionPt();

+ else {

+ Instruction *PreviousInst = cast<Instruction>(PreviousLastPart);

+ if (isa<PHINode>(PreviousLastPart))

+ // If the previous value is a phi node, we should insert after all the phi

+ // nodes in the block containing the PHI to avoid breaking basic block

+ // verification. Note that the basic block may be different to

+ // LoopVectorBody, in case we predicate the loop.

+ InsertPt = PreviousInst->getParent()->getFirstInsertionPt();

+ else

+ InsertPt = ++PreviousInst->getIterator();

+ }

+ Builder.SetInsertPoint(&*InsertPt);

// We will construct a vector for the recurrence by combining the values for

// the current and previous iterations. This is the required shuffle mask.

@@ -3693,16 +3734,20 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) {

}

+ // Wrap flags are in general invalid after vectorization, clear them.

+ clearReductionWrapFlags(RdxDesc);

// Fix the vector-loop phi.

// Reductions do not have to start at zero. They can start with

// any loop invariant values.

BasicBlock *Latch = OrigLoop->getLoopLatch();

Value *LoopVal = Phi->getIncomingValueForBlock(Latch);

for (unsigned Part = 0; Part < UF; ++Part) {

Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part);

Value *Val = getOrCreateVectorValue(LoopVal, Part);

- // Make sure to add the reduction stat value only to the

+ // Make sure to add the reduction start value only to the

// first unroll part.

Value *StartVal = (Part == 0) ? VectorStart : Identity;

cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader);

@@ -3839,6 +3884,37 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) {

Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);

}

+void InnerLoopVectorizer::clearReductionWrapFlags(

+ RecurrenceDescriptor &RdxDesc) {

+ RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();

+ if (RK != RecurrenceDescriptor::RK_IntegerAdd &&

+ RK != RecurrenceDescriptor::RK_IntegerMult)

+ return;

+ Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr();

+ assert(LoopExitInstr && "null loop exit instruction");

+ SmallVector<Instruction *, 8> Worklist;

+ SmallPtrSet<Instruction *, 8> Visited;

+ Worklist.push_back(LoopExitInstr);

+ Visited.insert(LoopExitInstr);

+ while (!Worklist.empty()) {

+ Instruction *Cur = Worklist.pop_back_val();

+ if (isa<OverflowingBinaryOperator>(Cur))

+ for (unsigned Part = 0; Part < UF; ++Part) {

+ Value *V = getOrCreateVectorValue(Cur, Part);

+ cast<Instruction>(V)->dropPoisonGeneratingFlags();

+ }

+ for (User *U : Cur->users()) {

+ Instruction *UI = cast<Instruction>(U);

+ if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) &&

+ Visited.insert(UI).second)

+ Worklist.push_back(UI);

+ }

void InnerLoopVectorizer::fixLCSSAPHIs() {

for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {

if (LCSSAPhi.getNumIncomingValues() == 1) {

@@ -3960,6 +4036,75 @@ void InnerLoopVectorizer::fixNonInductionPHIs() {

}

+void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, unsigned UF,

+ unsigned VF, bool IsPtrLoopInvariant,

+ SmallBitVector &IsIndexLoopInvariant) {

+ // Construct a vector GEP by widening the operands of the scalar GEP as

+ // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP

+ // results in a vector of pointers when at least one operand of the GEP

+ // is vector-typed. Thus, to keep the representation compact, we only use

+ // vector-typed operands for loop-varying values.

+ if (VF > 1 && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {

+ // If we are vectorizing, but the GEP has only loop-invariant operands,

+ // the GEP we build (by only using vector-typed operands for

+ // loop-varying values) would be a scalar pointer. Thus, to ensure we

+ // produce a vector of pointers, we need to either arbitrarily pick an

+ // operand to broadcast, or broadcast a clone of the original GEP.

+ // Here, we broadcast a clone of the original.

+ //

+ // TODO: If at some point we decide to scalarize instructions having

+ // loop-invariant operands, this special case will no longer be

+ // required. We would add the scalarization decision to

+ // collectLoopScalars() and teach getVectorValue() to broadcast

+ // the lane-zero scalar value.

+ auto *Clone = Builder.Insert(GEP->clone());

+ for (unsigned Part = 0; Part < UF; ++Part) {

+ Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);

+ VectorLoopValueMap.setVectorValue(GEP, Part, EntryPart);

+ addMetadata(EntryPart, GEP);

+ }

+ } else {

+ // If the GEP has at least one loop-varying operand, we are sure to

+ // produce a vector of pointers. But if we are only unrolling, we want

+ // to produce a scalar GEP for each unroll part. Thus, the GEP we

+ // produce with the code below will be scalar (if VF == 1) or vector

+ // (otherwise). Note that for the unroll-only case, we still maintain

+ // values in the vector mapping with initVector, as we do for other

+ // instructions.

+ for (unsigned Part = 0; Part < UF; ++Part) {

+ // The pointer operand of the new GEP. If it's loop-invariant, we

+ // won't broadcast it.

+ auto *Ptr = IsPtrLoopInvariant

+ ? GEP->getPointerOperand()

+ : getOrCreateVectorValue(GEP->getPointerOperand(), Part);

+ // Collect all the indices for the new GEP. If any index is

+ // loop-invariant, we won't broadcast it.

+ SmallVector<Value *, 4> Indices;

+ for (auto Index : enumerate(GEP->indices())) {

+ Value *User = Index.value().get();

+ if (IsIndexLoopInvariant[Index.index()])

+ Indices.push_back(User);

+ else

+ Indices.push_back(getOrCreateVectorValue(User, Part));

+ }

+ // Create the new GEP. Note that this GEP may be a scalar if VF == 1,

+ // but it should be a vector, otherwise.

+ auto *NewGEP =

+ GEP->isInBounds()

+ ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,

+ Indices)

+ : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);

+ assert((VF == 1 || NewGEP->getType()->isVectorTy()) &&

+ "NewGEP is not a pointer vector");

+ VectorLoopValueMap.setVectorValue(GEP, Part, NewGEP);

+ addMetadata(NewGEP, GEP);

+ }

void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,

unsigned VF) {

PHINode *P = cast<PHINode>(PN);

@@ -4062,76 +4207,8 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I) {

switch (I.getOpcode()) {

case Instruction::Br:

case Instruction::PHI:

+ case Instruction::GetElementPtr:

llvm_unreachable("This instruction is handled by a different recipe.");

- case Instruction::GetElementPtr: {

- // Construct a vector GEP by widening the operands of the scalar GEP as

- // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP

- // results in a vector of pointers when at least one operand of the GEP

- // is vector-typed. Thus, to keep the representation compact, we only use

- // vector-typed operands for loop-varying values.

- auto *GEP = cast<GetElementPtrInst>(&I);

- if (VF > 1 && OrigLoop->hasLoopInvariantOperands(GEP)) {

- // If we are vectorizing, but the GEP has only loop-invariant operands,

- // the GEP we build (by only using vector-typed operands for

- // loop-varying values) would be a scalar pointer. Thus, to ensure we

- // produce a vector of pointers, we need to either arbitrarily pick an

- // operand to broadcast, or broadcast a clone of the original GEP.

- // Here, we broadcast a clone of the original.

- //

- // TODO: If at some point we decide to scalarize instructions having

- // loop-invariant operands, this special case will no longer be

- // required. We would add the scalarization decision to

- // collectLoopScalars() and teach getVectorValue() to broadcast

- // the lane-zero scalar value.

- auto *Clone = Builder.Insert(GEP->clone());

- for (unsigned Part = 0; Part < UF; ++Part) {

- Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);

- VectorLoopValueMap.setVectorValue(&I, Part, EntryPart);

- addMetadata(EntryPart, GEP);

- }

- } else {

- // If the GEP has at least one loop-varying operand, we are sure to

- // produce a vector of pointers. But if we are only unrolling, we want

- // to produce a scalar GEP for each unroll part. Thus, the GEP we

- // produce with the code below will be scalar (if VF == 1) or vector

- // (otherwise). Note that for the unroll-only case, we still maintain

- // values in the vector mapping with initVector, as we do for other

- // instructions.

- for (unsigned Part = 0; Part < UF; ++Part) {

- // The pointer operand of the new GEP. If it's loop-invariant, we

- // won't broadcast it.

- auto *Ptr =

- OrigLoop->isLoopInvariant(GEP->getPointerOperand())

- ? GEP->getPointerOperand()

- : getOrCreateVectorValue(GEP->getPointerOperand(), Part);

- // Collect all the indices for the new GEP. If any index is

- // loop-invariant, we won't broadcast it.

- SmallVector<Value *, 4> Indices;

- for (auto &U : make_range(GEP->idx_begin(), GEP->idx_end())) {

- if (OrigLoop->isLoopInvariant(U.get()))

- Indices.push_back(U.get());

- else

- Indices.push_back(getOrCreateVectorValue(U.get(), Part));

- }

- // Create the new GEP. Note that this GEP may be a scalar if VF == 1,

- // but it should be a vector, otherwise.

- auto *NewGEP =

- GEP->isInBounds()

- ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,

- Indices)

- : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);

- assert((VF == 1 || NewGEP->getType()->isVectorTy()) &&

- "NewGEP is not a pointer vector");

- VectorLoopValueMap.setVectorValue(&I, Part, NewGEP);

- addMetadata(NewGEP, GEP);

- }

- break;

- }

case Instruction::UDiv:

case Instruction::SDiv:

case Instruction::SRem:

@@ -4335,26 +4412,6 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I) {

} // end of switch.

}

-void InnerLoopVectorizer::updateAnalysis() {

- // Forget the original basic block.

- PSE.getSE()->forgetLoop(OrigLoop);

- // DT is not kept up-to-date for outer loop vectorization

- if (EnableVPlanNativePath)

- return;

- // Update the dominator tree information.

- assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) &&

- "Entry does not dominate exit.");

- DT->addNewBlock(LoopMiddleBlock,

- LI->getLoopFor(LoopVectorBody)->getLoopLatch());

- DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]);

- DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader);

- DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]);

- assert(DT->verify(DominatorTree::VerificationLevel::Fast));

void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {

// We should not collect Scalars more than once per VF. Right now, this

// function is called from collectUniformsAndScalars(), which already does

@@ -4562,9 +4619,10 @@ bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigne

return WideningDecision == CM_Scalarize;

}

const MaybeAlign Alignment = getLoadStoreAlignment(I);

- return isa<LoadInst>(I) ?

- !(isLegalMaskedLoad(Ty, Ptr, Alignment) || isLegalMaskedGather(Ty))

- : !(isLegalMaskedStore(Ty, Ptr, Alignment) || isLegalMaskedScatter(Ty));

+ return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||

+ isLegalMaskedGather(Ty, Alignment))

+ : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||

+ isLegalMaskedScatter(Ty, Alignment));

}

case Instruction::UDiv:

case Instruction::SDiv:

@@ -4667,14 +4725,26 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {

SetVector<Instruction *> Worklist;

BasicBlock *Latch = TheLoop->getLoopLatch();

+ // Instructions that are scalar with predication must not be considered

+ // uniform after vectorization, because that would create an erroneous

+ // replicating region where only a single instance out of VF should be formed.

+ // TODO: optimize such seldom cases if found important, see PR40816.

+ auto addToWorklistIfAllowed = [&](Instruction *I) -> void {

+ if (isScalarWithPredication(I, VF)) {

+ LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "

+ << *I << "\n");

+ return;

+ }

+ LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");

+ Worklist.insert(I);

+ };

// Start with the conditional branch. If the branch condition is an

// instruction contained in the loop that is only used by the branch, it is

// uniform.

auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));

- if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) {

- Worklist.insert(Cmp);

- LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Cmp << "\n");

- }

+ if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())

+ addToWorklistIfAllowed(Cmp);

// Holds consecutive and consecutive-like pointers. Consecutive-like pointers

// are pointers that are treated like consecutive pointers during

@@ -4733,10 +4803,8 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {

// Add to the Worklist all consecutive and consecutive-like pointers that

// aren't also identified as possibly non-uniform.

for (auto *V : ConsecutiveLikePtrs)

- if (PossibleNonUniformPtrs.find(V) == PossibleNonUniformPtrs.end()) {

- LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *V << "\n");

- Worklist.insert(V);

- }

+ if (PossibleNonUniformPtrs.find(V) == PossibleNonUniformPtrs.end())

+ addToWorklistIfAllowed(V);

// Expand Worklist in topological order: whenever a new instruction

// is added , its users should be already inside Worklist. It ensures

@@ -4762,10 +4830,8 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {

return Worklist.count(J) ||

(OI == getLoadStorePointerOperand(J) &&

isUniformDecision(J, VF));

- })) {

- Worklist.insert(OI);

- LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *OI << "\n");

- }

+ }))

+ addToWorklistIfAllowed(OI);

}

@@ -4807,11 +4873,8 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {

continue;

// The induction variable and its update instruction will remain uniform.

- Worklist.insert(Ind);

- Worklist.insert(IndUpdate);

- LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Ind << "\n");

- LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *IndUpdate

- << "\n");

+ addToWorklistIfAllowed(Ind);

+ addToWorklistIfAllowed(IndUpdate);

}

Uniforms[VF].insert(Worklist.begin(), Worklist.end());

@@ -5143,9 +5206,10 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF,

if (Legal->getMaxSafeDepDistBytes() != -1U)

return 1;

- // Do not interleave loops with a relatively small trip count.

- unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);

- if (TC > 1 && TC < TinyTripCountInterleaveThreshold)

+ // Do not interleave loops with a relatively small known or estimated trip

+ // count.

+ auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);

+ if (BestKnownTC && *BestKnownTC < TinyTripCountInterleaveThreshold)

return 1;

RegisterUsage R = calculateRegisterUsage({VF})[0];

@@ -5208,12 +5272,10 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF,

MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;

}

- // If the trip count is constant, limit the interleave count to be less than

- // the trip count divided by VF.

- if (TC > 0) {

- assert(TC >= VF && "VF exceeds trip count?");

- if ((TC / VF) < MaxInterleaveCount)

- MaxInterleaveCount = (TC / VF);

+ // If trip count is known or estimated compile time constant, limit the

+ // interleave count to be less than the trip count divided by VF.

+ if (BestKnownTC) {

+ MaxInterleaveCount = std::min(*BestKnownTC / VF, MaxInterleaveCount);

}

// If we did not calculate the cost for VF (because the user selected the VF)

@@ -5746,7 +5808,7 @@ unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,

// vectorized loop where the user of it is a vectorized instruction.

const MaybeAlign Alignment = getLoadStoreAlignment(I);

Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(),

- Alignment ? Alignment->value() : 0, AS);

+ Alignment, AS);

// Get the overhead of the extractelement and insertelement instructions

// we might create due to scalarization.

@@ -5783,8 +5845,7 @@ unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,

Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy,

Alignment ? Alignment->value() : 0, AS);

else

- Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy,

- Alignment ? Alignment->value() : 0, AS, I);

+ Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, I);

bool Reverse = ConsecutiveStride < 0;

if (Reverse)

@@ -5800,16 +5861,14 @@ unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,

unsigned AS = getLoadStoreAddressSpace(I);

if (isa<LoadInst>(I)) {

return TTI.getAddressComputationCost(ValTy) +

- TTI.getMemoryOpCost(Instruction::Load, ValTy,

- Alignment ? Alignment->value() : 0, AS) +

+ TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS) +

TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);

}

StoreInst *SI = cast<StoreInst>(I);

bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());

return TTI.getAddressComputationCost(ValTy) +

- TTI.getMemoryOpCost(Instruction::Store, ValTy,

- Alignment ? Alignment->value() : 0, AS) +

+ TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS) +

(isLoopInvariantStoreValue

? 0

: TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,

@@ -5877,8 +5936,7 @@ unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,

unsigned AS = getLoadStoreAddressSpace(I);

return TTI.getAddressComputationCost(ValTy) +

- TTI.getMemoryOpCost(I->getOpcode(), ValTy,

- Alignment ? Alignment->value() : 0, AS, I);

+ TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, I);

}

return getWideningCost(I, VF);

}

@@ -6217,7 +6275,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,

unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;

return N * TTI.getArithmeticInstrCost(

I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue,

- Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands);

+ Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);

}

case Instruction::FNeg: {

unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;

@@ -6225,7 +6283,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,

I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue,

TargetTransformInfo::OK_AnyValue,

TargetTransformInfo::OP_None, TargetTransformInfo::OP_None,

- I->getOperand(0));

+ I->getOperand(0), I);

}

case Instruction::Select: {

SelectInst *SI = cast<SelectInst>(I);

@@ -6714,37 +6772,6 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {

return BlockMaskCache[BB] = BlockMask;

}

-VPInterleaveRecipe *VPRecipeBuilder::tryToInterleaveMemory(Instruction *I,

- VFRange &Range,

- VPlanPtr &Plan) {

- const InterleaveGroup<Instruction> *IG = CM.getInterleavedAccessGroup(I);

- if (!IG)

- return nullptr;

- // Now check if IG is relevant for VF's in the given range.

- auto isIGMember = [&](Instruction *I) -> std::function<bool(unsigned)> {

- return [=](unsigned VF) -> bool {

- return (VF >= 2 && // Query is illegal for VF == 1

- CM.getWideningDecision(I, VF) ==

- LoopVectorizationCostModel::CM_Interleave);

- };

- if (!LoopVectorizationPlanner::getDecisionAndClampRange(isIGMember(I), Range))

- return nullptr;

- // I is a member of an InterleaveGroup for VF's in the (possibly trimmed)

- // range. If it's the primary member of the IG construct a VPInterleaveRecipe.

- // Otherwise, it's an adjunct member of the IG, do not construct any Recipe.

- assert(I == IG->getInsertPos() &&

- "Generating a recipe for an adjunct member of an interleave group");

- VPValue *Mask = nullptr;

- if (Legal->isMaskRequired(I))

- Mask = createBlockInMask(I->getParent(), Plan);

- return new VPInterleaveRecipe(IG, Mask);

VPWidenMemoryInstructionRecipe *

VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,

VPlanPtr &Plan) {

@@ -6754,15 +6781,15 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,

auto willWiden = [&](unsigned VF) -> bool {

if (VF == 1)

return false;

- if (CM.isScalarAfterVectorization(I, VF) ||

- CM.isProfitableToScalarize(I, VF))

- return false;

LoopVectorizationCostModel::InstWidening Decision =

CM.getWideningDecision(I, VF);

assert(Decision != LoopVectorizationCostModel::CM_Unknown &&

"CM decision should be taken at this point.");

- assert(Decision != LoopVectorizationCostModel::CM_Interleave &&

- "Interleave memory opportunity should be caught earlier.");

+ if (Decision == LoopVectorizationCostModel::CM_Interleave)

+ return true;

+ if (CM.isScalarAfterVectorization(I, VF) ||

+ CM.isProfitableToScalarize(I, VF))

+ return false;

return Decision != LoopVectorizationCostModel::CM_Scalarize;

};

@@ -6773,7 +6800,8 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,

if (Legal->isMaskRequired(I))

Mask = createBlockInMask(I->getParent(), Plan);

- return new VPWidenMemoryInstructionRecipe(*I, Mask);

+ VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I));

+ return new VPWidenMemoryInstructionRecipe(*I, Addr, Mask);

}

VPWidenIntOrFpInductionRecipe *

@@ -6861,7 +6889,6 @@ bool VPRecipeBuilder::tryToWiden(Instruction *I, VPBasicBlock *VPBB,

case Instruction::FPTrunc:

case Instruction::FRem:

case Instruction::FSub:

- case Instruction::GetElementPtr:

case Instruction::ICmp:

case Instruction::IntToPtr:

case Instruction::Load:

@@ -6926,16 +6953,23 @@ bool VPRecipeBuilder::tryToWiden(Instruction *I, VPBasicBlock *VPBB,

if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))

return false;

+ // If this ingredient's recipe is to be recorded, keep its recipe a singleton

+ // to avoid having to split recipes later.

+ bool IsSingleton = Ingredient2Recipe.count(I);

+ // Success: widen this instruction.

- // Success: widen this instruction. We optimize the common case where

+ // Use the default widening recipe. We optimize the common case where

// consecutive instructions can be represented by a single recipe.

- if (!VPBB->empty()) {

- VPWidenRecipe *LastWidenRecipe = dyn_cast<VPWidenRecipe>(&VPBB->back());

- if (LastWidenRecipe && LastWidenRecipe->appendInstruction(I))

- return true;

- }

+ if (!IsSingleton && !VPBB->empty() && LastExtensibleRecipe == &VPBB->back() &&

+ LastExtensibleRecipe->appendInstruction(I))

+ return true;

- VPBB->appendRecipe(new VPWidenRecipe(I));

+ VPWidenRecipe *WidenRecipe = new VPWidenRecipe(I);

+ if (!IsSingleton)

+ LastExtensibleRecipe = WidenRecipe;

+ setRecipe(I, WidenRecipe);

+ VPBB->appendRecipe(WidenRecipe);

return true;

}

@@ -6951,6 +6985,7 @@ VPBasicBlock *VPRecipeBuilder::handleReplication(

[&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range);

auto *Recipe = new VPReplicateRecipe(I, IsUniform, IsPredicated);

+ setRecipe(I, Recipe);

// Find if I uses a predicated instruction. If so, it will use its scalar

// value. Avoid hoisting the insert-element which packs the scalar value into

@@ -7009,36 +7044,36 @@ VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,

bool VPRecipeBuilder::tryToCreateRecipe(Instruction *Instr, VFRange &Range,

VPlanPtr &Plan, VPBasicBlock *VPBB) {

VPRecipeBase *Recipe = nullptr;

- // Check if Instr should belong to an interleave memory recipe, or already

- // does. In the latter case Instr is irrelevant.

- if ((Recipe = tryToInterleaveMemory(Instr, Range, Plan))) {

- VPBB->appendRecipe(Recipe);

- return true;

- }

- // Check if Instr is a memory operation that should be widened.

- if ((Recipe = tryToWidenMemory(Instr, Range, Plan))) {

+ // First, check for specific widening recipes that deal with memory

+ // operations, inductions and Phi nodes.

+ if ((Recipe = tryToWidenMemory(Instr, Range, Plan)) ||

+ (Recipe = tryToOptimizeInduction(Instr, Range)) ||

+ (Recipe = tryToBlend(Instr, Plan)) ||

+ (isa<PHINode>(Instr) &&

+ (Recipe = new VPWidenPHIRecipe(cast<PHINode>(Instr))))) {

+ setRecipe(Instr, Recipe);

VPBB->appendRecipe(Recipe);

return true;

}

- // Check if Instr should form some PHI recipe.

- if ((Recipe = tryToOptimizeInduction(Instr, Range))) {

- VPBB->appendRecipe(Recipe);

- return true;

- }

- if ((Recipe = tryToBlend(Instr, Plan))) {

+ // Handle GEP widening.

+ if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Instr)) {

+ auto Scalarize = [&](unsigned VF) {

+ return CM.isScalarWithPredication(Instr, VF) ||

+ CM.isScalarAfterVectorization(Instr, VF) ||

+ CM.isProfitableToScalarize(Instr, VF);

+ };

+ if (LoopVectorizationPlanner::getDecisionAndClampRange(Scalarize, Range))

+ return false;

+ VPWidenGEPRecipe *Recipe = new VPWidenGEPRecipe(GEP, OrigLoop);

+ setRecipe(Instr, Recipe);

VPBB->appendRecipe(Recipe);

return true;

}

- if (PHINode *Phi = dyn_cast<PHINode>(Instr)) {

- VPBB->appendRecipe(new VPWidenPHIRecipe(Phi));

- return true;

- }

// Check if Instr is to be widened by a general VPWidenRecipe, after

- // having first checked for specific widening recipes that deal with

- // Interleave Groups, Inductions and Phi nodes.

+ // having first checked for specific widening recipes.

if (tryToWiden(Instr, VPBB, Range))

return true;

@@ -7094,19 +7129,57 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF,

VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(

VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef,

SmallPtrSetImpl<Instruction *> &DeadInstructions) {

// Hold a mapping from predicated instructions to their recipes, in order to

// fix their AlsoPack behavior if a user is determined to replicate and use a

// scalar instead of vector value.

DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe;

DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();

- DenseMap<Instruction *, Instruction *> SinkAfterInverse;

+ SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;

+ VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, Builder);

+ // ---------------------------------------------------------------------------

+ // Pre-construction: record ingredients whose recipes we'll need to further

+ // process after constructing the initial VPlan.

+ // ---------------------------------------------------------------------------

+ // Mark instructions we'll need to sink later and their targets as

+ // ingredients whose recipe we'll need to record.

+ for (auto &Entry : SinkAfter) {

+ RecipeBuilder.recordRecipeOf(Entry.first);

+ RecipeBuilder.recordRecipeOf(Entry.second);

+ }

+ // For each interleave group which is relevant for this (possibly trimmed)

+ // Range, add it to the set of groups to be later applied to the VPlan and add

+ // placeholders for its members' Recipes which we'll be replacing with a

+ // single VPInterleaveRecipe.

+ for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {

+ auto applyIG = [IG, this](unsigned VF) -> bool {

+ return (VF >= 2 && // Query is illegal for VF == 1

+ CM.getWideningDecision(IG->getInsertPos(), VF) ==

+ LoopVectorizationCostModel::CM_Interleave);

+ };

+ if (!getDecisionAndClampRange(applyIG, Range))

+ continue;

+ InterleaveGroups.insert(IG);

+ for (unsigned i = 0; i < IG->getFactor(); i++)

+ if (Instruction *Member = IG->getMember(i))

+ RecipeBuilder.recordRecipeOf(Member);

+ };

+ // ---------------------------------------------------------------------------

+ // Build initial VPlan: Scan the body of the loop in a topological order to

+ // visit each basic block after having visited its predecessor basic blocks.

+ // ---------------------------------------------------------------------------

// Create a dummy pre-entry VPBasicBlock to start building the VPlan.

VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");

auto Plan = std::make_unique<VPlan>(VPBB);

- VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, Builder);

// Represent values that will have defs inside VPlan.

for (Value *V : NeedDef)

Plan->addVPValue(V);

@@ -7125,10 +7198,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(

VPBB = FirstVPBBForBB;

Builder.setInsertPoint(VPBB);

- std::vector<Instruction *> Ingredients;

- // Organize the ingredients to vectorize from current basic block in the

- // right order.

+ // Introduce each ingredient into VPlan.

for (Instruction &I : BB->instructionsWithoutDebug()) {

Instruction *Instr = &I;

@@ -7138,43 +7208,6 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(

DeadInstructions.find(Instr) != DeadInstructions.end())

continue;

- // I is a member of an InterleaveGroup for Range.Start. If it's an adjunct

- // member of the IG, do not construct any Recipe for it.

- const InterleaveGroup<Instruction> *IG =

- CM.getInterleavedAccessGroup(Instr);

- if (IG && Instr != IG->getInsertPos() &&

- Range.Start >= 2 && // Query is illegal for VF == 1

- CM.getWideningDecision(Instr, Range.Start) ==

- LoopVectorizationCostModel::CM_Interleave) {

- auto SinkCandidate = SinkAfterInverse.find(Instr);

- if (SinkCandidate != SinkAfterInverse.end())

- Ingredients.push_back(SinkCandidate->second);

- continue;

- }

- // Move instructions to handle first-order recurrences, step 1: avoid

- // handling this instruction until after we've handled the instruction it

- // should follow.

- auto SAIt = SinkAfter.find(Instr);

- if (SAIt != SinkAfter.end()) {

- LLVM_DEBUG(dbgs() << "Sinking" << *SAIt->first << " after"

- << *SAIt->second

- << " to vectorize a 1st order recurrence.\n");

- SinkAfterInverse[SAIt->second] = Instr;

- continue;

- }

- Ingredients.push_back(Instr);

- // Move instructions to handle first-order recurrences, step 2: push the

- // instruction to be sunk at its insertion point.

- auto SAInvIt = SinkAfterInverse.find(Instr);

- if (SAInvIt != SinkAfterInverse.end())

- Ingredients.push_back(SAInvIt->second);

- }

- // Introduce each ingredient into VPlan.

- for (Instruction *Instr : Ingredients) {

if (RecipeBuilder.tryToCreateRecipe(Instr, Range, Plan, VPBB))

continue;

@@ -7199,6 +7232,33 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(

VPBlockUtils::disconnectBlocks(PreEntry, Entry);

delete PreEntry;

+ // ---------------------------------------------------------------------------

+ // Transform initial VPlan: Apply previously taken decisions, in order, to

+ // bring the VPlan to its final state.

+ // ---------------------------------------------------------------------------

+ // Apply Sink-After legal constraints.

+ for (auto &Entry : SinkAfter) {

+ VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);

+ VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);

+ Sink->moveAfter(Target);

+ }

+ // Interleave memory: for each Interleave Group we marked earlier as relevant

+ // for this VPlan, replace the Recipes widening its memory instructions with a

+ // single VPInterleaveRecipe at its insertion point.

+ for (auto IG : InterleaveGroups) {

+ auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(

+ RecipeBuilder.getRecipe(IG->getInsertPos()));

+ (new VPInterleaveRecipe(IG, Recipe->getAddr(), Recipe->getMask()))

+ ->insertBefore(Recipe);

+ for (unsigned i = 0; i < IG->getFactor(); ++i)

+ if (Instruction *Member = IG->getMember(i)) {

+ RecipeBuilder.getRecipe(Member)->eraseFromParent();

+ }

// Finally, if tail is folded by masking, introduce selects between the phi

// and the live-out instruction of each reduction, at the end of the latch.

if (CM.foldTailByMasking()) {

@@ -7255,9 +7315,8 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {

}

SmallPtrSet<Instruction *, 1> DeadInstructions;

- VPlanHCFGTransforms::VPInstructionsToVPRecipes(

- Plan, Legal->getInductionVars(), DeadInstructions);

+ VPlanTransforms::VPInstructionsToVPRecipes(

+ OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions);

return Plan;

}

@@ -7266,13 +7325,21 @@ getOrCreateVectorValues(Value *V, unsigned Part) {

return ILV.getOrCreateVectorValue(V, Part);

}

+Value *LoopVectorizationPlanner::VPCallbackILV::getOrCreateScalarValue(

+ Value *V, const VPIteration &Instance) {

+ return ILV.getOrCreateScalarValue(V, Instance);

void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent) const {

O << " +\n"

<< Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";

IG->getInsertPos()->printAsOperand(O, false);

- if (User) {

+ O << ", ";

+ getAddr()->printAsOperand(O);

+ VPValue *Mask = getMask();

+ if (Mask) {

O << ", ";

- User->getOperand(0)->printAsOperand(O);

+ Mask->printAsOperand(O);

}

O << "\\l\"";

for (unsigned i = 0; i < IG->getFactor(); ++i)

@@ -7286,6 +7353,11 @@ void VPWidenRecipe::execute(VPTransformState &State) {

State.ILV->widenInstruction(Instr);

}

+void VPWidenGEPRecipe::execute(VPTransformState &State) {

+ State.ILV->widenGEP(GEP, State.UF, State.VF, IsPtrLoopInvariant,

+ IsIndexLoopInvariant);

void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {

assert(!State.Instance && "Int or FP induction being replicated.");

State.ILV->widenIntOrFpInduction(IV, Trunc);

@@ -7336,15 +7408,8 @@ void VPBlendRecipe::execute(VPTransformState &State) {

void VPInterleaveRecipe::execute(VPTransformState &State) {

assert(!State.Instance && "Interleave group being replicated.");

- if (!User)

- return State.ILV->vectorizeInterleaveGroup(IG->getInsertPos());

- // Last (and currently only) operand is a mask.

- InnerLoopVectorizer::VectorParts MaskValues(State.UF);

- VPValue *Mask = User->getOperand(User->getNumOperands() - 1);

- for (unsigned Part = 0; Part < State.UF; ++Part)

- MaskValues[Part] = State.get(Mask, Part);

- State.ILV->vectorizeInterleaveGroup(IG->getInsertPos(), &MaskValues);

+ State.ILV->vectorizeInterleaveGroup(IG->getInsertPos(), State, getAddr(),

+ getMask());

}

void VPReplicateRecipe::execute(VPTransformState &State) {

@@ -7431,29 +7496,46 @@ void VPPredInstPHIRecipe::execute(VPTransformState &State) {

}

void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {

- if (!User)

- return State.ILV->vectorizeMemoryInstruction(&Instr);

- // Last (and currently only) operand is a mask.

- InnerLoopVectorizer::VectorParts MaskValues(State.UF);

- VPValue *Mask = User->getOperand(User->getNumOperands() - 1);

- for (unsigned Part = 0; Part < State.UF; ++Part)

- MaskValues[Part] = State.get(Mask, Part);

- State.ILV->vectorizeMemoryInstruction(&Instr, &MaskValues);

-static ScalarEpilogueLowering

-getScalarEpilogueLowering(Function *F, Loop *L, LoopVectorizeHints &Hints,

- ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) {

- ScalarEpilogueLowering SEL = CM_ScalarEpilogueAllowed;

- if (Hints.getForce() != LoopVectorizeHints::FK_Enabled &&

- (F->hasOptSize() ||

- llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI)))

- SEL = CM_ScalarEpilogueNotAllowedOptSize;

- else if (PreferPredicateOverEpilog || Hints.getPredicate())

- SEL = CM_ScalarEpilogueNotNeededUsePredicate;

- return SEL;

+ State.ILV->vectorizeMemoryInstruction(&Instr, State, getAddr(), getMask());

+// Determine how to lower the scalar epilogue, which depends on 1) optimising

+// for minimum code-size, 2) predicate compiler options, 3) loop hints forcing

+// predication, and 4) a TTI hook that analyses whether the loop is suitable

+// for predication.

+static ScalarEpilogueLowering getScalarEpilogueLowering(

+ Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,

+ BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,

+ AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,

+ LoopVectorizationLegality &LVL) {

+ bool OptSize =

+ F->hasOptSize() || llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,

+ PGSOQueryType::IRPass);

+ // 1) OptSize takes precedence over all other options, i.e. if this is set,

+ // don't look at hints or options, and don't request a scalar epilogue.

+ if (OptSize && Hints.getForce() != LoopVectorizeHints::FK_Enabled)

+ return CM_ScalarEpilogueNotAllowedOptSize;

+ bool PredicateOptDisabled = PreferPredicateOverEpilog.getNumOccurrences() &&

+ !PreferPredicateOverEpilog;

+ // 2) Next, if disabling predication is requested on the command line, honour

+ // this and request a scalar epilogue. Also do this if we don't have a

+ // primary induction variable, which is required for predication.

+ if (PredicateOptDisabled || !LVL.getPrimaryInduction())

+ return CM_ScalarEpilogueAllowed;

+ // 3) and 4) look if enabling predication is requested on the command line,

+ // with a loop hint, or if the TTI hook indicates this is profitable, request

+ // predication .

+ if (PreferPredicateOverEpilog ||

+ Hints.getPredicate() == LoopVectorizeHints::FK_Enabled ||

+ (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT,

+ LVL.getLAI()) &&

+ Hints.getPredicate() != LoopVectorizeHints::FK_Disabled))

+ return CM_ScalarEpilogueNotNeededUsePredicate;

+ return CM_ScalarEpilogueAllowed;

}

// Process the loop in the VPlan-native vectorization path. This path builds

@@ -7470,14 +7552,16 @@ static bool processLoopInVPlanNativePath(

assert(EnableVPlanNativePath && "VPlan-native path is disabled.");

Function *F = L->getHeader()->getParent();

InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());

- ScalarEpilogueLowering SEL = getScalarEpilogueLowering(F, L, Hints, PSI, BFI);

+ ScalarEpilogueLowering SEL = getScalarEpilogueLowering(

+ F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL);

LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,

&Hints, IAI);

// Use the planner for outer loop vectorization.

// TODO: CM is not used at this point inside the planner. Turn CM into an

// optional argument if we don't need it in the future.

- LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM);

+ LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI);

// Get user vectorization factor.

const unsigned UserVF = Hints.getWidth();

@@ -7562,7 +7646,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {

// Check the function attributes and profiles to find out if this function

// should be optimized for size.

- ScalarEpilogueLowering SEL = getScalarEpilogueLowering(F, L, Hints, PSI, BFI);

+ ScalarEpilogueLowering SEL = getScalarEpilogueLowering(

+ F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL);

// Entrance to the VPlan-native vectorization path. Outer loops are processed

// here. They may require CFG and instruction level transformations before

@@ -7635,7 +7720,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {

CM.collectValuesToIgnore();

// Use the planner for vectorization.

- LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM);

+ LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI);

// Get user vectorization factor.

unsigned UserVF = Hints.getWidth();

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 974eff9974d9..aabd974cd73e 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

@@ -26,6 +26,7 @@

#include "llvm/ADT/PostOrderIterator.h"

#include "llvm/ADT/STLExtras.h"

#include "llvm/ADT/SetVector.h"

+#include "llvm/ADT/SmallBitVector.h"

#include "llvm/ADT/SmallPtrSet.h"

#include "llvm/ADT/SmallSet.h"

#include "llvm/ADT/SmallVector.h"

@@ -72,6 +73,7 @@

#include "llvm/IR/Value.h"

#include "llvm/IR/ValueHandle.h"

#include "llvm/IR/Verifier.h"

+#include "llvm/InitializePasses.h"

#include "llvm/Pass.h"

#include "llvm/Support/Casting.h"

#include "llvm/Support/CommandLine.h"

@@ -127,6 +129,10 @@ static cl::opt<int>

MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden,

cl::desc("Attempt to vectorize for this register size in bits"));

+static cl::opt<int>

+MaxStoreLookup("slp-max-store-lookup", cl::init(32), cl::Hidden,

+ cl::desc("Maximum depth of the lookup for consecutive stores."));

/// Limits the size of scheduling regions in a block.

/// It avoid long compile times for _very_ large blocks where vector

/// instructions are spread over a wide range.

@@ -147,6 +153,20 @@ static cl::opt<unsigned> MinTreeSize(

"slp-min-tree-size", cl::init(3), cl::Hidden,

cl::desc("Only vectorize small trees if they are fully vectorizable"));

+// The maximum depth that the look-ahead score heuristic will explore.

+// The higher this value, the higher the compilation time overhead.

+static cl::opt<int> LookAheadMaxDepth(

+ "slp-max-look-ahead-depth", cl::init(2), cl::Hidden,

+ cl::desc("The maximum look-ahead depth for operand reordering scores"));

+// The Look-ahead heuristic goes through the users of the bundle to calculate

+// the users cost in getExternalUsesCost(). To avoid compilation time increase

+// we limit the number of users visited to this value.

+static cl::opt<unsigned> LookAheadUsersBudget(

+ "slp-look-ahead-users-budget", cl::init(2), cl::Hidden,

+ cl::desc("The maximum number of users to visit while visiting the "

+ "predecessors. This prevents compilation time increase."));

static cl::opt<bool>

ViewSLPTree("view-slp-tree", cl::Hidden,

cl::desc("Display the SLP trees with Graphviz"));

@@ -547,7 +567,7 @@ public:

/// Construct a vectorizable tree that starts at \p Roots, ignoring users for

/// the purpose of scheduling and extraction in the \p UserIgnoreLst taking

- /// into account (anf updating it, if required) list of externally used

+ /// into account (and updating it, if required) list of externally used

/// values stored in \p ExternallyUsedValues.

void buildTree(ArrayRef<Value *> Roots,

ExtraValueToDebugLocsMap &ExternallyUsedValues,

@@ -609,7 +629,10 @@ public:

return MinVecRegSize;

}

- /// Check if ArrayType or StructType is isomorphic to some VectorType.

+ /// Check if homogeneous aggregate is isomorphic to some VectorType.

+ /// Accepts homogeneous multidimensional aggregate of scalars/vectors like

+ /// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> },

+ /// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on.

///

/// \returns number of elements in vector if isomorphism exists, 0 otherwise.

unsigned canMapToVector(Type *T, const DataLayout &DL) const;

@@ -721,6 +744,7 @@ public:

const DataLayout &DL;

ScalarEvolution &SE;

+ const BoUpSLP &R;

/// \returns the operand data at \p OpIdx and \p Lane.

OperandData &getData(unsigned OpIdx, unsigned Lane) {

@@ -746,6 +770,227 @@ public:

std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);

}

+ // The hard-coded scores listed here are not very important. When computing

+ // the scores of matching one sub-tree with another, we are basically

+ // counting the number of values that are matching. So even if all scores

+ // are set to 1, we would still get a decent matching result.

+ // However, sometimes we have to break ties. For example we may have to

+ // choose between matching loads vs matching opcodes. This is what these

+ // scores are helping us with: they provide the order of preference.

+ /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).

+ static const int ScoreConsecutiveLoads = 3;

+ /// ExtractElementInst from same vector and consecutive indexes.

+ static const int ScoreConsecutiveExtracts = 3;

+ /// Constants.

+ static const int ScoreConstants = 2;

+ /// Instructions with the same opcode.

+ static const int ScoreSameOpcode = 2;

+ /// Instructions with alt opcodes (e.g, add + sub).

+ static const int ScoreAltOpcodes = 1;

+ /// Identical instructions (a.k.a. splat or broadcast).

+ static const int ScoreSplat = 1;

+ /// Matching with an undef is preferable to failing.

+ static const int ScoreUndef = 1;

+ /// Score for failing to find a decent match.

+ static const int ScoreFail = 0;

+ /// User exteranl to the vectorized code.

+ static const int ExternalUseCost = 1;

+ /// The user is internal but in a different lane.

+ static const int UserInDiffLaneCost = ExternalUseCost;

+ /// \returns the score of placing \p V1 and \p V2 in consecutive lanes.

+ static int getShallowScore(Value *V1, Value *V2, const DataLayout &DL,

+ ScalarEvolution &SE) {

+ auto *LI1 = dyn_cast<LoadInst>(V1);

+ auto *LI2 = dyn_cast<LoadInst>(V2);

+ if (LI1 && LI2)

+ return isConsecutiveAccess(LI1, LI2, DL, SE)

+ ? VLOperands::ScoreConsecutiveLoads

+ : VLOperands::ScoreFail;

+ auto *C1 = dyn_cast<Constant>(V1);

+ auto *C2 = dyn_cast<Constant>(V2);

+ if (C1 && C2)

+ return VLOperands::ScoreConstants;

+ // Extracts from consecutive indexes of the same vector better score as

+ // the extracts could be optimized away.

+ auto *Ex1 = dyn_cast<ExtractElementInst>(V1);

+ auto *Ex2 = dyn_cast<ExtractElementInst>(V2);

+ if (Ex1 && Ex2 && Ex1->getVectorOperand() == Ex2->getVectorOperand() &&

+ cast<ConstantInt>(Ex1->getIndexOperand())->getZExtValue() + 1 ==

+ cast<ConstantInt>(Ex2->getIndexOperand())->getZExtValue()) {

+ return VLOperands::ScoreConsecutiveExtracts;

+ }

+ auto *I1 = dyn_cast<Instruction>(V1);

+ auto *I2 = dyn_cast<Instruction>(V2);

+ if (I1 && I2) {

+ if (I1 == I2)

+ return VLOperands::ScoreSplat;

+ InstructionsState S = getSameOpcode({I1, I2});

+ // Note: Only consider instructions with <= 2 operands to avoid

+ // complexity explosion.

+ if (S.getOpcode() && S.MainOp->getNumOperands() <= 2)

+ return S.isAltShuffle() ? VLOperands::ScoreAltOpcodes

+ : VLOperands::ScoreSameOpcode;

+ }

+ if (isa<UndefValue>(V2))

+ return VLOperands::ScoreUndef;

+ return VLOperands::ScoreFail;

+ }

+ /// Holds the values and their lane that are taking part in the look-ahead

+ /// score calculation. This is used in the external uses cost calculation.

+ SmallDenseMap<Value *, int> InLookAheadValues;

+ /// \Returns the additinal cost due to uses of \p LHS and \p RHS that are

+ /// either external to the vectorized code, or require shuffling.

+ int getExternalUsesCost(const std::pair<Value *, int> &LHS,

+ const std::pair<Value *, int> &RHS) {

+ int Cost = 0;

+ SmallVector<std::pair<Value *, int>, 2> Values = {LHS, RHS};

+ for (int Idx = 0, IdxE = Values.size(); Idx != IdxE; ++Idx) {

+ Value *V = Values[Idx].first;

+ // Calculate the absolute lane, using the minimum relative lane of LHS

+ // and RHS as base and Idx as the offset.

+ int Ln = std::min(LHS.second, RHS.second) + Idx;

+ assert(Ln >= 0 && "Bad lane calculation");

+ unsigned UsersBudget = LookAheadUsersBudget;

+ for (User *U : V->users()) {

+ if (const TreeEntry *UserTE = R.getTreeEntry(U)) {

+ // The user is in the VectorizableTree. Check if we need to insert.

+ auto It = llvm::find(UserTE->Scalars, U);

+ assert(It != UserTE->Scalars.end() && "U is in UserTE");

+ int UserLn = std::distance(UserTE->Scalars.begin(), It);

+ assert(UserLn >= 0 && "Bad lane");

+ if (UserLn != Ln)

+ Cost += UserInDiffLaneCost;

+ } else {

+ // Check if the user is in the look-ahead code.

+ auto It2 = InLookAheadValues.find(U);

+ if (It2 != InLookAheadValues.end()) {

+ // The user is in the look-ahead code. Check the lane.

+ if (It2->second != Ln)

+ Cost += UserInDiffLaneCost;

+ } else {

+ // The user is neither in SLP tree nor in the look-ahead code.

+ Cost += ExternalUseCost;

+ }

+ // Limit the number of visited uses to cap compilation time.

+ if (--UsersBudget == 0)

+ break;

+ }

+ return Cost;

+ }

+ /// Go through the operands of \p LHS and \p RHS recursively until \p

+ /// MaxLevel, and return the cummulative score. For example:

+ /// \verbatim

+ /// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1]

+ /// \ / \ / \ / \ /

+ /// + + + +

+ /// G1 G2 G3 G4

+ /// \endverbatim

+ /// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at

+ /// each level recursively, accumulating the score. It starts from matching

+ /// the additions at level 0, then moves on to the loads (level 1). The

+ /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and

+ /// {B[0],B[1]} match with VLOperands::ScoreConsecutiveLoads, while

+ /// {A[0],C[0]} has a score of VLOperands::ScoreFail.

+ /// Please note that the order of the operands does not matter, as we

+ /// evaluate the score of all profitable combinations of operands. In

+ /// other words the score of G1 and G4 is the same as G1 and G2. This

+ /// heuristic is based on ideas described in:

+ /// Look-ahead SLP: Auto-vectorization in the presence of commutative

+ /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,

+ /// Luís F. W. Góes

+ int getScoreAtLevelRec(const std::pair<Value *, int> &LHS,

+ const std::pair<Value *, int> &RHS, int CurrLevel,

+ int MaxLevel) {

+ Value *V1 = LHS.first;

+ Value *V2 = RHS.first;

+ // Get the shallow score of V1 and V2.

+ int ShallowScoreAtThisLevel =

+ std::max((int)ScoreFail, getShallowScore(V1, V2, DL, SE) -

+ getExternalUsesCost(LHS, RHS));

+ int Lane1 = LHS.second;

+ int Lane2 = RHS.second;

+ // If reached MaxLevel,

+ // or if V1 and V2 are not instructions,

+ // or if they are SPLAT,

+ // or if they are not consecutive, early return the current cost.

+ auto *I1 = dyn_cast<Instruction>(V1);

+ auto *I2 = dyn_cast<Instruction>(V2);

+ if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||

+ ShallowScoreAtThisLevel == VLOperands::ScoreFail ||

+ (isa<LoadInst>(I1) && isa<LoadInst>(I2) && ShallowScoreAtThisLevel))

+ return ShallowScoreAtThisLevel;

+ assert(I1 && I2 && "Should have early exited.");

+ // Keep track of in-tree values for determining the external-use cost.

+ InLookAheadValues[V1] = Lane1;

+ InLookAheadValues[V2] = Lane2;

+ // Contains the I2 operand indexes that got matched with I1 operands.

+ SmallSet<unsigned, 4> Op2Used;

+ // Recursion towards the operands of I1 and I2. We are trying all possbile

+ // operand pairs, and keeping track of the best score.

+ for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();

+ OpIdx1 != NumOperands1; ++OpIdx1) {

+ // Try to pair op1I with the best operand of I2.

+ int MaxTmpScore = 0;

+ unsigned MaxOpIdx2 = 0;

+ bool FoundBest = false;

+ // If I2 is commutative try all combinations.

+ unsigned FromIdx = isCommutative(I2) ? 0 : OpIdx1;

+ unsigned ToIdx = isCommutative(I2)

+ ? I2->getNumOperands()

+ : std::min(I2->getNumOperands(), OpIdx1 + 1);

+ assert(FromIdx <= ToIdx && "Bad index");

+ for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {

+ // Skip operands already paired with OpIdx1.

+ if (Op2Used.count(OpIdx2))

+ continue;

+ // Recursively calculate the cost at each level

+ int TmpScore = getScoreAtLevelRec({I1->getOperand(OpIdx1), Lane1},

+ {I2->getOperand(OpIdx2), Lane2},

+ CurrLevel + 1, MaxLevel);

+ // Look for the best score.

+ if (TmpScore > VLOperands::ScoreFail && TmpScore > MaxTmpScore) {

+ MaxTmpScore = TmpScore;

+ MaxOpIdx2 = OpIdx2;

+ FoundBest = true;

+ }

+ if (FoundBest) {

+ // Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it.

+ Op2Used.insert(MaxOpIdx2);

+ ShallowScoreAtThisLevel += MaxTmpScore;

+ }

+ return ShallowScoreAtThisLevel;

+ }

+ /// \Returns the look-ahead score, which tells us how much the sub-trees

+ /// rooted at \p LHS and \p RHS match, the more they match the higher the

+ /// score. This helps break ties in an informed way when we cannot decide on

+ /// the order of the operands by just considering the immediate

+ /// predecessors.

+ int getLookAheadScore(const std::pair<Value *, int> &LHS,

+ const std::pair<Value *, int> &RHS) {

+ InLookAheadValues.clear();

+ return getScoreAtLevelRec(LHS, RHS, 1, LookAheadMaxDepth);

+ }

// Search all operands in Ops[*][Lane] for the one that matches best

// Ops[OpIdx][LastLane] and return its opreand index.

// If no good match can be found, return None.

@@ -763,9 +1008,6 @@ public:

// The linearized opcode of the operand at OpIdx, Lane.

bool OpIdxAPO = getData(OpIdx, Lane).APO;

- const unsigned BestScore = 2;

- const unsigned GoodScore = 1;

// The best operand index and its score.

// Sometimes we have more than one option (e.g., Opcode and Undefs), so we

// are using the score to differentiate between the two.

@@ -794,41 +1036,19 @@ public:

// Look for an operand that matches the current mode.

switch (RMode) {

case ReorderingMode::Load:

- if (isa<LoadInst>(Op)) {

- // Figure out which is left and right, so that we can check for

- // consecutive loads

- bool LeftToRight = Lane > LastLane;

- Value *OpLeft = (LeftToRight) ? OpLastLane : Op;

- Value *OpRight = (LeftToRight) ? Op : OpLastLane;

- if (isConsecutiveAccess(cast<LoadInst>(OpLeft),

- cast<LoadInst>(OpRight), DL, SE))

- BestOp.Idx = Idx;

- }

- break;

- case ReorderingMode::Opcode:

- // We accept both Instructions and Undefs, but with different scores.

- if ((isa<Instruction>(Op) && isa<Instruction>(OpLastLane) &&

- cast<Instruction>(Op)->getOpcode() ==

- cast<Instruction>(OpLastLane)->getOpcode()) ||

- (isa<UndefValue>(OpLastLane) && isa<Instruction>(Op)) ||

- isa<UndefValue>(Op)) {

- // An instruction has a higher score than an undef.

- unsigned Score = (isa<UndefValue>(Op)) ? GoodScore : BestScore;

- if (Score > BestOp.Score) {

- BestOp.Idx = Idx;

- BestOp.Score = Score;

- }

- break;

case ReorderingMode::Constant:

- if (isa<Constant>(Op)) {

- unsigned Score = (isa<UndefValue>(Op)) ? GoodScore : BestScore;

- if (Score > BestOp.Score) {

- BestOp.Idx = Idx;

- BestOp.Score = Score;

- }

+ case ReorderingMode::Opcode: {

+ bool LeftToRight = Lane > LastLane;

+ Value *OpLeft = (LeftToRight) ? OpLastLane : Op;

+ Value *OpRight = (LeftToRight) ? Op : OpLastLane;

+ unsigned Score =

+ getLookAheadScore({OpLeft, LastLane}, {OpRight, Lane});

+ if (Score > BestOp.Score) {

+ BestOp.Idx = Idx;

+ BestOp.Score = Score;

}

break;

+ }

case ReorderingMode::Splat:

if (Op == OpLastLane)

BestOp.Idx = Idx;

@@ -959,8 +1179,8 @@ public:

public:

/// Initialize with all the operands of the instruction vector \p RootVL.

VLOperands(ArrayRef<Value *> RootVL, const DataLayout &DL,

- ScalarEvolution &SE)

- : DL(DL), SE(SE) {

+ ScalarEvolution &SE, const BoUpSLP &R)

+ : DL(DL), SE(SE), R(R) {

// Append all the operands of RootVL.

appendOperandsOfVL(RootVL);

}

@@ -1189,7 +1409,8 @@ private:

SmallVectorImpl<Value *> &Left,

SmallVectorImpl<Value *> &Right,

const DataLayout &DL,

- ScalarEvolution &SE);

+ ScalarEvolution &SE,

+ const BoUpSLP &R);

struct TreeEntry {

using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;

TreeEntry(VecTreeTy &Container) : Container(Container) {}

@@ -1211,7 +1432,8 @@ private:

Value *VectorizedValue = nullptr;

/// Do we need to gather this sequence ?

- bool NeedToGather = false;

+ enum EntryState { Vectorize, NeedToGather };

+ EntryState State;

/// Does this sequence require some shuffling?

SmallVector<unsigned, 4> ReuseShuffleIndices;

@@ -1353,15 +1575,30 @@ private:

dbgs() << "Scalars: \n";

for (Value *V : Scalars)

dbgs().indent(2) << *V << "\n";

- dbgs() << "NeedToGather: " << NeedToGather << "\n";

- dbgs() << "MainOp: " << *MainOp << "\n";

- dbgs() << "AltOp: " << *AltOp << "\n";

+ dbgs() << "State: ";

+ switch (State) {

+ case Vectorize:

+ dbgs() << "Vectorize\n";

+ break;

+ case NeedToGather:

+ dbgs() << "NeedToGather\n";

+ break;

+ }

+ dbgs() << "MainOp: ";

+ if (MainOp)

+ dbgs() << *MainOp << "\n";

+ else

+ dbgs() << "NULL\n";

+ dbgs() << "AltOp: ";

+ if (AltOp)

+ dbgs() << *AltOp << "\n";

+ else

+ dbgs() << "NULL\n";

dbgs() << "VectorizedValue: ";

if (VectorizedValue)

- dbgs() << *VectorizedValue;

+ dbgs() << *VectorizedValue << "\n";

else

- dbgs() << "NULL";

- dbgs() << "\n";

+ dbgs() << "NULL\n";

dbgs() << "ReuseShuffleIndices: ";

if (ReuseShuffleIndices.empty())

dbgs() << "Emtpy";

@@ -1392,7 +1629,7 @@ private:

TreeEntry *Last = VectorizableTree.back().get();

Last->Idx = VectorizableTree.size() - 1;

Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());

- Last->NeedToGather = !Vectorized;

+ Last->State = Vectorized ? TreeEntry::Vectorize : TreeEntry::NeedToGather;

Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),

ReuseShuffleIndices.end());

Last->ReorderIndices = ReorderIndices;

@@ -1721,7 +1958,7 @@ private:

return nullptr;

}

- bool isInSchedulingRegion(ScheduleData *SD) {

+ bool isInSchedulingRegion(ScheduleData *SD) const {

return SD->SchedulingRegionID == SchedulingRegionID;

}

@@ -2063,7 +2300,7 @@ template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {

static std::string getNodeAttributes(const TreeEntry *Entry,

const BoUpSLP *) {

- if (Entry->NeedToGather)

+ if (Entry->State == TreeEntry::NeedToGather)

return "color=red";

return "";

}

@@ -2115,7 +2352,7 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots,

TreeEntry *Entry = TEPtr.get();

// No need to handle users of gathered values.

- if (Entry->NeedToGather)

+ if (Entry->State == TreeEntry::NeedToGather)

continue;

// For each lane:

@@ -2152,7 +2389,7 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots,

!InTreeUserNeedToExtract(Scalar, UserInst, TLI)) {

LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U

<< ".\n");

- assert(!UseEntry->NeedToGather && "Bad state");

+ assert(UseEntry->State != TreeEntry::NeedToGather && "Bad state");

continue;

}

@@ -2448,7 +2685,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

dyn_cast<SCEVConstant>(SE->getMinusSCEV(ScevN, Scev0));

uint64_t Size = DL->getTypeAllocSize(ScalarTy);

// Check that the sorted loads are consecutive.

- if (Diff && Diff->getAPInt().getZExtValue() == (VL.size() - 1) * Size) {

+ if (Diff && Diff->getAPInt() == (VL.size() - 1) * Size) {

if (CurrentOrder.empty()) {

// Original loads are consecutive and does not require reordering.

++NumOpsWantToKeepOriginalOrder;

@@ -2543,7 +2780,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

// Commutative predicate - collect + sort operands of the instructions

// so that each side is more likely to have the same opcode.

assert(P0 == SwapP0 && "Commutative Predicate mismatch");

- reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE);

+ reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this);

} else {

// Collect operands - commute if it uses the swapped predicate.

for (Value *V : VL) {

@@ -2590,7 +2827,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

// have the same opcode.

if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {

ValueList Left, Right;

- reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE);

+ reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this);

TE->setOperand(0, Left);

TE->setOperand(1, Right);

buildTree_rec(Left, Depth + 1, {TE, 0});

@@ -2637,9 +2874,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

}

// We don't combine GEPs with non-constant indexes.

+ Type *Ty1 = VL0->getOperand(1)->getType();

for (Value *V : VL) {

auto Op = cast<Instruction>(V)->getOperand(1);

- if (!isa<ConstantInt>(Op)) {

+ if (!isa<ConstantInt>(Op) ||

+ (Op->getType() != Ty1 &&

+ Op->getType()->getScalarSizeInBits() >

+ DL->getIndexSizeInBits(

+ V->getType()->getPointerAddressSpace()))) {

LLVM_DEBUG(dbgs()

<< "SLP: not-vectorizable GEP (non-constant indexes).\n");

BS.cancelScheduling(VL, VL0);

@@ -2665,24 +2907,74 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

}

case Instruction::Store: {

// Check if the stores are consecutive or if we need to swizzle them.

- for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)

- if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) {

+ llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType();

+ // Make sure all stores in the bundle are simple - we can't vectorize

+ // atomic or volatile stores.

+ SmallVector<Value *, 4> PointerOps(VL.size());

+ ValueList Operands(VL.size());

+ auto POIter = PointerOps.begin();

+ auto OIter = Operands.begin();

+ for (Value *V : VL) {

+ auto *SI = cast<StoreInst>(V);

+ if (!SI->isSimple()) {

BS.cancelScheduling(VL, VL0);

newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,

ReuseShuffleIndicies);

- LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");

+ LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple stores.\n");

return;

}

+ *POIter = SI->getPointerOperand();

+ *OIter = SI->getValueOperand();

+ ++POIter;

+ ++OIter;

+ }

- TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,

- ReuseShuffleIndicies);

- LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n");

+ OrdersType CurrentOrder;

+ // Check the order of pointer operands.

+ if (llvm::sortPtrAccesses(PointerOps, *DL, *SE, CurrentOrder)) {

+ Value *Ptr0;

+ Value *PtrN;

+ if (CurrentOrder.empty()) {

+ Ptr0 = PointerOps.front();

+ PtrN = PointerOps.back();

+ } else {

+ Ptr0 = PointerOps[CurrentOrder.front()];

+ PtrN = PointerOps[CurrentOrder.back()];

+ }

+ const SCEV *Scev0 = SE->getSCEV(Ptr0);

+ const SCEV *ScevN = SE->getSCEV(PtrN);

+ const auto *Diff =

+ dyn_cast<SCEVConstant>(SE->getMinusSCEV(ScevN, Scev0));

+ uint64_t Size = DL->getTypeAllocSize(ScalarTy);

+ // Check that the sorted pointer operands are consecutive.

+ if (Diff && Diff->getAPInt() == (VL.size() - 1) * Size) {

+ if (CurrentOrder.empty()) {

+ // Original stores are consecutive and does not require reordering.

+ ++NumOpsWantToKeepOriginalOrder;

+ TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S,

+ UserTreeIdx, ReuseShuffleIndicies);

+ TE->setOperandsInOrder();

+ buildTree_rec(Operands, Depth + 1, {TE, 0});

+ LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n");

+ } else {

+ // Need to reorder.

+ auto I = NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first;

+ ++(I->getSecond());

+ TreeEntry *TE =

+ newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,

+ ReuseShuffleIndicies, I->getFirst());

+ TE->setOperandsInOrder();

+ buildTree_rec(Operands, Depth + 1, {TE, 0});

+ LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled stores.\n");

+ }

+ return;

+ }

- ValueList Operands;

- for (Value *V : VL)

- Operands.push_back(cast<Instruction>(V)->getOperand(0));

- TE->setOperandsInOrder();

- buildTree_rec(Operands, Depth + 1, {TE, 0});

+ BS.cancelScheduling(VL, VL0);

+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,

+ ReuseShuffleIndicies);

+ LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");

return;

}

case Instruction::Call: {

@@ -2777,7 +3069,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

// Reorder operands if reordering would enable vectorization.

if (isa<BinaryOperator>(VL0)) {

ValueList Left, Right;

- reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE);

+ reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this);

TE->setOperand(0, Left);

TE->setOperand(1, Right);

buildTree_rec(Left, Depth + 1, {TE, 0});

@@ -2806,27 +3098,29 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

}

unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const {

- unsigned N;

- Type *EltTy;

- auto *ST = dyn_cast<StructType>(T);

- if (ST) {

- N = ST->getNumElements();

- EltTy = *ST->element_begin();

- } else {

- N = cast<ArrayType>(T)->getNumElements();

- EltTy = cast<ArrayType>(T)->getElementType();

+ unsigned N = 1;

+ Type *EltTy = T;

+ while (isa<CompositeType>(EltTy)) {

+ if (auto *ST = dyn_cast<StructType>(EltTy)) {

+ // Check that struct is homogeneous.

+ for (const auto *Ty : ST->elements())

+ if (Ty != *ST->element_begin())

+ return 0;

+ N *= ST->getNumElements();

+ EltTy = *ST->element_begin();

+ } else {

+ auto *SeqT = cast<SequentialType>(EltTy);

+ N *= SeqT->getNumElements();

+ EltTy = SeqT->getElementType();

+ }

}

if (!isValidElementType(EltTy))

return 0;

uint64_t VTSize = DL.getTypeStoreSizeInBits(VectorType::get(EltTy, N));

if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize || VTSize != DL.getTypeStoreSizeInBits(T))

return 0;

- if (ST) {

- // Check that struct is homogeneous.

- for (const auto *Ty : ST->elements())

- if (Ty != EltTy)

- return 0;

- }

return N;

}

@@ -2927,7 +3221,7 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {

ReuseShuffleCost =

TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy);

}

- if (E->NeedToGather) {

+ if (E->State == TreeEntry::NeedToGather) {

if (allConstant(VL))

return 0;

if (isSplat(VL)) {

@@ -2995,7 +3289,7 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {

TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, Idx);

}

- if (!E->NeedToGather) {

+ if (E->State == TreeEntry::Vectorize) {

int DeadCost = ReuseShuffleCost;

if (!E->ReorderIndices.empty()) {

// TODO: Merge this shuffle with the ReuseShuffleCost.

@@ -3135,13 +3429,13 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {

SmallVector<const Value *, 4> Operands(VL0->operand_values());

int ScalarEltCost = TTI->getArithmeticInstrCost(

- E->getOpcode(), ScalarTy, Op1VK, Op2VK, Op1VP, Op2VP, Operands);

+ E->getOpcode(), ScalarTy, Op1VK, Op2VK, Op1VP, Op2VP, Operands, VL0);

if (NeedToShuffleReuses) {

ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;

}

int ScalarCost = VecTy->getNumElements() * ScalarEltCost;

- int VecCost = TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, Op1VK,

- Op2VK, Op1VP, Op2VP, Operands);

+ int VecCost = TTI->getArithmeticInstrCost(

+ E->getOpcode(), VecTy, Op1VK, Op2VK, Op1VP, Op2VP, Operands, VL0);

return ReuseShuffleCost + VecCost - ScalarCost;

}

case Instruction::GetElementPtr: {

@@ -3162,7 +3456,7 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {

}

case Instruction::Load: {

// Cost of wide load - cost of scalar loads.

- unsigned alignment = cast<LoadInst>(VL0)->getAlignment();

+ MaybeAlign alignment(cast<LoadInst>(VL0)->getAlignment());

int ScalarEltCost =

TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0, VL0);

if (NeedToShuffleReuses) {

@@ -3180,15 +3474,22 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {

}

case Instruction::Store: {

// We know that we can merge the stores. Calculate the cost.

- unsigned alignment = cast<StoreInst>(VL0)->getAlignment();

+ bool IsReorder = !E->ReorderIndices.empty();

+ auto *SI =

+ cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);

+ MaybeAlign Alignment(SI->getAlignment());

int ScalarEltCost =

- TTI->getMemoryOpCost(Instruction::Store, ScalarTy, alignment, 0, VL0);

- if (NeedToShuffleReuses) {

- ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;

- }

+ TTI->getMemoryOpCost(Instruction::Store, ScalarTy, Alignment, 0, VL0);

+ if (NeedToShuffleReuses)

+ ReuseShuffleCost = -(ReuseShuffleNumbers - VL.size()) * ScalarEltCost;

int ScalarStCost = VecTy->getNumElements() * ScalarEltCost;

- int VecStCost =

- TTI->getMemoryOpCost(Instruction::Store, VecTy, alignment, 0, VL0);

+ int VecStCost = TTI->getMemoryOpCost(Instruction::Store,

+ VecTy, Alignment, 0, VL0);

+ if (IsReorder) {

+ // TODO: Merge this shuffle with the ReuseShuffleCost.

+ VecStCost += TTI->getShuffleCost(

+ TargetTransformInfo::SK_PermuteSingleSrc, VecTy);

+ }

return ReuseShuffleCost + VecStCost - ScalarStCost;

}

case Instruction::Call: {

@@ -3274,20 +3575,22 @@ bool BoUpSLP::isFullyVectorizableTinyTree() const {

<< VectorizableTree.size() << " is fully vectorizable .\n");

// We only handle trees of heights 1 and 2.

- if (VectorizableTree.size() == 1 && !VectorizableTree[0]->NeedToGather)

+ if (VectorizableTree.size() == 1 &&

+ VectorizableTree[0]->State == TreeEntry::Vectorize)

return true;

if (VectorizableTree.size() != 2)

return false;

// Handle splat and all-constants stores.

- if (!VectorizableTree[0]->NeedToGather &&

+ if (VectorizableTree[0]->State == TreeEntry::Vectorize &&

(allConstant(VectorizableTree[1]->Scalars) ||

isSplat(VectorizableTree[1]->Scalars)))

return true;

// Gathering cost would be too much for tiny trees.

- if (VectorizableTree[0]->NeedToGather || VectorizableTree[1]->NeedToGather)

+ if (VectorizableTree[0]->State == TreeEntry::NeedToGather ||

+ VectorizableTree[1]->State == TreeEntry::NeedToGather)

return false;

return true;

@@ -3397,7 +3700,7 @@ int BoUpSLP::getSpillCost() const {

continue;

}

- // Debug informations don't impact spill cost.

+ // Debug information does not impact spill cost.

if ((isa<CallInst>(&*PrevInstIt) &&

!isa<DbgInfoIntrinsic>(&*PrevInstIt)) &&

&*PrevInstIt != PrevInst)

@@ -3441,12 +3744,13 @@ int BoUpSLP::getTreeCost() {

// their uses. Since such an approach results in fewer total entries,

// existing heuristics based on tree size may yield different results.

- if (TE.NeedToGather &&

- std::any_of(

- std::next(VectorizableTree.begin(), I + 1), VectorizableTree.end(),

- [TE](const std::unique_ptr<TreeEntry> &EntryPtr) {

- return EntryPtr->NeedToGather && EntryPtr->isSame(TE.Scalars);

- }))

+ if (TE.State == TreeEntry::NeedToGather &&

+ std::any_of(std::next(VectorizableTree.begin(), I + 1),

+ VectorizableTree.end(),

+ [TE](const std::unique_ptr<TreeEntry> &EntryPtr) {

+ return EntryPtr->State == TreeEntry::NeedToGather &&

+ EntryPtr->isSame(TE.Scalars);

+ }))

continue;

int C = getEntryCost(&TE);

@@ -3538,13 +3842,15 @@ int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) const {

// Perform operand reordering on the instructions in VL and return the reordered

// operands in Left and Right.

-void BoUpSLP::reorderInputsAccordingToOpcode(

- ArrayRef<Value *> VL, SmallVectorImpl<Value *> &Left,

- SmallVectorImpl<Value *> &Right, const DataLayout &DL,

- ScalarEvolution &SE) {

+void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,

+ SmallVectorImpl<Value *> &Left,

+ SmallVectorImpl<Value *> &Right,

+ const DataLayout &DL,

+ ScalarEvolution &SE,

+ const BoUpSLP &R) {

if (VL.empty())

return;

- VLOperands Ops(VL, DL, SE);

+ VLOperands Ops(VL, DL, SE, R);

// Reorder the operands in place.

Ops.reorder();

Left = Ops.getVL(0);

@@ -3735,7 +4041,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();

- if (E->NeedToGather) {

+ if (E->State == TreeEntry::NeedToGather) {

setInsertPointAfterBundle(E);

auto *V = Gather(E->Scalars, VecTy);

if (NeedToShuffleReuses) {

@@ -3790,7 +4096,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

}

case Instruction::ExtractElement: {

- if (!E->NeedToGather) {

+ if (E->State == TreeEntry::Vectorize) {

Value *V = E->getSingleOperand(0);

if (!E->ReorderIndices.empty()) {

OrdersType Mask;

@@ -3823,7 +4129,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

return V;

}

case Instruction::ExtractValue: {

- if (!E->NeedToGather) {

+ if (E->State == TreeEntry::Vectorize) {

LoadInst *LI = cast<LoadInst>(E->getSingleOperand(0));

Builder.SetInsertPoint(LI);

PointerType *PtrTy = PointerType::get(VecTy, LI->getPointerAddressSpace());

@@ -4050,15 +4356,25 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

return V;

}

case Instruction::Store: {

- StoreInst *SI = cast<StoreInst>(VL0);

+ bool IsReorder = !E->ReorderIndices.empty();

+ auto *SI = cast<StoreInst>(

+ IsReorder ? E->Scalars[E->ReorderIndices.front()] : VL0);

unsigned Alignment = SI->getAlignment();

unsigned AS = SI->getPointerAddressSpace();

setInsertPointAfterBundle(E);

Value *VecValue = vectorizeTree(E->getOperand(0));

+ if (IsReorder) {

+ OrdersType Mask;

+ inversePermutation(E->ReorderIndices, Mask);

+ VecValue = Builder.CreateShuffleVector(

+ VecValue, UndefValue::get(VecValue->getType()), E->ReorderIndices,

+ "reorder_shuffle");

+ }

Value *ScalarPtr = SI->getPointerOperand();

- Value *VecPtr = Builder.CreateBitCast(ScalarPtr, VecTy->getPointerTo(AS));

+ Value *VecPtr = Builder.CreateBitCast(

+ ScalarPtr, VecValue->getType()->getPointerTo(AS));

StoreInst *ST = Builder.CreateStore(VecValue, VecPtr);

// The pointer operand uses an in-tree scalar, so add the new BitCast to

@@ -4088,7 +4404,22 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

std::vector<Value *> OpVecs;

for (int j = 1, e = cast<GetElementPtrInst>(VL0)->getNumOperands(); j < e;

++j) {

- Value *OpVec = vectorizeTree(E->getOperand(j));

+ ValueList &VL = E->getOperand(j);

+ // Need to cast all elements to the same type before vectorization to

+ // avoid crash.

+ Type *VL0Ty = VL0->getOperand(j)->getType();

+ Type *Ty = llvm::all_of(

+ VL, [VL0Ty](Value *V) { return VL0Ty == V->getType(); })

+ ? VL0Ty

+ : DL->getIndexType(cast<GetElementPtrInst>(VL0)

+ ->getPointerOperandType()

+ ->getScalarType());

+ for (Value *&V : VL) {

+ auto *CI = cast<ConstantInt>(V);

+ V = ConstantExpr::getIntegerCast(CI, Ty,

+ CI->getValue().isSignBitSet());

+ }

+ Value *OpVec = vectorizeTree(VL);

OpVecs.push_back(OpVec);

}

@@ -4284,7 +4615,7 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {

continue;

TreeEntry *E = getTreeEntry(Scalar);

assert(E && "Invalid scalar");

- assert(!E->NeedToGather && "Extracting from a gather list");

+ assert(E->State == TreeEntry::Vectorize && "Extracting from a gather list");

Value *Vec = E->VectorizedValue;

assert(Vec && "Can't find vectorizable value");

@@ -4357,7 +4688,7 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {

TreeEntry *Entry = TEPtr.get();

// No need to handle users of gathered values.

- if (Entry->NeedToGather)

+ if (Entry->State == TreeEntry::NeedToGather)

continue;

assert(Entry->VectorizedValue && "Can't find vectorizable value");

@@ -5332,125 +5663,140 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,

}

bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,

- unsigned VecRegSize) {

- const unsigned ChainLen = Chain.size();

- LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen

+ unsigned Idx) {

+ LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size()

<< "\n");

const unsigned Sz = R.getVectorElementSize(Chain[0]);

- const unsigned VF = VecRegSize / Sz;

+ const unsigned MinVF = R.getMinVecRegSize() / Sz;

+ unsigned VF = Chain.size();

- if (!isPowerOf2_32(Sz) || VF < 2)

+ if (!isPowerOf2_32(Sz) || !isPowerOf2_32(VF) || VF < 2 || VF < MinVF)

return false;

- bool Changed = false;

- // Look for profitable vectorizable trees at all offsets, starting at zero.

- for (unsigned i = 0, e = ChainLen; i + VF <= e; ++i) {

- ArrayRef<Value *> Operands = Chain.slice(i, VF);

- // Check that a previous iteration of this loop did not delete the Value.

- if (llvm::any_of(Operands, [&R](Value *V) {

- auto *I = dyn_cast<Instruction>(V);

- return I && R.isDeleted(I);

- }))

- continue;

- LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << i

- << "\n");

- R.buildTree(Operands);

- if (R.isTreeTinyAndNotFullyVectorizable())

- continue;

+ LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << Idx

+ << "\n");

- R.computeMinimumValueSizes();

+ R.buildTree(Chain);

+ Optional<ArrayRef<unsigned>> Order = R.bestOrder();

+ // TODO: Handle orders of size less than number of elements in the vector.

+ if (Order && Order->size() == Chain.size()) {

+ // TODO: reorder tree nodes without tree rebuilding.

+ SmallVector<Value *, 4> ReorderedOps(Chain.rbegin(), Chain.rend());

+ llvm::transform(*Order, ReorderedOps.begin(),

+ [Chain](const unsigned Idx) { return Chain[Idx]; });

+ R.buildTree(ReorderedOps);

+ }

+ if (R.isTreeTinyAndNotFullyVectorizable())

+ return false;

- int Cost = R.getTreeCost();

+ R.computeMinimumValueSizes();

- LLVM_DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF

- << "\n");

- if (Cost < -SLPCostThreshold) {

- LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n");

+ int Cost = R.getTreeCost();

- using namespace ore;

+ LLVM_DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n");

+ if (Cost < -SLPCostThreshold) {

+ LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n");

- R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized",

- cast<StoreInst>(Chain[i]))

- << "Stores SLP vectorized with cost " << NV("Cost", Cost)

- << " and with tree size "

- << NV("TreeSize", R.getTreeSize()));

+ using namespace ore;

- R.vectorizeTree();

+ R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized",

+ cast<StoreInst>(Chain[0]))

+ << "Stores SLP vectorized with cost " << NV("Cost", Cost)

+ << " and with tree size "

+ << NV("TreeSize", R.getTreeSize()));

- // Move to the next bundle.

- i += VF - 1;

- Changed = true;

- }

+ R.vectorizeTree();

+ return true;

}

- return Changed;

+ return false;

}

bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,

BoUpSLP &R) {

- SetVector<StoreInst *> Heads;

- SmallDenseSet<StoreInst *> Tails;

- SmallDenseMap<StoreInst *, StoreInst *> ConsecutiveChain;

// We may run into multiple chains that merge into a single chain. We mark the

// stores that we vectorized so that we don't visit the same store twice.

BoUpSLP::ValueSet VectorizedStores;

bool Changed = false;

- auto &&FindConsecutiveAccess =

- [this, &Stores, &Heads, &Tails, &ConsecutiveChain] (int K, int Idx) {

- if (!isConsecutiveAccess(Stores[K], Stores[Idx], *DL, *SE))

- return false;

- Tails.insert(Stores[Idx]);

- Heads.insert(Stores[K]);

- ConsecutiveChain[Stores[K]] = Stores[Idx];

- return true;

- };

+ int E = Stores.size();

+ SmallBitVector Tails(E, false);

+ SmallVector<int, 16> ConsecutiveChain(E, E + 1);

+ int MaxIter = MaxStoreLookup.getValue();

+ int IterCnt;

+ auto &&FindConsecutiveAccess = [this, &Stores, &Tails, &IterCnt, MaxIter,

+ &ConsecutiveChain](int K, int Idx) {

+ if (IterCnt >= MaxIter)

+ return true;

+ ++IterCnt;

+ if (!isConsecutiveAccess(Stores[K], Stores[Idx], *DL, *SE))

+ return false;

+ Tails.set(Idx);

+ ConsecutiveChain[K] = Idx;

+ return true;

+ };

// Do a quadratic search on all of the given stores in reverse order and find

// all of the pairs of stores that follow each other.

- int E = Stores.size();

for (int Idx = E - 1; Idx >= 0; --Idx) {

// If a store has multiple consecutive store candidates, search according

// to the sequence: Idx-1, Idx+1, Idx-2, Idx+2, ...

// This is because usually pairing with immediate succeeding or preceding

// candidate create the best chance to find slp vectorization opportunity.

- for (int Offset = 1, F = std::max(E - Idx, Idx + 1); Offset < F; ++Offset)

+ const int MaxLookDepth = std::max(E - Idx, Idx + 1);

+ IterCnt = 0;

+ for (int Offset = 1, F = MaxLookDepth; Offset < F; ++Offset)

if ((Idx >= Offset && FindConsecutiveAccess(Idx - Offset, Idx)) ||

(Idx + Offset < E && FindConsecutiveAccess(Idx + Offset, Idx)))

break;

}

// For stores that start but don't end a link in the chain:

- for (auto *SI : llvm::reverse(Heads)) {

- if (Tails.count(SI))

+ for (int Cnt = E; Cnt > 0; --Cnt) {

+ int I = Cnt - 1;

+ if (ConsecutiveChain[I] == E + 1 || Tails.test(I))

continue;

// We found a store instr that starts a chain. Now follow the chain and try

// to vectorize it.

BoUpSLP::ValueList Operands;

- StoreInst *I = SI;

// Collect the chain into a list.

- while ((Tails.count(I) || Heads.count(I)) && !VectorizedStores.count(I)) {

- Operands.push_back(I);

+ while (I != E + 1 && !VectorizedStores.count(Stores[I])) {

+ Operands.push_back(Stores[I]);

// Move to the next value in the chain.

I = ConsecutiveChain[I];

}

+ // If a vector register can't hold 1 element, we are done.

+ unsigned MaxVecRegSize = R.getMaxVecRegSize();

+ unsigned EltSize = R.getVectorElementSize(Stores[0]);

+ if (MaxVecRegSize % EltSize != 0)

+ continue;

+ unsigned MaxElts = MaxVecRegSize / EltSize;

// FIXME: Is division-by-2 the correct step? Should we assert that the

// register size is a power-of-2?

- for (unsigned Size = R.getMaxVecRegSize(); Size >= R.getMinVecRegSize();

- Size /= 2) {

- if (vectorizeStoreChain(Operands, R, Size)) {

- // Mark the vectorized stores so that we don't vectorize them again.

- VectorizedStores.insert(Operands.begin(), Operands.end());

- Changed = true;

- break;

+ unsigned StartIdx = 0;

+ for (unsigned Size = llvm::PowerOf2Ceil(MaxElts); Size >= 2; Size /= 2) {

+ for (unsigned Cnt = StartIdx, E = Operands.size(); Cnt + Size <= E;) {

+ ArrayRef<Value *> Slice = makeArrayRef(Operands).slice(Cnt, Size);

+ if (!VectorizedStores.count(Slice.front()) &&

+ !VectorizedStores.count(Slice.back()) &&

+ vectorizeStoreChain(Slice, R, Cnt)) {

+ // Mark the vectorized stores so that we don't vectorize them again.

+ VectorizedStores.insert(Slice.begin(), Slice.end());

+ Changed = true;

+ // If we vectorized initial block, no need to try to vectorize it

+ // again.

+ if (Cnt == StartIdx)

+ StartIdx += Size;

+ Cnt += Size;

+ continue;

+ }

+ ++Cnt;

}

+ // Check if the whole array was vectorized already - exit.

+ if (StartIdx >= Operands.size())

+ break;

}

@@ -5835,38 +6181,36 @@ class HorizontalReduction {

explicit operator bool() const { return Opcode; }

- /// Get the index of the first operand.

- unsigned getFirstOperandIndex() const {

- assert(!!*this && "The opcode is not set.");

+ /// Return true if this operation is any kind of minimum or maximum.

+ bool isMinMax() const {

switch (Kind) {

+ case RK_Arithmetic:

+ return false;

case RK_Min:

- case RK_UMin:

case RK_Max:

+ case RK_UMin:

case RK_UMax:

- return 1;

- case RK_Arithmetic:

+ return true;

case RK_None:

break;

}

- return 0;

+ llvm_unreachable("Reduction kind is not set");

+ }

+ /// Get the index of the first operand.

+ unsigned getFirstOperandIndex() const {

+ assert(!!*this && "The opcode is not set.");

+ // We allow calling this before 'Kind' is set, so handle that specially.

+ if (Kind == RK_None)

+ return 0;

+ return isMinMax() ? 1 : 0;

}

/// Total number of operands in the reduction operation.

unsigned getNumberOfOperands() const {

assert(Kind != RK_None && !!*this && LHS && RHS &&

"Expected reduction operation.");

- switch (Kind) {

- case RK_Arithmetic:

- return 2;

- case RK_Min:

- case RK_UMin:

- case RK_Max:

- case RK_UMax:

- return 3;

- case RK_None:

- break;

- }

- llvm_unreachable("Reduction kind is not set");

+ return isMinMax() ? 3 : 2;

}

/// Checks if the operation has the same parent as \p P.

@@ -5875,79 +6219,46 @@ class HorizontalReduction {

"Expected reduction operation.");

if (!IsRedOp)

return I->getParent() == P;

- switch (Kind) {

- case RK_Arithmetic:

- // Arithmetic reduction operation must be used once only.

- return I->getParent() == P;

- case RK_Min:

- case RK_UMin:

- case RK_Max:

- case RK_UMax: {

+ if (isMinMax()) {

// SelectInst must be used twice while the condition op must have single

// use only.

auto *Cmp = cast<Instruction>(cast<SelectInst>(I)->getCondition());

return I->getParent() == P && Cmp && Cmp->getParent() == P;

}

- case RK_None:

- break;

- }

- llvm_unreachable("Reduction kind is not set");

+ // Arithmetic reduction operation must be used once only.

+ return I->getParent() == P;

}

/// Expected number of uses for reduction operations/reduced values.

bool hasRequiredNumberOfUses(Instruction *I, bool IsReductionOp) const {

assert(Kind != RK_None && !!*this && LHS && RHS &&

"Expected reduction operation.");

- switch (Kind) {

- case RK_Arithmetic:

- return I->hasOneUse();

- case RK_Min:

- case RK_UMin:

- case RK_Max:

- case RK_UMax:

+ if (isMinMax())

return I->hasNUses(2) &&

(!IsReductionOp ||

cast<SelectInst>(I)->getCondition()->hasOneUse());

- case RK_None:

- break;

- }

- llvm_unreachable("Reduction kind is not set");

+ return I->hasOneUse();

}

/// Initializes the list of reduction operations.

void initReductionOps(ReductionOpsListType &ReductionOps) {

assert(Kind != RK_None && !!*this && LHS && RHS &&

"Expected reduction operation.");

- switch (Kind) {

- case RK_Arithmetic:

- ReductionOps.assign(1, ReductionOpsType());

- break;

- case RK_Min:

- case RK_UMin:

- case RK_Max:

- case RK_UMax:

+ if (isMinMax())

ReductionOps.assign(2, ReductionOpsType());

- break;

- case RK_None:

- llvm_unreachable("Reduction kind is not set");

- }

+ else

+ ReductionOps.assign(1, ReductionOpsType());

}

/// Add all reduction operations for the reduction instruction \p I.

void addReductionOps(Instruction *I, ReductionOpsListType &ReductionOps) {

assert(Kind != RK_None && !!*this && LHS && RHS &&

"Expected reduction operation.");

- switch (Kind) {

- case RK_Arithmetic:

- ReductionOps[0].emplace_back(I);

- break;

- case RK_Min:

- case RK_UMin:

- case RK_Max:

- case RK_UMax:

+ if (isMinMax()) {

ReductionOps[0].emplace_back(cast<SelectInst>(I)->getCondition());

ReductionOps[1].emplace_back(I);

- break;

- case RK_None:

- llvm_unreachable("Reduction kind is not set");

+ } else {

+ ReductionOps[0].emplace_back(I);

}

@@ -5980,12 +6291,12 @@ class HorizontalReduction {

/// Checks if two operation data are both a reduction op or both a reduced

/// value.

- bool operator==(const OperationData &OD) {

+ bool operator==(const OperationData &OD) const {

assert(((Kind != OD.Kind) || ((!LHS == !OD.LHS) && (!RHS == !OD.RHS))) &&

"One of the comparing operations is incorrect.");

return this == &OD || (Kind == OD.Kind && Opcode == OD.Opcode);

}

- bool operator!=(const OperationData &OD) { return !(*this == OD); }

+ bool operator!=(const OperationData &OD) const { return !(*this == OD); }

void clear() {

Opcode = 0;

LHS = nullptr;

@@ -6005,18 +6316,7 @@ class HorizontalReduction {

Value *getLHS() const { return LHS; }

Value *getRHS() const { return RHS; }

Type *getConditionType() const {

- switch (Kind) {

- case RK_Arithmetic:

- return nullptr;

- case RK_Min:

- case RK_Max:

- case RK_UMin:

- case RK_UMax:

- return CmpInst::makeCmpResultType(LHS->getType());

- case RK_None:

- break;

- }

- llvm_unreachable("Reduction kind is not set");

+ return isMinMax() ? CmpInst::makeCmpResultType(LHS->getType()) : nullptr;

}

/// Creates reduction operation with the current opcode with the IR flags

@@ -6400,6 +6700,18 @@ public:

assert(Pair.first && "DebugLoc must be set.");

ExternallyUsedValues[Pair.second].push_back(Pair.first);

}

+ // The compare instruction of a min/max is the insertion point for new

+ // instructions and may be replaced with a new compare instruction.

+ auto getCmpForMinMaxReduction = [](Instruction *RdxRootInst) {

+ assert(isa<SelectInst>(RdxRootInst) &&

+ "Expected min/max reduction to have select root instruction");

+ Value *ScalarCond = cast<SelectInst>(RdxRootInst)->getCondition();

+ assert(isa<Instruction>(ScalarCond) &&

+ "Expected min/max reduction to have compare condition");

+ return cast<Instruction>(ScalarCond);

+ };

// The reduction root is used as the insertion point for new instructions,

// so set it as externally used to prevent it from being deleted.

ExternallyUsedValues[ReductionRoot];

@@ -6455,8 +6767,14 @@ public:

DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc();

Value *VectorizedRoot = V.vectorizeTree(ExternallyUsedValues);

- // Emit a reduction.

- Builder.SetInsertPoint(cast<Instruction>(ReductionRoot));

+ // Emit a reduction. For min/max, the root is a select, but the insertion

+ // point is the compare condition of that select.

+ Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);

+ if (ReductionData.isMinMax())

+ Builder.SetInsertPoint(getCmpForMinMaxReduction(RdxRootInst));

+ else

+ Builder.SetInsertPoint(RdxRootInst);

Value *ReducedSubTree =

emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI);

if (VectorizedTree) {

@@ -6492,8 +6810,20 @@ public:

VectorizedTree = VectReductionData.createOp(Builder, "op.extra", I);

}

- // Update users.

+ // Update users. For a min/max reduction that ends with a compare and

+ // select, we also have to RAUW for the compare instruction feeding the

+ // reduction root. That's because the original compare may have extra uses

+ // besides the final select of the reduction.

+ if (ReductionData.isMinMax()) {

+ if (auto *VecSelect = dyn_cast<SelectInst>(VectorizedTree)) {

+ Instruction *ScalarCmp =

+ getCmpForMinMaxReduction(cast<Instruction>(ReductionRoot));

+ ScalarCmp->replaceAllUsesWith(VecSelect->getCondition());

+ }

ReductionRoot->replaceAllUsesWith(VectorizedTree);

// Mark all scalar reduction ops for deletion, they are replaced by the

// vector reductions.

V.eraseInstructions(IgnoreList);

@@ -6619,45 +6949,54 @@ private:

/// %rb = insertelement <4 x float> %ra, float %s1, i32 1

/// %rc = insertelement <4 x float> %rb, float %s2, i32 2

/// %rd = insertelement <4 x float> %rc, float %s3, i32 3

-/// starting from the last insertelement instruction.

+/// starting from the last insertelement or insertvalue instruction.

///

-/// Returns true if it matches

-static bool findBuildVector(InsertElementInst *LastInsertElem,

- TargetTransformInfo *TTI,

- SmallVectorImpl<Value *> &BuildVectorOpds,

- int &UserCost) {

- UserCost = 0;

- Value *V = nullptr;

- do {

- if (auto *CI = dyn_cast<ConstantInt>(LastInsertElem->getOperand(2))) {

- UserCost += TTI->getVectorInstrCost(Instruction::InsertElement,

- LastInsertElem->getType(),

- CI->getZExtValue());

- }

- BuildVectorOpds.push_back(LastInsertElem->getOperand(1));

- V = LastInsertElem->getOperand(0);

- if (isa<UndefValue>(V))

- break;

- LastInsertElem = dyn_cast<InsertElementInst>(V);

- if (!LastInsertElem || !LastInsertElem->hasOneUse())

- return false;

- } while (true);

- std::reverse(BuildVectorOpds.begin(), BuildVectorOpds.end());

- return true;

-/// Like findBuildVector, but looks for construction of aggregate.

+/// Also recognize aggregates like {<2 x float>, <2 x float>},

+/// {{float, float}, {float, float}}, [2 x {float, float}] and so on.

+/// See llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll for examples.

+///

+/// Assume LastInsertInst is of InsertElementInst or InsertValueInst type.

///

/// \return true if it matches.

-static bool findBuildAggregate(InsertValueInst *IV,

- SmallVectorImpl<Value *> &BuildVectorOpds) {

+static bool findBuildAggregate(Value *LastInsertInst, TargetTransformInfo *TTI,

+ SmallVectorImpl<Value *> &BuildVectorOpds,

+ int &UserCost) {

+ assert((isa<InsertElementInst>(LastInsertInst) ||

+ isa<InsertValueInst>(LastInsertInst)) &&

+ "Expected insertelement or insertvalue instruction!");

+ UserCost = 0;

do {

- BuildVectorOpds.push_back(IV->getInsertedValueOperand());

- Value *V = IV->getAggregateOperand();

- if (isa<UndefValue>(V))

+ Value *InsertedOperand;

+ if (auto *IE = dyn_cast<InsertElementInst>(LastInsertInst)) {

+ InsertedOperand = IE->getOperand(1);

+ LastInsertInst = IE->getOperand(0);

+ if (auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2))) {

+ UserCost += TTI->getVectorInstrCost(Instruction::InsertElement,

+ IE->getType(), CI->getZExtValue());

+ }

+ } else {

+ auto *IV = cast<InsertValueInst>(LastInsertInst);

+ InsertedOperand = IV->getInsertedValueOperand();

+ LastInsertInst = IV->getAggregateOperand();

+ }

+ if (isa<InsertElementInst>(InsertedOperand) ||

+ isa<InsertValueInst>(InsertedOperand)) {

+ int TmpUserCost;

+ SmallVector<Value *, 8> TmpBuildVectorOpds;

+ if (!findBuildAggregate(InsertedOperand, TTI, TmpBuildVectorOpds,

+ TmpUserCost))

+ return false;

+ BuildVectorOpds.append(TmpBuildVectorOpds.rbegin(),

+ TmpBuildVectorOpds.rend());

+ UserCost += TmpUserCost;

+ } else {

+ BuildVectorOpds.push_back(InsertedOperand);

+ }

+ if (isa<UndefValue>(LastInsertInst))

break;

- IV = dyn_cast<InsertValueInst>(V);

- if (!IV || !IV->hasOneUse())

+ if ((!isa<InsertValueInst>(LastInsertInst) &&

+ !isa<InsertElementInst>(LastInsertInst)) ||

+ !LastInsertInst->hasOneUse())

return false;

} while (true);

std::reverse(BuildVectorOpds.begin(), BuildVectorOpds.end());

@@ -6825,25 +7164,26 @@ bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Value *V,

bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,

BasicBlock *BB, BoUpSLP &R) {

+ int UserCost = 0;

const DataLayout &DL = BB->getModule()->getDataLayout();

if (!R.canMapToVector(IVI->getType(), DL))

return false;

SmallVector<Value *, 16> BuildVectorOpds;

- if (!findBuildAggregate(IVI, BuildVectorOpds))

+ if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, UserCost))

return false;

LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");

// Aggregate value is unlikely to be processed in vector register, we need to

// extract scalars into scalar registers, so NeedExtraction is set true.

- return tryToVectorizeList(BuildVectorOpds, R);

+ return tryToVectorizeList(BuildVectorOpds, R, UserCost);

}

bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,

BasicBlock *BB, BoUpSLP &R) {

int UserCost;

SmallVector<Value *, 16> BuildVectorOpds;

- if (!findBuildVector(IEI, TTI, BuildVectorOpds, UserCost) ||

+ if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, UserCost) ||

(llvm::all_of(BuildVectorOpds,

[](Value *V) { return isa<ExtractElementInst>(V); }) &&

isShuffle(BuildVectorOpds)))

@@ -7118,14 +7458,7 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {

LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length "

<< it->second.size() << ".\n");

- // Process the stores in chunks of 16.

- // TODO: The limit of 16 inhibits greater vectorization factors.

- // For example, AVX2 supports v32i8. Increasing this limit, however,

- // may cause a significant compile-time increase.

- for (unsigned CI = 0, CE = it->second.size(); CI < CE; CI += 16) {

- unsigned Len = std::min<unsigned>(CE - CI, 16);

- Changed |= vectorizeStores(makeArrayRef(&it->second[CI], Len), R);

- }

+ Changed |= vectorizeStores(it->second, R);

}

return Changed;

}

diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
index 0ca6a6b93cfd..598fb00e956e 100644
--- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h

@@ -47,6 +47,24 @@ class VPRecipeBuilder {

EdgeMaskCacheTy EdgeMaskCache;

BlockMaskCacheTy BlockMaskCache;

+ // VPlan-VPlan transformations support: Hold a mapping from ingredients to

+ // their recipe. To save on memory, only do so for selected ingredients,

+ // marked by having a nullptr entry in this map. If those ingredients get a

+ // VPWidenRecipe, also avoid compressing other ingredients into it to avoid

+ // having to split such recipes later.

+ DenseMap<Instruction *, VPRecipeBase *> Ingredient2Recipe;

+ VPWidenRecipe *LastExtensibleRecipe = nullptr;

+ /// Set the recipe created for given ingredient. This operation is a no-op for

+ /// ingredients that were not marked using a nullptr entry in the map.

+ void setRecipe(Instruction *I, VPRecipeBase *R) {

+ if (!Ingredient2Recipe.count(I))

+ return;

+ assert(Ingredient2Recipe[I] == nullptr &&

+ "Recipe already set for ingredient");

+ Ingredient2Recipe[I] = R;

+ }

public:

/// A helper function that computes the predicate of the block BB, assuming

/// that the header block of the loop is set to True. It returns the *entry*

@@ -57,16 +75,22 @@ public:

/// and DST.

VPValue *createEdgeMask(BasicBlock *Src, BasicBlock *Dst, VPlanPtr &Plan);

- /// Check if \I belongs to an Interleave Group within the given VF \p Range,

- /// \return true in the first returned value if so and false otherwise.

- /// Build a new VPInterleaveGroup Recipe if \I is the primary member of an IG

- /// for \p Range.Start, and provide it as the second returned value.

- /// Note that if \I is an adjunct member of an IG for \p Range.Start, the

- /// \return value is <true, nullptr>, as it is handled by another recipe.

- /// \p Range.End may be decreased to ensure same decision from \p Range.Start

- /// to \p Range.End.

- VPInterleaveRecipe *tryToInterleaveMemory(Instruction *I, VFRange &Range,

- VPlanPtr &Plan);

+ /// Mark given ingredient for recording its recipe once one is created for

+ /// it.

+ void recordRecipeOf(Instruction *I) {

+ assert((!Ingredient2Recipe.count(I) || Ingredient2Recipe[I] == nullptr) &&

+ "Recipe already set for ingredient");

+ Ingredient2Recipe[I] = nullptr;

+ }

+ /// Return the recipe created for given ingredient.

+ VPRecipeBase *getRecipe(Instruction *I) {

+ assert(Ingredient2Recipe.count(I) &&

+ "Recording this ingredients recipe was not requested");

+ assert(Ingredient2Recipe[I] != nullptr &&

+ "Ingredient doesn't have a recipe");

+ return Ingredient2Recipe[I];

+ }

/// Check if \I is a memory instruction to be widened for \p Range.Start and

/// potentially masked. Such instructions are handled by a recipe that takes

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 4b80d1fb20aa..f1c708720ccf 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp

@@ -31,6 +31,7 @@

#include "llvm/IR/Type.h"

#include "llvm/IR/Value.h"

#include "llvm/Support/Casting.h"

+#include "llvm/Support/CommandLine.h"

#include "llvm/Support/Debug.h"

#include "llvm/Support/ErrorHandling.h"

#include "llvm/Support/GenericDomTreeConstruction.h"

@@ -275,18 +276,35 @@ void VPRegionBlock::execute(VPTransformState *State) {

}

void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) {

+ assert(!Parent && "Recipe already in some VPBasicBlock");

+ assert(InsertPos->getParent() &&

+ "Insertion position not in any VPBasicBlock");

Parent = InsertPos->getParent();

Parent->getRecipeList().insert(InsertPos->getIterator(), this);

}

+void VPRecipeBase::insertAfter(VPRecipeBase *InsertPos) {

+ assert(!Parent && "Recipe already in some VPBasicBlock");

+ assert(InsertPos->getParent() &&

+ "Insertion position not in any VPBasicBlock");

+ Parent = InsertPos->getParent();

+ Parent->getRecipeList().insertAfter(InsertPos->getIterator(), this);

+void VPRecipeBase::removeFromParent() {

+ assert(getParent() && "Recipe not in any VPBasicBlock");

+ getParent()->getRecipeList().remove(getIterator());

+ Parent = nullptr;

iplist<VPRecipeBase>::iterator VPRecipeBase::eraseFromParent() {

+ assert(getParent() && "Recipe not in any VPBasicBlock");

return getParent()->getRecipeList().erase(getIterator());

}

void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) {

- InsertPos->getParent()->getRecipeList().splice(

- std::next(InsertPos->getIterator()), getParent()->getRecipeList(),

- getIterator());

+ removeFromParent();

+ insertAfter(InsertPos);

}

void VPInstruction::generateInstruction(VPTransformState &State,

@@ -447,14 +465,20 @@ void VPlan::execute(VPTransformState *State) {

// We do not attempt to preserve DT for outer loop vectorization currently.

if (!EnableVPlanNativePath)

- updateDominatorTree(State->DT, VectorPreHeaderBB, VectorLatchBB);

+ updateDominatorTree(State->DT, VectorPreHeaderBB, VectorLatchBB,

+ L->getExitBlock());

}

+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)

+LLVM_DUMP_METHOD

+void VPlan::dump() const { dbgs() << *this << '\n'; }

+#endif

void VPlan::updateDominatorTree(DominatorTree *DT, BasicBlock *LoopPreHeaderBB,

- BasicBlock *LoopLatchBB) {

+ BasicBlock *LoopLatchBB,

+ BasicBlock *LoopExitBB) {

BasicBlock *LoopHeaderBB = LoopPreHeaderBB->getSingleSuccessor();

assert(LoopHeaderBB && "Loop preheader does not have a single successor.");

- DT->addNewBlock(LoopHeaderBB, LoopPreHeaderBB);

// The vector body may be more than a single basic-block by this point.

// Update the dominator tree information inside the vector body by propagating

// it from header to latch, expecting only triangular control-flow, if any.

@@ -485,6 +509,9 @@ void VPlan::updateDominatorTree(DominatorTree *DT, BasicBlock *LoopPreHeaderBB,

DT->addNewBlock(InterimSucc, BB);

DT->addNewBlock(PostDomSucc, BB);

}

+ // Latch block is a new dominator for the loop exit.

+ DT->changeImmediateDominator(LoopExitBB, LoopLatchBB);

+ assert(DT->verify(DominatorTree::VerificationLevel::Fast));

}

const Twine VPlanPrinter::getUID(const VPBlockBase *Block) {

@@ -509,8 +536,7 @@ void VPlanPrinter::dump() {

if (!Plan.Value2VPValue.empty() || Plan.BackedgeTakenCount) {

OS << ", where:";

if (Plan.BackedgeTakenCount)

- OS << "\\n"

- << *Plan.getOrCreateBackedgeTakenCount() << " := BackedgeTakenCount";

+ OS << "\\n" << *Plan.BackedgeTakenCount << " := BackedgeTakenCount";

for (auto Entry : Plan.Value2VPValue) {

OS << "\\n" << *Entry.second;

OS << DOT::EscapeString(" := ");

@@ -522,7 +548,7 @@ void VPlanPrinter::dump() {

OS << "edge [fontname=Courier, fontsize=30]\n";

OS << "compound=true\n";

- for (VPBlockBase *Block : depth_first(Plan.getEntry()))

+ for (const VPBlockBase *Block : depth_first(Plan.getEntry()))

dumpBlock(Block);

OS << "}\n";

@@ -661,6 +687,16 @@ void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O,

O << " " << VPlanIngredient(IV) << "\\l\"";

}

+void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent) const {

+ O << " +\n" << Indent << "\"WIDEN-GEP ";

+ O << (IsPtrLoopInvariant ? "Inv" : "Var");

+ size_t IndicesNumber = IsIndexLoopInvariant.size();

+ for (size_t I = 0; I < IndicesNumber; ++I)

+ O << "[" << (IsIndexLoopInvariant[I] ? "Inv" : "Var") << "]";

+ O << "\\l\"";

+ O << " +\n" << Indent << "\" " << VPlanIngredient(GEP) << "\\l\"";

void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent) const {

O << " +\n" << Indent << "\"WIDEN-PHI " << VPlanIngredient(Phi) << "\\l\"";

}

@@ -703,9 +739,12 @@ void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent) const {

void VPWidenMemoryInstructionRecipe::print(raw_ostream &O,

const Twine &Indent) const {

O << " +\n" << Indent << "\"WIDEN " << VPlanIngredient(&Instr);

- if (User) {

+ O << ", ";

+ getAddr()->printAsOperand(O);

+ VPValue *Mask = getMask();

+ if (Mask) {

O << ", ";

- User->getOperand(0)->printAsOperand(O);

+ Mask->printAsOperand(O);

}

O << "\\l\"";

}

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 44d8a198f27e..c65abc3639d7 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h

@@ -31,6 +31,7 @@

#include "llvm/ADT/DepthFirstIterator.h"

#include "llvm/ADT/GraphTraits.h"

#include "llvm/ADT/Optional.h"

+#include "llvm/ADT/SmallBitVector.h"

#include "llvm/ADT/SmallPtrSet.h"

#include "llvm/ADT/SmallSet.h"

#include "llvm/ADT/SmallVector.h"

@@ -226,6 +227,8 @@ public:

struct VPCallback {

virtual ~VPCallback() {}

virtual Value *getOrCreateVectorValues(Value *V, unsigned Part) = 0;

+ virtual Value *getOrCreateScalarValue(Value *V,

+ const VPIteration &Instance) = 0;

};

/// VPTransformState holds information passed down when "executing" a VPlan,

@@ -268,6 +271,13 @@ struct VPTransformState {

return Callback.getOrCreateVectorValues(VPValue2Value[Def], Part);

}

+ /// Get the generated Value for a given VPValue and given Part and Lane. Note

+ /// that as per-lane Defs are still created by ILV and managed in its ValueMap

+ /// this method currently just delegates the call to ILV.

+ Value *get(VPValue *Def, const VPIteration &Instance) {

+ return Callback.getOrCreateScalarValue(VPValue2Value[Def], Instance);

+ }

/// Set the generated Value for a given VPValue and a given Part.

void set(VPValue *Def, Value *V, unsigned Part) {

if (!Data.PerPartOutput.count(Def)) {

@@ -567,6 +577,7 @@ public:

/// instructions.

class VPRecipeBase : public ilist_node_with_parent<VPRecipeBase, VPBasicBlock> {

friend VPBasicBlock;

+ friend class VPBlockUtils;

private:

const unsigned char SubclassID; ///< Subclass identifier (for isa/dyn_cast).

@@ -586,6 +597,7 @@ public:

VPInterleaveSC,

VPPredInstPHISC,

VPReplicateSC,

+ VPWidenGEPSC,

VPWidenIntOrFpInductionSC,

VPWidenMemoryInstructionSC,

VPWidenPHISC,

@@ -615,10 +627,18 @@ public:

/// the specified recipe.

void insertBefore(VPRecipeBase *InsertPos);

+ /// Insert an unlinked Recipe into a basic block immediately after

+ /// the specified Recipe.

+ void insertAfter(VPRecipeBase *InsertPos);

/// Unlink this recipe from its current VPBasicBlock and insert it into

/// the VPBasicBlock that MovePos lives in, right after MovePos.

void moveAfter(VPRecipeBase *MovePos);

+ /// This method unlinks 'this' from the containing basic block, but does not

+ /// delete it.

+ void removeFromParent();

/// This method unlinks 'this' from the containing basic block and deletes it.

///

/// \returns an iterator pointing to the element after the erased one

@@ -630,7 +650,6 @@ public:

/// executed, these instructions would always form a single-def expression as

/// the VPInstruction is also a single def-use vertex.

class VPInstruction : public VPUser, public VPRecipeBase {

- friend class VPlanHCFGTransforms;

friend class VPlanSlp;

public:

@@ -740,6 +759,36 @@ public:

void print(raw_ostream &O, const Twine &Indent) const override;

};

+/// A recipe for handling GEP instructions.

+class VPWidenGEPRecipe : public VPRecipeBase {

+private:

+ GetElementPtrInst *GEP;

+ bool IsPtrLoopInvariant;

+ SmallBitVector IsIndexLoopInvariant;

+public:

+ VPWidenGEPRecipe(GetElementPtrInst *GEP, Loop *OrigLoop)

+ : VPRecipeBase(VPWidenGEPSC), GEP(GEP),

+ IsIndexLoopInvariant(GEP->getNumIndices(), false) {

+ IsPtrLoopInvariant = OrigLoop->isLoopInvariant(GEP->getPointerOperand());

+ for (auto Index : enumerate(GEP->indices()))

+ IsIndexLoopInvariant[Index.index()] =

+ OrigLoop->isLoopInvariant(Index.value().get());

+ }

+ ~VPWidenGEPRecipe() override = default;

+ /// Method to support type inquiry through isa, cast, and dyn_cast.

+ static inline bool classof(const VPRecipeBase *V) {

+ return V->getVPRecipeID() == VPRecipeBase::VPWidenGEPSC;

+ }

+ /// Generate the gep nodes.

+ void execute(VPTransformState &State) override;

+ /// Print the recipe.

+ void print(raw_ostream &O, const Twine &Indent) const override;

+};

/// A recipe for handling phi nodes of integer and floating-point inductions,

/// producing their vector and scalar values.

class VPWidenIntOrFpInductionRecipe : public VPRecipeBase {

@@ -822,13 +871,14 @@ public:

class VPInterleaveRecipe : public VPRecipeBase {

private:

const InterleaveGroup<Instruction> *IG;

- std::unique_ptr<VPUser> User;

+ VPUser User;

public:

- VPInterleaveRecipe(const InterleaveGroup<Instruction> *IG, VPValue *Mask)

- : VPRecipeBase(VPInterleaveSC), IG(IG) {

- if (Mask) // Create a VPInstruction to register as a user of the mask.

- User.reset(new VPUser({Mask}));

+ VPInterleaveRecipe(const InterleaveGroup<Instruction> *IG, VPValue *Addr,

+ VPValue *Mask)

+ : VPRecipeBase(VPInterleaveSC), IG(IG), User({Addr}) {

+ if (Mask)

+ User.addOperand(Mask);

}

~VPInterleaveRecipe() override = default;

@@ -837,6 +887,18 @@ public:

return V->getVPRecipeID() == VPRecipeBase::VPInterleaveSC;

}

+ /// Return the address accessed by this recipe.

+ VPValue *getAddr() const {

+ return User.getOperand(0); // Address is the 1st, mandatory operand.

+ }

+ /// Return the mask used by this recipe. Note that a full mask is represented

+ /// by a nullptr.

+ VPValue *getMask() const {

+ // Mask is optional and therefore the last, currently 2nd operand.

+ return User.getNumOperands() == 2 ? User.getOperand(1) : nullptr;

+ }

/// Generate the wide load or store, and shuffles.

void execute(VPTransformState &State) override;

@@ -959,13 +1021,14 @@ public:

class VPWidenMemoryInstructionRecipe : public VPRecipeBase {

private:

Instruction &Instr;

- std::unique_ptr<VPUser> User;

+ VPUser User;

public:

- VPWidenMemoryInstructionRecipe(Instruction &Instr, VPValue *Mask)

- : VPRecipeBase(VPWidenMemoryInstructionSC), Instr(Instr) {

- if (Mask) // Create a VPInstruction to register as a user of the mask.

- User.reset(new VPUser({Mask}));

+ VPWidenMemoryInstructionRecipe(Instruction &Instr, VPValue *Addr,

+ VPValue *Mask)

+ : VPRecipeBase(VPWidenMemoryInstructionSC), Instr(Instr), User({Addr}) {

+ if (Mask)

+ User.addOperand(Mask);

}

/// Method to support type inquiry through isa, cast, and dyn_cast.

@@ -973,6 +1036,18 @@ public:

return V->getVPRecipeID() == VPRecipeBase::VPWidenMemoryInstructionSC;

}

+ /// Return the address accessed by this recipe.

+ VPValue *getAddr() const {

+ return User.getOperand(0); // Address is the 1st, mandatory operand.

+ }

+ /// Return the mask used by this recipe. Note that a full mask is represented

+ /// by a nullptr.

+ VPValue *getMask() const {

+ // Mask is optional and therefore the last, currently 2nd operand.

+ return User.getNumOperands() == 2 ? User.getOperand(1) : nullptr;

+ }

/// Generate the wide load/store.

void execute(VPTransformState &State) override;

@@ -1143,6 +1218,128 @@ public:

void execute(struct VPTransformState *State) override;

};

+//===----------------------------------------------------------------------===//

+// GraphTraits specializations for VPlan Hierarchical Control-Flow Graphs //

+//===----------------------------------------------------------------------===//

+// The following set of template specializations implement GraphTraits to treat

+// any VPBlockBase as a node in a graph of VPBlockBases. It's important to note

+// that VPBlockBase traits don't recurse into VPRegioBlocks, i.e., if the

+// VPBlockBase is a VPRegionBlock, this specialization provides access to its

+// successors/predecessors but not to the blocks inside the region.

+template <> struct GraphTraits<VPBlockBase *> {

+ using NodeRef = VPBlockBase *;

+ using ChildIteratorType = SmallVectorImpl<VPBlockBase *>::iterator;

+ static NodeRef getEntryNode(NodeRef N) { return N; }

+ static inline ChildIteratorType child_begin(NodeRef N) {

+ return N->getSuccessors().begin();

+ }

+ static inline ChildIteratorType child_end(NodeRef N) {

+ return N->getSuccessors().end();

+ }

+};

+template <> struct GraphTraits<const VPBlockBase *> {

+ using NodeRef = const VPBlockBase *;

+ using ChildIteratorType = SmallVectorImpl<VPBlockBase *>::const_iterator;

+ static NodeRef getEntryNode(NodeRef N) { return N; }

+ static inline ChildIteratorType child_begin(NodeRef N) {

+ return N->getSuccessors().begin();

+ }

+ static inline ChildIteratorType child_end(NodeRef N) {

+ return N->getSuccessors().end();

+ }

+};

+// Inverse order specialization for VPBasicBlocks. Predecessors are used instead

+// of successors for the inverse traversal.

+template <> struct GraphTraits<Inverse<VPBlockBase *>> {

+ using NodeRef = VPBlockBase *;

+ using ChildIteratorType = SmallVectorImpl<VPBlockBase *>::iterator;

+ static NodeRef getEntryNode(Inverse<NodeRef> B) { return B.Graph; }

+ static inline ChildIteratorType child_begin(NodeRef N) {

+ return N->getPredecessors().begin();

+ }

+ static inline ChildIteratorType child_end(NodeRef N) {

+ return N->getPredecessors().end();

+ }

+};

+// The following set of template specializations implement GraphTraits to

+// treat VPRegionBlock as a graph and recurse inside its nodes. It's important

+// to note that the blocks inside the VPRegionBlock are treated as VPBlockBases

+// (i.e., no dyn_cast is performed, VPBlockBases specialization is used), so

+// there won't be automatic recursion into other VPBlockBases that turn to be

+// VPRegionBlocks.

+template <>

+struct GraphTraits<VPRegionBlock *> : public GraphTraits<VPBlockBase *> {

+ using GraphRef = VPRegionBlock *;

+ using nodes_iterator = df_iterator<NodeRef>;

+ static NodeRef getEntryNode(GraphRef N) { return N->getEntry(); }

+ static nodes_iterator nodes_begin(GraphRef N) {

+ return nodes_iterator::begin(N->getEntry());

+ }

+ static nodes_iterator nodes_end(GraphRef N) {

+ // df_iterator::end() returns an empty iterator so the node used doesn't

+ // matter.

+ return nodes_iterator::end(N);

+ }

+};

+template <>

+struct GraphTraits<const VPRegionBlock *>

+ : public GraphTraits<const VPBlockBase *> {

+ using GraphRef = const VPRegionBlock *;

+ using nodes_iterator = df_iterator<NodeRef>;

+ static NodeRef getEntryNode(GraphRef N) { return N->getEntry(); }

+ static nodes_iterator nodes_begin(GraphRef N) {

+ return nodes_iterator::begin(N->getEntry());

+ }

+ static nodes_iterator nodes_end(GraphRef N) {

+ // df_iterator::end() returns an empty iterator so the node used doesn't

+ // matter.

+ return nodes_iterator::end(N);

+ }

+};

+template <>

+struct GraphTraits<Inverse<VPRegionBlock *>>

+ : public GraphTraits<Inverse<VPBlockBase *>> {

+ using GraphRef = VPRegionBlock *;

+ using nodes_iterator = df_iterator<NodeRef>;

+ static NodeRef getEntryNode(Inverse<GraphRef> N) {

+ return N.Graph->getExit();

+ }

+ static nodes_iterator nodes_begin(GraphRef N) {

+ return nodes_iterator::begin(N->getExit());

+ }

+ static nodes_iterator nodes_end(GraphRef N) {

+ // df_iterator::end() returns an empty iterator so the node used doesn't

+ // matter.

+ return nodes_iterator::end(N);

+ }

+};

/// VPlan models a candidate for vectorization, encoding various decisions take

/// to produce efficient output IR, including which branches, basic-blocks and

/// output IR instructions to generate, and their cost. VPlan holds a

@@ -1245,35 +1442,45 @@ public:

return Value2VPValue[V];

}

+ VPValue *getOrAddVPValue(Value *V) {

+ assert(V && "Trying to get or add the VPValue of a null Value");

+ if (!Value2VPValue.count(V))

+ addVPValue(V);

+ return getVPValue(V);

+ }

/// Return the VPLoopInfo analysis for this VPlan.

VPLoopInfo &getVPLoopInfo() { return VPLInfo; }

const VPLoopInfo &getVPLoopInfo() const { return VPLInfo; }

+ /// Dump the plan to stderr (for debugging).

+ void dump() const;

private:

/// Add to the given dominator tree the header block and every new basic block

/// that was created between it and the latch block, inclusive.

- static void updateDominatorTree(DominatorTree *DT,

+ static void updateDominatorTree(DominatorTree *DT, BasicBlock *LoopLatchBB,

BasicBlock *LoopPreHeaderBB,

- BasicBlock *LoopLatchBB);

+ BasicBlock *LoopExitBB);

};

/// VPlanPrinter prints a given VPlan to a given output stream. The printing is

/// indented and follows the dot format.

class VPlanPrinter {

- friend inline raw_ostream &operator<<(raw_ostream &OS, VPlan &Plan);

+ friend inline raw_ostream &operator<<(raw_ostream &OS, const VPlan &Plan);

friend inline raw_ostream &operator<<(raw_ostream &OS,

const struct VPlanIngredient &I);

private:

raw_ostream &OS;

- VPlan &Plan;

- unsigned Depth;

+ const VPlan &Plan;

+ unsigned Depth = 0;

unsigned TabWidth = 2;

std::string Indent;

unsigned BID = 0;

SmallDenseMap<const VPBlockBase *, unsigned> BlockID;

- VPlanPrinter(raw_ostream &O, VPlan &P) : OS(O), Plan(P) {}

+ VPlanPrinter(raw_ostream &O, const VPlan &P) : OS(O), Plan(P) {}

/// Handle indentation.

void bumpIndent(int b) { Indent = std::string((Depth += b) * TabWidth, ' '); }

@@ -1320,135 +1527,13 @@ inline raw_ostream &operator<<(raw_ostream &OS, const VPlanIngredient &I) {

return OS;

}

-inline raw_ostream &operator<<(raw_ostream &OS, VPlan &Plan) {

+inline raw_ostream &operator<<(raw_ostream &OS, const VPlan &Plan) {

VPlanPrinter Printer(OS, Plan);

Printer.dump();

return OS;

}

//===----------------------------------------------------------------------===//

-// GraphTraits specializations for VPlan Hierarchical Control-Flow Graphs //

-//===----------------------------------------------------------------------===//

-// The following set of template specializations implement GraphTraits to treat

-// any VPBlockBase as a node in a graph of VPBlockBases. It's important to note

-// that VPBlockBase traits don't recurse into VPRegioBlocks, i.e., if the

-// VPBlockBase is a VPRegionBlock, this specialization provides access to its

-// successors/predecessors but not to the blocks inside the region.

-template <> struct GraphTraits<VPBlockBase *> {

- using NodeRef = VPBlockBase *;

- using ChildIteratorType = SmallVectorImpl<VPBlockBase *>::iterator;

- static NodeRef getEntryNode(NodeRef N) { return N; }

- static inline ChildIteratorType child_begin(NodeRef N) {

- return N->getSuccessors().begin();

- }

- static inline ChildIteratorType child_end(NodeRef N) {

- return N->getSuccessors().end();

- }

-};

-template <> struct GraphTraits<const VPBlockBase *> {

- using NodeRef = const VPBlockBase *;

- using ChildIteratorType = SmallVectorImpl<VPBlockBase *>::const_iterator;

- static NodeRef getEntryNode(NodeRef N) { return N; }

- static inline ChildIteratorType child_begin(NodeRef N) {

- return N->getSuccessors().begin();

- }

- static inline ChildIteratorType child_end(NodeRef N) {

- return N->getSuccessors().end();

- }

-};

-// Inverse order specialization for VPBasicBlocks. Predecessors are used instead

-// of successors for the inverse traversal.

-template <> struct GraphTraits<Inverse<VPBlockBase *>> {

- using NodeRef = VPBlockBase *;

- using ChildIteratorType = SmallVectorImpl<VPBlockBase *>::iterator;

- static NodeRef getEntryNode(Inverse<NodeRef> B) { return B.Graph; }

- static inline ChildIteratorType child_begin(NodeRef N) {

- return N->getPredecessors().begin();

- }

- static inline ChildIteratorType child_end(NodeRef N) {

- return N->getPredecessors().end();

- }

-};

-// The following set of template specializations implement GraphTraits to

-// treat VPRegionBlock as a graph and recurse inside its nodes. It's important

-// to note that the blocks inside the VPRegionBlock are treated as VPBlockBases

-// (i.e., no dyn_cast is performed, VPBlockBases specialization is used), so

-// there won't be automatic recursion into other VPBlockBases that turn to be

-// VPRegionBlocks.

-template <>

-struct GraphTraits<VPRegionBlock *> : public GraphTraits<VPBlockBase *> {

- using GraphRef = VPRegionBlock *;

- using nodes_iterator = df_iterator<NodeRef>;

- static NodeRef getEntryNode(GraphRef N) { return N->getEntry(); }

- static nodes_iterator nodes_begin(GraphRef N) {

- return nodes_iterator::begin(N->getEntry());

- }

- static nodes_iterator nodes_end(GraphRef N) {

- // df_iterator::end() returns an empty iterator so the node used doesn't

- // matter.

- return nodes_iterator::end(N);

- }

-};

-template <>

-struct GraphTraits<const VPRegionBlock *>

- : public GraphTraits<const VPBlockBase *> {

- using GraphRef = const VPRegionBlock *;

- using nodes_iterator = df_iterator<NodeRef>;

- static NodeRef getEntryNode(GraphRef N) { return N->getEntry(); }

- static nodes_iterator nodes_begin(GraphRef N) {

- return nodes_iterator::begin(N->getEntry());

- }

- static nodes_iterator nodes_end(GraphRef N) {

- // df_iterator::end() returns an empty iterator so the node used doesn't

- // matter.

- return nodes_iterator::end(N);

- }

-};

-template <>

-struct GraphTraits<Inverse<VPRegionBlock *>>

- : public GraphTraits<Inverse<VPBlockBase *>> {

- using GraphRef = VPRegionBlock *;

- using nodes_iterator = df_iterator<NodeRef>;

- static NodeRef getEntryNode(Inverse<GraphRef> N) {

- return N.Graph->getExit();

- }

- static nodes_iterator nodes_begin(GraphRef N) {

- return nodes_iterator::begin(N->getExit());

- }

- static nodes_iterator nodes_end(GraphRef N) {

- // df_iterator::end() returns an empty iterator so the node used doesn't

- // matter.

- return nodes_iterator::end(N);

- }

-};

-//===----------------------------------------------------------------------===//

// VPlan Utilities

//===----------------------------------------------------------------------===//

diff --git a/llvm/lib/Transforms/Vectorize/VPlanHCFGTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index b22d3190d654..3f6a2efd55cc 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanHCFGTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

@@ -1,4 +1,4 @@

-//===-- VPlanHCFGTransforms.cpp - Utility VPlan to VPlan transforms -------===//

+//===-- VPlanTransforms.cpp - Utility VPlan to VPlan transforms -----------===//

// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

// See https://llvm.org/LICENSE.txt for license information.

@@ -11,13 +11,13 @@

///

//===----------------------------------------------------------------------===//

-#include "VPlanHCFGTransforms.h"

+#include "VPlanTransforms.h"

#include "llvm/ADT/PostOrderIterator.h"

using namespace llvm;

-void VPlanHCFGTransforms::VPInstructionsToVPRecipes(

- VPlanPtr &Plan,

+void VPlanTransforms::VPInstructionsToVPRecipes(

+ Loop *OrigLoop, VPlanPtr &Plan,

LoopVectorizationLegality::InductionList *Inductions,

SmallPtrSetImpl<Instruction *> &DeadInstructions) {

@@ -56,7 +56,9 @@ void VPlanHCFGTransforms::VPInstructionsToVPRecipes(

VPRecipeBase *NewRecipe = nullptr;

// Create VPWidenMemoryInstructionRecipe for loads and stores.

if (isa<LoadInst>(Inst) || isa<StoreInst>(Inst))

- NewRecipe = new VPWidenMemoryInstructionRecipe(*Inst, nullptr /*Mask*/);

+ NewRecipe = new VPWidenMemoryInstructionRecipe(

+ *Inst, Plan->getOrAddVPValue(getLoadStorePointerOperand(Inst)),

+ nullptr /*Mask*/);

else if (PHINode *Phi = dyn_cast<PHINode>(Inst)) {

InductionDescriptor II = Inductions->lookup(Phi);

if (II.getKind() == InductionDescriptor::IK_IntInduction ||

@@ -64,6 +66,8 @@ void VPlanHCFGTransforms::VPInstructionsToVPRecipes(

NewRecipe = new VPWidenIntOrFpInductionRecipe(Phi);

} else

NewRecipe = new VPWidenPHIRecipe(Phi);

+ } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Inst)) {

+ NewRecipe = new VPWidenGEPRecipe(GEP, OrigLoop);

} else {

// If the last recipe is a VPWidenRecipe, add Inst to it instead of

// creating a new recipe.

diff --git a/llvm/lib/Transforms/Vectorize/VPlanHCFGTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 79a23c33184f..0d3bd7da09a7 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanHCFGTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h

@@ -1,4 +1,4 @@

-//===- VPlanHCFGTransforms.h - Utility VPlan to VPlan transforms ----------===//

+//===- VPlanTransforms.h - Utility VPlan to VPlan transforms --------------===//

// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

// See https://llvm.org/LICENSE.txt for license information.

@@ -10,8 +10,8 @@

/// This file provides utility VPlan to VPlan transformations.

//===----------------------------------------------------------------------===//

-#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLANHCFGTRANSFORMS_H

-#define LLVM_TRANSFORMS_VECTORIZE_VPLANHCFGTRANSFORMS_H

+#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLANTRANSFORMS_H

+#define LLVM_TRANSFORMS_VECTORIZE_VPLANTRANSFORMS_H

#include "VPlan.h"

#include "llvm/IR/Instruction.h"

@@ -19,17 +19,17 @@

namespace llvm {

-class VPlanHCFGTransforms {

+class VPlanTransforms {

public:

/// Replaces the VPInstructions in \p Plan with corresponding

/// widen recipes.

static void VPInstructionsToVPRecipes(

- VPlanPtr &Plan,

+ Loop *OrigLoop, VPlanPtr &Plan,

LoopVectorizationLegality::InductionList *Inductions,

SmallPtrSetImpl<Instruction *> &DeadInstructions);

};

} // namespace llvm

-#endif // LLVM_TRANSFORMS_VECTORIZE_VPLANHCFGTRANSFORMS_H

+#endif // LLVM_TRANSFORMS_VECTORIZE_VPLANTRANSFORMS_H

diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index 7b6c228c229e..464498c29d89 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h

@@ -37,7 +37,7 @@ class VPUser;

// and live-outs which the VPlan will need to fix accordingly.

class VPValue {

friend class VPBuilder;

- friend class VPlanHCFGTransforms;

+ friend class VPlanTransforms;

friend class VPBasicBlock;

friend class VPInterleavedAccessInfo;

diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
index 394b1b93113b..ab3e7e2282e7 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp

@@ -14,6 +14,7 @@

#include "VPlanVerifier.h"

#include "llvm/ADT/DepthFirstIterator.h"

+#include "llvm/Support/CommandLine.h"

#define DEBUG_TYPE "loop-vectorize"