Skip to content

Commit

Permalink
[SLP]Improve/fix extracts calculations for non-power-of-2 elements.
Browse files Browse the repository at this point in the history
Change-Id: I6ea9a21eba83034bb01bb1ab9aabb2b97b0d40c2
  • Loading branch information
jrbyrnes committed Jul 23, 2024
1 parent 68f6b4e commit 26466ce
Show file tree
Hide file tree
Showing 10 changed files with 593 additions and 303 deletions.
127 changes: 86 additions & 41 deletions llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,21 @@ static std::string shortBundleName(ArrayRef<Value *> VL) {
}
#endif

/// Returns power-of-2 number of elements in a single register (part), given the
/// total number of elements \p Size and number of registers (parts) \p
/// NumParts.
static unsigned getPartNumElems(unsigned Size, unsigned NumParts) {
return PowerOf2Ceil(divideCeil(Size, NumParts));
}

/// Returns correct remaining number of elements, considering total amount \p
/// Size, (power-of-2 number) of elements in a single register \p PartNumElems
/// and current register (part) \p Part.
static unsigned getNumElems(unsigned Size, unsigned PartNumElems,
unsigned Part) {
return std::min<unsigned>(PartNumElems, Size - Part * PartNumElems);
}

/// \returns true if all of the instructions in \p VL are in the same block or
/// false otherwise.
static bool allSameBlock(ArrayRef<Value *> VL) {
Expand Down Expand Up @@ -7139,14 +7154,17 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
if (NumSrcRegs == 0)
NumSrcRegs = 1;
// FIXME: this must be moved to TTI for better estimation.
unsigned EltsPerVector = PowerOf2Ceil(std::max(
divideCeil(VL.size(), NumParts), divideCeil(NumElts, NumSrcRegs)));
unsigned EltsPerVector = getPartNumElems(VL.size(), NumParts);
auto CheckPerRegistersShuffle =
[&](MutableArrayRef<int> Mask) -> std::optional<TTI::ShuffleKind> {
[&](MutableArrayRef<int> Mask,
SmallVector<int> Indices) -> std::optional<TTI::ShuffleKind> {
if (NumElts <= EltsPerVector)
return std::nullopt;
DenseSet<int> RegIndices;
// Check that if trying to permute same single/2 input vectors.
TTI::ShuffleKind ShuffleKind = TTI::SK_PermuteSingleSrc;
int FirstRegId = -1;
Indices.assign(1, -1);
for (int &I : Mask) {
if (I == PoisonMaskElem)
continue;
Expand All @@ -7156,8 +7174,15 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
RegIndices.insert(RegId);
if (RegIndices.size() > 2)
return std::nullopt;
if (RegIndices.size() == 2)
if (RegIndices.size() == 2) {
ShuffleKind = TTI::SK_PermuteTwoSrc;
if (Indices.size() == 1)
Indices.push_back(-1);
}
if (RegId == FirstRegId)
Indices.front() = I % NumElts;
else
Indices.back() = I % NumElts;
I = (I % NumElts) % EltsPerVector +
(RegId == FirstRegId ? 0 : EltsPerVector);
}
Expand All @@ -7168,22 +7193,23 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
// Process extracts in blocks of EltsPerVector to check if the source vector
// operand can be re-used directly. If not, add the cost of creating a
// shuffle to extract the values into a vector register.
for (unsigned Part = 0; Part < NumParts; ++Part) {
for (unsigned Part : seq<unsigned>(NumParts)) {
if (!ShuffleKinds[Part])
continue;
ArrayRef<int> MaskSlice =
Mask.slice(Part * EltsPerVector,
(Part == NumParts - 1 && Mask.size() % EltsPerVector != 0)
? Mask.size() % EltsPerVector
: EltsPerVector);
ArrayRef<int> MaskSlice = Mask.slice(
Part * EltsPerVector, getNumElems(Mask.size(), EltsPerVector, Part));
SmallVector<int> SubMask(EltsPerVector, PoisonMaskElem);
copy(MaskSlice, SubMask.begin());
SmallVector<int> Indices;
std::optional<TTI::ShuffleKind> RegShuffleKind =
CheckPerRegistersShuffle(SubMask);
CheckPerRegistersShuffle(SubMask, Indices);
if (!RegShuffleKind) {
Cost += ::getShuffleCost(
TTI, *ShuffleKinds[Part],
FixedVectorType::get(VL.front()->getType(), NumElts), MaskSlice);
if (*ShuffleKinds[Part] != TTI::SK_PermuteSingleSrc ||
!ShuffleVectorInst::isIdentityMask(
MaskSlice, std::max<unsigned>(NumElts, MaskSlice.size())))
Cost += ::getShuffleCost(
TTI, *ShuffleKinds[Part],
FixedVectorType::get(VL.front()->getType(), NumElts), MaskSlice);
continue;
}
if (*RegShuffleKind != TTI::SK_PermuteSingleSrc ||
Expand All @@ -7193,6 +7219,13 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
FixedVectorType::get(VL.front()->getType(), EltsPerVector),
SubMask);
}
for (int Idx : Indices) {
Cost += ::getShuffleCost(
TTI, TTI::SK_ExtractSubvector,
FixedVectorType::get(VL.front()->getType(), NumElts), std::nullopt,
CostKind, Idx,
FixedVectorType::get(VL.front()->getType(), EltsPerVector));
}
}
return Cost;
}
Expand Down Expand Up @@ -7220,11 +7253,11 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
InVectors.front().get<const TreeEntry *>() == &E1 &&
InVectors.back().get<const TreeEntry *>() == E2) ||
(!E2 && InVectors.front().get<const TreeEntry *>() == &E1)) {
assert(all_of(ArrayRef(CommonMask).slice(Part * SliceSize, SliceSize),
unsigned Limit = getNumElems(Mask.size(), SliceSize, Part);
assert(all_of(ArrayRef(CommonMask).slice(Part * SliceSize, Limit),
[](int Idx) { return Idx == PoisonMaskElem; }) &&
"Expected all poisoned elements.");
ArrayRef<int> SubMask =
ArrayRef(Mask).slice(Part * SliceSize, SliceSize);
ArrayRef<int> SubMask = ArrayRef(Mask).slice(Part * SliceSize, Limit);
copy(SubMask, std::next(CommonMask.begin(), SliceSize * Part));
return;
}
Expand Down Expand Up @@ -7465,10 +7498,11 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
});
});
SmallPtrSet<Value *, 4> UniqueBases;
unsigned SliceSize = VL.size() / NumParts;
for (unsigned Part = 0; Part < NumParts; ++Part) {
ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, SliceSize);
for (auto [I, V] : enumerate(VL.slice(Part * SliceSize, SliceSize))) {
unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
for (unsigned Part : seq<unsigned>(NumParts)) {
unsigned Limit = getNumElems(VL.size(), SliceSize, Part);
ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
for (auto [I, V] : enumerate(VL.slice(Part * SliceSize, Limit))) {
// Ignore non-extractelement scalars.
if (isa<UndefValue>(V) ||
(!SubMask.empty() && SubMask[I] == PoisonMaskElem))
Expand Down Expand Up @@ -7561,7 +7595,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
if (NumParts == 0 || NumParts >= Mask.size())
NumParts = 1;
unsigned SliceSize = Mask.size() / NumParts;
unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
const auto *It =
find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
Expand All @@ -7579,7 +7613,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
if (NumParts == 0 || NumParts >= Mask.size())
NumParts = 1;
unsigned SliceSize = Mask.size() / NumParts;
unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
const auto *It =
find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
Expand Down Expand Up @@ -9339,12 +9373,12 @@ BoUpSLP::tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
assert(NumParts > 0 && "NumParts expected be greater than or equal to 1.");
SmallVector<std::optional<TTI::ShuffleKind>> ShufflesRes(NumParts);
Mask.assign(VL.size(), PoisonMaskElem);
unsigned SliceSize = VL.size() / NumParts;
for (unsigned Part = 0; Part < NumParts; ++Part) {
unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
for (unsigned Part : seq<unsigned>(NumParts)) {
// Scan list of gathered scalars for extractelements that can be represented
// as shuffles.
MutableArrayRef<Value *> SubVL =
MutableArrayRef(VL).slice(Part * SliceSize, SliceSize);
MutableArrayRef<Value *> SubVL = MutableArrayRef(VL).slice(
Part * SliceSize, getNumElems(VL.size(), SliceSize, Part));
SmallVector<int> SubMask;
std::optional<TTI::ShuffleKind> Res =
tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
Expand Down Expand Up @@ -9730,10 +9764,11 @@ BoUpSLP::isGatherShuffledEntry(
"Expected only single user of the gather node.");
assert(VL.size() % NumParts == 0 &&
"Number of scalars must be divisible by NumParts.");
unsigned SliceSize = VL.size() / NumParts;
unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
SmallVector<std::optional<TTI::ShuffleKind>> Res;
for (unsigned Part = 0; Part < NumParts; ++Part) {
ArrayRef<Value *> SubVL = VL.slice(Part * SliceSize, SliceSize);
for (unsigned Part : seq<unsigned>(NumParts)) {
ArrayRef<Value *> SubVL =
VL.slice(Part * SliceSize, getNumElems(VL.size(), SliceSize, Part));
SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back();
std::optional<TTI::ShuffleKind> SubRes =
isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part);
Expand Down Expand Up @@ -10250,11 +10285,12 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
// into a long virtual vector register, forming the original vector.
Value *Vec = nullptr;
SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
unsigned SliceSize = E->Scalars.size() / NumParts;
for (unsigned Part = 0; Part < NumParts; ++Part) {
unsigned SliceSize = getPartNumElems(E->Scalars.size(), NumParts);
for (unsigned Part : seq<unsigned>(NumParts)) {
unsigned Limit = getNumElems(E->Scalars.size(), SliceSize, Part);
ArrayRef<Value *> VL =
ArrayRef(E->Scalars).slice(Part * SliceSize, SliceSize);
MutableArrayRef<int> SubMask = Mask.slice(Part * SliceSize, SliceSize);
ArrayRef(E->Scalars).slice(Part * SliceSize, Limit);
MutableArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
constexpr int MaxBases = 2;
SmallVector<Value *, MaxBases> Bases(MaxBases);
#ifndef NDEBUG
Expand Down Expand Up @@ -10290,7 +10326,9 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
assert((Part == 0 || all_of(seq<unsigned>(0, Part),
[&](unsigned P) {
ArrayRef<int> SubMask =
Mask.slice(P * SliceSize, SliceSize);
Mask.slice(P * SliceSize,
getNumElems(Mask.size(),
SliceSize, P));
return all_of(SubMask, [](int Idx) {
return Idx == PoisonMaskElem;
});
Expand Down Expand Up @@ -10663,13 +10701,19 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
Idx == 0) ||
(Mask.size() == InputVF &&
ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))) {
std::iota(std::next(Mask.begin(), I * SliceSize),
std::next(Mask.begin(), (I + 1) * SliceSize), 0);
std::iota(
std::next(Mask.begin(), I * SliceSize),
std::next(Mask.begin(),
I * SliceSize + getNumElems(Mask.size(), SliceSize, I)),
0);
} else {
unsigned IVal =
*find_if_not(Mask, [](int Idx) { return Idx == PoisonMaskElem; });
std::fill(std::next(Mask.begin(), I * SliceSize),
std::next(Mask.begin(), (I + 1) * SliceSize), IVal);
std::fill(
std::next(Mask.begin(), I * SliceSize),
std::next(Mask.begin(),
I * SliceSize + getNumElems(Mask.size(), SliceSize, I)),
IVal);
}
return true;
};
Expand Down Expand Up @@ -10930,7 +10974,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
}
}
if (!GatherShuffles.empty()) {
unsigned SliceSize = E->Scalars.size() / NumParts;
unsigned SliceSize = getPartNumElems(E->Scalars.size(), NumParts);
SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
for (const auto [I, TEs] : enumerate(Entries)) {
if (TEs.empty()) {
Expand All @@ -10940,7 +10984,8 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
}
assert((TEs.size() == 1 || TEs.size() == 2) &&
"Expected shuffle of 1 or 2 entries.");
auto SubMask = ArrayRef(Mask).slice(I * SliceSize, SliceSize);
unsigned Limit = getNumElems(Mask.size(), SliceSize, I);
auto SubMask = ArrayRef(Mask).slice(I * SliceSize, Limit);
VecMask.assign(VecMask.size(), PoisonMaskElem);
copy(SubMask, std::next(VecMask.begin(), I * SliceSize));
if (TEs.size() == 1) {
Expand Down
Loading

0 comments on commit 26466ce

Please sign in to comment.