Skip to content

Commit

Permalink
[AMDGPU] Vectorize i8 Shuffles
Browse files Browse the repository at this point in the history
  • Loading branch information
jrbyrnes committed Jun 17, 2024
1 parent eb6c7cd commit b503e21
Show file tree
Hide file tree
Showing 4 changed files with 236 additions and 284 deletions.
19 changes: 10 additions & 9 deletions llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1148,14 +1148,15 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,

Kind = improveShuffleKindFromMask(Kind, Mask, VT, Index, SubTp);

// Larger vector widths may require additional instructions, but are
// typically cheaper than scalarized versions.
unsigned NumVectorElts = cast<FixedVectorType>(VT)->getNumElements();
unsigned ScalarSize = DL.getTypeSizeInBits(VT->getElementType());
if (ST->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
DL.getTypeSizeInBits(VT->getElementType()) == 16) {
bool HasVOP3P = ST->hasVOP3PInsts();
(ScalarSize == 16 || ScalarSize == 8)) {
// Larger vector widths may require additional instructions, but are
// typically cheaper than scalarized versions.
unsigned NumVectorElts = cast<FixedVectorType>(VT)->getNumElements();
unsigned RequestedElts =
count_if(Mask, [](int MaskElt) { return MaskElt != -1; });
unsigned EltsPerReg = 32 / ScalarSize;
if (RequestedElts == 0)
return 0;
switch (Kind) {
Expand All @@ -1164,9 +1165,9 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
case TTI::SK_PermuteSingleSrc: {
// With op_sel VOP3P instructions freely can access the low half or high
// half of a register, so any swizzle of two elements is free.
if (HasVOP3P && NumVectorElts == 2)
if (ST->hasVOP3PInsts() && ScalarSize == 16 && NumVectorElts == 2)
return 0;
unsigned NumPerms = alignTo(RequestedElts, 2) / 2;
unsigned NumPerms = alignTo(RequestedElts, EltsPerReg) / EltsPerReg;
// SK_Broadcast just reuses the same mask
unsigned NumPermMasks = Kind == TTI::SK_Broadcast ? 1 : NumPerms;
return NumPerms + NumPermMasks;
Expand All @@ -1178,12 +1179,12 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
return 0;
// Insert/extract subvectors only require shifts / extract code to get the
// relevant bits
return alignTo(RequestedElts, 2) / 2;
return alignTo(RequestedElts, EltsPerReg) / EltsPerReg;
}
case TTI::SK_PermuteTwoSrc:
case TTI::SK_Splice:
case TTI::SK_Select: {
unsigned NumPerms = alignTo(RequestedElts, 2) / 2;
unsigned NumPerms = alignTo(RequestedElts, EltsPerReg) / EltsPerReg;
// SK_Select just reuses the same mask
unsigned NumPermMasks = Kind == TTI::SK_Select ? 1 : NumPerms;
return NumPerms + NumPermMasks;
Expand Down
Loading

0 comments on commit b503e21

Please sign in to comment.