aboutsummaryrefslogtreecommitdiff
path: root/lib/Target/AMDGPU/SIISelLowering.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Target/AMDGPU/SIISelLowering.cpp')
-rw-r--r--lib/Target/AMDGPU/SIISelLowering.cpp52
1 files changed, 46 insertions, 6 deletions
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp
index 01c1f78e7ca4..76c2644867aa 100644
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -698,6 +698,18 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
}
}
+bool SITargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT) const {
+ if (AS == AMDGPUASI.GLOBAL_ADDRESS || AS == AMDGPUASI.FLAT_ADDRESS) {
+ return (MemVT.getSizeInBits() <= 4 * 32);
+ } else if (AS == AMDGPUASI.PRIVATE_ADDRESS) {
+ unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
+ return (MemVT.getSizeInBits() <= MaxPrivateBits);
+ } else if (AS == AMDGPUASI.LOCAL_ADDRESS) {
+ return (MemVT.getSizeInBits() <= 2 * 32);
+ }
+ return true;
+}
+
bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
unsigned AddrSpace,
unsigned Align,
@@ -4229,12 +4241,40 @@ SDValue SITargetLowering::performAndCombine(SDNode *N,
SDValue RHS = N->getOperand(1);
- if (VT == MVT::i64) {
- const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
- if (CRHS) {
- if (SDValue Split
- = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS))
- return Split;
+ const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
+ if (VT == MVT::i64 && CRHS) {
+ if (SDValue Split
+ = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS))
+ return Split;
+ }
+
+ if (CRHS && VT == MVT::i32) {
+ // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb
+ // nb = number of trailing zeroes in mask
+ // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass,
+ // given that we are selecting 8 or 16 bit fields starting at byte boundary.
+ uint64_t Mask = CRHS->getZExtValue();
+ unsigned Bits = countPopulation(Mask);
+ if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL &&
+ (Bits == 8 || Bits == 16) && isShiftedMask_64(Mask) && !(Mask & 1)) {
+ if (auto *CShift = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) {
+ unsigned Shift = CShift->getZExtValue();
+ unsigned NB = CRHS->getAPIntValue().countTrailingZeros();
+ unsigned Offset = NB + Shift;
+ if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary.
+ SDLoc SL(N);
+ SDValue BFE = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
+ LHS->getOperand(0),
+ DAG.getConstant(Offset, SL, MVT::i32),
+ DAG.getConstant(Bits, SL, MVT::i32));
+ EVT NarrowVT = EVT::getIntegerVT(*DAG.getContext(), Bits);
+ SDValue Ext = DAG.getNode(ISD::AssertZext, SL, VT, BFE,
+ DAG.getValueType(NarrowVT));
+ SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(LHS), VT, Ext,
+ DAG.getConstant(NB, SDLoc(CRHS), MVT::i32));
+ return Shl;
+ }
+ }
}
}