diff options
Diffstat (limited to 'contrib/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp')
-rw-r--r-- | contrib/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp | 1050 |
1 files changed, 592 insertions, 458 deletions
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/contrib/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp index 43664df3b861..dca13fc49414 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp @@ -33,9 +33,6 @@ using namespace llvm; #define DEBUG_TYPE "aarch64-ldst-opt" -/// AArch64AllocLoadStoreOpt - Post-register allocation pass to combine -/// load / store instructions to form ldp / stp instructions. - STATISTIC(NumPairCreated, "Number of load/store pair instructions generated"); STATISTIC(NumPostFolded, "Number of post-index updates folded"); STATISTIC(NumPreFolded, "Number of pre-index updates folded"); @@ -45,9 +42,19 @@ STATISTIC(NumNarrowLoadsPromoted, "Number of narrow loads promoted"); STATISTIC(NumZeroStoresPromoted, "Number of narrow zero stores promoted"); STATISTIC(NumLoadsFromStoresPromoted, "Number of loads from stores promoted"); -static cl::opt<unsigned> ScanLimit("aarch64-load-store-scan-limit", +// The LdStLimit limits how far we search for load/store pairs. +static cl::opt<unsigned> LdStLimit("aarch64-load-store-scan-limit", cl::init(20), cl::Hidden); +// The UpdateLimit limits how far we search for update instructions when we form +// pre-/post-index instructions. +static cl::opt<unsigned> UpdateLimit("aarch64-update-scan-limit", cl::init(100), + cl::Hidden); + +static cl::opt<bool> EnableNarrowLdMerge("enable-narrow-ld-merge", cl::Hidden, + cl::init(false), + cl::desc("Enable narrow load merge")); + namespace llvm { void initializeAArch64LoadStoreOptPass(PassRegistry &); } @@ -88,22 +95,29 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass { const TargetRegisterInfo *TRI; const AArch64Subtarget *Subtarget; + // Track which registers have been modified and used. + BitVector ModifiedRegs, UsedRegs; + // Scan the instructions looking for a load/store that can be combined // with the current instruction into a load/store pair. // Return the matching instruction if one is found, else MBB->end(). MachineBasicBlock::iterator findMatchingInsn(MachineBasicBlock::iterator I, LdStPairFlags &Flags, - unsigned Limit); + unsigned Limit, + bool FindNarrowMerge); // Scan the instructions looking for a store that writes to the address from // which the current load instruction reads. Return true if one is found. bool findMatchingStore(MachineBasicBlock::iterator I, unsigned Limit, MachineBasicBlock::iterator &StoreI); + // Merge the two instructions indicated into a wider instruction. + MachineBasicBlock::iterator + mergeNarrowInsns(MachineBasicBlock::iterator I, + MachineBasicBlock::iterator MergeMI, + const LdStPairFlags &Flags); + // Merge the two instructions indicated into a single pair-wise instruction. - // If MergeForward is true, erase the first instruction and fold its - // operation into the second. If false, the reverse. Return the instruction - // following the first instruction (which may change during processing). MachineBasicBlock::iterator mergePairedInsns(MachineBasicBlock::iterator I, MachineBasicBlock::iterator Paired, @@ -118,8 +132,8 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass { // be combined with the current instruction (a load or store) using // pre or post indexed addressing with writeback. Scan forwards. MachineBasicBlock::iterator - findMatchingUpdateInsnForward(MachineBasicBlock::iterator I, unsigned Limit, - int UnscaledOffset); + findMatchingUpdateInsnForward(MachineBasicBlock::iterator I, + int UnscaledOffset, unsigned Limit); // Scan the instruction list to find a base register update that can // be combined with the current instruction (a load or store) using @@ -129,7 +143,7 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass { // Find an instruction that updates the base register of the ld/st // instruction. - bool isMatchingUpdateInsn(MachineInstr *MemMI, MachineInstr *MI, + bool isMatchingUpdateInsn(MachineInstr &MemMI, MachineInstr &MI, unsigned BaseReg, int Offset); // Merge a pre- or post-index base register update into a ld/st instruction. @@ -140,17 +154,21 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass { // Find and merge foldable ldr/str instructions. bool tryToMergeLdStInst(MachineBasicBlock::iterator &MBBI); + // Find and pair ldr/str instructions. + bool tryToPairLdStInst(MachineBasicBlock::iterator &MBBI); + // Find and promote load instructions which read directly from store. bool tryToPromoteLoadFromStore(MachineBasicBlock::iterator &MBBI); - // Check if converting two narrow loads into a single wider load with - // bitfield extracts could be enabled. - bool enableNarrowLdMerge(MachineFunction &Fn); - bool optimizeBlock(MachineBasicBlock &MBB, bool enableNarrowLdOpt); bool runOnMachineFunction(MachineFunction &Fn) override; + MachineFunctionProperties getRequiredProperties() const override { + return MachineFunctionProperties().set( + MachineFunctionProperties::Property::AllVRegsAllocated); + } + const char *getPassName() const override { return AARCH64_LOAD_STORE_OPT_NAME; } @@ -161,37 +179,8 @@ char AArch64LoadStoreOpt::ID = 0; INITIALIZE_PASS(AArch64LoadStoreOpt, "aarch64-ldst-opt", AARCH64_LOAD_STORE_OPT_NAME, false, false) -static bool isUnscaledLdSt(unsigned Opc) { - switch (Opc) { - default: - return false; - case AArch64::STURSi: - case AArch64::STURDi: - case AArch64::STURQi: - case AArch64::STURBBi: - case AArch64::STURHHi: - case AArch64::STURWi: - case AArch64::STURXi: - case AArch64::LDURSi: - case AArch64::LDURDi: - case AArch64::LDURQi: - case AArch64::LDURWi: - case AArch64::LDURXi: - case AArch64::LDURSWi: - case AArch64::LDURHHi: - case AArch64::LDURBBi: - case AArch64::LDURSBWi: - case AArch64::LDURSHWi: - return true; - } -} - -static bool isUnscaledLdSt(MachineInstr *MI) { - return isUnscaledLdSt(MI->getOpcode()); -} - -static unsigned getBitExtrOpcode(MachineInstr *MI) { - switch (MI->getOpcode()) { +static unsigned getBitExtrOpcode(MachineInstr &MI) { + switch (MI.getOpcode()) { default: llvm_unreachable("Unexpected opcode."); case AArch64::LDRBBui: @@ -219,10 +208,6 @@ static bool isNarrowStore(unsigned Opc) { } } -static bool isNarrowStore(MachineInstr *MI) { - return isNarrowStore(MI->getOpcode()); -} - static bool isNarrowLoad(unsigned Opc) { switch (Opc) { default: @@ -239,13 +224,17 @@ static bool isNarrowLoad(unsigned Opc) { } } -static bool isNarrowLoad(MachineInstr *MI) { - return isNarrowLoad(MI->getOpcode()); +static bool isNarrowLoad(MachineInstr &MI) { + return isNarrowLoad(MI.getOpcode()); +} + +static bool isNarrowLoadOrStore(unsigned Opc) { + return isNarrowLoad(Opc) || isNarrowStore(Opc); } // Scaling factor for unscaled load or store. -static int getMemScale(MachineInstr *MI) { - switch (MI->getOpcode()) { +static int getMemScale(MachineInstr &MI) { + switch (MI.getOpcode()) { default: llvm_unreachable("Opcode has unknown scale!"); case AArch64::LDRBBui: @@ -354,6 +343,37 @@ static unsigned getMatchingNonSExtOpcode(unsigned Opc, } } +static unsigned getMatchingWideOpcode(unsigned Opc) { + switch (Opc) { + default: + llvm_unreachable("Opcode has no wide equivalent!"); + case AArch64::STRBBui: + return AArch64::STRHHui; + case AArch64::STRHHui: + return AArch64::STRWui; + case AArch64::STURBBi: + return AArch64::STURHHi; + case AArch64::STURHHi: + return AArch64::STURWi; + case AArch64::STURWi: + return AArch64::STURXi; + case AArch64::STRWui: + return AArch64::STRXui; + case AArch64::LDRHHui: + case AArch64::LDRSHWui: + return AArch64::LDRWui; + case AArch64::LDURHHi: + case AArch64::LDURSHWi: + return AArch64::LDURWi; + case AArch64::LDRBBui: + case AArch64::LDRSBWui: + return AArch64::LDRHHui; + case AArch64::LDURBBi: + case AArch64::LDURSBWi: + return AArch64::LDURHHi; + } +} + static unsigned getMatchingPairOpcode(unsigned Opc) { switch (Opc) { default: @@ -367,14 +387,6 @@ static unsigned getMatchingPairOpcode(unsigned Opc) { case AArch64::STRQui: case AArch64::STURQi: return AArch64::STPQi; - case AArch64::STRBBui: - return AArch64::STRHHui; - case AArch64::STRHHui: - return AArch64::STRWui; - case AArch64::STURBBi: - return AArch64::STURHHi; - case AArch64::STURHHi: - return AArch64::STURWi; case AArch64::STRWui: case AArch64::STURWi: return AArch64::STPWi; @@ -399,25 +411,13 @@ static unsigned getMatchingPairOpcode(unsigned Opc) { case AArch64::LDRSWui: case AArch64::LDURSWi: return AArch64::LDPSWi; - case AArch64::LDRHHui: - case AArch64::LDRSHWui: - return AArch64::LDRWui; - case AArch64::LDURHHi: - case AArch64::LDURSHWi: - return AArch64::LDURWi; - case AArch64::LDRBBui: - case AArch64::LDRSBWui: - return AArch64::LDRHHui; - case AArch64::LDURBBi: - case AArch64::LDURSBWi: - return AArch64::LDURHHi; } } -static unsigned isMatchingStore(MachineInstr *LoadInst, - MachineInstr *StoreInst) { - unsigned LdOpc = LoadInst->getOpcode(); - unsigned StOpc = StoreInst->getOpcode(); +static unsigned isMatchingStore(MachineInstr &LoadInst, + MachineInstr &StoreInst) { + unsigned LdOpc = LoadInst.getOpcode(); + unsigned StOpc = StoreInst.getOpcode(); switch (LdOpc) { default: llvm_unreachable("Unsupported load instruction!"); @@ -562,8 +562,8 @@ static unsigned getPostIndexedOpcode(unsigned Opc) { } } -static bool isPairedLdSt(const MachineInstr *MI) { - switch (MI->getOpcode()) { +static bool isPairedLdSt(const MachineInstr &MI) { + switch (MI.getOpcode()) { default: return false; case AArch64::LDPSi: @@ -581,41 +581,55 @@ static bool isPairedLdSt(const MachineInstr *MI) { } } -static const MachineOperand &getLdStRegOp(const MachineInstr *MI, +static const MachineOperand &getLdStRegOp(const MachineInstr &MI, unsigned PairedRegOp = 0) { assert(PairedRegOp < 2 && "Unexpected register operand idx."); unsigned Idx = isPairedLdSt(MI) ? PairedRegOp : 0; - return MI->getOperand(Idx); + return MI.getOperand(Idx); } -static const MachineOperand &getLdStBaseOp(const MachineInstr *MI) { +static const MachineOperand &getLdStBaseOp(const MachineInstr &MI) { unsigned Idx = isPairedLdSt(MI) ? 2 : 1; - return MI->getOperand(Idx); + return MI.getOperand(Idx); } -static const MachineOperand &getLdStOffsetOp(const MachineInstr *MI) { +static const MachineOperand &getLdStOffsetOp(const MachineInstr &MI) { unsigned Idx = isPairedLdSt(MI) ? 3 : 2; - return MI->getOperand(Idx); + return MI.getOperand(Idx); } -static bool isLdOffsetInRangeOfSt(MachineInstr *LoadInst, - MachineInstr *StoreInst) { +static bool isLdOffsetInRangeOfSt(MachineInstr &LoadInst, + MachineInstr &StoreInst, + const AArch64InstrInfo *TII) { assert(isMatchingStore(LoadInst, StoreInst) && "Expect only matched ld/st."); int LoadSize = getMemScale(LoadInst); int StoreSize = getMemScale(StoreInst); - int UnscaledStOffset = isUnscaledLdSt(StoreInst) + int UnscaledStOffset = TII->isUnscaledLdSt(StoreInst) ? getLdStOffsetOp(StoreInst).getImm() : getLdStOffsetOp(StoreInst).getImm() * StoreSize; - int UnscaledLdOffset = isUnscaledLdSt(LoadInst) + int UnscaledLdOffset = TII->isUnscaledLdSt(LoadInst) ? getLdStOffsetOp(LoadInst).getImm() : getLdStOffsetOp(LoadInst).getImm() * LoadSize; return (UnscaledStOffset <= UnscaledLdOffset) && (UnscaledLdOffset + LoadSize <= (UnscaledStOffset + StoreSize)); } +static bool isPromotableZeroStoreOpcode(unsigned Opc) { + return isNarrowStore(Opc) || Opc == AArch64::STRWui || Opc == AArch64::STURWi; +} + +static bool isPromotableZeroStoreOpcode(MachineInstr &MI) { + return isPromotableZeroStoreOpcode(MI.getOpcode()); +} + +static bool isPromotableZeroStoreInst(MachineInstr &MI) { + return (isPromotableZeroStoreOpcode(MI)) && + getLdStRegOp(MI).getReg() == AArch64::WZR; +} + MachineBasicBlock::iterator -AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I, - MachineBasicBlock::iterator Paired, +AArch64LoadStoreOpt::mergeNarrowInsns(MachineBasicBlock::iterator I, + MachineBasicBlock::iterator MergeMI, const LdStPairFlags &Flags) { MachineBasicBlock::iterator NextI = I; ++NextI; @@ -623,128 +637,124 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I, // to skip one further. Either way we merge will invalidate the iterator, // and we don't need to scan the new instruction, as it's a pairwise // instruction, which we're not considering for further action anyway. - if (NextI == Paired) + if (NextI == MergeMI) ++NextI; - int SExtIdx = Flags.getSExtIdx(); - unsigned Opc = - SExtIdx == -1 ? I->getOpcode() : getMatchingNonSExtOpcode(I->getOpcode()); - bool IsUnscaled = isUnscaledLdSt(Opc); - int OffsetStride = IsUnscaled ? getMemScale(I) : 1; + unsigned Opc = I->getOpcode(); + bool IsScaled = !TII->isUnscaledLdSt(Opc); + int OffsetStride = IsScaled ? 1 : getMemScale(*I); bool MergeForward = Flags.getMergeForward(); - unsigned NewOpc = getMatchingPairOpcode(Opc); // Insert our new paired instruction after whichever of the paired // instructions MergeForward indicates. - MachineBasicBlock::iterator InsertionPoint = MergeForward ? Paired : I; + MachineBasicBlock::iterator InsertionPoint = MergeForward ? MergeMI : I; // Also based on MergeForward is from where we copy the base register operand // so we get the flags compatible with the input code. const MachineOperand &BaseRegOp = - MergeForward ? getLdStBaseOp(Paired) : getLdStBaseOp(I); + MergeForward ? getLdStBaseOp(*MergeMI) : getLdStBaseOp(*I); // Which register is Rt and which is Rt2 depends on the offset order. MachineInstr *RtMI, *Rt2MI; - if (getLdStOffsetOp(I).getImm() == - getLdStOffsetOp(Paired).getImm() + OffsetStride) { - RtMI = Paired; - Rt2MI = I; - // Here we swapped the assumption made for SExtIdx. - // I.e., we turn ldp I, Paired into ldp Paired, I. - // Update the index accordingly. - if (SExtIdx != -1) - SExtIdx = (SExtIdx + 1) % 2; + if (getLdStOffsetOp(*I).getImm() == + getLdStOffsetOp(*MergeMI).getImm() + OffsetStride) { + RtMI = &*MergeMI; + Rt2MI = &*I; } else { - RtMI = I; - Rt2MI = Paired; + RtMI = &*I; + Rt2MI = &*MergeMI; } - int OffsetImm = getLdStOffsetOp(RtMI).getImm(); + int OffsetImm = getLdStOffsetOp(*RtMI).getImm(); + // Change the scaled offset from small to large type. + if (IsScaled) { + assert(((OffsetImm & 1) == 0) && "Unexpected offset to merge"); + OffsetImm /= 2; + } + DebugLoc DL = I->getDebugLoc(); + MachineBasicBlock *MBB = I->getParent(); if (isNarrowLoad(Opc)) { - // Change the scaled offset from small to large type. - if (!IsUnscaled) { - assert(((OffsetImm & 1) == 0) && "Unexpected offset to merge"); - OffsetImm /= 2; - } - MachineInstr *RtNewDest = MergeForward ? I : Paired; + MachineInstr *RtNewDest = &*(MergeForward ? I : MergeMI); // When merging small (< 32 bit) loads for big-endian targets, the order of // the component parts gets swapped. if (!Subtarget->isLittleEndian()) std::swap(RtMI, Rt2MI); // Construct the new load instruction. MachineInstr *NewMemMI, *BitExtMI1, *BitExtMI2; - NewMemMI = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), - TII->get(NewOpc)) - .addOperand(getLdStRegOp(RtNewDest)) - .addOperand(BaseRegOp) - .addImm(OffsetImm) - .setMemRefs(I->mergeMemRefsWith(*Paired)); + NewMemMI = + BuildMI(*MBB, InsertionPoint, DL, TII->get(getMatchingWideOpcode(Opc))) + .addOperand(getLdStRegOp(*RtNewDest)) + .addOperand(BaseRegOp) + .addImm(OffsetImm) + .setMemRefs(I->mergeMemRefsWith(*MergeMI)); + (void)NewMemMI; DEBUG( dbgs() << "Creating the new load and extract. Replacing instructions:\n "); DEBUG(I->print(dbgs())); DEBUG(dbgs() << " "); - DEBUG(Paired->print(dbgs())); + DEBUG(MergeMI->print(dbgs())); DEBUG(dbgs() << " with instructions:\n "); DEBUG((NewMemMI)->print(dbgs())); - int Width = getMemScale(I) == 1 ? 8 : 16; + int Width = getMemScale(*I) == 1 ? 8 : 16; int LSBLow = 0; int LSBHigh = Width; int ImmsLow = LSBLow + Width - 1; int ImmsHigh = LSBHigh + Width - 1; - MachineInstr *ExtDestMI = MergeForward ? Paired : I; + MachineInstr *ExtDestMI = &*(MergeForward ? MergeMI : I); if ((ExtDestMI == Rt2MI) == Subtarget->isLittleEndian()) { // Create the bitfield extract for high bits. - BitExtMI1 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), - TII->get(getBitExtrOpcode(Rt2MI))) - .addOperand(getLdStRegOp(Rt2MI)) - .addReg(getLdStRegOp(RtNewDest).getReg()) - .addImm(LSBHigh) - .addImm(ImmsHigh); + BitExtMI1 = + BuildMI(*MBB, InsertionPoint, DL, TII->get(getBitExtrOpcode(*Rt2MI))) + .addOperand(getLdStRegOp(*Rt2MI)) + .addReg(getLdStRegOp(*RtNewDest).getReg()) + .addImm(LSBHigh) + .addImm(ImmsHigh); // Create the bitfield extract for low bits. if (RtMI->getOpcode() == getMatchingNonSExtOpcode(RtMI->getOpcode())) { // For unsigned, prefer to use AND for low bits. - BitExtMI2 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), - TII->get(AArch64::ANDWri)) - .addOperand(getLdStRegOp(RtMI)) - .addReg(getLdStRegOp(RtNewDest).getReg()) + BitExtMI2 = BuildMI(*MBB, InsertionPoint, DL, TII->get(AArch64::ANDWri)) + .addOperand(getLdStRegOp(*RtMI)) + .addReg(getLdStRegOp(*RtNewDest).getReg()) .addImm(ImmsLow); } else { - BitExtMI2 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), - TII->get(getBitExtrOpcode(RtMI))) - .addOperand(getLdStRegOp(RtMI)) - .addReg(getLdStRegOp(RtNewDest).getReg()) - .addImm(LSBLow) - .addImm(ImmsLow); + BitExtMI2 = + BuildMI(*MBB, InsertionPoint, DL, TII->get(getBitExtrOpcode(*RtMI))) + .addOperand(getLdStRegOp(*RtMI)) + .addReg(getLdStRegOp(*RtNewDest).getReg()) + .addImm(LSBLow) + .addImm(ImmsLow); } } else { // Create the bitfield extract for low bits. if (RtMI->getOpcode() == getMatchingNonSExtOpcode(RtMI->getOpcode())) { // For unsigned, prefer to use AND for low bits. - BitExtMI1 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), - TII->get(AArch64::ANDWri)) - .addOperand(getLdStRegOp(RtMI)) - .addReg(getLdStRegOp(RtNewDest).getReg()) + BitExtMI1 = BuildMI(*MBB, InsertionPoint, DL, TII->get(AArch64::ANDWri)) + .addOperand(getLdStRegOp(*RtMI)) + .addReg(getLdStRegOp(*RtNewDest).getReg()) .addImm(ImmsLow); } else { - BitExtMI1 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), - TII->get(getBitExtrOpcode(RtMI))) - .addOperand(getLdStRegOp(RtMI)) - .addReg(getLdStRegOp(RtNewDest).getReg()) - .addImm(LSBLow) - .addImm(ImmsLow); + BitExtMI1 = + BuildMI(*MBB, InsertionPoint, DL, TII->get(getBitExtrOpcode(*RtMI))) + .addOperand(getLdStRegOp(*RtMI)) + .addReg(getLdStRegOp(*RtNewDest).getReg()) + .addImm(LSBLow) + .addImm(ImmsLow); } // Create the bitfield extract for high bits. - BitExtMI2 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), - TII->get(getBitExtrOpcode(Rt2MI))) - .addOperand(getLdStRegOp(Rt2MI)) - .addReg(getLdStRegOp(RtNewDest).getReg()) - .addImm(LSBHigh) - .addImm(ImmsHigh); + BitExtMI2 = + BuildMI(*MBB, InsertionPoint, DL, TII->get(getBitExtrOpcode(*Rt2MI))) + .addOperand(getLdStRegOp(*Rt2MI)) + .addReg(getLdStRegOp(*RtNewDest).getReg()) + .addImm(LSBHigh) + .addImm(ImmsHigh); } + (void)BitExtMI1; + (void)BitExtMI2; + DEBUG(dbgs() << " "); DEBUG((BitExtMI1)->print(dbgs())); DEBUG(dbgs() << " "); @@ -753,47 +763,122 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I, // Erase the old instructions. I->eraseFromParent(); - Paired->eraseFromParent(); + MergeMI->eraseFromParent(); return NextI; } + assert(isPromotableZeroStoreInst(*I) && isPromotableZeroStoreInst(*MergeMI) && + "Expected promotable zero store"); // Construct the new instruction. MachineInstrBuilder MIB; - if (isNarrowStore(Opc)) { - // Change the scaled offset from small to large type. - if (!IsUnscaled) { - assert(((OffsetImm & 1) == 0) && "Unexpected offset to merge"); - OffsetImm /= 2; + MIB = BuildMI(*MBB, InsertionPoint, DL, TII->get(getMatchingWideOpcode(Opc))) + .addReg(isNarrowStore(Opc) ? AArch64::WZR : AArch64::XZR) + .addOperand(BaseRegOp) + .addImm(OffsetImm) + .setMemRefs(I->mergeMemRefsWith(*MergeMI)); + (void)MIB; + + DEBUG(dbgs() << "Creating wider load/store. Replacing instructions:\n "); + DEBUG(I->print(dbgs())); + DEBUG(dbgs() << " "); + DEBUG(MergeMI->print(dbgs())); + DEBUG(dbgs() << " with instruction:\n "); + DEBUG(((MachineInstr *)MIB)->print(dbgs())); + DEBUG(dbgs() << "\n"); + + // Erase the old instructions. + I->eraseFromParent(); + MergeMI->eraseFromParent(); + return NextI; +} + +MachineBasicBlock::iterator +AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I, + MachineBasicBlock::iterator Paired, + const LdStPairFlags &Flags) { + MachineBasicBlock::iterator NextI = I; + ++NextI; + // If NextI is the second of the two instructions to be merged, we need + // to skip one further. Either way we merge will invalidate the iterator, + // and we don't need to scan the new instruction, as it's a pairwise + // instruction, which we're not considering for further action anyway. + if (NextI == Paired) + ++NextI; + + int SExtIdx = Flags.getSExtIdx(); + unsigned Opc = + SExtIdx == -1 ? I->getOpcode() : getMatchingNonSExtOpcode(I->getOpcode()); + bool IsUnscaled = TII->isUnscaledLdSt(Opc); + int OffsetStride = IsUnscaled ? getMemScale(*I) : 1; + + bool MergeForward = Flags.getMergeForward(); + // Insert our new paired instruction after whichever of the paired + // instructions MergeForward indicates. + MachineBasicBlock::iterator InsertionPoint = MergeForward ? Paired : I; + // Also based on MergeForward is from where we copy the base register operand + // so we get the flags compatible with the input code. + const MachineOperand &BaseRegOp = + MergeForward ? getLdStBaseOp(*Paired) : getLdStBaseOp(*I); + + int Offset = getLdStOffsetOp(*I).getImm(); + int PairedOffset = getLdStOffsetOp(*Paired).getImm(); + bool PairedIsUnscaled = TII->isUnscaledLdSt(Paired->getOpcode()); + if (IsUnscaled != PairedIsUnscaled) { + // We're trying to pair instructions that differ in how they are scaled. If + // I is scaled then scale the offset of Paired accordingly. Otherwise, do + // the opposite (i.e., make Paired's offset unscaled). + int MemSize = getMemScale(*Paired); + if (PairedIsUnscaled) { + // If the unscaled offset isn't a multiple of the MemSize, we can't + // pair the operations together. + assert(!(PairedOffset % getMemScale(*Paired)) && + "Offset should be a multiple of the stride!"); + PairedOffset /= MemSize; + } else { + PairedOffset *= MemSize; } - MIB = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), - TII->get(NewOpc)) - .addOperand(getLdStRegOp(I)) - .addOperand(BaseRegOp) - .addImm(OffsetImm) - .setMemRefs(I->mergeMemRefsWith(*Paired)); + } + + // Which register is Rt and which is Rt2 depends on the offset order. + MachineInstr *RtMI, *Rt2MI; + if (Offset == PairedOffset + OffsetStride) { + RtMI = &*Paired; + Rt2MI = &*I; + // Here we swapped the assumption made for SExtIdx. + // I.e., we turn ldp I, Paired into ldp Paired, I. + // Update the index accordingly. + if (SExtIdx != -1) + SExtIdx = (SExtIdx + 1) % 2; } else { - // Handle Unscaled - if (IsUnscaled) - OffsetImm /= OffsetStride; - MIB = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), - TII->get(NewOpc)) - .addOperand(getLdStRegOp(RtMI)) - .addOperand(getLdStRegOp(Rt2MI)) - .addOperand(BaseRegOp) - .addImm(OffsetImm); + RtMI = &*I; + Rt2MI = &*Paired; + } + int OffsetImm = getLdStOffsetOp(*RtMI).getImm(); + // Scale the immediate offset, if necessary. + if (TII->isUnscaledLdSt(RtMI->getOpcode())) { + assert(!(OffsetImm % getMemScale(*RtMI)) && + "Unscaled offset cannot be scaled."); + OffsetImm /= getMemScale(*RtMI); } - (void)MIB; + // Construct the new instruction. + MachineInstrBuilder MIB; + DebugLoc DL = I->getDebugLoc(); + MachineBasicBlock *MBB = I->getParent(); + MIB = BuildMI(*MBB, InsertionPoint, DL, TII->get(getMatchingPairOpcode(Opc))) + .addOperand(getLdStRegOp(*RtMI)) + .addOperand(getLdStRegOp(*Rt2MI)) + .addOperand(BaseRegOp) + .addImm(OffsetImm) + .setMemRefs(I->mergeMemRefsWith(*Paired)); - // FIXME: Do we need/want to copy the mem operands from the source - // instructions? Probably. What uses them after this? + (void)MIB; DEBUG(dbgs() << "Creating pair load/store. Replacing instructions:\n "); DEBUG(I->print(dbgs())); DEBUG(dbgs() << " "); DEBUG(Paired->print(dbgs())); DEBUG(dbgs() << " with instruction:\n "); - if (SExtIdx != -1) { // Generate the sign extension for the proper result of the ldp. // I.e., with X1, that would be: @@ -814,26 +899,23 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I, // Insert this definition right after the generated LDP, i.e., before // InsertionPoint. MachineInstrBuilder MIBKill = - BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), - TII->get(TargetOpcode::KILL), DstRegW) + BuildMI(*MBB, InsertionPoint, DL, TII->get(TargetOpcode::KILL), DstRegW) .addReg(DstRegW) .addReg(DstRegX, RegState::Define); MIBKill->getOperand(2).setImplicit(); // Create the sign extension. MachineInstrBuilder MIBSXTW = - BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), - TII->get(AArch64::SBFMXri), DstRegX) + BuildMI(*MBB, InsertionPoint, DL, TII->get(AArch64::SBFMXri), DstRegX) .addReg(DstRegX) .addImm(0) .addImm(31); (void)MIBSXTW; DEBUG(dbgs() << " Extend operand:\n "); DEBUG(((MachineInstr *)MIBSXTW)->print(dbgs())); - DEBUG(dbgs() << "\n"); } else { DEBUG(((MachineInstr *)MIB)->print(dbgs())); - DEBUG(dbgs() << "\n"); } + DEBUG(dbgs() << "\n"); // Erase the old instructions. I->eraseFromParent(); @@ -848,10 +930,10 @@ AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI, MachineBasicBlock::iterator NextI = LoadI; ++NextI; - int LoadSize = getMemScale(LoadI); - int StoreSize = getMemScale(StoreI); - unsigned LdRt = getLdStRegOp(LoadI).getReg(); - unsigned StRt = getLdStRegOp(StoreI).getReg(); + int LoadSize = getMemScale(*LoadI); + int StoreSize = getMemScale(*StoreI); + unsigned LdRt = getLdStRegOp(*LoadI).getReg(); + unsigned StRt = getLdStRegOp(*StoreI).getReg(); bool IsStoreXReg = TRI->getRegClass(AArch64::GPR64RegClassID)->contains(StRt); assert((IsStoreXReg || @@ -881,15 +963,16 @@ AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI, // performance and correctness are verified only in little-endian. if (!Subtarget->isLittleEndian()) return NextI; - bool IsUnscaled = isUnscaledLdSt(LoadI); - assert(IsUnscaled == isUnscaledLdSt(StoreI) && "Unsupported ld/st match"); + bool IsUnscaled = TII->isUnscaledLdSt(*LoadI); + assert(IsUnscaled == TII->isUnscaledLdSt(*StoreI) && + "Unsupported ld/st match"); assert(LoadSize <= StoreSize && "Invalid load size"); int UnscaledLdOffset = IsUnscaled - ? getLdStOffsetOp(LoadI).getImm() - : getLdStOffsetOp(LoadI).getImm() * LoadSize; + ? getLdStOffsetOp(*LoadI).getImm() + : getLdStOffsetOp(*LoadI).getImm() * LoadSize; int UnscaledStOffset = IsUnscaled - ? getLdStOffsetOp(StoreI).getImm() - : getLdStOffsetOp(StoreI).getImm() * StoreSize; + ? getLdStOffsetOp(*StoreI).getImm() + : getLdStOffsetOp(*StoreI).getImm() * StoreSize; int Width = LoadSize * 8; int Immr = 8 * (UnscaledLdOffset - UnscaledStOffset); int Imms = Immr + Width - 1; @@ -926,6 +1009,7 @@ AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI, .addImm(Imms); } } + (void)BitExtMI; DEBUG(dbgs() << "Promoting load by replacing :\n "); DEBUG(StoreI->print(dbgs())); @@ -944,16 +1028,18 @@ AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI, /// trackRegDefsUses - Remember what registers the specified instruction uses /// and modifies. -static void trackRegDefsUses(const MachineInstr *MI, BitVector &ModifiedRegs, +static void trackRegDefsUses(const MachineInstr &MI, BitVector &ModifiedRegs, BitVector &UsedRegs, const TargetRegisterInfo *TRI) { - for (const MachineOperand &MO : MI->operands()) { + for (const MachineOperand &MO : MI.operands()) { if (MO.isRegMask()) ModifiedRegs.setBitsNotInMask(MO.getRegMask()); if (!MO.isReg()) continue; unsigned Reg = MO.getReg(); + if (!Reg) + continue; if (MO.isDef()) { for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) ModifiedRegs.set(*AI); @@ -968,38 +1054,42 @@ static void trackRegDefsUses(const MachineInstr *MI, BitVector &ModifiedRegs, static bool inBoundsForPair(bool IsUnscaled, int Offset, int OffsetStride) { // Convert the byte-offset used by unscaled into an "element" offset used // by the scaled pair load/store instructions. - if (IsUnscaled) + if (IsUnscaled) { + // If the byte-offset isn't a multiple of the stride, there's no point + // trying to match it. + if (Offset % OffsetStride) + return false; Offset /= OffsetStride; - + } return Offset <= 63 && Offset >= -64; } // Do alignment, specialized to power of 2 and for signed ints, // avoiding having to do a C-style cast from uint_64t to int when -// using RoundUpToAlignment from include/llvm/Support/MathExtras.h. +// using alignTo from include/llvm/Support/MathExtras.h. // FIXME: Move this function to include/MathExtras.h? static int alignTo(int Num, int PowOf2) { return (Num + PowOf2 - 1) & ~(PowOf2 - 1); } -static bool mayAlias(MachineInstr *MIa, MachineInstr *MIb, +static bool mayAlias(MachineInstr &MIa, MachineInstr &MIb, const AArch64InstrInfo *TII) { // One of the instructions must modify memory. - if (!MIa->mayStore() && !MIb->mayStore()) + if (!MIa.mayStore() && !MIb.mayStore()) return false; // Both instructions must be memory operations. - if (!MIa->mayLoadOrStore() && !MIb->mayLoadOrStore()) + if (!MIa.mayLoadOrStore() && !MIb.mayLoadOrStore()) return false; return !TII->areMemAccessesTriviallyDisjoint(MIa, MIb); } -static bool mayAlias(MachineInstr *MIa, +static bool mayAlias(MachineInstr &MIa, SmallVectorImpl<MachineInstr *> &MemInsns, const AArch64InstrInfo *TII) { - for (auto &MIb : MemInsns) - if (mayAlias(MIa, MIb, TII)) + for (MachineInstr *MIb : MemInsns) + if (mayAlias(MIa, *MIb, TII)) return true; return false; @@ -1008,40 +1098,43 @@ static bool mayAlias(MachineInstr *MIa, bool AArch64LoadStoreOpt::findMatchingStore( MachineBasicBlock::iterator I, unsigned Limit, MachineBasicBlock::iterator &StoreI) { - MachineBasicBlock::iterator E = I->getParent()->begin(); + MachineBasicBlock::iterator B = I->getParent()->begin(); MachineBasicBlock::iterator MBBI = I; - MachineInstr *FirstMI = I; - unsigned BaseReg = getLdStBaseOp(FirstMI).getReg(); + MachineInstr &LoadMI = *I; + unsigned BaseReg = getLdStBaseOp(LoadMI).getReg(); + + // If the load is the first instruction in the block, there's obviously + // not any matching store. + if (MBBI == B) + return false; // Track which registers have been modified and used between the first insn // and the second insn. - BitVector ModifiedRegs, UsedRegs; - ModifiedRegs.resize(TRI->getNumRegs()); - UsedRegs.resize(TRI->getNumRegs()); + ModifiedRegs.reset(); + UsedRegs.reset(); - for (unsigned Count = 0; MBBI != E && Count < Limit;) { + unsigned Count = 0; + do { --MBBI; - MachineInstr *MI = MBBI; - // Skip DBG_VALUE instructions. Otherwise debug info can affect the - // optimization by changing how far we scan. - if (MI->isDebugValue()) - continue; - // Now that we know this is a real instruction, count it. - ++Count; + MachineInstr &MI = *MBBI; + + // Don't count DBG_VALUE instructions towards the search limit. + if (!MI.isDebugValue()) + ++Count; // If the load instruction reads directly from the address to which the // store instruction writes and the stored value is not modified, we can // promote the load. Since we do not handle stores with pre-/post-index, // it's unnecessary to check if BaseReg is modified by the store itself. - if (MI->mayStore() && isMatchingStore(FirstMI, MI) && + if (MI.mayStore() && isMatchingStore(LoadMI, MI) && BaseReg == getLdStBaseOp(MI).getReg() && - isLdOffsetInRangeOfSt(FirstMI, MI) && + isLdOffsetInRangeOfSt(LoadMI, MI, TII) && !ModifiedRegs[getLdStRegOp(MI).getReg()]) { StoreI = MBBI; return true; } - if (MI->isCall()) + if (MI.isCall()) return false; // Update modified / uses register lists. @@ -1053,139 +1146,165 @@ bool AArch64LoadStoreOpt::findMatchingStore( return false; // If we encounter a store aliased with the load, return early. - if (MI->mayStore() && mayAlias(FirstMI, MI, TII)) + if (MI.mayStore() && mayAlias(LoadMI, MI, TII)) return false; - } + } while (MBBI != B && Count < Limit); return false; } -/// findMatchingInsn - Scan the instructions looking for a load/store that can -/// be combined with the current instruction into a load/store pair. +// Returns true if FirstMI and MI are candidates for merging or pairing. +// Otherwise, returns false. +static bool areCandidatesToMergeOrPair(MachineInstr &FirstMI, MachineInstr &MI, + LdStPairFlags &Flags, + const AArch64InstrInfo *TII) { + // If this is volatile or if pairing is suppressed, not a candidate. + if (MI.hasOrderedMemoryRef() || TII->isLdStPairSuppressed(MI)) + return false; + + // We should have already checked FirstMI for pair suppression and volatility. + assert(!FirstMI.hasOrderedMemoryRef() && + !TII->isLdStPairSuppressed(FirstMI) && + "FirstMI shouldn't get here if either of these checks are true."); + + unsigned OpcA = FirstMI.getOpcode(); + unsigned OpcB = MI.getOpcode(); + + // Opcodes match: nothing more to check. + if (OpcA == OpcB) + return true; + + // Try to match a sign-extended load/store with a zero-extended load/store. + bool IsValidLdStrOpc, PairIsValidLdStrOpc; + unsigned NonSExtOpc = getMatchingNonSExtOpcode(OpcA, &IsValidLdStrOpc); + assert(IsValidLdStrOpc && + "Given Opc should be a Load or Store with an immediate"); + // OpcA will be the first instruction in the pair. + if (NonSExtOpc == getMatchingNonSExtOpcode(OpcB, &PairIsValidLdStrOpc)) { + Flags.setSExtIdx(NonSExtOpc == (unsigned)OpcA ? 1 : 0); + return true; + } + + // If the second instruction isn't even a load/store, bail out. + if (!PairIsValidLdStrOpc) + return false; + + // FIXME: We don't support merging narrow loads/stores with mixed + // scaled/unscaled offsets. + if (isNarrowLoadOrStore(OpcA) || isNarrowLoadOrStore(OpcB)) + return false; + + // Try to match an unscaled load/store with a scaled load/store. + return TII->isUnscaledLdSt(OpcA) != TII->isUnscaledLdSt(OpcB) && + getMatchingPairOpcode(OpcA) == getMatchingPairOpcode(OpcB); + + // FIXME: Can we also match a mixed sext/zext unscaled/scaled pair? +} + +/// Scan the instructions looking for a load/store that can be combined with the +/// current instruction into a wider equivalent or a load/store pair. MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, - LdStPairFlags &Flags, unsigned Limit) { + LdStPairFlags &Flags, unsigned Limit, + bool FindNarrowMerge) { MachineBasicBlock::iterator E = I->getParent()->end(); MachineBasicBlock::iterator MBBI = I; - MachineInstr *FirstMI = I; + MachineInstr &FirstMI = *I; ++MBBI; - unsigned Opc = FirstMI->getOpcode(); - bool MayLoad = FirstMI->mayLoad(); - bool IsUnscaled = isUnscaledLdSt(FirstMI); + bool MayLoad = FirstMI.mayLoad(); + bool IsUnscaled = TII->isUnscaledLdSt(FirstMI); unsigned Reg = getLdStRegOp(FirstMI).getReg(); unsigned BaseReg = getLdStBaseOp(FirstMI).getReg(); int Offset = getLdStOffsetOp(FirstMI).getImm(); - bool IsNarrowStore = isNarrowStore(Opc); - - // For narrow stores, find only the case where the stored value is WZR. - if (IsNarrowStore && Reg != AArch64::WZR) - return E; - - // Early exit if the first instruction modifies the base register. - // e.g., ldr x0, [x0] - if (FirstMI->modifiesRegister(BaseReg, TRI)) - return E; - - // Early exit if the offset if not possible to match. (6 bits of positive - // range, plus allow an extra one in case we find a later insn that matches - // with Offset-1) int OffsetStride = IsUnscaled ? getMemScale(FirstMI) : 1; - if (!(isNarrowLoad(Opc) || IsNarrowStore) && - !inBoundsForPair(IsUnscaled, Offset, OffsetStride)) - return E; + bool IsPromotableZeroStore = isPromotableZeroStoreInst(FirstMI); // Track which registers have been modified and used between the first insn // (inclusive) and the second insn. - BitVector ModifiedRegs, UsedRegs; - ModifiedRegs.resize(TRI->getNumRegs()); - UsedRegs.resize(TRI->getNumRegs()); + ModifiedRegs.reset(); + UsedRegs.reset(); // Remember any instructions that read/write memory between FirstMI and MI. SmallVector<MachineInstr *, 4> MemInsns; for (unsigned Count = 0; MBBI != E && Count < Limit; ++MBBI) { - MachineInstr *MI = MBBI; + MachineInstr &MI = *MBBI; // Skip DBG_VALUE instructions. Otherwise debug info can affect the // optimization by changing how far we scan. - if (MI->isDebugValue()) + if (MI.isDebugValue()) continue; // Now that we know this is a real instruction, count it. ++Count; - bool CanMergeOpc = Opc == MI->getOpcode(); Flags.setSExtIdx(-1); - if (!CanMergeOpc) { - bool IsValidLdStrOpc; - unsigned NonSExtOpc = getMatchingNonSExtOpcode(Opc, &IsValidLdStrOpc); - assert(IsValidLdStrOpc && - "Given Opc should be a Load or Store with an immediate"); - // Opc will be the first instruction in the pair. - Flags.setSExtIdx(NonSExtOpc == (unsigned)Opc ? 1 : 0); - CanMergeOpc = NonSExtOpc == getMatchingNonSExtOpcode(MI->getOpcode()); - } - - if (CanMergeOpc && getLdStOffsetOp(MI).isImm()) { - assert(MI->mayLoadOrStore() && "Expected memory operation."); + if (areCandidatesToMergeOrPair(FirstMI, MI, Flags, TII) && + getLdStOffsetOp(MI).isImm()) { + assert(MI.mayLoadOrStore() && "Expected memory operation."); // If we've found another instruction with the same opcode, check to see // if the base and offset are compatible with our starting instruction. // These instructions all have scaled immediate operands, so we just // check for +1/-1. Make sure to check the new instruction offset is // actually an immediate and not a symbolic reference destined for // a relocation. - // - // Pairwise instructions have a 7-bit signed offset field. Single insns - // have a 12-bit unsigned offset field. To be a valid combine, the - // final offset must be in range. unsigned MIBaseReg = getLdStBaseOp(MI).getReg(); int MIOffset = getLdStOffsetOp(MI).getImm(); + bool MIIsUnscaled = TII->isUnscaledLdSt(MI); + if (IsUnscaled != MIIsUnscaled) { + // We're trying to pair instructions that differ in how they are scaled. + // If FirstMI is scaled then scale the offset of MI accordingly. + // Otherwise, do the opposite (i.e., make MI's offset unscaled). + int MemSize = getMemScale(MI); + if (MIIsUnscaled) { + // If the unscaled offset isn't a multiple of the MemSize, we can't + // pair the operations together: bail and keep looking. + if (MIOffset % MemSize) + continue; + MIOffset /= MemSize; + } else { + MIOffset *= MemSize; + } + } + if (BaseReg == MIBaseReg && ((Offset == MIOffset + OffsetStride) || (Offset + OffsetStride == MIOffset))) { int MinOffset = Offset < MIOffset ? Offset : MIOffset; - // If this is a volatile load/store that otherwise matched, stop looking - // as something is going on that we don't have enough information to - // safely transform. Similarly, stop if we see a hint to avoid pairs. - if (MI->hasOrderedMemoryRef() || TII->isLdStPairSuppressed(MI)) - return E; - // If the resultant immediate offset of merging these instructions - // is out of range for a pairwise instruction, bail and keep looking. - bool MIIsUnscaled = isUnscaledLdSt(MI); - bool IsNarrowLoad = isNarrowLoad(MI->getOpcode()); - if (!IsNarrowLoad && - !inBoundsForPair(MIIsUnscaled, MinOffset, OffsetStride)) { - trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); - MemInsns.push_back(MI); - continue; - } - - if (IsNarrowLoad || IsNarrowStore) { + if (FindNarrowMerge) { // If the alignment requirements of the scaled wide load/store - // instruction can't express the offset of the scaled narrow - // input, bail and keep looking. - if (!IsUnscaled && alignTo(MinOffset, 2) != MinOffset) { + // instruction can't express the offset of the scaled narrow input, + // bail and keep looking. For promotable zero stores, allow only when + // the stored value is the same (i.e., WZR). + if ((!IsUnscaled && alignTo(MinOffset, 2) != MinOffset) || + (IsPromotableZeroStore && Reg != getLdStRegOp(MI).getReg())) { trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); - MemInsns.push_back(MI); + MemInsns.push_back(&MI); continue; } } else { + // Pairwise instructions have a 7-bit signed offset field. Single + // insns have a 12-bit unsigned offset field. If the resultant + // immediate offset of merging these instructions is out of range for + // a pairwise instruction, bail and keep looking. + if (!inBoundsForPair(IsUnscaled, MinOffset, OffsetStride)) { + trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); + MemInsns.push_back(&MI); + continue; + } // If the alignment requirements of the paired (scaled) instruction // can't express the offset of the unscaled input, bail and keep // looking. if (IsUnscaled && (alignTo(MinOffset, OffsetStride) != MinOffset)) { trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); - MemInsns.push_back(MI); + MemInsns.push_back(&MI); continue; } } // If the destination register of the loads is the same register, bail // and keep looking. A load-pair instruction with both destination // registers the same is UNPREDICTABLE and will result in an exception. - // For narrow stores, allow only when the stored value is the same - // (i.e., WZR). - if ((MayLoad && Reg == getLdStRegOp(MI).getReg()) || - (IsNarrowStore && Reg != getLdStRegOp(MI).getReg())) { + if (MayLoad && Reg == getLdStRegOp(MI).getReg()) { trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); - MemInsns.push_back(MI); + MemInsns.push_back(&MI); continue; } @@ -1194,7 +1313,7 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, // and first alias with the second, we can combine the second into the // first. if (!ModifiedRegs[getLdStRegOp(MI).getReg()] && - !(MI->mayLoad() && UsedRegs[getLdStRegOp(MI).getReg()]) && + !(MI.mayLoad() && UsedRegs[getLdStRegOp(MI).getReg()]) && !mayAlias(MI, MemInsns, TII)) { Flags.setMergeForward(false); return MBBI; @@ -1217,7 +1336,7 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, // If the instruction wasn't a matching load or store. Stop searching if we // encounter a call instruction that might modify memory. - if (MI->isCall()) + if (MI.isCall()) return E; // Update modified / uses register lists. @@ -1229,8 +1348,8 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, return E; // Update list of instructions that read/write memory. - if (MI->mayLoadOrStore()) - MemInsns.push_back(MI); + if (MI.mayLoadOrStore()) + MemInsns.push_back(&MI); } return E; } @@ -1258,22 +1377,24 @@ AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I, unsigned NewOpc = IsPreIdx ? getPreIndexedOpcode(I->getOpcode()) : getPostIndexedOpcode(I->getOpcode()); MachineInstrBuilder MIB; - if (!isPairedLdSt(I)) { + if (!isPairedLdSt(*I)) { // Non-paired instruction. MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc)) - .addOperand(getLdStRegOp(Update)) - .addOperand(getLdStRegOp(I)) - .addOperand(getLdStBaseOp(I)) - .addImm(Value); + .addOperand(getLdStRegOp(*Update)) + .addOperand(getLdStRegOp(*I)) + .addOperand(getLdStBaseOp(*I)) + .addImm(Value) + .setMemRefs(I->memoperands_begin(), I->memoperands_end()); } else { // Paired instruction. - int Scale = getMemScale(I); + int Scale = getMemScale(*I); MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc)) - .addOperand(getLdStRegOp(Update)) - .addOperand(getLdStRegOp(I, 0)) - .addOperand(getLdStRegOp(I, 1)) - .addOperand(getLdStBaseOp(I)) - .addImm(Value / Scale); + .addOperand(getLdStRegOp(*Update)) + .addOperand(getLdStRegOp(*I, 0)) + .addOperand(getLdStRegOp(*I, 1)) + .addOperand(getLdStBaseOp(*I)) + .addImm(Value / Scale) + .setMemRefs(I->memoperands_begin(), I->memoperands_end()); } (void)MIB; @@ -1296,10 +1417,10 @@ AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I, return NextI; } -bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr *MemMI, - MachineInstr *MI, +bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr &MemMI, + MachineInstr &MI, unsigned BaseReg, int Offset) { - switch (MI->getOpcode()) { + switch (MI.getOpcode()) { default: break; case AArch64::SUBXri: @@ -1309,20 +1430,20 @@ bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr *MemMI, case AArch64::ADDXri: // Make sure it's a vanilla immediate operand, not a relocation or // anything else we can't handle. - if (!MI->getOperand(2).isImm()) + if (!MI.getOperand(2).isImm()) break; // Watch out for 1 << 12 shifted value. - if (AArch64_AM::getShiftValue(MI->getOperand(3).getImm())) + if (AArch64_AM::getShiftValue(MI.getOperand(3).getImm())) break; // The update instruction source and destination register must be the // same as the load/store base register. - if (MI->getOperand(0).getReg() != BaseReg || - MI->getOperand(1).getReg() != BaseReg) + if (MI.getOperand(0).getReg() != BaseReg || + MI.getOperand(1).getReg() != BaseReg) break; bool IsPairedInsn = isPairedLdSt(MemMI); - int UpdateOffset = MI->getOperand(2).getImm(); + int UpdateOffset = MI.getOperand(2).getImm(); // For non-paired load/store instructions, the immediate must fit in a // signed 9-bit integer. if (!IsPairedInsn && (UpdateOffset > 255 || UpdateOffset < -256)) @@ -1343,7 +1464,7 @@ bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr *MemMI, // If we have a non-zero Offset, we check that it matches the amount // we're adding to the register. - if (!Offset || Offset == MI->getOperand(2).getImm()) + if (!Offset || Offset == MI.getOperand(2).getImm()) return true; break; } @@ -1351,9 +1472,9 @@ bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr *MemMI, } MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward( - MachineBasicBlock::iterator I, unsigned Limit, int UnscaledOffset) { + MachineBasicBlock::iterator I, int UnscaledOffset, unsigned Limit) { MachineBasicBlock::iterator E = I->getParent()->end(); - MachineInstr *MemMI = I; + MachineInstr &MemMI = *I; MachineBasicBlock::iterator MBBI = I; unsigned BaseReg = getLdStBaseOp(MemMI).getReg(); @@ -1376,22 +1497,20 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward( // Track which registers have been modified and used between the first insn // (inclusive) and the second insn. - BitVector ModifiedRegs, UsedRegs; - ModifiedRegs.resize(TRI->getNumRegs()); - UsedRegs.resize(TRI->getNumRegs()); + ModifiedRegs.reset(); + UsedRegs.reset(); ++MBBI; - for (unsigned Count = 0; MBBI != E; ++MBBI) { - MachineInstr *MI = MBBI; - // Skip DBG_VALUE instructions. Otherwise debug info can affect the - // optimization by changing how far we scan. - if (MI->isDebugValue()) + for (unsigned Count = 0; MBBI != E && Count < Limit; ++MBBI) { + MachineInstr &MI = *MBBI; + // Skip DBG_VALUE instructions. + if (MI.isDebugValue()) continue; // Now that we know this is a real instruction, count it. ++Count; // If we found a match, return it. - if (isMatchingUpdateInsn(I, MI, BaseReg, UnscaledOffset)) + if (isMatchingUpdateInsn(*I, MI, BaseReg, UnscaledOffset)) return MBBI; // Update the status of what the instruction clobbered and used. @@ -1409,7 +1528,7 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward( MachineBasicBlock::iterator I, unsigned Limit) { MachineBasicBlock::iterator B = I->getParent()->begin(); MachineBasicBlock::iterator E = I->getParent()->end(); - MachineInstr *MemMI = I; + MachineInstr &MemMI = *I; MachineBasicBlock::iterator MBBI = I; unsigned BaseReg = getLdStBaseOp(MemMI).getReg(); @@ -1430,22 +1549,19 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward( // Track which registers have been modified and used between the first insn // (inclusive) and the second insn. - BitVector ModifiedRegs, UsedRegs; - ModifiedRegs.resize(TRI->getNumRegs()); - UsedRegs.resize(TRI->getNumRegs()); - --MBBI; - for (unsigned Count = 0; MBBI != B; --MBBI) { - MachineInstr *MI = MBBI; - // Skip DBG_VALUE instructions. Otherwise debug info can affect the - // optimization by changing how far we scan. - if (MI->isDebugValue()) - continue; + ModifiedRegs.reset(); + UsedRegs.reset(); + unsigned Count = 0; + do { + --MBBI; + MachineInstr &MI = *MBBI; - // Now that we know this is a real instruction, count it. - ++Count; + // Don't count DBG_VALUE instructions towards the search limit. + if (!MI.isDebugValue()) + ++Count; // If we found a match, return it. - if (isMatchingUpdateInsn(I, MI, BaseReg, Offset)) + if (isMatchingUpdateInsn(*I, MI, BaseReg, Offset)) return MBBI; // Update the status of what the instruction clobbered and used. @@ -1455,15 +1571,15 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward( // return early. if (ModifiedRegs[BaseReg] || UsedRegs[BaseReg]) return E; - } + } while (MBBI != B && Count < Limit); return E; } bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore( MachineBasicBlock::iterator &MBBI) { - MachineInstr *MI = MBBI; + MachineInstr &MI = *MBBI; // If this is a volatile load, don't mess with it. - if (MI->hasOrderedMemoryRef()) + if (MI.hasOrderedMemoryRef()) return false; // Make sure this is a reg+imm. @@ -1471,9 +1587,9 @@ bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore( if (!getLdStOffsetOp(MI).isImm()) return false; - // Look backward up to ScanLimit instructions. + // Look backward up to LdStLimit instructions. MachineBasicBlock::iterator StoreI; - if (findMatchingStore(MBBI, ScanLimit, StoreI)) { + if (findMatchingStore(MBBI, LdStLimit, StoreI)) { ++NumLoadsFromStoresPromoted; // Promote the load. Keeping the iterator straight is a // pain, so we let the merge routine tell us what the next instruction @@ -1484,40 +1600,70 @@ bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore( return false; } +// Find narrow loads that can be converted into a single wider load with +// bitfield extract instructions. Also merge adjacent zero stores into a wider +// store. bool AArch64LoadStoreOpt::tryToMergeLdStInst( MachineBasicBlock::iterator &MBBI) { - MachineInstr *MI = MBBI; - MachineBasicBlock::iterator E = MI->getParent()->end(); - // If this is a volatile load/store, don't mess with it. - if (MI->hasOrderedMemoryRef()) - return false; + assert((isNarrowLoad(*MBBI) || isPromotableZeroStoreOpcode(*MBBI)) && + "Expected narrow op."); + MachineInstr &MI = *MBBI; + MachineBasicBlock::iterator E = MI.getParent()->end(); - // Make sure this is a reg+imm (as opposed to an address reloc). - if (!getLdStOffsetOp(MI).isImm()) + if (!TII->isCandidateToMergeOrPair(MI)) return false; - // Check if this load/store has a hint to avoid pair formation. - // MachineMemOperands hints are set by the AArch64StorePairSuppress pass. - if (TII->isLdStPairSuppressed(MI)) + // For promotable zero stores, the stored value should be WZR. + if (isPromotableZeroStoreOpcode(MI) && + getLdStRegOp(MI).getReg() != AArch64::WZR) return false; - // Look ahead up to ScanLimit instructions for a pairable instruction. + // Look ahead up to LdStLimit instructions for a mergable instruction. LdStPairFlags Flags; - MachineBasicBlock::iterator Paired = findMatchingInsn(MBBI, Flags, ScanLimit); - if (Paired != E) { + MachineBasicBlock::iterator MergeMI = + findMatchingInsn(MBBI, Flags, LdStLimit, /* FindNarrowMerge = */ true); + if (MergeMI != E) { if (isNarrowLoad(MI)) { ++NumNarrowLoadsPromoted; - } else if (isNarrowStore(MI)) { + } else if (isPromotableZeroStoreInst(MI)) { ++NumZeroStoresPromoted; - } else { - ++NumPairCreated; - if (isUnscaledLdSt(MI)) - ++NumUnscaledPairCreated; } + // Keeping the iterator straight is a pain, so we let the merge routine tell + // us what the next instruction is after it's done mucking about. + MBBI = mergeNarrowInsns(MBBI, MergeMI, Flags); + return true; + } + return false; +} - // Merge the loads into a pair. Keeping the iterator straight is a - // pain, so we let the merge routine tell us what the next instruction - // is after it's done mucking about. +// Find loads and stores that can be merged into a single load or store pair +// instruction. +bool AArch64LoadStoreOpt::tryToPairLdStInst(MachineBasicBlock::iterator &MBBI) { + MachineInstr &MI = *MBBI; + MachineBasicBlock::iterator E = MI.getParent()->end(); + + if (!TII->isCandidateToMergeOrPair(MI)) + return false; + + // Early exit if the offset is not possible to match. (6 bits of positive + // range, plus allow an extra one in case we find a later insn that matches + // with Offset-1) + bool IsUnscaled = TII->isUnscaledLdSt(MI); + int Offset = getLdStOffsetOp(MI).getImm(); + int OffsetStride = IsUnscaled ? getMemScale(MI) : 1; + if (!inBoundsForPair(IsUnscaled, Offset, OffsetStride)) + return false; + + // Look ahead up to LdStLimit instructions for a pairable instruction. + LdStPairFlags Flags; + MachineBasicBlock::iterator Paired = + findMatchingInsn(MBBI, Flags, LdStLimit, /* FindNarrowMerge = */ false); + if (Paired != E) { + ++NumPairCreated; + if (TII->isUnscaledLdSt(MI)) + ++NumUnscaledPairCreated; + // Keeping the iterator straight is a pain, so we let the merge routine tell + // us what the next instruction is after it's done mucking about. MBBI = mergePairedInsns(MBBI, Paired, Flags); return true; } @@ -1527,7 +1673,7 @@ bool AArch64LoadStoreOpt::tryToMergeLdStInst( bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB, bool enableNarrowLdOpt) { bool Modified = false; - // Three tranformations to do here: + // Four tranformations to do here: // 1) Find loads that directly read from stores and promote them by // replacing with mov instructions. If the store is wider than the load, // the load will be replaced with a bitfield extract. @@ -1536,35 +1682,11 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB, // ldrh w2, [x0, #6] // ; becomes // str w1, [x0, #4] - // lsr w2, w1, #16 - // 2) Find narrow loads that can be converted into a single wider load - // with bitfield extract instructions. - // e.g., - // ldrh w0, [x2] - // ldrh w1, [x2, #2] - // ; becomes - // ldr w0, [x2] - // ubfx w1, w0, #16, #16 - // and w0, w0, #ffff - // 3) Find loads and stores that can be merged into a single load or store - // pair instruction. - // e.g., - // ldr x0, [x2] - // ldr x1, [x2, #8] - // ; becomes - // ldp x0, x1, [x2] - // 4) Find base register updates that can be merged into the load or store - // as a base-reg writeback. - // e.g., - // ldr x0, [x2] - // add x2, x2, #4 - // ; becomes - // ldr x0, [x2], #4 - + // lsr w2, w1, #16 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); MBBI != E;) { - MachineInstr *MI = MBBI; - switch (MI->getOpcode()) { + MachineInstr &MI = *MBBI; + switch (MI.getOpcode()) { default: // Just move on to the next instruction. ++MBBI; @@ -1586,47 +1708,49 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB, ++MBBI; break; } - // FIXME: Do the other instructions. } } - + // 2) Find narrow loads that can be converted into a single wider load + // with bitfield extract instructions. + // e.g., + // ldrh w0, [x2] + // ldrh w1, [x2, #2] + // ; becomes + // ldr w0, [x2] + // ubfx w1, w0, #16, #16 + // and w0, w0, #ffff + // + // Also merge adjacent zero stores into a wider store. + // e.g., + // strh wzr, [x0] + // strh wzr, [x0, #2] + // ; becomes + // str wzr, [x0] for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); enableNarrowLdOpt && MBBI != E;) { - MachineInstr *MI = MBBI; - switch (MI->getOpcode()) { - default: - // Just move on to the next instruction. - ++MBBI; - break; - // Scaled instructions. - case AArch64::LDRBBui: - case AArch64::LDRHHui: - case AArch64::LDRSBWui: - case AArch64::LDRSHWui: - case AArch64::STRBBui: - case AArch64::STRHHui: - // Unscaled instructions. - case AArch64::LDURBBi: - case AArch64::LDURHHi: - case AArch64::LDURSBWi: - case AArch64::LDURSHWi: - case AArch64::STURBBi: - case AArch64::STURHHi: { + MachineInstr &MI = *MBBI; + unsigned Opc = MI.getOpcode(); + if (isPromotableZeroStoreOpcode(Opc) || + (EnableNarrowLdMerge && isNarrowLoad(Opc))) { if (tryToMergeLdStInst(MBBI)) { Modified = true; - break; - } + } else + ++MBBI; + } else ++MBBI; - break; - } - // FIXME: Do the other instructions. - } } + // 3) Find loads and stores that can be merged into a single load or store + // pair instruction. + // e.g., + // ldr x0, [x2] + // ldr x1, [x2, #8] + // ; becomes + // ldp x0, x1, [x2] for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); MBBI != E;) { - MachineInstr *MI = MBBI; - switch (MI->getOpcode()) { + MachineInstr &MI = *MBBI; + switch (MI.getOpcode()) { default: // Just move on to the next instruction. ++MBBI; @@ -1655,23 +1779,28 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB, case AArch64::LDURWi: case AArch64::LDURXi: case AArch64::LDURSWi: { - if (tryToMergeLdStInst(MBBI)) { + if (tryToPairLdStInst(MBBI)) { Modified = true; break; } ++MBBI; break; } - // FIXME: Do the other instructions. } } - + // 4) Find base register updates that can be merged into the load or store + // as a base-reg writeback. + // e.g., + // ldr x0, [x2] + // add x2, x2, #4 + // ; becomes + // ldr x0, [x2], #4 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); MBBI != E;) { - MachineInstr *MI = MBBI; + MachineInstr &MI = *MBBI; // Do update merging. It's simpler to keep this separate from the above - // switch, though not strictly necessary. - unsigned Opc = MI->getOpcode(); + // switchs, though not strictly necessary. + unsigned Opc = MI.getOpcode(); switch (Opc) { default: // Just move on to the next instruction. @@ -1726,7 +1855,7 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB, // merged into: // ldr x0, [x20], #32 MachineBasicBlock::iterator Update = - findMatchingUpdateInsnForward(MBBI, ScanLimit, 0); + findMatchingUpdateInsnForward(MBBI, 0, UpdateLimit); if (Update != E) { // Merge the update into the ld/st. MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/false); @@ -1736,7 +1865,7 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB, } // Don't know how to handle pre/post-index versions, so move to the next // instruction. - if (isUnscaledLdSt(Opc)) { + if (TII->isUnscaledLdSt(Opc)) { ++MBBI; break; } @@ -1746,7 +1875,7 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB, // ldr x1, [x0] // merged into: // ldr x1, [x0, #8]! - Update = findMatchingUpdateInsnBackward(MBBI, ScanLimit); + Update = findMatchingUpdateInsnBackward(MBBI, UpdateLimit); if (Update != E) { // Merge the update into the ld/st. MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true); @@ -1764,7 +1893,7 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB, // add x0, x0, #64 // merged into: // ldr x1, [x0, #64]! - Update = findMatchingUpdateInsnForward(MBBI, ScanLimit, UnscaledOffset); + Update = findMatchingUpdateInsnForward(MBBI, UnscaledOffset, UpdateLimit); if (Update != E) { // Merge the update into the ld/st. MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true); @@ -1777,29 +1906,29 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB, ++MBBI; break; } - // FIXME: Do the other instructions. } } return Modified; } -bool AArch64LoadStoreOpt::enableNarrowLdMerge(MachineFunction &Fn) { - bool ProfitableArch = Subtarget->isCortexA57(); - // FIXME: The benefit from converting narrow loads into a wider load could be - // microarchitectural as it assumes that a single load with two bitfield - // extracts is cheaper than two narrow loads. Currently, this conversion is - // enabled only in cortex-a57 on which performance benefits were verified. - return ProfitableArch && !Subtarget->requiresStrictAlign(); -} - bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { + if (skipFunction(*Fn.getFunction())) + return false; + Subtarget = &static_cast<const AArch64Subtarget &>(Fn.getSubtarget()); TII = static_cast<const AArch64InstrInfo *>(Subtarget->getInstrInfo()); TRI = Subtarget->getRegisterInfo(); + // Resize the modified and used register bitfield trackers. We do this once + // per function and then clear the bitfield each time we optimize a load or + // store. + ModifiedRegs.resize(TRI->getNumRegs()); + UsedRegs.resize(TRI->getNumRegs()); + bool Modified = false; - bool enableNarrowLdOpt = enableNarrowLdMerge(Fn); + bool enableNarrowLdOpt = + Subtarget->mergeNarrowLoads() && !Subtarget->requiresStrictAlign(); for (auto &MBB : Fn) Modified |= optimizeBlock(MBB, enableNarrowLdOpt); @@ -1809,6 +1938,11 @@ bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { // FIXME: Do we need/want a pre-alloc pass like ARM has to try to keep // loads and stores near one another? +// FIXME: When pairing store instructions it's very possible for this pass to +// hoist a store with a KILL marker above another use (without a KILL marker). +// The resulting IR is invalid, but nothing uses the KILL markers after this +// pass, so it's never caused a problem in practice. + /// createAArch64LoadStoreOptimizationPass - returns an instance of the /// load / store optimization pass. FunctionPass *llvm::createAArch64LoadStoreOptimizationPass() { |