diff options
Diffstat (limited to 'lib/Target')
79 files changed, 966 insertions, 756 deletions
diff --git a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp index ae01ea477bb9..7141e77fcd25 100644 --- a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -1865,7 +1865,7 @@ static void getUsefulBitsFromBitfieldMoveOpd(SDValue Op, APInt &UsefulBits, OpUsefulBits = OpUsefulBits.shl(OpUsefulBits.getBitWidth() - Imm); getUsefulBits(Op, OpUsefulBits, Depth + 1); // The interesting part was at zero in the argument - OpUsefulBits = OpUsefulBits.lshr(OpUsefulBits.getBitWidth() - Imm); + OpUsefulBits.lshrInPlace(OpUsefulBits.getBitWidth() - Imm); } UsefulBits &= OpUsefulBits; @@ -1894,13 +1894,13 @@ static void getUsefulBitsFromOrWithShiftedReg(SDValue Op, APInt &UsefulBits, uint64_t ShiftAmt = AArch64_AM::getShiftValue(ShiftTypeAndValue); Mask = Mask.shl(ShiftAmt); getUsefulBits(Op, Mask, Depth + 1); - Mask = Mask.lshr(ShiftAmt); + Mask.lshrInPlace(ShiftAmt); } else if (AArch64_AM::getShiftType(ShiftTypeAndValue) == AArch64_AM::LSR) { // Shift Right // We do not handle AArch64_AM::ASR, because the sign will change the // number of useful bits uint64_t ShiftAmt = AArch64_AM::getShiftValue(ShiftTypeAndValue); - Mask = Mask.lshr(ShiftAmt); + Mask.lshrInPlace(ShiftAmt); getUsefulBits(Op, Mask, Depth + 1); Mask = Mask.shl(ShiftAmt); } else @@ -1954,7 +1954,7 @@ static void getUsefulBitsFromBFM(SDValue Op, SDValue Orig, APInt &UsefulBits, if (Op.getOperand(1) == Orig) { // Copy the bits from the result to the zero bits. Mask = ResultUsefulBits & OpUsefulBits; - Mask = Mask.lshr(LSB); + Mask.lshrInPlace(LSB); } if (Op.getOperand(0) == Orig) diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp index 0d3289ac84c3..4ddc95199d4c 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -3239,30 +3239,26 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol // node so that legalize doesn't hack it. - if (getTargetMachine().getCodeModel() == CodeModel::Large && - Subtarget->isTargetMachO()) { - if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { + if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee)) { + auto GV = G->getGlobal(); + if (Subtarget->classifyGlobalFunctionReference(GV, getTargetMachine()) == + AArch64II::MO_GOT) { + Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_GOT); + Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee); + } else { const GlobalValue *GV = G->getGlobal(); - bool InternalLinkage = GV->hasInternalLinkage(); - if (InternalLinkage) - Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 0); - else { - Callee = - DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_GOT); - Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee); - } - } else if (ExternalSymbolSDNode *S = - dyn_cast<ExternalSymbolSDNode>(Callee)) { + Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 0); + } + } else if (auto *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { + if (getTargetMachine().getCodeModel() == CodeModel::Large && + Subtarget->isTargetMachO()) { const char *Sym = S->getSymbol(); Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, AArch64II::MO_GOT); Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee); + } else { + const char *Sym = S->getSymbol(); + Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, 0); } - } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { - const GlobalValue *GV = G->getGlobal(); - Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 0); - } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { - const char *Sym = S->getSymbol(); - Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, 0); } // We don't usually want to end the call-sequence here because we would tidy @@ -7130,7 +7126,7 @@ bool AArch64TargetLowering::isProfitableToHoist(Instruction *I) const { if (I->getOpcode() != Instruction::FMul) return true; - if (I->getNumUses() != 1) + if (!I->hasOneUse()) return true; Instruction *User = I->user_back(); @@ -10395,7 +10391,7 @@ bool AArch64TargetLowering::isUsedByReturnOnly(SDNode *N, // call. This will cause the optimizers to attempt to move, or duplicate, // return instructions to help enable tail call optimizations for this // instruction. -bool AArch64TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const { +bool AArch64TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const { return CI->isTailCall(); } diff --git a/lib/Target/AArch64/AArch64ISelLowering.h b/lib/Target/AArch64/AArch64ISelLowering.h index 2ad6c8b23df8..a023b4373835 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.h +++ b/lib/Target/AArch64/AArch64ISelLowering.h @@ -593,7 +593,7 @@ private: } bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const override; - bool mayBeEmittedAsTailCall(CallInst *CI) const override; + bool mayBeEmittedAsTailCall(const CallInst *CI) const override; bool getIndexedAddressParts(SDNode *Op, SDValue &Base, SDValue &Offset, ISD::MemIndexedMode &AM, bool &IsInc, SelectionDAG &DAG) const; diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td index 4449412532f3..82e9c5a88e3b 100644 --- a/lib/Target/AArch64/AArch64InstrInfo.td +++ b/lib/Target/AArch64/AArch64InstrInfo.td @@ -2586,6 +2586,11 @@ def FMOVS0 : Pseudo<(outs FPR32:$Rd), (ins), [(set f32:$Rd, (fpimm0))]>, def FMOVD0 : Pseudo<(outs FPR64:$Rd), (ins), [(set f64:$Rd, (fpimm0))]>, Sched<[WriteF]>; } +// Similarly add aliases +def : InstAlias<"fmov $Rd, #0.0", (FMOVWHr FPR16:$Rd, WZR), 0>, + Requires<[HasFullFP16]>; +def : InstAlias<"fmov $Rd, #0.0", (FMOVWSr FPR32:$Rd, WZR), 0>; +def : InstAlias<"fmov $Rd, #0.0", (FMOVXDr FPR64:$Rd, XZR), 0>; //===----------------------------------------------------------------------===// // Floating point conversion instruction. diff --git a/lib/Target/AArch64/AArch64InstructionSelector.cpp b/lib/Target/AArch64/AArch64InstructionSelector.cpp index 878dac6bff1e..5e01b6cd2b46 100644 --- a/lib/Target/AArch64/AArch64InstructionSelector.cpp +++ b/lib/Target/AArch64/AArch64InstructionSelector.cpp @@ -20,6 +20,7 @@ #include "AArch64TargetMachine.h" #include "MCTargetDesc/AArch64AddressingModes.h" #include "llvm/CodeGen/GlobalISel/InstructionSelector.h" +#include "llvm/CodeGen/GlobalISel/Utils.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstr.h" diff --git a/lib/Target/AArch64/AArch64RegisterBankInfo.cpp b/lib/Target/AArch64/AArch64RegisterBankInfo.cpp index 20a5979f9b4b..6f9021c4a030 100644 --- a/lib/Target/AArch64/AArch64RegisterBankInfo.cpp +++ b/lib/Target/AArch64/AArch64RegisterBankInfo.cpp @@ -482,7 +482,7 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { SmallVector<PartialMappingIdx, 4> OpRegBankIdx(NumOperands); for (unsigned Idx = 0; Idx < NumOperands; ++Idx) { auto &MO = MI.getOperand(Idx); - if (!MO.isReg()) + if (!MO.isReg() || !MO.getReg()) continue; LLT Ty = MRI.getType(MO.getReg()); @@ -537,7 +537,7 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { InstructionMapping{DefaultMappingID, Cost, nullptr, NumOperands}; SmallVector<const ValueMapping *, 8> OpdsMapping(NumOperands); for (unsigned Idx = 0; Idx < NumOperands; ++Idx) { - if (MI.getOperand(Idx).isReg()) { + if (MI.getOperand(Idx).isReg() && MI.getOperand(Idx).getReg()) { auto Mapping = getValueMapping(OpRegBankIdx[Idx], OpSize[Idx]); if (!Mapping->isValid()) return InstructionMapping(); diff --git a/lib/Target/AArch64/AArch64SchedFalkorDetails.td b/lib/Target/AArch64/AArch64SchedFalkorDetails.td index 6bce4ef6b652..4bd77d344488 100644 --- a/lib/Target/AArch64/AArch64SchedFalkorDetails.td +++ b/lib/Target/AArch64/AArch64SchedFalkorDetails.td @@ -265,6 +265,12 @@ def : InstRW<[FalkorWr_2LD_2VXVY_2LD_1XYZ_2VXVY_4cyc, WriteAdr],(instregex "^LD4 // Arithmetic and Logical Instructions // ----------------------------------------------------------------------------- def : InstRW<[FalkorWr_ADD], (instregex "^ADD(S)?(W|X)r(s|x)$")>; +def : InstRW<[FalkorWr_1XYZ_1cyc], (instregex "^AND(S)?(W|X)r(i|r|s)$")>; +def : InstRW<[FalkorWr_1XYZ_1cyc], (instregex "^BIC(S)?(W|X)r(r|s)$")>; +def : InstRW<[FalkorWr_1XYZ_1cyc], (instregex "^EON(W|X)r(r|s)$")>; +def : InstRW<[FalkorWr_1XYZ_1cyc], (instregex "^EOR(W|X)r(i|r|s)$")>; +def : InstRW<[FalkorWr_1XYZ_1cyc], (instregex "^ORN(W|X)r(r|s)$")>; +def : InstRW<[FalkorWr_1XYZ_1cyc], (instregex "^ORR(W|X)r(i|r|s)$")>; def : InstRW<[FalkorWr_2XYZ_2cyc], (instregex "^SUB(S)?(W|X)r(s|x)$")>; // SIMD Miscellaneous Instructions diff --git a/lib/Target/AArch64/AArch64Subtarget.cpp b/lib/Target/AArch64/AArch64Subtarget.cpp index b3aba4781db8..042755bd36d0 100644 --- a/lib/Target/AArch64/AArch64Subtarget.cpp +++ b/lib/Target/AArch64/AArch64Subtarget.cpp @@ -35,6 +35,11 @@ static cl::opt<bool> UseAddressTopByteIgnored("aarch64-use-tbi", cl::desc("Assume that top byte of " "an address is ignored"), cl::init(false), cl::Hidden); +static cl::opt<bool> + UseNonLazyBind("aarch64-enable-nonlazybind", + cl::desc("Call nonlazybind functions via direct GOT load"), + cl::init(false), cl::Hidden); + AArch64Subtarget & AArch64Subtarget::initializeSubtargetDependencies(StringRef FS, StringRef CPUString) { @@ -155,6 +160,23 @@ AArch64Subtarget::ClassifyGlobalReference(const GlobalValue *GV, return AArch64II::MO_NO_FLAG; } +unsigned char AArch64Subtarget::classifyGlobalFunctionReference( + const GlobalValue *GV, const TargetMachine &TM) const { + // MachO large model always goes via a GOT, because we don't have the + // relocations available to do anything else.. + if (TM.getCodeModel() == CodeModel::Large && isTargetMachO() && + !GV->hasInternalLinkage()) + return AArch64II::MO_GOT; + + // NonLazyBind goes via GOT unless we know it's available locally. + auto *F = dyn_cast<Function>(GV); + if (UseNonLazyBind && F && F->hasFnAttribute(Attribute::NonLazyBind) && + !TM.shouldAssumeDSOLocal(*GV->getParent(), GV)) + return AArch64II::MO_GOT; + + return AArch64II::MO_NO_FLAG; +} + /// This function returns the name of a function which has an interface /// like the non-standard bzero function, if such a function exists on /// the current subtarget and it is considered prefereable over diff --git a/lib/Target/AArch64/AArch64Subtarget.h b/lib/Target/AArch64/AArch64Subtarget.h index 40ad9185012c..3d66a9ea8ce6 100644 --- a/lib/Target/AArch64/AArch64Subtarget.h +++ b/lib/Target/AArch64/AArch64Subtarget.h @@ -271,6 +271,9 @@ public: unsigned char ClassifyGlobalReference(const GlobalValue *GV, const TargetMachine &TM) const; + unsigned char classifyGlobalFunctionReference(const GlobalValue *GV, + const TargetMachine &TM) const; + /// This function returns the name of a function which has an interface /// like the non-standard bzero function, if such a function exists on /// the current subtarget and it is considered prefereable over diff --git a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp index cbab68979c56..d7bbc2bcd22c 100644 --- a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp +++ b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp @@ -2100,27 +2100,9 @@ AArch64AsmParser::tryParseFPImm(OperandVector &Operands) { bool isNegative = parseOptionalToken(AsmToken::Minus); const AsmToken &Tok = Parser.getTok(); - if (Tok.is(AsmToken::Real)) { - APFloat RealVal(APFloat::IEEEdouble(), Tok.getString()); - if (isNegative) - RealVal.changeSign(); - - uint64_t IntVal = RealVal.bitcastToAPInt().getZExtValue(); - int Val = AArch64_AM::getFP64Imm(APInt(64, IntVal)); - Parser.Lex(); // Eat the token. - // Check for out of range values. As an exception, we let Zero through, - // as we handle that special case in post-processing before matching in - // order to use the zero register for it. - if (Val == -1 && !RealVal.isPosZero()) { - TokError("expected compatible register or floating-point constant"); - return MatchOperand_ParseFail; - } - Operands.push_back(AArch64Operand::CreateFPImm(Val, S, getContext())); - return MatchOperand_Success; - } - if (Tok.is(AsmToken::Integer)) { + if (Tok.is(AsmToken::Real) || Tok.is(AsmToken::Integer)) { int64_t Val; - if (!isNegative && Tok.getString().startswith("0x")) { + if (Tok.is(AsmToken::Integer) && !isNegative && Tok.getString().startswith("0x")) { Val = Tok.getIntVal(); if (Val > 255 || Val < 0) { TokError("encoded floating point value out of range"); @@ -2128,10 +2110,24 @@ AArch64AsmParser::tryParseFPImm(OperandVector &Operands) { } } else { APFloat RealVal(APFloat::IEEEdouble(), Tok.getString()); + if (isNegative) + RealVal.changeSign(); + uint64_t IntVal = RealVal.bitcastToAPInt().getZExtValue(); - // If we had a '-' in front, toggle the sign bit. - IntVal ^= (uint64_t)isNegative << 63; Val = AArch64_AM::getFP64Imm(APInt(64, IntVal)); + + // Check for out of range values. As an exception we let Zero through, + // but as tokens instead of an FPImm so that it can be matched by the + // appropriate alias if one exists. + if (RealVal.isPosZero()) { + Parser.Lex(); // Eat the token. + Operands.push_back(AArch64Operand::CreateToken("#0", false, S, getContext())); + Operands.push_back(AArch64Operand::CreateToken(".0", false, S, getContext())); + return MatchOperand_Success; + } else if (Val == -1) { + TokError("expected compatible register or floating-point constant"); + return MatchOperand_ParseFail; + } } Parser.Lex(); // Eat the token. Operands.push_back(AArch64Operand::CreateFPImm(Val, S, getContext())); @@ -3655,21 +3651,6 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, } } - // Yet another horrible hack to handle FMOV Rd, #0.0 using [WX]ZR. - if (NumOperands == 3 && Tok == "fmov") { - AArch64Operand &RegOp = static_cast<AArch64Operand &>(*Operands[1]); - AArch64Operand &ImmOp = static_cast<AArch64Operand &>(*Operands[2]); - if (RegOp.isReg() && ImmOp.isFPImm() && ImmOp.getFPImm() == (unsigned)-1) { - unsigned zreg = - !AArch64MCRegisterClasses[AArch64::FPR64RegClassID].contains( - RegOp.getReg()) - ? AArch64::WZR - : AArch64::XZR; - Operands[2] = AArch64Operand::CreateReg(zreg, false, Op.getStartLoc(), - Op.getEndLoc(), getContext()); - } - } - MCInst Inst; // First try to match against the secondary set of tables containing the // short-form NEON instructions (e.g. "fadd.2s v0, v1, v2"). diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp index 8fc822329595..94112849f84e 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp @@ -39,7 +39,7 @@ AArch64MCAsmInfoDarwin::AArch64MCAsmInfoDarwin() { PrivateLabelPrefix = "L"; SeparatorString = "%%"; CommentString = ";"; - PointerSize = CalleeSaveStackSlotSize = 8; + CodePointerSize = CalleeSaveStackSlotSize = 8; AlignmentIsInBytes = false; UsesELFSectionDirectiveForBSS = true; @@ -71,7 +71,7 @@ AArch64MCAsmInfoELF::AArch64MCAsmInfoELF(const Triple &T) { // We prefer NEON instructions to be printed in the short form. AssemblerDialect = AsmWriterVariant == Default ? 0 : AsmWriterVariant; - PointerSize = 8; + CodePointerSize = 8; // ".comm align is in bytes but .align is pow-2." AlignmentIsInBytes = false; diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index 0446655830d1..a81bcb56dfdc 100644 --- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -144,6 +144,10 @@ bool AMDGPUAsmPrinter::isBlockOnlyReachableByFallthrough( } void AMDGPUAsmPrinter::EmitFunctionBodyStart() { + const AMDGPUMachineFunction *MFI = MF->getInfo<AMDGPUMachineFunction>(); + if (!MFI->isEntryFunction()) + return; + const AMDGPUSubtarget &STM = MF->getSubtarget<AMDGPUSubtarget>(); SIProgramInfo KernelInfo; amd_kernel_code_t KernelCode; @@ -184,9 +188,11 @@ void AMDGPUAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) { } bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { + const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>(); // The starting address of all shader programs must be 256 bytes aligned. - MF.setAlignment(8); + // Regular functions just need the basic required instruction alignment. + MF.setAlignment(MFI->isEntryFunction() ? 8 : 2); SetupMachineFunction(MF); @@ -220,13 +226,19 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { OutStreamer->SwitchSection(CommentSection); if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { - OutStreamer->emitRawComment(" Kernel info:", false); - OutStreamer->emitRawComment(" codeLenInByte = " + Twine(KernelInfo.CodeLen), - false); + if (MFI->isEntryFunction()) { + OutStreamer->emitRawComment(" Kernel info:", false); + } else { + OutStreamer->emitRawComment(" Function info:", false); + } + + OutStreamer->emitRawComment(" codeLenInByte = " + + Twine(getFunctionCodeSize(MF)), false); OutStreamer->emitRawComment(" NumSgprs: " + Twine(KernelInfo.NumSGPR), false); OutStreamer->emitRawComment(" NumVgprs: " + Twine(KernelInfo.NumVGPR), false); + OutStreamer->emitRawComment(" FloatMode: " + Twine(KernelInfo.FloatMode), false); OutStreamer->emitRawComment(" IeeeMode: " + Twine(KernelInfo.IEEEMode), @@ -236,6 +248,9 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { OutStreamer->emitRawComment(" LDSByteSize: " + Twine(KernelInfo.LDSSize) + " bytes/workgroup (compile time only)", false); + if (!MFI->isEntryFunction()) + return false; + OutStreamer->emitRawComment(" SGPRBlocks: " + Twine(KernelInfo.SGPRBlocks), false); OutStreamer->emitRawComment(" VGPRBlocks: " + @@ -317,7 +332,7 @@ void AMDGPUAsmPrinter::EmitProgramInfoR600(const MachineFunction &MF) { const MachineOperand &MO = MI.getOperand(op_idx); if (!MO.isReg()) continue; - unsigned HWReg = RI->getEncodingValue(MO.getReg()) & 0xff; + unsigned HWReg = RI->getHWRegIndex(MO.getReg()); // Register with value > 127 aren't GPR if (HWReg > 127) @@ -360,18 +375,12 @@ void AMDGPUAsmPrinter::EmitProgramInfoR600(const MachineFunction &MF) { } } -void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, - const MachineFunction &MF) const { +uint64_t AMDGPUAsmPrinter::getFunctionCodeSize(const MachineFunction &MF) const { const SISubtarget &STM = MF.getSubtarget<SISubtarget>(); - const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); - uint64_t CodeSize = 0; - unsigned MaxSGPR = 0; - unsigned MaxVGPR = 0; - bool VCCUsed = false; - bool FlatUsed = false; - const SIRegisterInfo *RI = STM.getRegisterInfo(); const SIInstrInfo *TII = STM.getInstrInfo(); + uint64_t CodeSize = 0; + for (const MachineBasicBlock &MBB : MF) { for (const MachineInstr &MI : MBB) { // TODO: CodeSize should account for multiple functions. @@ -380,122 +389,86 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, if (MI.isDebugValue()) continue; - if (isVerbose()) - CodeSize += TII->getInstSizeInBytes(MI); + CodeSize += TII->getInstSizeInBytes(MI); + } + } - unsigned numOperands = MI.getNumOperands(); - for (unsigned op_idx = 0; op_idx < numOperands; op_idx++) { - const MachineOperand &MO = MI.getOperand(op_idx); - unsigned width = 0; - bool isSGPR = false; + return CodeSize; +} - if (!MO.isReg()) - continue; +static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo &MRI, + const SIInstrInfo &TII, + unsigned Reg) { + for (const MachineOperand &UseOp : MRI.reg_operands(Reg)) { + if (!UseOp.isImplicit() || !TII.isFLAT(*UseOp.getParent())) + return true; + } - unsigned reg = MO.getReg(); - switch (reg) { - case AMDGPU::EXEC: - case AMDGPU::EXEC_LO: - case AMDGPU::EXEC_HI: - case AMDGPU::SCC: - case AMDGPU::M0: - case AMDGPU::SRC_SHARED_BASE: - case AMDGPU::SRC_SHARED_LIMIT: - case AMDGPU::SRC_PRIVATE_BASE: - case AMDGPU::SRC_PRIVATE_LIMIT: - continue; + return false; +} - case AMDGPU::VCC: - case AMDGPU::VCC_LO: - case AMDGPU::VCC_HI: - VCCUsed = true; - continue; +void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, + const MachineFunction &MF) const { + const SISubtarget &STM = MF.getSubtarget<SISubtarget>(); + const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + const MachineRegisterInfo &MRI = MF.getRegInfo(); + const SIInstrInfo *TII = STM.getInstrInfo(); + const SIRegisterInfo *RI = &TII->getRegisterInfo(); - case AMDGPU::FLAT_SCR: - case AMDGPU::FLAT_SCR_LO: - case AMDGPU::FLAT_SCR_HI: - // Even if FLAT_SCRATCH is implicitly used, it has no effect if flat - // instructions aren't used to access the scratch buffer. - if (MFI->hasFlatScratchInit()) - FlatUsed = true; - continue; - case AMDGPU::TBA: - case AMDGPU::TBA_LO: - case AMDGPU::TBA_HI: - case AMDGPU::TMA: - case AMDGPU::TMA_LO: - case AMDGPU::TMA_HI: - llvm_unreachable("trap handler registers should not be used"); - - default: - break; - } - - if (AMDGPU::SReg_32RegClass.contains(reg)) { - assert(!AMDGPU::TTMP_32RegClass.contains(reg) && - "trap handler registers should not be used"); - isSGPR = true; - width = 1; - } else if (AMDGPU::VGPR_32RegClass.contains(reg)) { - isSGPR = false; - width = 1; - } else if (AMDGPU::SReg_64RegClass.contains(reg)) { - assert(!AMDGPU::TTMP_64RegClass.contains(reg) && - "trap handler registers should not be used"); - isSGPR = true; - width = 2; - } else if (AMDGPU::VReg_64RegClass.contains(reg)) { - isSGPR = false; - width = 2; - } else if (AMDGPU::VReg_96RegClass.contains(reg)) { - isSGPR = false; - width = 3; - } else if (AMDGPU::SReg_128RegClass.contains(reg)) { - isSGPR = true; - width = 4; - } else if (AMDGPU::VReg_128RegClass.contains(reg)) { - isSGPR = false; - width = 4; - } else if (AMDGPU::SReg_256RegClass.contains(reg)) { - isSGPR = true; - width = 8; - } else if (AMDGPU::VReg_256RegClass.contains(reg)) { - isSGPR = false; - width = 8; - } else if (AMDGPU::SReg_512RegClass.contains(reg)) { - isSGPR = true; - width = 16; - } else if (AMDGPU::VReg_512RegClass.contains(reg)) { - isSGPR = false; - width = 16; - } else { - llvm_unreachable("Unknown register class"); - } - unsigned hwReg = RI->getEncodingValue(reg) & 0xff; - unsigned maxUsed = hwReg + width - 1; - if (isSGPR) { - MaxSGPR = maxUsed > MaxSGPR ? maxUsed : MaxSGPR; - } else { - MaxVGPR = maxUsed > MaxVGPR ? maxUsed : MaxVGPR; - } - } + MCPhysReg NumVGPRReg = AMDGPU::NoRegister; + for (MCPhysReg Reg : reverse(AMDGPU::VGPR_32RegClass.getRegisters())) { + if (MRI.isPhysRegUsed(Reg)) { + NumVGPRReg = Reg; + break; + } + } + + MCPhysReg NumSGPRReg = AMDGPU::NoRegister; + for (MCPhysReg Reg : reverse(AMDGPU::SGPR_32RegClass.getRegisters())) { + if (MRI.isPhysRegUsed(Reg)) { + NumSGPRReg = Reg; + break; } } + // We found the maximum register index. They start at 0, so add one to get the + // number of registers. + ProgInfo.NumVGPR = NumVGPRReg == AMDGPU::NoRegister ? 0 : + RI->getHWRegIndex(NumVGPRReg) + 1; + ProgInfo.NumSGPR = NumSGPRReg == AMDGPU::NoRegister ? 0 : + RI->getHWRegIndex(NumSGPRReg) + 1; unsigned ExtraSGPRs = 0; - if (VCCUsed) + ProgInfo.VCCUsed = MRI.isPhysRegUsed(AMDGPU::VCC_LO) || + MRI.isPhysRegUsed(AMDGPU::VCC_HI); + if (ProgInfo.VCCUsed) ExtraSGPRs = 2; + ProgInfo.FlatUsed = MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_LO) || + MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_HI); + + // Even if FLAT_SCRATCH is implicitly used, it has no effect if flat + // instructions aren't used to access the scratch buffer. Inline assembly + // may need it though. + // + // If we only have implicit uses of flat_scr on flat instructions, it is not + // really needed. + if (ProgInfo.FlatUsed && !MFI->hasFlatScratchInit() && + (!hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR) && + !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_LO) && + !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_HI))) { + ProgInfo.FlatUsed = false; + } + if (STM.getGeneration() < SISubtarget::VOLCANIC_ISLANDS) { - if (FlatUsed) + if (ProgInfo.FlatUsed) ExtraSGPRs = 4; } else { if (STM.isXNACKEnabled()) ExtraSGPRs = 4; - if (FlatUsed) + if (ProgInfo.FlatUsed) ExtraSGPRs = 6; } @@ -505,34 +478,29 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, if (STM.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS && !STM.hasSGPRInitBug()) { unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs(); - if (MaxSGPR + 1 > MaxAddressableNumSGPRs) { + if (ProgInfo.NumSGPR > MaxAddressableNumSGPRs) { // This can happen due to a compiler bug or when using inline asm. LLVMContext &Ctx = MF.getFunction()->getContext(); DiagnosticInfoResourceLimit Diag(*MF.getFunction(), "addressable scalar registers", - MaxSGPR + 1, DS_Error, + ProgInfo.NumSGPR, DS_Error, DK_ResourceLimit, MaxAddressableNumSGPRs); Ctx.diagnose(Diag); - MaxSGPR = MaxAddressableNumSGPRs - 1; + ProgInfo.NumSGPR = MaxAddressableNumSGPRs - 1; } } // Account for extra SGPRs and VGPRs reserved for debugger use. - MaxSGPR += ExtraSGPRs; - MaxVGPR += ExtraVGPRs; - - // We found the maximum register index. They start at 0, so add one to get the - // number of registers. - ProgInfo.NumSGPR = MaxSGPR + 1; - ProgInfo.NumVGPR = MaxVGPR + 1; + ProgInfo.NumSGPR += ExtraSGPRs; + ProgInfo.NumVGPR += ExtraVGPRs; // Adjust number of registers used to meet default/requested minimum/maximum // number of waves per execution unit request. ProgInfo.NumSGPRsForWavesPerEU = std::max( - ProgInfo.NumSGPR, STM.getMinNumSGPRs(MFI->getMaxWavesPerEU())); + std::max(ProgInfo.NumSGPR, 1u), STM.getMinNumSGPRs(MFI->getMaxWavesPerEU())); ProgInfo.NumVGPRsForWavesPerEU = std::max( - ProgInfo.NumVGPR, STM.getMinNumVGPRs(MFI->getMaxWavesPerEU())); + std::max(ProgInfo.NumVGPR, 1u), STM.getMinNumVGPRs(MFI->getMaxWavesPerEU())); if (STM.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS || STM.hasSGPRInitBug()) { @@ -559,10 +527,10 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; } - if (MFI->NumUserSGPRs > STM.getMaxNumUserSGPRs()) { + if (MFI->getNumUserSGPRs() > STM.getMaxNumUserSGPRs()) { LLVMContext &Ctx = MF.getFunction()->getContext(); DiagnosticInfoResourceLimit Diag(*MF.getFunction(), "user SGPRs", - MFI->NumUserSGPRs, DS_Error); + MFI->getNumUserSGPRs(), DS_Error); Ctx.diagnose(Diag); } @@ -584,7 +552,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, ProgInfo.VGPRBlocks = ProgInfo.VGPRBlocks / STM.getVGPREncodingGranule() - 1; // Record first reserved VGPR and number of reserved VGPRs. - ProgInfo.ReservedVGPRFirst = STM.debuggerReserveRegs() ? MaxVGPR + 1 : 0; + ProgInfo.ReservedVGPRFirst = STM.debuggerReserveRegs() ? ProgInfo.NumVGPR : 0; ProgInfo.ReservedVGPRCount = STM.getReservedNumVGPRs(MF); // Update DebuggerWavefrontPrivateSegmentOffsetSGPR and @@ -609,10 +577,6 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, const MachineFrameInfo &FrameInfo = MF.getFrameInfo(); ProgInfo.ScratchSize = FrameInfo.getStackSize(); - ProgInfo.FlatUsed = FlatUsed; - ProgInfo.VCCUsed = VCCUsed; - ProgInfo.CodeLen = CodeSize; - unsigned LDSAlignShift; if (STM.getGeneration() < SISubtarget::SEA_ISLANDS) { // LDS is allocated in 64 dword blocks. @@ -623,7 +587,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, } unsigned LDSSpillSize = - MFI->LDSWaveSpillSize * MFI->getMaxFlatWorkGroupSize(); + MFI->getLDSWaveSpillSize() * MFI->getMaxFlatWorkGroupSize(); ProgInfo.LDSSize = MFI->getLDSSize() + LDSSpillSize; ProgInfo.LDSBlocks = diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/lib/Target/AMDGPU/AMDGPUAsmPrinter.h index 13425c8b2a0f..8c86dea4b885 100644 --- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.h +++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.h @@ -55,7 +55,7 @@ private: uint32_t NumVGPR = 0; uint32_t NumSGPR = 0; - uint32_t LDSSize; + uint32_t LDSSize = 0; bool FlatUsed = false; // Number of SGPRs that meets number of waves per execution unit request. @@ -85,11 +85,11 @@ private: // Bonus information for debugging. bool VCCUsed = false; - uint64_t CodeLen = 0; SIProgramInfo() = default; }; + uint64_t getFunctionCodeSize(const MachineFunction &MF) const; void getSIProgramInfo(SIProgramInfo &Out, const MachineFunction &MF) const; void getAmdKernelCode(amd_kernel_code_t &Out, const SIProgramInfo &KernelInfo, const MachineFunction &MF) const; diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h index 36bc2498781f..a5cda817ac11 100644 --- a/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -415,9 +415,11 @@ public: return 0; } + // Scratch is allocated in 256 dword per wave blocks for the entire + // wavefront. When viewed from the perspecive of an arbitrary workitem, this + // is 4-byte aligned. unsigned getStackAlignment() const { - // Scratch is allocated in 256 dword per wave blocks. - return 4 * 256 / getWavefrontSize(); + return 4; } bool enableMachineScheduler() const override { diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 01ac9968181a..6edd3e923ba1 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -426,16 +426,23 @@ static bool isArgPassedInSGPR(const Argument *A) { const Function *F = A->getParent(); // Arguments to compute shaders are never a source of divergence. - if (!AMDGPU::isShader(F->getCallingConv())) + CallingConv::ID CC = F->getCallingConv(); + switch (CC) { + case CallingConv::AMDGPU_KERNEL: + case CallingConv::SPIR_KERNEL: return true; - - // For non-compute shaders, SGPR inputs are marked with either inreg or byval. - if (F->getAttributes().hasParamAttribute(A->getArgNo(), Attribute::InReg) || - F->getAttributes().hasParamAttribute(A->getArgNo(), Attribute::ByVal)) - return true; - - // Everything else is in VGPRs. - return false; + case CallingConv::AMDGPU_VS: + case CallingConv::AMDGPU_GS: + case CallingConv::AMDGPU_PS: + case CallingConv::AMDGPU_CS: + // For non-compute shaders, SGPR inputs are marked with either inreg or byval. + // Everything else is in VGPRs. + return F->getAttributes().hasParamAttribute(A->getArgNo(), Attribute::InReg) || + F->getAttributes().hasParamAttribute(A->getArgNo(), Attribute::ByVal); + default: + // TODO: Should calls support inreg for SGPR inputs? + return false; + } } /// diff --git a/lib/Target/AMDGPU/DSInstructions.td b/lib/Target/AMDGPU/DSInstructions.td index a9f64589fa5e..357e18108e7e 100644 --- a/lib/Target/AMDGPU/DSInstructions.td +++ b/lib/Target/AMDGPU/DSInstructions.td @@ -255,8 +255,6 @@ class DS_1A1D_PERMUTE <string opName, SDPatternOperator node = null_frag> [(set i32:$vdst, (node (DS1Addr1Offset i32:$addr, i16:$offset), i32:$data0))] > { - let LGKM_CNT = 0; - let mayLoad = 0; let mayStore = 0; let isConvergent = 1; diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp index 1655591abf39..6c61fb1f2d6b 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp @@ -14,6 +14,7 @@ using namespace llvm; AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(const Triple &TT) : MCAsmInfoELF() { + CodePointerSize = (TT.getArch() == Triple::amdgcn) ? 8 : 4; HasSingleParameterDotFile = false; //===------------------------------------------------------------------===// MinInstAlignment = 4; diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp index 7268131396dc..dd867b15b4c7 100644 --- a/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/lib/Target/AMDGPU/SIISelLowering.cpp @@ -461,6 +461,13 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::ZERO_EXTEND, MVT::v2i32, Expand); setOperationAction(ISD::SIGN_EXTEND, MVT::v2i32, Expand); setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand); + } else { + setOperationAction(ISD::SELECT, MVT::v2i16, Custom); + setOperationAction(ISD::SELECT, MVT::v2f16, Custom); + } + + for (MVT VT : { MVT::v4i16, MVT::v4f16, MVT::v2i8, MVT::v4i8, MVT::v8i8 }) { + setOperationAction(ISD::SELECT, VT, Custom); } setTargetDAGCombine(ISD::FADD); @@ -2191,6 +2198,28 @@ void SITargetLowering::ReplaceNodeResults(SDNode *N, break; } } + case ISD::SELECT: { + SDLoc SL(N); + EVT VT = N->getValueType(0); + EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT); + SDValue LHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(1)); + SDValue RHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(2)); + + EVT SelectVT = NewVT; + if (NewVT.bitsLT(MVT::i32)) { + LHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, LHS); + RHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, RHS); + SelectVT = MVT::i32; + } + + SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, SelectVT, + N->getOperand(0), LHS, RHS); + + if (NewVT != SelectVT) + NewSelect = DAG.getNode(ISD::TRUNCATE, SL, NewVT, NewSelect); + Results.push_back(DAG.getNode(ISD::BITCAST, SL, VT, NewSelect)); + return; + } default: break; } diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/lib/Target/AMDGPU/SIMachineFunctionInfo.h index a84f3e274f82..810fb05984c4 100644 --- a/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -133,14 +133,12 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction { AMDGPUBufferPseudoSourceValue BufferPSV; AMDGPUImagePseudoSourceValue ImagePSV; -public: - // FIXME: Make private +private: unsigned LDSWaveSpillSize; unsigned ScratchOffsetReg; unsigned NumUserSGPRs; unsigned NumSystemSGPRs; -private: bool HasSpilledSGPRs; bool HasSpilledVGPRs; bool HasNonSpillStackObjects; @@ -535,6 +533,10 @@ public: llvm_unreachable("unexpected dimension"); } + unsigned getLDSWaveSpillSize() const { + return LDSWaveSpillSize; + } + const AMDGPUBufferPseudoSourceValue *getBufferPSV() const { return &BufferPSV; } diff --git a/lib/Target/AMDGPU/SIRegisterInfo.cpp b/lib/Target/AMDGPU/SIRegisterInfo.cpp index 36d4df52ff0e..098c67252dd8 100644 --- a/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -124,7 +124,7 @@ unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg( unsigned RegCount = ST.getMaxNumSGPRs(MF); unsigned Reg; - // Try to place it in a hole after PrivateSegmentbufferReg. + // Try to place it in a hole after PrivateSegmentBufferReg. if (RegCount & 3) { // We cannot put the segment buffer in (Idx - 4) ... (Idx - 1) due to // alignment constraints, so we have a hole where can put the wave offset. diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td index 57f9d1c6b610..005b74a68af3 100644 --- a/lib/Target/ARM/ARM.td +++ b/lib/Target/ARM/ARM.td @@ -67,8 +67,9 @@ def FeatureFullFP16 : SubtargetFeature<"fullfp16", "HasFullFP16", "true", [FeatureFPARMv8]>; def FeatureD16 : SubtargetFeature<"d16", "HasD16", "true", "Restrict FP to 16 double registers">; -def FeatureHWDiv : SubtargetFeature<"hwdiv", "HasHardwareDivide", "true", - "Enable divide instructions">; +def FeatureHWDivThumb : SubtargetFeature<"hwdiv", "HasHardwareDivideInThumb", + "true", + "Enable divide instructions in Thumb">; def FeatureHWDivARM : SubtargetFeature<"hwdiv-arm", "HasHardwareDivideInARM", "true", "Enable divide instructions in ARM mode">; @@ -225,7 +226,7 @@ def FeatureMP : SubtargetFeature<"mp", "HasMPExtension", "true", def FeatureVirtualization : SubtargetFeature<"virtualization", "HasVirtualization", "true", "Supports Virtualization extension", - [FeatureHWDiv, FeatureHWDivARM]>; + [FeatureHWDivThumb, FeatureHWDivARM]>; // M-series ISA def FeatureMClass : SubtargetFeature<"mclass", "ARMProcClass", "MClass", @@ -433,21 +434,21 @@ def ARMv7ve : Architecture<"armv7ve", "ARMv7ve", [HasV7Ops, def ARMv7r : Architecture<"armv7-r", "ARMv7r", [HasV7Ops, FeatureDB, FeatureDSP, - FeatureHWDiv, + FeatureHWDivThumb, FeatureRClass]>; def ARMv7m : Architecture<"armv7-m", "ARMv7m", [HasV7Ops, FeatureThumb2, FeatureNoARM, FeatureDB, - FeatureHWDiv, + FeatureHWDivThumb, FeatureMClass]>; def ARMv7em : Architecture<"armv7e-m", "ARMv7em", [HasV7Ops, FeatureThumb2, FeatureNoARM, FeatureDB, - FeatureHWDiv, + FeatureHWDivThumb, FeatureMClass, FeatureDSP]>; @@ -502,7 +503,7 @@ def ARMv8mBaseline : Architecture<"armv8-m.base", "ARMv8mBaseline", [HasV8MBaselineOps, FeatureNoARM, FeatureDB, - FeatureHWDiv, + FeatureHWDivThumb, FeatureV7Clrex, Feature8MSecExt, FeatureAcquireRelease, @@ -512,7 +513,7 @@ def ARMv8mMainline : Architecture<"armv8-m.main", "ARMv8mMainline", [HasV8MMainlineOps, FeatureNoARM, FeatureDB, - FeatureHWDiv, + FeatureHWDivThumb, Feature8MSecExt, FeatureAcquireRelease, FeatureMClass]>; @@ -678,7 +679,7 @@ def : ProcessorModel<"krait", CortexA9Model, [ARMv7a, ProcKrait, FeatureFP16, FeatureAvoidPartialCPSR, FeatureVFP4, - FeatureHWDiv, + FeatureHWDivThumb, FeatureHWDivARM]>; def : ProcessorModel<"swift", SwiftModel, [ARMv7a, ProcSwift, @@ -686,7 +687,7 @@ def : ProcessorModel<"swift", SwiftModel, [ARMv7a, ProcSwift, FeatureNEONForFP, FeatureVFP4, FeatureMP, - FeatureHWDiv, + FeatureHWDivThumb, FeatureHWDivARM, FeatureAvoidPartialCPSR, FeatureAvoidMOVsShOp, @@ -768,39 +769,39 @@ def : ProcNoItin<"cortex-m33", [ARMv8mMainline, FeatureVFPOnlySP]>; def : ProcNoItin<"cortex-a32", [ARMv8a, - FeatureHWDiv, + FeatureHWDivThumb, FeatureHWDivARM, FeatureCrypto, FeatureCRC]>; def : ProcNoItin<"cortex-a35", [ARMv8a, ProcA35, - FeatureHWDiv, + FeatureHWDivThumb, FeatureHWDivARM, FeatureCrypto, FeatureCRC]>; def : ProcNoItin<"cortex-a53", [ARMv8a, ProcA53, - FeatureHWDiv, + FeatureHWDivThumb, FeatureHWDivARM, FeatureCrypto, FeatureCRC, FeatureFPAO]>; def : ProcNoItin<"cortex-a57", [ARMv8a, ProcA57, - FeatureHWDiv, + FeatureHWDivThumb, FeatureHWDivARM, FeatureCrypto, FeatureCRC, FeatureFPAO]>; def : ProcNoItin<"cortex-a72", [ARMv8a, ProcA72, - FeatureHWDiv, + FeatureHWDivThumb, FeatureHWDivARM, FeatureCrypto, FeatureCRC]>; def : ProcNoItin<"cortex-a73", [ARMv8a, ProcA73, - FeatureHWDiv, + FeatureHWDivThumb, FeatureHWDivARM, FeatureCrypto, FeatureCRC]>; @@ -811,7 +812,7 @@ def : ProcessorModel<"cyclone", SwiftModel, [ARMv8a, ProcSwift, FeatureNEONForFP, FeatureVFP4, FeatureMP, - FeatureHWDiv, + FeatureHWDivThumb, FeatureHWDivARM, FeatureAvoidPartialCPSR, FeatureAvoidMOVsShOp, @@ -820,25 +821,25 @@ def : ProcessorModel<"cyclone", SwiftModel, [ARMv8a, ProcSwift, FeatureZCZeroing]>; def : ProcNoItin<"exynos-m1", [ARMv8a, ProcExynosM1, - FeatureHWDiv, + FeatureHWDivThumb, FeatureHWDivARM, FeatureCrypto, FeatureCRC]>; def : ProcNoItin<"exynos-m2", [ARMv8a, ProcExynosM1, - FeatureHWDiv, + FeatureHWDivThumb, FeatureHWDivARM, FeatureCrypto, FeatureCRC]>; def : ProcNoItin<"exynos-m3", [ARMv8a, ProcExynosM1, - FeatureHWDiv, + FeatureHWDivThumb, FeatureHWDivARM, FeatureCrypto, FeatureCRC]>; def : ProcNoItin<"kryo", [ARMv8a, ProcKryo, - FeatureHWDiv, + FeatureHWDivThumb, FeatureHWDivARM, FeatureCrypto, FeatureCRC]>; diff --git a/lib/Target/ARM/ARMAsmPrinter.cpp b/lib/Target/ARM/ARMAsmPrinter.cpp index eb0d410b596b..14e197f477f1 100644 --- a/lib/Target/ARM/ARMAsmPrinter.cpp +++ b/lib/Target/ARM/ARMAsmPrinter.cpp @@ -589,12 +589,6 @@ void ARMAsmPrinter::EmitEndOfAsmFile(Module &M) { ATS.finishAttributeSection(); } -static bool isV8M(const ARMSubtarget *Subtarget) { - // Note that v8M Baseline is a subset of v6T2! - return (Subtarget->hasV8MBaselineOps() && !Subtarget->hasV6T2Ops()) || - Subtarget->hasV8MMainlineOps(); -} - //===----------------------------------------------------------------------===// // Helper routines for EmitStartOfAsmFile() and EmitEndOfAsmFile() // FIXME: @@ -602,39 +596,6 @@ static bool isV8M(const ARMSubtarget *Subtarget) { // to appear in the .ARM.attributes section in ELF. // Instead of subclassing the MCELFStreamer, we do the work here. -static ARMBuildAttrs::CPUArch getArchForCPU(StringRef CPU, - const ARMSubtarget *Subtarget) { - if (CPU == "xscale") - return ARMBuildAttrs::v5TEJ; - - if (Subtarget->hasV8Ops()) { - if (Subtarget->isRClass()) - return ARMBuildAttrs::v8_R; - return ARMBuildAttrs::v8_A; - } else if (Subtarget->hasV8MMainlineOps()) - return ARMBuildAttrs::v8_M_Main; - else if (Subtarget->hasV7Ops()) { - if (Subtarget->isMClass() && Subtarget->hasDSP()) - return ARMBuildAttrs::v7E_M; - return ARMBuildAttrs::v7; - } else if (Subtarget->hasV6T2Ops()) - return ARMBuildAttrs::v6T2; - else if (Subtarget->hasV8MBaselineOps()) - return ARMBuildAttrs::v8_M_Base; - else if (Subtarget->hasV6MOps()) - return ARMBuildAttrs::v6S_M; - else if (Subtarget->hasV6Ops()) - return ARMBuildAttrs::v6; - else if (Subtarget->hasV5TEOps()) - return ARMBuildAttrs::v5TE; - else if (Subtarget->hasV5TOps()) - return ARMBuildAttrs::v5T; - else if (Subtarget->hasV4TOps()) - return ARMBuildAttrs::v4T; - else - return ARMBuildAttrs::v4; -} - // Returns true if all functions have the same function attribute value. // It also returns true when the module has no functions. static bool checkFunctionsAttributeConsistency(const Module &M, StringRef Attr, @@ -671,89 +632,8 @@ void ARMAsmPrinter::emitAttributes() { static_cast<const ARMBaseTargetMachine &>(TM); const ARMSubtarget STI(TT, CPU, ArchFS, ATM, ATM.isLittleEndian()); - const std::string &CPUString = STI.getCPUString(); - - if (!StringRef(CPUString).startswith("generic")) { - // FIXME: remove krait check when GNU tools support krait cpu - if (STI.isKrait()) { - ATS.emitTextAttribute(ARMBuildAttrs::CPU_name, "cortex-a9"); - // We consider krait as a "cortex-a9" + hwdiv CPU - // Enable hwdiv through ".arch_extension idiv" - if (STI.hasDivide() || STI.hasDivideInARMMode()) - ATS.emitArchExtension(ARM::AEK_HWDIV | ARM::AEK_HWDIVARM); - } else - ATS.emitTextAttribute(ARMBuildAttrs::CPU_name, CPUString); - } - - ATS.emitAttribute(ARMBuildAttrs::CPU_arch, getArchForCPU(CPUString, &STI)); - - // Tag_CPU_arch_profile must have the default value of 0 when "Architecture - // profile is not applicable (e.g. pre v7, or cross-profile code)". - if (STI.hasV7Ops() || isV8M(&STI)) { - if (STI.isAClass()) { - ATS.emitAttribute(ARMBuildAttrs::CPU_arch_profile, - ARMBuildAttrs::ApplicationProfile); - } else if (STI.isRClass()) { - ATS.emitAttribute(ARMBuildAttrs::CPU_arch_profile, - ARMBuildAttrs::RealTimeProfile); - } else if (STI.isMClass()) { - ATS.emitAttribute(ARMBuildAttrs::CPU_arch_profile, - ARMBuildAttrs::MicroControllerProfile); - } - } - - ATS.emitAttribute(ARMBuildAttrs::ARM_ISA_use, - STI.hasARMOps() ? ARMBuildAttrs::Allowed - : ARMBuildAttrs::Not_Allowed); - if (isV8M(&STI)) { - ATS.emitAttribute(ARMBuildAttrs::THUMB_ISA_use, - ARMBuildAttrs::AllowThumbDerived); - } else if (STI.isThumb1Only()) { - ATS.emitAttribute(ARMBuildAttrs::THUMB_ISA_use, ARMBuildAttrs::Allowed); - } else if (STI.hasThumb2()) { - ATS.emitAttribute(ARMBuildAttrs::THUMB_ISA_use, - ARMBuildAttrs::AllowThumb32); - } - - if (STI.hasNEON()) { - /* NEON is not exactly a VFP architecture, but GAS emit one of - * neon/neon-fp-armv8/neon-vfpv4/vfpv3/vfpv2 for .fpu parameters */ - if (STI.hasFPARMv8()) { - if (STI.hasCrypto()) - ATS.emitFPU(ARM::FK_CRYPTO_NEON_FP_ARMV8); - else - ATS.emitFPU(ARM::FK_NEON_FP_ARMV8); - } else if (STI.hasVFP4()) - ATS.emitFPU(ARM::FK_NEON_VFPV4); - else - ATS.emitFPU(STI.hasFP16() ? ARM::FK_NEON_FP16 : ARM::FK_NEON); - // Emit Tag_Advanced_SIMD_arch for ARMv8 architecture - if (STI.hasV8Ops()) - ATS.emitAttribute(ARMBuildAttrs::Advanced_SIMD_arch, - STI.hasV8_1aOps() ? ARMBuildAttrs::AllowNeonARMv8_1a: - ARMBuildAttrs::AllowNeonARMv8); - } else { - if (STI.hasFPARMv8()) - // FPv5 and FP-ARMv8 have the same instructions, so are modeled as one - // FPU, but there are two different names for it depending on the CPU. - ATS.emitFPU(STI.hasD16() - ? (STI.isFPOnlySP() ? ARM::FK_FPV5_SP_D16 : ARM::FK_FPV5_D16) - : ARM::FK_FP_ARMV8); - else if (STI.hasVFP4()) - ATS.emitFPU(STI.hasD16() - ? (STI.isFPOnlySP() ? ARM::FK_FPV4_SP_D16 : ARM::FK_VFPV4_D16) - : ARM::FK_VFPV4); - else if (STI.hasVFP3()) - ATS.emitFPU(STI.hasD16() - // +d16 - ? (STI.isFPOnlySP() - ? (STI.hasFP16() ? ARM::FK_VFPV3XD_FP16 : ARM::FK_VFPV3XD) - : (STI.hasFP16() ? ARM::FK_VFPV3_D16_FP16 : ARM::FK_VFPV3_D16)) - // -d16 - : (STI.hasFP16() ? ARM::FK_VFPV3_FP16 : ARM::FK_VFPV3)); - else if (STI.hasVFP2()) - ATS.emitFPU(ARM::FK_VFPV2); - } + // Emit build attributes for the available hardware. + ATS.emitTargetAttributes(STI); // RW data addressing. if (isPositionIndependent()) { @@ -846,32 +726,15 @@ void ARMAsmPrinter::emitAttributes() { ATS.emitAttribute(ARMBuildAttrs::ABI_FP_number_model, ARMBuildAttrs::AllowIEEE754); - if (STI.allowsUnalignedMem()) - ATS.emitAttribute(ARMBuildAttrs::CPU_unaligned_access, - ARMBuildAttrs::Allowed); - else - ATS.emitAttribute(ARMBuildAttrs::CPU_unaligned_access, - ARMBuildAttrs::Not_Allowed); - // FIXME: add more flags to ARMBuildAttributes.h // 8-bytes alignment stuff. ATS.emitAttribute(ARMBuildAttrs::ABI_align_needed, 1); ATS.emitAttribute(ARMBuildAttrs::ABI_align_preserved, 1); - // ABI_HardFP_use attribute to indicate single precision FP. - if (STI.isFPOnlySP()) - ATS.emitAttribute(ARMBuildAttrs::ABI_HardFP_use, - ARMBuildAttrs::HardFPSinglePrecision); - // Hard float. Use both S and D registers and conform to AAPCS-VFP. if (STI.isAAPCS_ABI() && TM.Options.FloatABIType == FloatABI::Hard) ATS.emitAttribute(ARMBuildAttrs::ABI_VFP_args, ARMBuildAttrs::HardFPAAPCS); - // FIXME: Should we signal R9 usage? - - if (STI.hasFP16()) - ATS.emitAttribute(ARMBuildAttrs::FP_HP_extension, ARMBuildAttrs::AllowHPFP); - // FIXME: To support emitting this build attribute as GCC does, the // -mfp16-format option and associated plumbing must be // supported. For now the __fp16 type is exposed by default, so this @@ -879,21 +742,6 @@ void ARMAsmPrinter::emitAttributes() { ATS.emitAttribute(ARMBuildAttrs::ABI_FP_16bit_format, ARMBuildAttrs::FP16FormatIEEE); - if (STI.hasMPExtension()) - ATS.emitAttribute(ARMBuildAttrs::MPextension_use, ARMBuildAttrs::AllowMP); - - // Hardware divide in ARM mode is part of base arch, starting from ARMv8. - // If only Thumb hwdiv is present, it must also be in base arch (ARMv7-R/M). - // It is not possible to produce DisallowDIV: if hwdiv is present in the base - // arch, supplying -hwdiv downgrades the effective arch, via ClearImpliedBits. - // AllowDIVExt is only emitted if hwdiv isn't available in the base arch; - // otherwise, the default value (AllowDIVIfExists) applies. - if (STI.hasDivideInARMMode() && !STI.hasV8Ops()) - ATS.emitAttribute(ARMBuildAttrs::DIV_use, ARMBuildAttrs::AllowDIVExt); - - if (STI.hasDSP() && isV8M(&STI)) - ATS.emitAttribute(ARMBuildAttrs::DSP_extension, ARMBuildAttrs::Allowed); - if (MMI) { if (const Module *SourceModule = MMI->getModule()) { // ABI_PCS_wchar_t to indicate wchar_t width @@ -930,16 +778,6 @@ void ARMAsmPrinter::emitAttributes() { else ATS.emitAttribute(ARMBuildAttrs::ABI_PCS_R9_use, ARMBuildAttrs::R9IsGPR); - - if (STI.hasTrustZone() && STI.hasVirtualization()) - ATS.emitAttribute(ARMBuildAttrs::Virtualization_use, - ARMBuildAttrs::AllowTZVirtualization); - else if (STI.hasTrustZone()) - ATS.emitAttribute(ARMBuildAttrs::Virtualization_use, - ARMBuildAttrs::AllowTZ); - else if (STI.hasVirtualization()) - ATS.emitAttribute(ARMBuildAttrs::Virtualization_use, - ARMBuildAttrs::AllowVirtualization); } //===----------------------------------------------------------------------===// diff --git a/lib/Target/ARM/ARMBaseInstrInfo.h b/lib/Target/ARM/ARMBaseInstrInfo.h index 23777b821f9f..faf1c631a3a7 100644 --- a/lib/Target/ARM/ARMBaseInstrInfo.h +++ b/lib/Target/ARM/ARMBaseInstrInfo.h @@ -404,6 +404,29 @@ public: /// Returns true if the instruction has a shift by immediate that can be /// executed in one cycle less. bool isSwiftFastImmShift(const MachineInstr *MI) const; + + /// Returns predicate register associated with the given frame instruction. + unsigned getFramePred(const MachineInstr &MI) const { + assert(isFrameInstr(MI)); + if (isFrameSetup(MI)) + // Operands of ADJCALLSTACKDOWN: + // - argument declared in ADJCALLSTACKDOWN pattern: + // 0 - frame size + // 1 - predicate code (like ARMCC::AL) + // - added by predOps: + // 2 - predicate reg + return MI.getOperand(2).getReg(); + assert(MI.getOpcode() == ARM::ADJCALLSTACKUP || + MI.getOpcode() == ARM::tADJCALLSTACKUP); + // Operands of ADJCALLSTACKUP: + // - argument declared in ADJCALLSTACKUP pattern: + // 0 - frame size + // 1 - arg of CALLSEQ_END + // 2 - predicate code + // - added by predOps: + // 3 - predicate reg + return MI.getOperand(3).getReg(); + } }; /// Get the operands corresponding to the given \p Pred value. By default, the diff --git a/lib/Target/ARM/ARMCallingConv.td b/lib/Target/ARM/ARMCallingConv.td index 7a7b7fede7c8..bc7afdb7f1c9 100644 --- a/lib/Target/ARM/ARMCallingConv.td +++ b/lib/Target/ARM/ARMCallingConv.td @@ -273,9 +273,9 @@ def CSR_iOS_SwiftError : CalleeSavedRegs<(sub CSR_iOS, R8)>; def CSR_iOS_ThisReturn : CalleeSavedRegs<(add LR, R7, R6, R5, R4, (sub CSR_AAPCS_ThisReturn, R9))>; -def CSR_iOS_TLSCall : CalleeSavedRegs<(add LR, SP, - (sequence "R%u", 12, 1), - (sequence "D%u", 31, 0))>; +def CSR_iOS_TLSCall + : CalleeSavedRegs<(add LR, SP, (sub(sequence "R%u", 12, 1), R9, R12), + (sequence "D%u", 31, 0))>; // C++ TLS access function saves all registers except SP. Try to match // the order of CSRs in CSR_iOS. diff --git a/lib/Target/ARM/ARMConstantIslandPass.cpp b/lib/Target/ARM/ARMConstantIslandPass.cpp index 23722f1b7f3f..6434df317aa8 100644 --- a/lib/Target/ARM/ARMConstantIslandPass.cpp +++ b/lib/Target/ARM/ARMConstantIslandPass.cpp @@ -1741,10 +1741,9 @@ bool ARMConstantIslands::undoLRSpillRestore() { .add(MI->getOperand(1)); MI->eraseFromParent(); MadeChange = true; - } - if (MI->getOpcode() == ARM::tPUSH && - MI->getOperand(2).getReg() == ARM::LR && - MI->getNumExplicitOperands() == 3) { + } else if (MI->getOpcode() == ARM::tPUSH && + MI->getOperand(2).getReg() == ARM::LR && + MI->getNumExplicitOperands() == 3) { // Just remove the push. MI->eraseFromParent(); MadeChange = true; @@ -2158,6 +2157,15 @@ bool ARMConstantIslands::optimizeThumb2JumpTables() { // If we're in PIC mode, there should be another ADD following. auto *TRI = STI->getRegisterInfo(); + + // %base cannot be redefined after the load as it will appear before + // TBB/TBH like: + // %base = + // %base = + // tBB %base, %idx + if (registerDefinedBetween(BaseReg, Load->getNextNode(), MBB->end(), TRI)) + continue; + if (isPositionIndependentOrROPI) { MachineInstr *Add = Load->getNextNode(); if (Add->getOpcode() != ARM::tADDrr || diff --git a/lib/Target/ARM/ARMFastISel.cpp b/lib/Target/ARM/ARMFastISel.cpp index 01e062bd185c..e9bc7db66fa4 100644 --- a/lib/Target/ARM/ARMFastISel.cpp +++ b/lib/Target/ARM/ARMFastISel.cpp @@ -1702,7 +1702,8 @@ bool ARMFastISel::SelectDiv(const Instruction *I, bool isSigned) { // If we have integer div support we should have selected this automagically. // In case we have a real miss go ahead and return false and we'll pick // it up later. - if (Subtarget->hasDivide()) return false; + if (Subtarget->hasDivideInThumbMode()) + return false; // Otherwise emit a libcall. RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL; diff --git a/lib/Target/ARM/ARMFrameLowering.cpp b/lib/Target/ARM/ARMFrameLowering.cpp index 37be22bed540..70dbe1bc5b95 100644 --- a/lib/Target/ARM/ARMFrameLowering.cpp +++ b/lib/Target/ARM/ARMFrameLowering.cpp @@ -322,6 +322,18 @@ static void emitAligningInstructions(MachineFunction &MF, ARMFunctionInfo *AFI, } } +/// We need the offset of the frame pointer relative to other MachineFrameInfo +/// offsets which are encoded relative to SP at function begin. +/// See also emitPrologue() for how the FP is set up. +/// Unfortunately we cannot determine this value in determineCalleeSaves() yet +/// as assignCalleeSavedSpillSlots() hasn't run at this point. Instead we use +/// this to produce a conservative estimate that we check in an assert() later. +static int getMaxFPOffset(const Function &F, const ARMFunctionInfo &AFI) { + // This is a conservative estimation: Assume the frame pointer being r7 and + // pc("r15") up to r8 getting spilled before (= 8 registers). + return -AFI.getArgRegsSaveSize() - (8 * 4); +} + void ARMFrameLowering::emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const { MachineBasicBlock::iterator MBBI = MBB.begin(); @@ -432,8 +444,10 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF, unsigned DPRCSOffset = GPRCS2Offset - DPRGapSize - DPRCSSize; int FramePtrOffsetInPush = 0; if (HasFP) { - FramePtrOffsetInPush = - MFI.getObjectOffset(FramePtrSpillFI) + ArgRegsSaveSize; + int FPOffset = MFI.getObjectOffset(FramePtrSpillFI); + assert(getMaxFPOffset(*MF.getFunction(), *AFI) <= FPOffset && + "Max FP estimation is wrong"); + FramePtrOffsetInPush = FPOffset + ArgRegsSaveSize; AFI->setFramePtrSpillOffset(MFI.getObjectOffset(FramePtrSpillFI) + NumBytes); } @@ -1700,6 +1714,14 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF, // worth the effort and added fragility? unsigned EstimatedStackSize = MFI.estimateStackSize(MF) + 4 * (NumGPRSpills + NumFPRSpills); + + // Determine biggest (positive) SP offset in MachineFrameInfo. + int MaxFixedOffset = 0; + for (int I = MFI.getObjectIndexBegin(); I < 0; ++I) { + int MaxObjectOffset = MFI.getObjectOffset(I) + MFI.getObjectSize(I); + MaxFixedOffset = std::max(MaxFixedOffset, MaxObjectOffset); + } + bool HasFP = hasFP(MF); if (HasFP) { if (AFI->hasStackFrame()) @@ -1707,15 +1729,20 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF, } else { // If FP is not used, SP will be used to access arguments, so count the // size of arguments into the estimation. - EstimatedStackSize += AFI->getArgumentStackSize(); + EstimatedStackSize += MaxFixedOffset; } EstimatedStackSize += 16; // For possible paddings. - bool BigStack = EstimatedStackSize >= estimateRSStackSizeLimit(MF, this) || - MFI.hasVarSizedObjects() || - (MFI.adjustsStack() && !canSimplifyCallFramePseudos(MF)); + unsigned EstimatedRSStackSizeLimit = estimateRSStackSizeLimit(MF, this); + int MaxFPOffset = getMaxFPOffset(*MF.getFunction(), *AFI); + bool BigFrameOffsets = EstimatedStackSize >= EstimatedRSStackSizeLimit || + MFI.hasVarSizedObjects() || + (MFI.adjustsStack() && !canSimplifyCallFramePseudos(MF)) || + // For large argument stacks fp relative addressed may overflow. + (HasFP && (MaxFixedOffset - MaxFPOffset) >= (int)EstimatedRSStackSizeLimit); bool ExtraCSSpill = false; - if (BigStack || !CanEliminateFrame || RegInfo->cannotEliminateFrame(MF)) { + if (BigFrameOffsets || + !CanEliminateFrame || RegInfo->cannotEliminateFrame(MF)) { AFI->setHasStackFrame(true); if (HasFP) { @@ -1899,7 +1926,7 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF, // callee-saved register or reserve a special spill slot to facilitate // register scavenging. Thumb1 needs a spill slot for stack pointer // adjustments also, even when the frame itself is small. - if (BigStack && !ExtraCSSpill) { + if (BigFrameOffsets && !ExtraCSSpill) { // If any non-reserved CS register isn't spilled, just spill one or two // extra. That should take care of it! unsigned NumExtras = TargetAlign / 4; @@ -1958,7 +1985,7 @@ MachineBasicBlock::iterator ARMFrameLowering::eliminateCallFramePseudoInstr( // ADJCALLSTACKUP -> add, sp, sp, amount MachineInstr &Old = *I; DebugLoc dl = Old.getDebugLoc(); - unsigned Amount = Old.getOperand(0).getImm(); + unsigned Amount = TII.getFrameSize(Old); if (Amount != 0) { // We need to keep the stack aligned properly. To do this, we round the // amount of space needed for the outgoing arguments up to the next @@ -1976,14 +2003,11 @@ MachineBasicBlock::iterator ARMFrameLowering::eliminateCallFramePseudoInstr( ARMCC::CondCodes Pred = (PIdx == -1) ? ARMCC::AL : (ARMCC::CondCodes)Old.getOperand(PIdx).getImm(); + unsigned PredReg = TII.getFramePred(Old); if (Opc == ARM::ADJCALLSTACKDOWN || Opc == ARM::tADJCALLSTACKDOWN) { - // Note: PredReg is operand 2 for ADJCALLSTACKDOWN. - unsigned PredReg = Old.getOperand(2).getReg(); emitSPUpdate(isARM, MBB, I, dl, TII, -Amount, MachineInstr::NoFlags, Pred, PredReg); } else { - // Note: PredReg is operand 3 for ADJCALLSTACKUP. - unsigned PredReg = Old.getOperand(3).getReg(); assert(Opc == ARM::ADJCALLSTACKUP || Opc == ARM::tADJCALLSTACKUP); emitSPUpdate(isARM, MBB, I, dl, TII, Amount, MachineInstr::NoFlags, Pred, PredReg); diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp index b07b4e1f5cfb..e9df9449103c 100644 --- a/lib/Target/ARM/ARMISelDAGToDAG.cpp +++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -228,11 +228,6 @@ private: const uint16_t *DOpcodes, const uint16_t *QOpcodes = nullptr); - /// SelectVTBL - Select NEON VTBL and VTBX intrinsics. NumVecs should be 2, - /// 3 or 4. These are custom-selected so that a REG_SEQUENCE can be - /// generated to force the table registers to be consecutive. - void SelectVTBL(SDNode *N, bool IsExt, unsigned NumVecs, unsigned Opc); - /// Try to select SBFX/UBFX instructions for ARM. bool tryV6T2BitfieldExtractOp(SDNode *N, bool isSigned); @@ -544,11 +539,11 @@ bool ARMDAGToDAGISel::SelectImmShifterOperand(SDValue N, SDValue NewMulConst; if (canExtractShiftFromMul(N, 31, PowerOfTwo, NewMulConst)) { HandleSDNode Handle(N); + SDLoc Loc(N); replaceDAGValue(N.getOperand(1), NewMulConst); BaseReg = Handle.getValue(); - Opc = CurDAG->getTargetConstant(ARM_AM::getSORegOpc(ARM_AM::lsl, - PowerOfTwo), - SDLoc(N), MVT::i32); + Opc = CurDAG->getTargetConstant( + ARM_AM::getSORegOpc(ARM_AM::lsl, PowerOfTwo), Loc, MVT::i32); return true; } } @@ -1859,6 +1854,14 @@ static unsigned getVLDSTRegisterUpdateOpcode(unsigned Opc) { return Opc; // If not one we handle, return it unchanged. } +/// Returns true if the given increment is a Constant known to be equal to the +/// access size performed by a NEON load/store. This means the "[rN]!" form can +/// be used. +static bool isPerfectIncrement(SDValue Inc, EVT VecTy, unsigned NumVecs) { + auto C = dyn_cast<ConstantSDNode>(Inc); + return C && C->getZExtValue() == VecTy.getSizeInBits() / 8 * NumVecs; +} + void ARMDAGToDAGISel::SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs, const uint16_t *DOpcodes, const uint16_t *QOpcodes0, @@ -1926,13 +1929,13 @@ void ARMDAGToDAGISel::SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs, SDValue Inc = N->getOperand(AddrOpIdx + 1); // FIXME: VLD1/VLD2 fixed increment doesn't need Reg0. Remove the reg0 // case entirely when the rest are updated to that form, too. - if ((NumVecs <= 2) && !isa<ConstantSDNode>(Inc.getNode())) + bool IsImmUpdate = isPerfectIncrement(Inc, VT, NumVecs); + if ((NumVecs <= 2) && !IsImmUpdate) Opc = getVLDSTRegisterUpdateOpcode(Opc); // FIXME: We use a VLD1 for v1i64 even if the pseudo says vld2/3/4, so // check for that explicitly too. Horribly hacky, but temporary. - if ((NumVecs > 2 && !isVLDfixed(Opc)) || - !isa<ConstantSDNode>(Inc.getNode())) - Ops.push_back(isa<ConstantSDNode>(Inc.getNode()) ? Reg0 : Inc); + if ((NumVecs > 2 && !isVLDfixed(Opc)) || !IsImmUpdate) + Ops.push_back(IsImmUpdate ? Reg0 : Inc); } Ops.push_back(Pred); Ops.push_back(Reg0); @@ -2080,11 +2083,12 @@ void ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs, SDValue Inc = N->getOperand(AddrOpIdx + 1); // FIXME: VST1/VST2 fixed increment doesn't need Reg0. Remove the reg0 // case entirely when the rest are updated to that form, too. - if (NumVecs <= 2 && !isa<ConstantSDNode>(Inc.getNode())) + bool IsImmUpdate = isPerfectIncrement(Inc, VT, NumVecs); + if (NumVecs <= 2 && !IsImmUpdate) Opc = getVLDSTRegisterUpdateOpcode(Opc); // FIXME: We use a VST1 for v1i64 even if the pseudo says vld2/3/4, so // check for that explicitly too. Horribly hacky, but temporary. - if (!isa<ConstantSDNode>(Inc.getNode())) + if (!IsImmUpdate) Ops.push_back(Inc); else if (NumVecs > 2 && !isVSTfixed(Opc)) Ops.push_back(Reg0); @@ -2214,7 +2218,9 @@ void ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad, bool isUpdating, Ops.push_back(Align); if (isUpdating) { SDValue Inc = N->getOperand(AddrOpIdx + 1); - Ops.push_back(isa<ConstantSDNode>(Inc.getNode()) ? Reg0 : Inc); + bool IsImmUpdate = + isPerfectIncrement(Inc, VT.getVectorElementType(), NumVecs); + Ops.push_back(IsImmUpdate ? Reg0 : Inc); } SDValue SuperReg; @@ -2318,9 +2324,11 @@ void ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool isUpdating, unsigned NumVecs, // fixed-stride update instructions don't have an explicit writeback // operand. It's implicit in the opcode itself. SDValue Inc = N->getOperand(2); - if (NumVecs <= 2 && !isa<ConstantSDNode>(Inc.getNode())) + bool IsImmUpdate = + isPerfectIncrement(Inc, VT.getVectorElementType(), NumVecs); + if (NumVecs <= 2 && !IsImmUpdate) Opc = getVLDSTRegisterUpdateOpcode(Opc); - if (!isa<ConstantSDNode>(Inc.getNode())) + if (!IsImmUpdate) Ops.push_back(Inc); // FIXME: VLD3 and VLD4 haven't been updated to that form yet. else if (NumVecs > 2) @@ -2356,39 +2364,6 @@ void ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool isUpdating, unsigned NumVecs, CurDAG->RemoveDeadNode(N); } -void ARMDAGToDAGISel::SelectVTBL(SDNode *N, bool IsExt, unsigned NumVecs, - unsigned Opc) { - assert(NumVecs >= 2 && NumVecs <= 4 && "VTBL NumVecs out-of-range"); - SDLoc dl(N); - EVT VT = N->getValueType(0); - unsigned FirstTblReg = IsExt ? 2 : 1; - - // Form a REG_SEQUENCE to force register allocation. - SDValue RegSeq; - SDValue V0 = N->getOperand(FirstTblReg + 0); - SDValue V1 = N->getOperand(FirstTblReg + 1); - if (NumVecs == 2) - RegSeq = SDValue(createDRegPairNode(MVT::v16i8, V0, V1), 0); - else { - SDValue V2 = N->getOperand(FirstTblReg + 2); - // If it's a vtbl3, form a quad D-register and leave the last part as - // an undef. - SDValue V3 = (NumVecs == 3) - ? SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, dl, VT), 0) - : N->getOperand(FirstTblReg + 3); - RegSeq = SDValue(createQuadDRegsNode(MVT::v4i64, V0, V1, V2, V3), 0); - } - - SmallVector<SDValue, 6> Ops; - if (IsExt) - Ops.push_back(N->getOperand(1)); - Ops.push_back(RegSeq); - Ops.push_back(N->getOperand(FirstTblReg + NumVecs)); - Ops.push_back(getAL(CurDAG, dl)); // predicate - Ops.push_back(CurDAG->getRegister(0, MVT::i32)); // predicate register - ReplaceNode(N, CurDAG->getMachineNode(Opc, dl, VT, Ops)); -} - bool ARMDAGToDAGISel::tryV6T2BitfieldExtractOp(SDNode *N, bool isSigned) { if (!Subtarget->hasV6T2Ops()) return false; @@ -3730,59 +3705,6 @@ void ARMDAGToDAGISel::Select(SDNode *N) { break; } - case ISD::INTRINSIC_WO_CHAIN: { - unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); - switch (IntNo) { - default: - break; - - case Intrinsic::arm_neon_vtbl2: - SelectVTBL(N, false, 2, ARM::VTBL2); - return; - case Intrinsic::arm_neon_vtbl3: - SelectVTBL(N, false, 3, ARM::VTBL3Pseudo); - return; - case Intrinsic::arm_neon_vtbl4: - SelectVTBL(N, false, 4, ARM::VTBL4Pseudo); - return; - - case Intrinsic::arm_neon_vtbx2: - SelectVTBL(N, true, 2, ARM::VTBX2); - return; - case Intrinsic::arm_neon_vtbx3: - SelectVTBL(N, true, 3, ARM::VTBX3Pseudo); - return; - case Intrinsic::arm_neon_vtbx4: - SelectVTBL(N, true, 4, ARM::VTBX4Pseudo); - return; - } - break; - } - - case ARMISD::VTBL1: { - SDLoc dl(N); - EVT VT = N->getValueType(0); - SDValue Ops[] = {N->getOperand(0), N->getOperand(1), - getAL(CurDAG, dl), // Predicate - CurDAG->getRegister(0, MVT::i32)}; // Predicate Register - ReplaceNode(N, CurDAG->getMachineNode(ARM::VTBL1, dl, VT, Ops)); - return; - } - case ARMISD::VTBL2: { - SDLoc dl(N); - EVT VT = N->getValueType(0); - - // Form a REG_SEQUENCE to force register allocation. - SDValue V0 = N->getOperand(0); - SDValue V1 = N->getOperand(1); - SDValue RegSeq = SDValue(createDRegPairNode(MVT::v16i8, V0, V1), 0); - - SDValue Ops[] = {RegSeq, N->getOperand(2), getAL(CurDAG, dl), // Predicate - CurDAG->getRegister(0, MVT::i32)}; // Predicate Register - ReplaceNode(N, CurDAG->getMachineNode(ARM::VTBL2, dl, VT, Ops)); - return; - } - case ISD::ATOMIC_CMP_SWAP: SelectCMP_SWAP(N); return; diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index e697c8ca5339..165e9b7378c7 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -852,7 +852,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, if (!Subtarget->hasV6Ops()) setOperationAction(ISD::BSWAP, MVT::i32, Expand); - bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivide() + bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivideInThumbMode() : Subtarget->hasDivideInARMMode(); if (!hasDivide) { // These are expanded into libcalls if the cpu doesn't have HW divider. @@ -860,7 +860,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setOperationAction(ISD::UDIV, MVT::i32, LibCall); } - if (Subtarget->isTargetWindows() && !Subtarget->hasDivide()) { + if (Subtarget->isTargetWindows() && !Subtarget->hasDivideInThumbMode()) { setOperationAction(ISD::SDIV, MVT::i32, Custom); setOperationAction(ISD::UDIV, MVT::i32, Custom); @@ -2633,7 +2633,7 @@ bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const { return true; } -bool ARMTargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const { +bool ARMTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const { if (!Subtarget->supportsTailCall()) return false; @@ -3347,6 +3347,12 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG, return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); } + case Intrinsic::arm_neon_vtbl1: + return DAG.getNode(ARMISD::VTBL1, SDLoc(Op), Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + case Intrinsic::arm_neon_vtbl2: + return DAG.getNode(ARMISD::VTBL2, SDLoc(Op), Op.getValueType(), + Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); } } @@ -10867,11 +10873,8 @@ static SDValue CombineBaseUpdate(SDNode *N, // If the increment is a constant, it must match the memory ref size. SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0); - if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) { - uint64_t IncVal = CInc->getZExtValue(); - if (IncVal != NumBytes) - continue; - } else if (NumBytes >= 3 * 16) { + ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode()); + if (NumBytes >= 3 * 16 && (!CInc || CInc->getZExtValue() != NumBytes)) { // VLD3/4 and VST3/4 for 128-bit vectors are implemented with two // separate instructions that make it harder to use a non-constant update. continue; @@ -11688,34 +11691,6 @@ static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } -static void computeKnownBits(SelectionDAG &DAG, SDValue Op, APInt &KnownZero, - APInt &KnownOne) { - if (Op.getOpcode() == ARMISD::BFI) { - // Conservatively, we can recurse down the first operand - // and just mask out all affected bits. - computeKnownBits(DAG, Op.getOperand(0), KnownZero, KnownOne); - - // The operand to BFI is already a mask suitable for removing the bits it - // sets. - ConstantSDNode *CI = cast<ConstantSDNode>(Op.getOperand(2)); - const APInt &Mask = CI->getAPIntValue(); - KnownZero &= Mask; - KnownOne &= Mask; - return; - } - if (Op.getOpcode() == ARMISD::CMOV) { - APInt KZ2(KnownZero.getBitWidth(), 0); - APInt KO2(KnownOne.getBitWidth(), 0); - computeKnownBits(DAG, Op.getOperand(0), KnownZero, KnownOne); - computeKnownBits(DAG, Op.getOperand(1), KZ2, KO2); - - KnownZero &= KZ2; - KnownOne &= KO2; - return; - } - return DAG.computeKnownBits(Op, KnownZero, KnownOne); -} - SDValue ARMTargetLowering::PerformCMOVToBFICombine(SDNode *CMOV, SelectionDAG &DAG) const { // If we have a CMOV, OR and AND combination such as: // if (x & CN) @@ -11777,7 +11752,7 @@ SDValue ARMTargetLowering::PerformCMOVToBFICombine(SDNode *CMOV, SelectionDAG &D // Lastly, can we determine that the bits defined by OrCI // are zero in Y? APInt KnownZero, KnownOne; - computeKnownBits(DAG, Y, KnownZero, KnownOne); + DAG.computeKnownBits(Y, KnownZero, KnownOne); if ((OrCI & KnownZero) != OrCI) return SDValue(); @@ -12657,6 +12632,19 @@ void ARMTargetLowering::computeKnownBitsForTargetNode(const SDValue Op, } } } + case ARMISD::BFI: { + // Conservatively, we can recurse down the first operand + // and just mask out all affected bits. + DAG.computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, Depth + 1); + + // The operand to BFI is already a mask suitable for removing the bits it + // sets. + ConstantSDNode *CI = cast<ConstantSDNode>(Op.getOperand(2)); + const APInt &Mask = CI->getAPIntValue(); + KnownZero &= Mask; + KnownOne &= Mask; + return; + } } } @@ -13052,7 +13040,9 @@ SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const { // rem = a - b * div // return {div, rem} // This should be lowered into UDIV/SDIV + MLS later on. - if (Subtarget->hasDivide() && Op->getValueType(0).isSimple() && + bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivideInThumbMode() + : Subtarget->hasDivideInARMMode(); + if (hasDivide && Op->getValueType(0).isSimple() && Op->getSimpleValueType(0) == MVT::i32) { unsigned DivOpcode = isSigned ? ISD::SDIV : ISD::UDIV; const SDValue Dividend = Op->getOperand(0); diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h index 70a0b1380ec9..8b54ce430ed2 100644 --- a/lib/Target/ARM/ARMISelLowering.h +++ b/lib/Target/ARM/ARMISelLowering.h @@ -717,7 +717,7 @@ class InstrItineraryData; bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const override; - bool mayBeEmittedAsTailCall(CallInst *CI) const override; + bool mayBeEmittedAsTailCall(const CallInst *CI) const override; SDValue getCMOV(const SDLoc &dl, EVT VT, SDValue FalseVal, SDValue TrueVal, SDValue ARMcc, SDValue CCR, SDValue Cmp, diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td index cc0e7d4d9c35..703e8071b177 100644 --- a/lib/Target/ARM/ARMInstrInfo.td +++ b/lib/Target/ARM/ARMInstrInfo.td @@ -259,8 +259,8 @@ def HasFP16 : Predicate<"Subtarget->hasFP16()">, AssemblerPredicate<"FeatureFP16","half-float conversions">; def HasFullFP16 : Predicate<"Subtarget->hasFullFP16()">, AssemblerPredicate<"FeatureFullFP16","full half-float">; -def HasDivide : Predicate<"Subtarget->hasDivide()">, - AssemblerPredicate<"FeatureHWDiv", "divide in THUMB">; +def HasDivideInThumb : Predicate<"Subtarget->hasDivideInThumbMode()">, + AssemblerPredicate<"FeatureHWDivThumb", "divide in THUMB">; def HasDivideInARM : Predicate<"Subtarget->hasDivideInARMMode()">, AssemblerPredicate<"FeatureHWDivARM", "divide in ARM">; def HasDSP : Predicate<"Subtarget->hasDSP()">, diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td index 681e235d78f0..9b08c612e16b 100644 --- a/lib/Target/ARM/ARMInstrNEON.td +++ b/lib/Target/ARM/ARMInstrNEON.td @@ -587,6 +587,14 @@ def SDTARMVMULL : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisInt<1>, def NEONvmulls : SDNode<"ARMISD::VMULLs", SDTARMVMULL>; def NEONvmullu : SDNode<"ARMISD::VMULLu", SDTARMVMULL>; +def SDTARMVTBL1 : SDTypeProfile<1, 2, [SDTCisVT<0, v8i8>, SDTCisVT<1, v8i8>, + SDTCisVT<2, v8i8>]>; +def SDTARMVTBL2 : SDTypeProfile<1, 3, [SDTCisVT<0, v8i8>, SDTCisVT<1, v8i8>, + SDTCisVT<2, v8i8>, SDTCisVT<3, v8i8>]>; +def NEONvtbl1 : SDNode<"ARMISD::VTBL1", SDTARMVTBL1>; +def NEONvtbl2 : SDNode<"ARMISD::VTBL2", SDTARMVTBL2>; + + def NEONimmAllZerosV: PatLeaf<(NEONvmovImm (i32 timm)), [{ ConstantSDNode *ConstVal = cast<ConstantSDNode>(N->getOperand(0)); unsigned EltBits = 0; @@ -6443,7 +6451,8 @@ def VTBL1 : N3V<1,1,0b11,0b1000,0,0, (outs DPR:$Vd), (ins VecListOneD:$Vn, DPR:$Vm), NVTBLFrm, IIC_VTB1, "vtbl", "8", "$Vd, $Vn, $Vm", "", - [(set DPR:$Vd, (v8i8 (int_arm_neon_vtbl1 VecListOneD:$Vn, DPR:$Vm)))]>; + [(set DPR:$Vd, (v8i8 (NEONvtbl1 VecListOneD:$Vn, DPR:$Vm)))]>; + let hasExtraSrcRegAllocReq = 1 in { def VTBL2 : N3V<1,1,0b11,0b1001,0,0, (outs DPR:$Vd), @@ -6498,6 +6507,49 @@ def VTBX4Pseudo IIC_VTBX4, "$orig = $dst", []>; } // DecoderMethod = "DecodeTBLInstruction" +def : Pat<(v8i8 (NEONvtbl2 v8i8:$Vn0, v8i8:$Vn1, v8i8:$Vm)), + (v8i8 (VTBL2 (REG_SEQUENCE DPair, v8i8:$Vn0, dsub_0, + v8i8:$Vn1, dsub_1), + v8i8:$Vm))>; +def : Pat<(v8i8 (int_arm_neon_vtbx2 v8i8:$orig, v8i8:$Vn0, v8i8:$Vn1, + v8i8:$Vm)), + (v8i8 (VTBX2 v8i8:$orig, + (REG_SEQUENCE DPair, v8i8:$Vn0, dsub_0, + v8i8:$Vn1, dsub_1), + v8i8:$Vm))>; + +def : Pat<(v8i8 (int_arm_neon_vtbl3 v8i8:$Vn0, v8i8:$Vn1, + v8i8:$Vn2, v8i8:$Vm)), + (v8i8 (VTBL3Pseudo (REG_SEQUENCE QQPR, v8i8:$Vn0, dsub_0, + v8i8:$Vn1, dsub_1, + v8i8:$Vn2, dsub_2, + (v8i8 (IMPLICIT_DEF)), dsub_3), + v8i8:$Vm))>; +def : Pat<(v8i8 (int_arm_neon_vtbx3 v8i8:$orig, v8i8:$Vn0, v8i8:$Vn1, + v8i8:$Vn2, v8i8:$Vm)), + (v8i8 (VTBX3Pseudo v8i8:$orig, + (REG_SEQUENCE QQPR, v8i8:$Vn0, dsub_0, + v8i8:$Vn1, dsub_1, + v8i8:$Vn2, dsub_2, + (v8i8 (IMPLICIT_DEF)), dsub_3), + v8i8:$Vm))>; + +def : Pat<(v8i8 (int_arm_neon_vtbl4 v8i8:$Vn0, v8i8:$Vn1, + v8i8:$Vn2, v8i8:$Vn3, v8i8:$Vm)), + (v8i8 (VTBL4Pseudo (REG_SEQUENCE QQPR, v8i8:$Vn0, dsub_0, + v8i8:$Vn1, dsub_1, + v8i8:$Vn2, dsub_2, + v8i8:$Vn3, dsub_3), + v8i8:$Vm))>; +def : Pat<(v8i8 (int_arm_neon_vtbx4 v8i8:$orig, v8i8:$Vn0, v8i8:$Vn1, + v8i8:$Vn2, v8i8:$Vn3, v8i8:$Vm)), + (v8i8 (VTBX4Pseudo v8i8:$orig, + (REG_SEQUENCE QQPR, v8i8:$Vn0, dsub_0, + v8i8:$Vn1, dsub_1, + v8i8:$Vn2, dsub_2, + v8i8:$Vn3, dsub_3), + v8i8:$Vm))>; + // VRINT : Vector Rounding multiclass VRINT_FPI<string op, bits<3> op9_7, SDPatternOperator Int> { let PostEncoderMethod = "NEONThumb2V8PostEncoder", DecoderNamespace = "v8NEON" in { diff --git a/lib/Target/ARM/ARMInstrThumb2.td b/lib/Target/ARM/ARMInstrThumb2.td index f5b673b78ad7..f710ee6a7e77 100644 --- a/lib/Target/ARM/ARMInstrThumb2.td +++ b/lib/Target/ARM/ARMInstrThumb2.td @@ -2797,7 +2797,7 @@ def t2SMLSLDX : T2DualHalfMulAddLong<0b101, 0b1101, "smlsldx">; def t2SDIV : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iDIV, "sdiv", "\t$Rd, $Rn, $Rm", [(set rGPR:$Rd, (sdiv rGPR:$Rn, rGPR:$Rm))]>, - Requires<[HasDivide, IsThumb, HasV8MBaseline]>, + Requires<[HasDivideInThumb, IsThumb, HasV8MBaseline]>, Sched<[WriteDIV]> { let Inst{31-27} = 0b11111; let Inst{26-21} = 0b011100; @@ -2809,7 +2809,7 @@ def t2SDIV : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iDIV, def t2UDIV : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iDIV, "udiv", "\t$Rd, $Rn, $Rm", [(set rGPR:$Rd, (udiv rGPR:$Rn, rGPR:$Rm))]>, - Requires<[HasDivide, IsThumb, HasV8MBaseline]>, + Requires<[HasDivideInThumb, IsThumb, HasV8MBaseline]>, Sched<[WriteDIV]> { let Inst{31-27} = 0b11111; let Inst{26-21} = 0b011101; diff --git a/lib/Target/ARM/ARMInstructionSelector.cpp b/lib/Target/ARM/ARMInstructionSelector.cpp index 8d224d6a70fa..816596b85721 100644 --- a/lib/Target/ARM/ARMInstructionSelector.cpp +++ b/lib/Target/ARM/ARMInstructionSelector.cpp @@ -299,6 +299,20 @@ bool ARMInstructionSelector::select(MachineInstr &I) const { I.setDesc(TII.get(ARM::ADDrr)); MIB.add(predOps(ARMCC::AL)).add(condCodeOp()); break; + case G_SUB: + I.setDesc(TII.get(ARM::SUBrr)); + MIB.add(predOps(ARMCC::AL)).add(condCodeOp()); + break; + case G_MUL: + if (TII.getSubtarget().hasV6Ops()) { + I.setDesc(TII.get(ARM::MUL)); + } else { + assert(TII.getSubtarget().useMulOps() && "Unsupported target"); + I.setDesc(TII.get(ARM::MULv5)); + MIB->getOperand(0).setIsEarlyClobber(true); + } + MIB.add(predOps(ARMCC::AL)).add(condCodeOp()); + break; case G_FADD: if (!selectFAdd(MIB, TII, MRI)) return false; diff --git a/lib/Target/ARM/ARMLegalizerInfo.cpp b/lib/Target/ARM/ARMLegalizerInfo.cpp index 994bbd673dd8..fe9681439e6b 100644 --- a/lib/Target/ARM/ARMLegalizerInfo.cpp +++ b/lib/Target/ARM/ARMLegalizerInfo.cpp @@ -43,8 +43,9 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) { setAction({Op, 1, p0}, Legal); } - for (auto Ty : {s1, s8, s16, s32}) - setAction({G_ADD, Ty}, Legal); + for (unsigned Op : {G_ADD, G_SUB, G_MUL}) + for (auto Ty : {s1, s8, s16, s32}) + setAction({Op, Ty}, Legal); for (unsigned Op : {G_SEXT, G_ZEXT}) { setAction({Op, s32}, Legal); diff --git a/lib/Target/ARM/ARMRegisterBankInfo.cpp b/lib/Target/ARM/ARMRegisterBankInfo.cpp index 08f3da738868..e47bd3a8963e 100644 --- a/lib/Target/ARM/ARMRegisterBankInfo.cpp +++ b/lib/Target/ARM/ARMRegisterBankInfo.cpp @@ -219,6 +219,8 @@ ARMRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { switch (Opc) { case G_ADD: + case G_SUB: + case G_MUL: case G_SEXT: case G_ZEXT: case G_GEP: diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h index 40993fc0aa8a..d2630685d91b 100644 --- a/lib/Target/ARM/ARMSubtarget.h +++ b/lib/Target/ARM/ARMSubtarget.h @@ -208,8 +208,8 @@ protected: /// FP registers for VFPv3. bool HasD16 = false; - /// HasHardwareDivide - True if subtarget supports [su]div - bool HasHardwareDivide = false; + /// HasHardwareDivide - True if subtarget supports [su]div in Thumb mode + bool HasHardwareDivideInThumb = false; /// HasHardwareDivideInARM - True if subtarget supports [su]div in ARM mode bool HasHardwareDivideInARM = false; @@ -507,7 +507,7 @@ public: return hasNEON() && UseNEONForSinglePrecisionFP; } - bool hasDivide() const { return HasHardwareDivide; } + bool hasDivideInThumbMode() const { return HasHardwareDivideInThumb; } bool hasDivideInARMMode() const { return HasHardwareDivideInARM; } bool hasDataBarrier() const { return HasDataBarrier; } bool hasV7Clrex() const { return HasV7Clrex; } diff --git a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp index f421d3ac1693..ada816c16389 100644 --- a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp +++ b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp @@ -67,6 +67,9 @@ static cl::opt<ImplicitItModeTy> ImplicitItMode( clEnumValN(ImplicitItModeTy::ThumbOnly, "thumb", "Warn in ARM, emit implicit ITs in Thumb"))); +static cl::opt<bool> AddBuildAttributes("arm-add-build-attributes", + cl::init(false)); + class ARMOperand; enum VectorLaneTy { NoLanes, AllLanes, IndexedLane }; @@ -540,6 +543,10 @@ public: // Initialize the set of available features. setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits())); + // Add build attributes based on the selected target. + if (AddBuildAttributes) + getTargetStreamer().emitTargetAttributes(STI); + // Not in an ITBlock to start with. ITState.CurPosition = ~0U; @@ -10189,8 +10196,8 @@ static const struct { { ARM::AEK_CRYPTO, Feature_HasV8, {ARM::FeatureCrypto, ARM::FeatureNEON, ARM::FeatureFPARMv8} }, { ARM::AEK_FP, Feature_HasV8, {ARM::FeatureFPARMv8} }, - { (ARM::AEK_HWDIV | ARM::AEK_HWDIVARM), Feature_HasV7 | Feature_IsNotMClass, - {ARM::FeatureHWDiv, ARM::FeatureHWDivARM} }, + { (ARM::AEK_HWDIVTHUMB | ARM::AEK_HWDIVARM), Feature_HasV7 | Feature_IsNotMClass, + {ARM::FeatureHWDivThumb, ARM::FeatureHWDivARM} }, { ARM::AEK_MP, Feature_HasV7 | Feature_IsNotMClass, {ARM::FeatureMP} }, { ARM::AEK_SIMD, Feature_HasV8, {ARM::FeatureNEON, ARM::FeatureFPARMv8} }, { ARM::AEK_SEC, Feature_HasV6K, {ARM::FeatureTrustZone} }, diff --git a/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp index 6fa890ba1cd5..4d6c52f3cd49 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp +++ b/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp @@ -464,7 +464,7 @@ public: void emitUnwindRaw(int64_t Offset, const SmallVectorImpl<uint8_t> &Opcodes); void ChangeSection(MCSection *Section, const MCExpr *Subsection) override { - LastMappingSymbols[getPreviousSection().first] = std::move(LastEMSInfo); + LastMappingSymbols[getCurrentSection().first] = std::move(LastEMSInfo); MCELFStreamer::ChangeSection(Section, Subsection); auto LastMappingSymbol = LastMappingSymbols.find(Section); if (LastMappingSymbol != LastMappingSymbols.end()) { diff --git a/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp b/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp index 73e563890dd9..2b0cd461df7a 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp +++ b/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp @@ -11,9 +11,13 @@ // //===----------------------------------------------------------------------===// +#include "ARMTargetMachine.h" #include "llvm/MC/ConstantPools.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCStreamer.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/Support/ARMBuildAttributes.h" +#include "llvm/Support/TargetParser.h" using namespace llvm; @@ -75,3 +79,179 @@ void ARMTargetStreamer::emitInst(uint32_t Inst, char Suffix) {} void ARMTargetStreamer::AnnotateTLSDescriptorSequence(const MCSymbolRefExpr *SRE) {} void ARMTargetStreamer::emitThumbSet(MCSymbol *Symbol, const MCExpr *Value) {} + +static ARMBuildAttrs::CPUArch getArchForCPU(const MCSubtargetInfo &STI) { + if (STI.getCPU() == "xscale") + return ARMBuildAttrs::v5TEJ; + + if (STI.hasFeature(ARM::HasV8Ops)) { + if (STI.hasFeature(ARM::FeatureRClass)) + return ARMBuildAttrs::v8_R; + return ARMBuildAttrs::v8_A; + } else if (STI.hasFeature(ARM::HasV8MMainlineOps)) + return ARMBuildAttrs::v8_M_Main; + else if (STI.hasFeature(ARM::HasV7Ops)) { + if (STI.hasFeature(ARM::FeatureMClass) && STI.hasFeature(ARM::FeatureDSP)) + return ARMBuildAttrs::v7E_M; + return ARMBuildAttrs::v7; + } else if (STI.hasFeature(ARM::HasV6T2Ops)) + return ARMBuildAttrs::v6T2; + else if (STI.hasFeature(ARM::HasV8MBaselineOps)) + return ARMBuildAttrs::v8_M_Base; + else if (STI.hasFeature(ARM::HasV6MOps)) + return ARMBuildAttrs::v6S_M; + else if (STI.hasFeature(ARM::HasV6Ops)) + return ARMBuildAttrs::v6; + else if (STI.hasFeature(ARM::HasV5TEOps)) + return ARMBuildAttrs::v5TE; + else if (STI.hasFeature(ARM::HasV5TOps)) + return ARMBuildAttrs::v5T; + else if (STI.hasFeature(ARM::HasV4TOps)) + return ARMBuildAttrs::v4T; + else + return ARMBuildAttrs::v4; +} + +static bool isV8M(const MCSubtargetInfo &STI) { + // Note that v8M Baseline is a subset of v6T2! + return (STI.hasFeature(ARM::HasV8MBaselineOps) && + !STI.hasFeature(ARM::HasV6T2Ops)) || + STI.hasFeature(ARM::HasV8MMainlineOps); +} + +/// Emit the build attributes that only depend on the hardware that we expect +// /to be available, and not on the ABI, or any source-language choices. +void ARMTargetStreamer::emitTargetAttributes(const MCSubtargetInfo &STI) { + switchVendor("aeabi"); + + const StringRef CPUString = STI.getCPU(); + if (!CPUString.empty() && !CPUString.startswith("generic")) { + // FIXME: remove krait check when GNU tools support krait cpu + if (STI.hasFeature(ARM::ProcKrait)) { + emitTextAttribute(ARMBuildAttrs::CPU_name, "cortex-a9"); + // We consider krait as a "cortex-a9" + hwdiv CPU + // Enable hwdiv through ".arch_extension idiv" + if (STI.hasFeature(ARM::FeatureHWDivThumb) || + STI.hasFeature(ARM::FeatureHWDivARM)) + emitArchExtension(ARM::AEK_HWDIVTHUMB | ARM::AEK_HWDIVARM); + } else { + emitTextAttribute(ARMBuildAttrs::CPU_name, CPUString); + } + } + + emitAttribute(ARMBuildAttrs::CPU_arch, getArchForCPU(STI)); + + if (STI.hasFeature(ARM::FeatureAClass)) { + emitAttribute(ARMBuildAttrs::CPU_arch_profile, + ARMBuildAttrs::ApplicationProfile); + } else if (STI.hasFeature(ARM::FeatureRClass)) { + emitAttribute(ARMBuildAttrs::CPU_arch_profile, + ARMBuildAttrs::RealTimeProfile); + } else if (STI.hasFeature(ARM::FeatureMClass)) { + emitAttribute(ARMBuildAttrs::CPU_arch_profile, + ARMBuildAttrs::MicroControllerProfile); + } + + emitAttribute(ARMBuildAttrs::ARM_ISA_use, STI.hasFeature(ARM::FeatureNoARM) + ? ARMBuildAttrs::Not_Allowed + : ARMBuildAttrs::Allowed); + + if (isV8M(STI)) { + emitAttribute(ARMBuildAttrs::THUMB_ISA_use, + ARMBuildAttrs::AllowThumbDerived); + } else if (STI.hasFeature(ARM::FeatureThumb2)) { + emitAttribute(ARMBuildAttrs::THUMB_ISA_use, + ARMBuildAttrs::AllowThumb32); + } else if (STI.hasFeature(ARM::HasV4TOps)) { + emitAttribute(ARMBuildAttrs::THUMB_ISA_use, ARMBuildAttrs::Allowed); + } + + if (STI.hasFeature(ARM::FeatureNEON)) { + /* NEON is not exactly a VFP architecture, but GAS emit one of + * neon/neon-fp-armv8/neon-vfpv4/vfpv3/vfpv2 for .fpu parameters */ + if (STI.hasFeature(ARM::FeatureFPARMv8)) { + if (STI.hasFeature(ARM::FeatureCrypto)) + emitFPU(ARM::FK_CRYPTO_NEON_FP_ARMV8); + else + emitFPU(ARM::FK_NEON_FP_ARMV8); + } else if (STI.hasFeature(ARM::FeatureVFP4)) + emitFPU(ARM::FK_NEON_VFPV4); + else + emitFPU(STI.hasFeature(ARM::FeatureFP16) ? ARM::FK_NEON_FP16 + : ARM::FK_NEON); + // Emit Tag_Advanced_SIMD_arch for ARMv8 architecture + if (STI.hasFeature(ARM::HasV8Ops)) + emitAttribute(ARMBuildAttrs::Advanced_SIMD_arch, + STI.hasFeature(ARM::HasV8_1aOps) + ? ARMBuildAttrs::AllowNeonARMv8_1a + : ARMBuildAttrs::AllowNeonARMv8); + } else { + if (STI.hasFeature(ARM::FeatureFPARMv8)) + // FPv5 and FP-ARMv8 have the same instructions, so are modeled as one + // FPU, but there are two different names for it depending on the CPU. + emitFPU(STI.hasFeature(ARM::FeatureD16) + ? (STI.hasFeature(ARM::FeatureVFPOnlySP) ? ARM::FK_FPV5_SP_D16 + : ARM::FK_FPV5_D16) + : ARM::FK_FP_ARMV8); + else if (STI.hasFeature(ARM::FeatureVFP4)) + emitFPU(STI.hasFeature(ARM::FeatureD16) + ? (STI.hasFeature(ARM::FeatureVFPOnlySP) ? ARM::FK_FPV4_SP_D16 + : ARM::FK_VFPV4_D16) + : ARM::FK_VFPV4); + else if (STI.hasFeature(ARM::FeatureVFP3)) + emitFPU( + STI.hasFeature(ARM::FeatureD16) + // +d16 + ? (STI.hasFeature(ARM::FeatureVFPOnlySP) + ? (STI.hasFeature(ARM::FeatureFP16) ? ARM::FK_VFPV3XD_FP16 + : ARM::FK_VFPV3XD) + : (STI.hasFeature(ARM::FeatureFP16) + ? ARM::FK_VFPV3_D16_FP16 + : ARM::FK_VFPV3_D16)) + // -d16 + : (STI.hasFeature(ARM::FeatureFP16) ? ARM::FK_VFPV3_FP16 + : ARM::FK_VFPV3)); + else if (STI.hasFeature(ARM::FeatureVFP2)) + emitFPU(ARM::FK_VFPV2); + } + + // ABI_HardFP_use attribute to indicate single precision FP. + if (STI.hasFeature(ARM::FeatureVFPOnlySP)) + emitAttribute(ARMBuildAttrs::ABI_HardFP_use, + ARMBuildAttrs::HardFPSinglePrecision); + + if (STI.hasFeature(ARM::FeatureFP16)) + emitAttribute(ARMBuildAttrs::FP_HP_extension, ARMBuildAttrs::AllowHPFP); + + if (STI.hasFeature(ARM::FeatureMP)) + emitAttribute(ARMBuildAttrs::MPextension_use, ARMBuildAttrs::AllowMP); + + // Hardware divide in ARM mode is part of base arch, starting from ARMv8. + // If only Thumb hwdiv is present, it must also be in base arch (ARMv7-R/M). + // It is not possible to produce DisallowDIV: if hwdiv is present in the base + // arch, supplying -hwdiv downgrades the effective arch, via ClearImpliedBits. + // AllowDIVExt is only emitted if hwdiv isn't available in the base arch; + // otherwise, the default value (AllowDIVIfExists) applies. + if (STI.hasFeature(ARM::FeatureHWDivARM) && !STI.hasFeature(ARM::HasV8Ops)) + emitAttribute(ARMBuildAttrs::DIV_use, ARMBuildAttrs::AllowDIVExt); + + if (STI.hasFeature(ARM::FeatureDSP) && isV8M(STI)) + emitAttribute(ARMBuildAttrs::DSP_extension, ARMBuildAttrs::Allowed); + + if (STI.hasFeature(ARM::FeatureStrictAlign)) + emitAttribute(ARMBuildAttrs::CPU_unaligned_access, + ARMBuildAttrs::Not_Allowed); + else + emitAttribute(ARMBuildAttrs::CPU_unaligned_access, + ARMBuildAttrs::Allowed); + + if (STI.hasFeature(ARM::FeatureTrustZone) && + STI.hasFeature(ARM::FeatureVirtualization)) + emitAttribute(ARMBuildAttrs::Virtualization_use, + ARMBuildAttrs::AllowTZVirtualization); + else if (STI.hasFeature(ARM::FeatureTrustZone)) + emitAttribute(ARMBuildAttrs::Virtualization_use, ARMBuildAttrs::AllowTZ); + else if (STI.hasFeature(ARM::FeatureVirtualization)) + emitAttribute(ARMBuildAttrs::Virtualization_use, + ARMBuildAttrs::AllowVirtualization); +} diff --git a/lib/Target/ARM/Thumb1FrameLowering.cpp b/lib/Target/ARM/Thumb1FrameLowering.cpp index fc083b98395b..d0fd366ab9ed 100644 --- a/lib/Target/ARM/Thumb1FrameLowering.cpp +++ b/lib/Target/ARM/Thumb1FrameLowering.cpp @@ -83,13 +83,12 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, // ADJCALLSTACKUP -> add, sp, sp, amount MachineInstr &Old = *I; DebugLoc dl = Old.getDebugLoc(); - unsigned Amount = Old.getOperand(0).getImm(); + unsigned Amount = TII.getFrameSize(Old); if (Amount != 0) { // We need to keep the stack aligned properly. To do this, we round the // amount of space needed for the outgoing arguments up to the next // alignment boundary. - unsigned Align = getStackAlignment(); - Amount = (Amount+Align-1)/Align*Align; + Amount = alignTo(Amount, getStackAlignment()); // Replace the pseudo instruction with a new instruction... unsigned Opc = Old.getOpcode(); diff --git a/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.cpp b/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.cpp index 9f2ee8cf8035..535bb012eb07 100644 --- a/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.cpp +++ b/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.cpp @@ -18,7 +18,7 @@ namespace llvm { AVRMCAsmInfo::AVRMCAsmInfo(const Triple &TT) { - PointerSize = 2; + CodePointerSize = 2; CalleeSaveStackSlotSize = 2; CommentString = ";"; PrivateGlobalPrefix = ".L"; diff --git a/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h b/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h index 559ac291a79e..fd7c97bf1f0a 100644 --- a/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h +++ b/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h @@ -42,7 +42,7 @@ public: // messed up in random places by 4 bytes. .debug_line // section will be parsable, but with odd offsets and // line numbers, etc. - PointerSize = 8; + CodePointerSize = 8; } }; } diff --git a/lib/Target/Hexagon/BitTracker.cpp b/lib/Target/Hexagon/BitTracker.cpp index 61d3630ac095..cb3049bf1500 100644 --- a/lib/Target/Hexagon/BitTracker.cpp +++ b/lib/Target/Hexagon/BitTracker.cpp @@ -1011,12 +1011,7 @@ void BT::subst(RegisterRef OldRR, RegisterRef NewRR) { bool BT::reached(const MachineBasicBlock *B) const { int BN = B->getNumber(); assert(BN >= 0); - for (EdgeSetType::iterator I = EdgeExec.begin(), E = EdgeExec.end(); - I != E; ++I) { - if (I->second == BN) - return true; - } - return false; + return ReachedBB.count(BN); } // Visit an individual instruction. This could be a newly added instruction, @@ -1036,6 +1031,8 @@ void BT::reset() { EdgeExec.clear(); InstrExec.clear(); Map.clear(); + ReachedBB.clear(); + ReachedBB.reserve(MF.size()); } void BT::run() { @@ -1068,6 +1065,7 @@ void BT::run() { if (EdgeExec.count(Edge)) continue; EdgeExec.insert(Edge); + ReachedBB.insert(Edge.second); const MachineBasicBlock &B = *MF.getBlockNumbered(Edge.second); MachineBasicBlock::const_iterator It = B.begin(), End = B.end(); diff --git a/lib/Target/Hexagon/BitTracker.h b/lib/Target/Hexagon/BitTracker.h index a547b34e852f..7f49f430382d 100644 --- a/lib/Target/Hexagon/BitTracker.h +++ b/lib/Target/Hexagon/BitTracker.h @@ -10,6 +10,7 @@ #ifndef LLVM_LIB_TARGET_HEXAGON_BITTRACKER_H #define LLVM_LIB_TARGET_HEXAGON_BITTRACKER_H +#include "llvm/ADT/DenseSet.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/MachineFunction.h" @@ -68,10 +69,11 @@ private: typedef std::set<const MachineInstr *> InstrSetType; typedef std::queue<CFGEdge> EdgeQueueType; - EdgeSetType EdgeExec; // Executable flow graph edges. - InstrSetType InstrExec; // Executable instructions. - EdgeQueueType FlowQ; // Work queue of CFG edges. - bool Trace; // Enable tracing for debugging. + EdgeSetType EdgeExec; // Executable flow graph edges. + InstrSetType InstrExec; // Executable instructions. + EdgeQueueType FlowQ; // Work queue of CFG edges. + DenseSet<unsigned> ReachedBB; // Cache of reached blocks. + bool Trace; // Enable tracing for debugging. const MachineEvaluator &ME; MachineFunction &MF; diff --git a/lib/Target/Hexagon/HexagonISelLowering.cpp b/lib/Target/Hexagon/HexagonISelLowering.cpp index 418dd71aeb4b..e5eb059b566f 100644 --- a/lib/Target/Hexagon/HexagonISelLowering.cpp +++ b/lib/Target/Hexagon/HexagonISelLowering.cpp @@ -635,7 +635,7 @@ HexagonTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, return DAG.getNode(HexagonISD::RET_FLAG, dl, MVT::Other, RetOps); } -bool HexagonTargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const { +bool HexagonTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const { // If either no tail call or told not to tail call at all, don't. auto Attr = CI->getParent()->getParent()->getFnAttribute("disable-tail-calls"); diff --git a/lib/Target/Hexagon/HexagonISelLowering.h b/lib/Target/Hexagon/HexagonISelLowering.h index fb8f0ba6b057..1415156487c0 100644 --- a/lib/Target/Hexagon/HexagonISelLowering.h +++ b/lib/Target/Hexagon/HexagonISelLowering.h @@ -195,7 +195,7 @@ namespace HexagonISD { const SmallVectorImpl<SDValue> &OutVals, const SDLoc &dl, SelectionDAG &DAG) const override; - bool mayBeEmittedAsTailCall(CallInst *CI) const override; + bool mayBeEmittedAsTailCall(const CallInst *CI) const override; /// If a physical register, this returns the register that receives the /// exception address on entry to an EH pad. diff --git a/lib/Target/Hexagon/HexagonOptAddrMode.cpp b/lib/Target/Hexagon/HexagonOptAddrMode.cpp index b243de317dc5..27b40f134b1f 100644 --- a/lib/Target/Hexagon/HexagonOptAddrMode.cpp +++ b/lib/Target/Hexagon/HexagonOptAddrMode.cpp @@ -35,7 +35,6 @@ #include "llvm/Support/raw_ostream.h" #include <cassert> #include <cstdint> -#include <map> static cl::opt<int> CodeGrowthLimit("hexagon-amode-growth-limit", cl::Hidden, cl::init(0), cl::desc("Code growth limit for address mode " @@ -45,10 +44,8 @@ using namespace llvm; using namespace rdf; namespace llvm { - FunctionPass *createHexagonOptAddrMode(); - void initializeHexagonOptAddrModePass(PassRegistry &); - + void initializeHexagonOptAddrModePass(PassRegistry&); } // end namespace llvm namespace { @@ -59,10 +56,7 @@ public: HexagonOptAddrMode() : MachineFunctionPass(ID), HII(nullptr), MDT(nullptr), DFG(nullptr), - LV(nullptr) { - PassRegistry &R = *PassRegistry::getPassRegistry(); - initializeHexagonOptAddrModePass(R); - } + LV(nullptr) {} StringRef getPassName() const override { return "Optimize addressing mode of load/store"; @@ -84,7 +78,6 @@ private: MachineDominatorTree *MDT; DataFlowGraph *DFG; DataFlowGraph::DefStackMap DefM; - std::map<RegisterRef, std::map<NodeId, NodeId>> RDefMap; Liveness *LV; MISetType Deleted; @@ -99,8 +92,6 @@ private: void getAllRealUses(NodeAddr<StmtNode *> SN, NodeList &UNodeList); bool allValidCandidates(NodeAddr<StmtNode *> SA, NodeList &UNodeList); short getBaseWithLongOffset(const MachineInstr &MI) const; - void updateMap(NodeAddr<InstrNode *> IA); - bool constructDefMap(MachineBasicBlock *B); bool changeStore(MachineInstr *OldMI, MachineOperand ImmOp, unsigned ImmOpNum); bool changeLoad(MachineInstr *OldMI, MachineOperand ImmOp, unsigned ImmOpNum); @@ -112,11 +103,11 @@ private: char HexagonOptAddrMode::ID = 0; -INITIALIZE_PASS_BEGIN(HexagonOptAddrMode, "opt-amode", +INITIALIZE_PASS_BEGIN(HexagonOptAddrMode, "amode-opt", "Optimize addressing mode", false, false) INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) INITIALIZE_PASS_DEPENDENCY(MachineDominanceFrontier) -INITIALIZE_PASS_END(HexagonOptAddrMode, "opt-amode", "Optimize addressing mode", +INITIALIZE_PASS_END(HexagonOptAddrMode, "amode-opt", "Optimize addressing mode", false, false) bool HexagonOptAddrMode::hasRepForm(MachineInstr &MI, unsigned TfrDefR) { @@ -173,8 +164,11 @@ bool HexagonOptAddrMode::canRemoveAddasl(NodeAddr<StmtNode *> AddAslSN, for (auto I = UNodeList.rbegin(), E = UNodeList.rend(); I != E; ++I) { NodeAddr<UseNode *> UA = *I; NodeAddr<InstrNode *> IA = UA.Addr->getOwner(*DFG); - if ((UA.Addr->getFlags() & NodeAttrs::PhiRef) || - RDefMap[OffsetRR][IA.Id] != OffsetRegRD) + if (UA.Addr->getFlags() & NodeAttrs::PhiRef) + return false; + NodeAddr<RefNode*> AA = LV->getNearestAliasedRef(OffsetRR, IA); + if ((DFG->IsDef(AA) && AA.Id != OffsetRegRD) || + AA.Addr->getReachingDef() != OffsetRegRD) return false; MachineInstr &UseMI = *NodeAddr<StmtNode *>(IA).Addr->getCode(); @@ -486,14 +480,14 @@ bool HexagonOptAddrMode::changeAddAsl(NodeAddr<UseNode *> AddAslUN, MIB.add(AddAslMI->getOperand(2)); MIB.add(AddAslMI->getOperand(3)); const GlobalValue *GV = ImmOp.getGlobal(); - MIB.addGlobalAddress(GV, UseMI->getOperand(2).getImm(), + MIB.addGlobalAddress(GV, UseMI->getOperand(2).getImm()+ImmOp.getOffset(), ImmOp.getTargetFlags()); OpStart = 3; } else if (UseMID.mayStore()) { MIB.add(AddAslMI->getOperand(2)); MIB.add(AddAslMI->getOperand(3)); const GlobalValue *GV = ImmOp.getGlobal(); - MIB.addGlobalAddress(GV, UseMI->getOperand(1).getImm(), + MIB.addGlobalAddress(GV, UseMI->getOperand(1).getImm()+ImmOp.getOffset(), ImmOp.getTargetFlags()); MIB.add(UseMI->getOperand(2)); OpStart = 3; @@ -597,46 +591,6 @@ bool HexagonOptAddrMode::processBlock(NodeAddr<BlockNode *> BA) { return Changed; } -void HexagonOptAddrMode::updateMap(NodeAddr<InstrNode *> IA) { - RegisterSet RRs; - for (NodeAddr<RefNode *> RA : IA.Addr->members(*DFG)) - RRs.insert(RA.Addr->getRegRef(*DFG)); - bool Common = false; - for (auto &R : RDefMap) { - if (!RRs.count(R.first)) - continue; - Common = true; - break; - } - if (!Common) - return; - - for (auto &R : RDefMap) { - auto F = DefM.find(R.first.Reg); - if (F == DefM.end() || F->second.empty()) - continue; - R.second[IA.Id] = F->second.top()->Id; - } -} - -bool HexagonOptAddrMode::constructDefMap(MachineBasicBlock *B) { - bool Changed = false; - auto BA = DFG->getFunc().Addr->findBlock(B, *DFG); - DFG->markBlock(BA.Id, DefM); - - for (NodeAddr<InstrNode *> IA : BA.Addr->members(*DFG)) { - updateMap(IA); - DFG->pushAllDefs(IA, DefM); - } - - MachineDomTreeNode *N = MDT->getNode(B); - for (auto I : *N) - Changed |= constructDefMap(I->getBlock()); - - DFG->releaseBlock(BA.Id, DefM); - return Changed; -} - bool HexagonOptAddrMode::runOnMachineFunction(MachineFunction &MF) { if (skipFunction(*MF.getFunction())) return false; @@ -658,8 +612,6 @@ bool HexagonOptAddrMode::runOnMachineFunction(MachineFunction &MF) { L.computePhiInfo(); LV = &L; - constructDefMap(&DFG->getMF().front()); - Deleted.clear(); NodeAddr<FuncNode *> FA = DFG->getFunc(); DEBUG(dbgs() << "==== [RefMap#]=====:\n " diff --git a/lib/Target/Hexagon/HexagonTargetMachine.cpp b/lib/Target/Hexagon/HexagonTargetMachine.cpp index 06fc9195fa67..6913d50bbcaa 100644 --- a/lib/Target/Hexagon/HexagonTargetMachine.cpp +++ b/lib/Target/Hexagon/HexagonTargetMachine.cpp @@ -111,6 +111,7 @@ namespace llvm { extern char &HexagonExpandCondsetsID; void initializeHexagonExpandCondsetsPass(PassRegistry&); void initializeHexagonLoopIdiomRecognizePass(PassRegistry&); + void initializeHexagonOptAddrModePass(PassRegistry&); Pass *createHexagonLoopIdiomPass(); FunctionPass *createHexagonBitSimplify(); @@ -152,6 +153,7 @@ extern "C" void LLVMInitializeHexagonTarget() { // Register the target. RegisterTargetMachine<HexagonTargetMachine> X(getTheHexagonTarget()); initializeHexagonLoopIdiomRecognizePass(*PassRegistry::getPassRegistry()); + initializeHexagonOptAddrModePass(*PassRegistry::getPassRegistry()); } HexagonTargetMachine::HexagonTargetMachine(const Target &T, const Triple &TT, diff --git a/lib/Target/Hexagon/RDFCopy.cpp b/lib/Target/Hexagon/RDFCopy.cpp index 57ce9fabc5e3..ea86ffba58f6 100644 --- a/lib/Target/Hexagon/RDFCopy.cpp +++ b/lib/Target/Hexagon/RDFCopy.cpp @@ -59,7 +59,7 @@ void CopyPropagation::recordCopy(NodeAddr<StmtNode*> SA, EqualityMap &EM) { bool CopyPropagation::scanBlock(MachineBasicBlock *B) { bool Changed = false; - auto BA = DFG.getFunc().Addr->findBlock(B, DFG); + NodeAddr<BlockNode*> BA = DFG.findBlock(B); for (NodeAddr<InstrNode*> IA : BA.Addr->members(DFG)) { if (DFG.IsCode<NodeAttrs::Stmt>(IA)) { diff --git a/lib/Target/Hexagon/RDFGraph.h b/lib/Target/Hexagon/RDFGraph.h index d5faca4cd6f4..52f390356b26 100644 --- a/lib/Target/Hexagon/RDFGraph.h +++ b/lib/Target/Hexagon/RDFGraph.h @@ -508,7 +508,8 @@ namespace rdf { static_assert(sizeof(NodeBase) <= NodeAllocator::NodeMemSize, "NodeBase must be at most NodeAllocator::NodeMemSize bytes"); - typedef std::vector<NodeAddr<NodeBase*>> NodeList; +// typedef std::vector<NodeAddr<NodeBase*>> NodeList; + typedef SmallVector<NodeAddr<NodeBase*>,4> NodeList; typedef std::set<NodeId> NodeSet; struct RefNode : public NodeBase { diff --git a/lib/Target/Hexagon/RDFRegisters.cpp b/lib/Target/Hexagon/RDFRegisters.cpp index 5c5496a548af..4224ded3418b 100644 --- a/lib/Target/Hexagon/RDFRegisters.cpp +++ b/lib/Target/Hexagon/RDFRegisters.cpp @@ -69,6 +69,19 @@ PhysicalRegisterInfo::PhysicalRegisterInfo(const TargetRegisterInfo &tri, for (const MachineOperand &Op : In.operands()) if (Op.isRegMask()) RegMasks.insert(Op.getRegMask()); + + MaskInfos.resize(RegMasks.size()+1); + for (uint32_t M = 1, NM = RegMasks.size(); M <= NM; ++M) { + BitVector PU(TRI.getNumRegUnits()); + const uint32_t *MB = RegMasks.get(M); + for (unsigned i = 1, e = TRI.getNumRegs(); i != e; ++i) { + if (!(MB[i/32] & (1u << (i%32)))) + continue; + for (MCRegUnitIterator U(i, &TRI); U.isValid(); ++U) + PU.set(*U); + } + MaskInfos[M].Units = PU.flip(); + } } RegisterRef PhysicalRegisterInfo::normalize(RegisterRef RR) const { @@ -201,17 +214,8 @@ bool PhysicalRegisterInfo::aliasMM(RegisterRef RM, RegisterRef RN) const { bool RegisterAggr::hasAliasOf(RegisterRef RR) const { - if (PhysicalRegisterInfo::isRegMaskId(RR.Reg)) { - // XXX SLOW - const uint32_t *MB = PRI.getRegMaskBits(RR.Reg); - for (unsigned i = 1, e = PRI.getTRI().getNumRegs(); i != e; ++i) { - if (MB[i/32] & (1u << (i%32))) - continue; - if (hasAliasOf(RegisterRef(i, LaneBitmask::getAll()))) - return true; - } - return false; - } + if (PhysicalRegisterInfo::isRegMaskId(RR.Reg)) + return Units.anyCommon(PRI.getMaskUnits(RR.Reg)); for (MCRegUnitMaskIterator U(RR.Reg, &PRI.getTRI()); U.isValid(); ++U) { std::pair<uint32_t,LaneBitmask> P = *U; @@ -224,15 +228,8 @@ bool RegisterAggr::hasAliasOf(RegisterRef RR) const { bool RegisterAggr::hasCoverOf(RegisterRef RR) const { if (PhysicalRegisterInfo::isRegMaskId(RR.Reg)) { - // XXX SLOW - const uint32_t *MB = PRI.getRegMaskBits(RR.Reg); - for (unsigned i = 1, e = PRI.getTRI().getNumRegs(); i != e; ++i) { - if (MB[i/32] & (1u << (i%32))) - continue; - if (!hasCoverOf(RegisterRef(i, LaneBitmask::getAll()))) - return false; - } - return true; + BitVector T(PRI.getMaskUnits(RR.Reg)); + return T.reset(Units).none(); } for (MCRegUnitMaskIterator U(RR.Reg, &PRI.getTRI()); U.isValid(); ++U) { @@ -246,15 +243,7 @@ bool RegisterAggr::hasCoverOf(RegisterRef RR) const { RegisterAggr &RegisterAggr::insert(RegisterRef RR) { if (PhysicalRegisterInfo::isRegMaskId(RR.Reg)) { - BitVector PU(PRI.getTRI().getNumRegUnits()); // Preserved units. - const uint32_t *MB = PRI.getRegMaskBits(RR.Reg); - for (unsigned i = 1, e = PRI.getTRI().getNumRegs(); i != e; ++i) { - if (!(MB[i/32] & (1u << (i%32)))) - continue; - for (MCRegUnitIterator U(i, &PRI.getTRI()); U.isValid(); ++U) - PU.set(*U); - } - Units |= PU.flip(); + Units |= PRI.getMaskUnits(RR.Reg); return *this; } diff --git a/lib/Target/Hexagon/RDFRegisters.h b/lib/Target/Hexagon/RDFRegisters.h index 4b35c85a6b62..314d8b5666d7 100644 --- a/lib/Target/Hexagon/RDFRegisters.h +++ b/lib/Target/Hexagon/RDFRegisters.h @@ -51,6 +51,8 @@ namespace rdf { return F - Map.begin() + 1; } + uint32_t size() const { return Map.size(); } + typedef typename std::vector<T>::const_iterator const_iterator; const_iterator begin() const { return Map.begin(); } const_iterator end() const { return Map.end(); } @@ -107,6 +109,9 @@ namespace rdf { RegisterRef getRefForUnit(uint32_t U) const { return RegisterRef(UnitInfos[U].Reg, UnitInfos[U].Mask); } + const BitVector &getMaskUnits(RegisterId MaskId) const { + return MaskInfos[TargetRegisterInfo::stackSlot2Index(MaskId)].Units; + } const TargetRegisterInfo &getTRI() const { return TRI; } @@ -118,11 +123,15 @@ namespace rdf { RegisterId Reg = 0; LaneBitmask Mask; }; + struct MaskInfo { + BitVector Units; + }; const TargetRegisterInfo &TRI; + IndexedSet<const uint32_t*> RegMasks; std::vector<RegInfo> RegInfos; std::vector<UnitInfo> UnitInfos; - IndexedSet<const uint32_t*> RegMasks; + std::vector<MaskInfo> MaskInfos; bool aliasRR(RegisterRef RA, RegisterRef RB) const; bool aliasRM(RegisterRef RR, RegisterRef RM) const; @@ -135,7 +144,7 @@ namespace rdf { : Units(pri.getTRI().getNumRegUnits()), PRI(pri) {} RegisterAggr(const RegisterAggr &RG) = default; - bool empty() const { return Units.empty(); } + bool empty() const { return Units.none(); } bool hasAliasOf(RegisterRef RR) const; bool hasCoverOf(RegisterRef RR) const; static bool isCoverOf(RegisterRef RA, RegisterRef RB, diff --git a/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.cpp b/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.cpp index c26b3081dbc3..82e6731ecd78 100644 --- a/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.cpp +++ b/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.cpp @@ -17,7 +17,7 @@ using namespace llvm; void MSP430MCAsmInfo::anchor() { } MSP430MCAsmInfo::MSP430MCAsmInfo(const Triple &TT) { - PointerSize = CalleeSaveStackSlotSize = 2; + CodePointerSize = CalleeSaveStackSlotSize = 2; CommentString = ";"; diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp index ebe3c5784888..11411d997bb3 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp +++ b/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp @@ -23,7 +23,7 @@ MipsMCAsmInfo::MipsMCAsmInfo(const Triple &TheTriple) { if ((TheTriple.getArch() == Triple::mips64el) || (TheTriple.getArch() == Triple::mips64)) { - PointerSize = CalleeSaveStackSlotSize = 8; + CodePointerSize = CalleeSaveStackSlotSize = 8; } // FIXME: This condition isn't quite right but it's the best we can do until diff --git a/lib/Target/Mips/MipsMSAInstrInfo.td b/lib/Target/Mips/MipsMSAInstrInfo.td index 8b04fcb76920..bf79f0f2ff82 100644 --- a/lib/Target/Mips/MipsMSAInstrInfo.td +++ b/lib/Target/Mips/MipsMSAInstrInfo.td @@ -3781,6 +3781,80 @@ let Predicates = [HasMSA] in { ISA_MIPS1_NOT_32R6_64R6; } +def vsplati64_imm_eq_63 : PatLeaf<(bitconvert (v4i32 (build_vector))), [{ + APInt Imm; + SDNode *BV = N->getOperand(0).getNode(); + EVT EltTy = N->getValueType(0).getVectorElementType(); + + return selectVSplat(BV, Imm, EltTy.getSizeInBits()) && + Imm.getBitWidth() == EltTy.getSizeInBits() && Imm == 63; +}]>; + +def immi32Cst7 : ImmLeaf<i32, [{return isUInt<32>(Imm) && Imm == 7;}]>; +def immi32Cst15 : ImmLeaf<i32, [{return isUInt<32>(Imm) && Imm == 15;}]>; +def immi32Cst31 : ImmLeaf<i32, [{return isUInt<32>(Imm) && Imm == 31;}]>; + +def vsplati8imm7 : PatFrag<(ops node:$wt), + (and node:$wt, (vsplati8 immi32Cst7))>; +def vsplati16imm15 : PatFrag<(ops node:$wt), + (and node:$wt, (vsplati16 immi32Cst15))>; +def vsplati32imm31 : PatFrag<(ops node:$wt), + (and node:$wt, (vsplati32 immi32Cst31))>; +def vsplati64imm63 : PatFrag<(ops node:$wt), + (and node:$wt, vsplati64_imm_eq_63)>; + +class MSAShiftPat<SDNode Node, ValueType VT, MSAInst Insn, dag Vec> : + MSAPat<(VT (Node VT:$ws, (VT (and VT:$wt, Vec)))), + (VT (Insn VT:$ws, VT:$wt))>; + +class MSABitPat<SDNode Node, ValueType VT, MSAInst Insn, PatFrag Frag> : + MSAPat<(VT (Node VT:$ws, (shl vsplat_imm_eq_1, (Frag VT:$wt)))), + (VT (Insn VT:$ws, VT:$wt))>; + +multiclass MSAShiftPats<SDNode Node, string Insn> { + def : MSAShiftPat<Node, v16i8, !cast<MSAInst>(Insn#_B), + (vsplati8 immi32Cst7)>; + def : MSAShiftPat<Node, v8i16, !cast<MSAInst>(Insn#_H), + (vsplati16 immi32Cst15)>; + def : MSAShiftPat<Node, v4i32, !cast<MSAInst>(Insn#_W), + (vsplati32 immi32Cst31)>; + def : MSAPat<(v2i64 (Node v2i64:$ws, (v2i64 (and v2i64:$wt, + vsplati64_imm_eq_63)))), + (v2i64 (!cast<MSAInst>(Insn#_D) v2i64:$ws, v2i64:$wt))>; +} + +multiclass MSABitPats<SDNode Node, string Insn> { + def : MSABitPat<Node, v16i8, !cast<MSAInst>(Insn#_B), vsplati8imm7>; + def : MSABitPat<Node, v8i16, !cast<MSAInst>(Insn#_H), vsplati16imm15>; + def : MSABitPat<Node, v4i32, !cast<MSAInst>(Insn#_W), vsplati32imm31>; + def : MSAPat<(Node v2i64:$ws, (shl (v2i64 vsplati64_imm_eq_1), + (vsplati64imm63 v2i64:$wt))), + (v2i64 (!cast<MSAInst>(Insn#_D) v2i64:$ws, v2i64:$wt))>; +} + +defm : MSAShiftPats<shl, "SLL">; +defm : MSAShiftPats<srl, "SRL">; +defm : MSAShiftPats<sra, "SRA">; +defm : MSABitPats<xor, "BNEG">; +defm : MSABitPats<or, "BSET">; + +def : MSAPat<(and v16i8:$ws, (xor (shl vsplat_imm_eq_1, + (vsplati8imm7 v16i8:$wt)), + immAllOnesV)), + (v16i8 (BCLR_B v16i8:$ws, v16i8:$wt))>; +def : MSAPat<(and v8i16:$ws, (xor (shl vsplat_imm_eq_1, + (vsplati16imm15 v8i16:$wt)), + immAllOnesV)), + (v8i16 (BCLR_H v8i16:$ws, v8i16:$wt))>; +def : MSAPat<(and v4i32:$ws, (xor (shl vsplat_imm_eq_1, + (vsplati32imm31 v4i32:$wt)), + immAllOnesV)), + (v4i32 (BCLR_W v4i32:$ws, v4i32:$wt))>; +def : MSAPat<(and v2i64:$ws, (xor (shl (v2i64 vsplati64_imm_eq_1), + (vsplati64imm63 v2i64:$wt)), + (bitconvert (v4i32 immAllOnesV)))), + (v2i64 (BCLR_D v2i64:$ws, v2i64:$wt))>; + // Vector extraction with fixed index. // // Extracting 32-bit values on MSA32 should always use COPY_S_W rather than diff --git a/lib/Target/Mips/MipsSEISelLowering.cpp b/lib/Target/Mips/MipsSEISelLowering.cpp index e2da8477295b..bf7f079e3105 100644 --- a/lib/Target/Mips/MipsSEISelLowering.cpp +++ b/lib/Target/Mips/MipsSEISelLowering.cpp @@ -1547,11 +1547,24 @@ static SDValue lowerMSABinaryBitImmIntr(SDValue Op, SelectionDAG &DAG, return DAG.getNode(Opc, DL, VecTy, Op->getOperand(1), Exp2Imm); } +static SDValue truncateVecElts(SDValue Op, SelectionDAG &DAG) { + SDLoc DL(Op); + EVT ResTy = Op->getValueType(0); + SDValue Vec = Op->getOperand(2); + bool BigEndian = !DAG.getSubtarget().getTargetTriple().isLittleEndian(); + MVT ResEltTy = ResTy == MVT::v2i64 ? MVT::i64 : MVT::i32; + SDValue ConstValue = DAG.getConstant(Vec.getScalarValueSizeInBits() - 1, + DL, ResEltTy); + SDValue SplatVec = getBuildVectorSplat(ResTy, ConstValue, BigEndian, DAG); + + return DAG.getNode(ISD::AND, DL, ResTy, Vec, SplatVec); +} + static SDValue lowerMSABitClear(SDValue Op, SelectionDAG &DAG) { EVT ResTy = Op->getValueType(0); SDLoc DL(Op); SDValue One = DAG.getConstant(1, DL, ResTy); - SDValue Bit = DAG.getNode(ISD::SHL, DL, ResTy, One, Op->getOperand(2)); + SDValue Bit = DAG.getNode(ISD::SHL, DL, ResTy, One, truncateVecElts(Op, DAG)); return DAG.getNode(ISD::AND, DL, ResTy, Op->getOperand(1), DAG.getNOT(DL, Bit, ResTy)); @@ -1687,7 +1700,7 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op, return DAG.getNode(ISD::XOR, DL, VecTy, Op->getOperand(1), DAG.getNode(ISD::SHL, DL, VecTy, One, - Op->getOperand(2))); + truncateVecElts(Op, DAG))); } case Intrinsic::mips_bnegi_b: case Intrinsic::mips_bnegi_h: @@ -1723,7 +1736,7 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op, return DAG.getNode(ISD::OR, DL, VecTy, Op->getOperand(1), DAG.getNode(ISD::SHL, DL, VecTy, One, - Op->getOperand(2))); + truncateVecElts(Op, DAG))); } case Intrinsic::mips_bseti_b: case Intrinsic::mips_bseti_h: @@ -2210,7 +2223,7 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op, case Intrinsic::mips_sll_w: case Intrinsic::mips_sll_d: return DAG.getNode(ISD::SHL, DL, Op->getValueType(0), Op->getOperand(1), - Op->getOperand(2)); + truncateVecElts(Op, DAG)); case Intrinsic::mips_slli_b: case Intrinsic::mips_slli_h: case Intrinsic::mips_slli_w: @@ -2240,7 +2253,7 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op, case Intrinsic::mips_sra_w: case Intrinsic::mips_sra_d: return DAG.getNode(ISD::SRA, DL, Op->getValueType(0), Op->getOperand(1), - Op->getOperand(2)); + truncateVecElts(Op, DAG)); case Intrinsic::mips_srai_b: case Intrinsic::mips_srai_h: case Intrinsic::mips_srai_w: @@ -2270,7 +2283,7 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op, case Intrinsic::mips_srl_w: case Intrinsic::mips_srl_d: return DAG.getNode(ISD::SRL, DL, Op->getValueType(0), Op->getOperand(1), - Op->getOperand(2)); + truncateVecElts(Op, DAG)); case Intrinsic::mips_srli_b: case Intrinsic::mips_srli_h: case Intrinsic::mips_srli_w: diff --git a/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp index 78bdf4e698d8..bdd0f156c8af 100644 --- a/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp +++ b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp @@ -27,7 +27,7 @@ void NVPTXMCAsmInfo::anchor() {} NVPTXMCAsmInfo::NVPTXMCAsmInfo(const Triple &TheTriple) { if (TheTriple.getArch() == Triple::nvptx64) { - PointerSize = CalleeSaveStackSlotSize = 8; + CodePointerSize = CalleeSaveStackSlotSize = 8; } CommentString = "//"; diff --git a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp index 21e25de80dc7..ba28cd83278b 100644 --- a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp +++ b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp @@ -2004,7 +2004,7 @@ void NVPTXAsmPrinter::bufferAggregateConstant(const Constant *CPV, for (unsigned I = 0, E = DL.getTypeAllocSize(CPV->getType()); I < E; ++I) { uint8_t Byte = Val.getLoBits(8).getZExtValue(); aggBuffer->addBytes(&Byte, 1, 1); - Val = Val.lshr(8); + Val.lshrInPlace(8); } return; } diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp index d8fab5b7c01a..d30bf1a56e8a 100644 --- a/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp +++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp @@ -20,7 +20,7 @@ void PPCMCAsmInfoDarwin::anchor() { } PPCMCAsmInfoDarwin::PPCMCAsmInfoDarwin(bool is64Bit, const Triple& T) { if (is64Bit) { - PointerSize = CalleeSaveStackSlotSize = 8; + CodePointerSize = CalleeSaveStackSlotSize = 8; } IsLittleEndian = false; @@ -50,7 +50,7 @@ PPCELFMCAsmInfo::PPCELFMCAsmInfo(bool is64Bit, const Triple& T) { NeedsLocalForSize = true; if (is64Bit) { - PointerSize = CalleeSaveStackSlotSize = 8; + CodePointerSize = CalleeSaveStackSlotSize = 8; } IsLittleEndian = T.getArch() == Triple::ppc64le; diff --git a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp index 9c72638023bb..125c00295f88 100644 --- a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp +++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp @@ -2977,10 +2977,10 @@ void PPCDAGToDAGISel::Select(SDNode *N) { SelectAddrIdxOnly(LD->getBasePtr(), Base, Offset)) { SDValue Chain = LD->getChain(); SDValue Ops[] = { Base, Offset, Chain }; - SDNode *NewN = CurDAG->SelectNodeTo(N, PPC::LXVDSX, - N->getValueType(0), Ops); MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); MemOp[0] = LD->getMemOperand(); + SDNode *NewN = CurDAG->SelectNodeTo(N, PPC::LXVDSX, + N->getValueType(0), Ops); cast<MachineSDNode>(NewN)->setMemRefs(MemOp, MemOp + 1); return; } diff --git a/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.cpp b/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.cpp index b164df8b595a..d622911e92c4 100644 --- a/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.cpp +++ b/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.cpp @@ -18,7 +18,7 @@ using namespace llvm; void RISCVMCAsmInfo::anchor() {} RISCVMCAsmInfo::RISCVMCAsmInfo(const Triple &TT) { - PointerSize = CalleeSaveStackSlotSize = TT.isArch64Bit() ? 8 : 4; + CodePointerSize = CalleeSaveStackSlotSize = TT.isArch64Bit() ? 8 : 4; CommentString = "#"; AlignmentIsInBytes = false; SupportsDebugInformation = true; diff --git a/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp b/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp index 3ed09898fb78..21df60237d96 100644 --- a/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp +++ b/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp @@ -28,7 +28,7 @@ SparcELFMCAsmInfo::SparcELFMCAsmInfo(const Triple &TheTriple) { IsLittleEndian = (TheTriple.getArch() == Triple::sparcel); if (isV9) { - PointerSize = CalleeSaveStackSlotSize = 8; + CodePointerSize = CalleeSaveStackSlotSize = 8; } Data16bitsDirective = "\t.half\t"; diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp b/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp index b17977d41be1..6e00981939b6 100644 --- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp +++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp @@ -14,7 +14,7 @@ using namespace llvm; SystemZMCAsmInfo::SystemZMCAsmInfo(const Triple &TT) { - PointerSize = 8; + CodePointerSize = 8; CalleeSaveStackSlotSize = 8; IsLittleEndian = false; diff --git a/lib/Target/SystemZ/SystemZISelLowering.cpp b/lib/Target/SystemZ/SystemZISelLowering.cpp index 84d3c7bed50a..f2fd581f7847 100644 --- a/lib/Target/SystemZ/SystemZISelLowering.cpp +++ b/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -829,7 +829,7 @@ bool SystemZTargetLowering::allowTruncateForTailCall(Type *FromType, return isTruncateFree(FromType, ToType); } -bool SystemZTargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const { +bool SystemZTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const { return CI->isTailCall(); } diff --git a/lib/Target/SystemZ/SystemZISelLowering.h b/lib/Target/SystemZ/SystemZISelLowering.h index 7d92a7355877..1c34dc43e8bb 100644 --- a/lib/Target/SystemZ/SystemZISelLowering.h +++ b/lib/Target/SystemZ/SystemZISelLowering.h @@ -454,7 +454,7 @@ public: MachineBasicBlock *BB) const override; SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; bool allowTruncateForTailCall(Type *, Type *) const override; - bool mayBeEmittedAsTailCall(CallInst *CI) const override; + bool mayBeEmittedAsTailCall(const CallInst *CI) const override; SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl<ISD::InputArg> &Ins, diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.cpp b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.cpp index 2dcec5263fa1..5f8c78ed1683 100644 --- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.cpp +++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.cpp @@ -22,7 +22,7 @@ using namespace llvm; WebAssemblyMCAsmInfoELF::~WebAssemblyMCAsmInfoELF() {} WebAssemblyMCAsmInfoELF::WebAssemblyMCAsmInfoELF(const Triple &T) { - PointerSize = CalleeSaveStackSlotSize = T.isArch64Bit() ? 8 : 4; + CodePointerSize = CalleeSaveStackSlotSize = T.isArch64Bit() ? 8 : 4; // TODO: What should MaxInstLength be? @@ -55,7 +55,7 @@ WebAssemblyMCAsmInfoELF::WebAssemblyMCAsmInfoELF(const Triple &T) { WebAssemblyMCAsmInfo::~WebAssemblyMCAsmInfo() {} WebAssemblyMCAsmInfo::WebAssemblyMCAsmInfo(const Triple &T) { - PointerSize = CalleeSaveStackSlotSize = T.isArch64Bit() ? 8 : 4; + CodePointerSize = CalleeSaveStackSlotSize = T.isArch64Bit() ? 8 : 4; // TODO: What should MaxInstLength be? diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp index a0b008947491..544cd653fd72 100644 --- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp +++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp @@ -94,6 +94,8 @@ void WebAssemblyMCCodeEmitter::encodeInstruction( MCFixupKind(WebAssembly::fixup_code_global_index), MI.getLoc())); ++MCNumFixups; encodeULEB128(uint64_t(MO.getImm()), OS); + } else if (Info.OperandType == WebAssembly::OPERAND_SIGNATURE) { + encodeSLEB128(int64_t(MO.getImm()), OS); } else { encodeULEB128(uint64_t(MO.getImm()), OS); } diff --git a/lib/Target/WebAssembly/WebAssemblyOptimizeReturned.cpp b/lib/Target/WebAssembly/WebAssemblyOptimizeReturned.cpp index f4c9a4ef6b9c..559165e4c86b 100644 --- a/lib/Target/WebAssembly/WebAssemblyOptimizeReturned.cpp +++ b/lib/Target/WebAssembly/WebAssemblyOptimizeReturned.cpp @@ -54,7 +54,7 @@ FunctionPass *llvm::createWebAssemblyOptimizeReturned() { void OptimizeReturned::visitCallSite(CallSite CS) { for (unsigned i = 0, e = CS.getNumArgOperands(); i < e; ++i) - if (CS.paramHasAttr(0, Attribute::Returned)) { + if (CS.paramHasAttr(i, Attribute::Returned)) { Instruction *Inst = CS.getInstruction(); Value *Arg = CS.getArgOperand(i); // Ignore constants, globals, undef, etc. diff --git a/lib/Target/WebAssembly/known_gcc_test_failures.txt b/lib/Target/WebAssembly/known_gcc_test_failures.txt index 8dd5e8a03e2e..8e8e5fd1eff1 100644 --- a/lib/Target/WebAssembly/known_gcc_test_failures.txt +++ b/lib/Target/WebAssembly/known_gcc_test_failures.txt @@ -1,5 +1,15 @@ # Tests which are known to fail from the GCC torture test suite. +# Syntax: Each line has a single test to be marked as a 'known failure' (or +# 'exclusion'. Known failures are expected to fail, and will cause an error if +# they pass. (Known failures that do not run at all will not cause an +# error). The format is +# <name> <attributes> # comment +# +# The attributes in this case represent the different arguments used to +# compiler: 'wasm-s' is for compiling to .s files, and 'wasm-o' for compiling +# to wasm object files (.o). + # Computed gotos are not supported (Cannot select BlockAddress/BRIND) 20040302-1.c 20071210-1.c @@ -66,3 +76,21 @@ pr41935.c 920728-1.c pr28865.c widechar-2.c + +# crash: Running pass 'WebAssembly Explicit Locals' on function +20020107-1.c wasm-o +20030222-1.c wasm-o +20071220-1.c wasm-o +20071220-2.c wasm-o +990130-1.c wasm-o +pr38533.c wasm-o +pr41239.c wasm-o +pr43385.c wasm-o +pr43560.c wasm-o +pr45695.c wasm-o +pr49279.c wasm-o +pr49390.c wasm-o +pr52286.c wasm-o + +# fatal error: error in backend: data symbols must have a size set with .size +921110-1.c wasm-o diff --git a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp index 48a1d8f1330c..9c35a251e480 100644 --- a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp +++ b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp @@ -43,7 +43,7 @@ void X86MCAsmInfoDarwin::anchor() { } X86MCAsmInfoDarwin::X86MCAsmInfoDarwin(const Triple &T) { bool is64Bit = T.getArch() == Triple::x86_64; if (is64Bit) - PointerSize = CalleeSaveStackSlotSize = 8; + CodePointerSize = CalleeSaveStackSlotSize = 8; AssemblerDialect = AsmWriterFlavor; @@ -92,7 +92,7 @@ X86ELFMCAsmInfo::X86ELFMCAsmInfo(const Triple &T) { // For ELF, x86-64 pointer size depends on the ABI. // For x86-64 without the x32 ABI, pointer size is 8. For x86 and for x86-64 // with the x32 ABI, pointer size remains the default 4. - PointerSize = (is64Bit && !isX32) ? 8 : 4; + CodePointerSize = (is64Bit && !isX32) ? 8 : 4; // OTOH, stack slot size is always 8 for x86-64, even with the x32 ABI. CalleeSaveStackSlotSize = is64Bit ? 8 : 4; @@ -129,7 +129,7 @@ X86MCAsmInfoMicrosoft::X86MCAsmInfoMicrosoft(const Triple &Triple) { if (Triple.getArch() == Triple::x86_64) { PrivateGlobalPrefix = ".L"; PrivateLabelPrefix = ".L"; - PointerSize = 8; + CodePointerSize = 8; WinEHEncodingType = WinEH::EncodingType::Itanium; } else { // 32-bit X86 doesn't use CFI, so this isn't a real encoding type. It's just @@ -156,7 +156,7 @@ X86MCAsmInfoGNUCOFF::X86MCAsmInfoGNUCOFF(const Triple &Triple) { if (Triple.getArch() == Triple::x86_64) { PrivateGlobalPrefix = ".L"; PrivateLabelPrefix = ".L"; - PointerSize = 8; + CodePointerSize = 8; WinEHEncodingType = WinEH::EncodingType::Itanium; ExceptionsType = ExceptionHandling::WinEH; } else { diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp index 78e0bca4158e..8678a13b95d0 100644 --- a/lib/Target/X86/X86FrameLowering.cpp +++ b/lib/Target/X86/X86FrameLowering.cpp @@ -1698,21 +1698,18 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, } } -// NOTE: this only has a subset of the full frame index logic. In -// particular, the FI < 0 and AfterFPPop logic is handled in -// X86RegisterInfo::eliminateFrameIndex, but not here. Possibly -// (probably?) it should be moved into here. int X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI, unsigned &FrameReg) const { const MachineFrameInfo &MFI = MF.getFrameInfo(); + bool IsFixed = MFI.isFixedObjectIndex(FI); // We can't calculate offset from frame pointer if the stack is realigned, // so enforce usage of stack/base pointer. The base pointer is used when we // have dynamic allocas in addition to dynamic realignment. if (TRI->hasBasePointer(MF)) - FrameReg = TRI->getBaseRegister(); + FrameReg = IsFixed ? TRI->getFramePtr() : TRI->getBaseRegister(); else if (TRI->needsStackRealignment(MF)) - FrameReg = TRI->getStackRegister(); + FrameReg = IsFixed ? TRI->getFramePtr() : TRI->getStackRegister(); else FrameReg = TRI->getFrameRegister(MF); diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp index eb5c56ff2ff9..2d788bf0cf99 100644 --- a/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -1311,8 +1311,9 @@ bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM, ++Cost; // If the base is a register with multiple uses, this // transformation may save a mov. - if ((AM.BaseType == X86ISelAddressMode::RegBase && - AM.Base_Reg.getNode() && + // FIXME: Don't rely on DELETED_NODEs. + if ((AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode() && + AM.Base_Reg->getOpcode() != ISD::DELETED_NODE && !AM.Base_Reg.getNode()->hasOneUse()) || AM.BaseType == X86ISelAddressMode::FrameIndexBase) --Cost; diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 7ff483063ec2..b5f29fb400ef 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -2742,13 +2742,13 @@ static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) { return GuaranteedTailCallOpt && canGuaranteeTCO(CC); } -bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const { +bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const { auto Attr = CI->getParent()->getParent()->getFnAttribute("disable-tail-calls"); if (!CI->isTailCall() || Attr.getValueAsString() == "true") return false; - CallSite CS(CI); + ImmutableCallSite CS(CI); CallingConv::ID CalleeCC = CS.getCallingConv(); if (!mayTailCallThisCC(CalleeCC)) return false; @@ -8327,13 +8327,13 @@ static APInt computeZeroableShuffleElements(ArrayRef<int> Mask, Zeroable.setBit(i); else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) { APInt Val = Cst->getAPIntValue(); - Val = Val.lshr((M % Scale) * ScalarSizeInBits); + Val.lshrInPlace((M % Scale) * ScalarSizeInBits); Val = Val.getLoBits(ScalarSizeInBits); if (Val == 0) Zeroable.setBit(i); } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) { APInt Val = Cst->getValueAPF().bitcastToAPInt(); - Val = Val.lshr((M % Scale) * ScalarSizeInBits); + Val.lshrInPlace((M % Scale) * ScalarSizeInBits); Val = Val.getLoBits(ScalarSizeInBits); if (Val == 0) Zeroable.setBit(i); @@ -16069,7 +16069,7 @@ static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) { unsigned EltBits = EltVT.getSizeInBits(); // For FABS, mask is 0x7f...; for FNEG, mask is 0x80... APInt MaskElt = - IsFABS ? APInt::getSignedMaxValue(EltBits) : APInt::getSignBit(EltBits); + IsFABS ? APInt::getSignedMaxValue(EltBits) : APInt::getSignMask(EltBits); const fltSemantics &Sem = EltVT == MVT::f64 ? APFloat::IEEEdouble() : (IsF128 ? APFloat::IEEEquad() : APFloat::IEEEsingle()); @@ -16132,9 +16132,9 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) { // The mask constants are automatically splatted for vector types. unsigned EltSizeInBits = VT.getScalarSizeInBits(); SDValue SignMask = DAG.getConstantFP( - APFloat(Sem, APInt::getSignBit(EltSizeInBits)), dl, LogicVT); + APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT); SDValue MagMask = DAG.getConstantFP( - APFloat(Sem, ~APInt::getSignBit(EltSizeInBits)), dl, LogicVT); + APFloat(Sem, ~APInt::getSignMask(EltSizeInBits)), dl, LogicVT); // First, clear all bits but the sign bit from the second operand (sign). if (IsFakeVector) @@ -17344,10 +17344,10 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget, // bits of the inputs before performing those operations. if (FlipSigns) { MVT EltVT = VT.getVectorElementType(); - SDValue SB = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), dl, + SDValue SM = DAG.getConstant(APInt::getSignMask(EltVT.getSizeInBits()), dl, VT); - Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SB); - Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SB); + Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM); + Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM); } SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1); @@ -22111,11 +22111,11 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget, } // i64 vector arithmetic shift can be emulated with the transform: - // M = lshr(SIGN_BIT, Amt) + // M = lshr(SIGN_MASK, Amt) // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M) if ((VT == MVT::v2i64 || (VT == MVT::v4i64 && Subtarget.hasInt256())) && Op.getOpcode() == ISD::SRA) { - SDValue S = DAG.getConstant(APInt::getSignBit(64), dl, VT); + SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT); SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt); R = DAG.getNode(ISD::SRL, dl, VT, R, Amt); R = DAG.getNode(ISD::XOR, dl, VT, R, M); @@ -22647,7 +22647,7 @@ bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { // FIXME: On 32 bits x86, fild/movq might be faster than lock cmpxchg8b. TargetLowering::AtomicExpansionKind X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { - auto PTy = cast<PointerType>(LI->getPointerOperand()->getType()); + auto PTy = cast<PointerType>(LI->getPointerOperandType()); return needsCmpXchgNb(PTy->getElementType()) ? AtomicExpansionKind::CmpXChg : AtomicExpansionKind::None; } @@ -26722,8 +26722,8 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op, // Low bits are known zero. KnownZero.setLowBits(ShAmt); } else { - KnownZero = KnownZero.lshr(ShAmt); - KnownOne = KnownOne.lshr(ShAmt); + KnownZero.lshrInPlace(ShAmt); + KnownOne.lshrInPlace(ShAmt); // High bits are known zero. KnownZero.setHighBits(ShAmt); } @@ -30152,7 +30152,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, // x s< 0 ? x^C : 0 --> subus x, C if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR && ISD::isBuildVectorAllZeros(CondRHS.getNode()) && - OpRHSConst->getAPIntValue().isSignBit()) + OpRHSConst->getAPIntValue().isSignMask()) // Note that we have to rebuild the RHS constant here to ensure we // don't rely on particular values of undef lanes. return DAG.getNode( @@ -30203,7 +30203,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, return SDValue(); assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size"); - APInt DemandedMask(APInt::getSignBit(BitWidth)); + APInt DemandedMask(APInt::getSignMask(BitWidth)); APInt KnownZero, KnownOne; TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(), DCI.isBeforeLegalizeOps()); @@ -31269,7 +31269,7 @@ static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG, else if (X86ISD::VSRAI == Opcode) Elt = Elt.ashr(ShiftImm); else - Elt = Elt.lshr(ShiftImm); + Elt.lshrInPlace(ShiftImm); } return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N)); } @@ -32234,8 +32234,8 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG, BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(V); if (!BV || !BV->isConstant()) return false; - for (unsigned i = 0, e = V.getNumOperands(); i < e; i++) { - ConstantSDNode *C = dyn_cast<ConstantSDNode>(V.getOperand(i)); + for (SDValue Op : V->ops()) { + ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op); if (!C) return false; uint64_t Val = C->getZExtValue(); @@ -33428,8 +33428,8 @@ static SDValue isFNEG(SDNode *N) { SDValue Op0 = peekThroughBitcasts(Op.getOperand(0)); unsigned EltBits = Op1.getScalarValueSizeInBits(); - auto isSignBitValue = [&](const ConstantFP *C) { - return C->getValueAPF().bitcastToAPInt() == APInt::getSignBit(EltBits); + auto isSignMask = [&](const ConstantFP *C) { + return C->getValueAPF().bitcastToAPInt() == APInt::getSignMask(EltBits); }; // There is more than one way to represent the same constant on @@ -33440,21 +33440,21 @@ static SDValue isFNEG(SDNode *N) { // We check all variants here. if (Op1.getOpcode() == X86ISD::VBROADCAST) { if (auto *C = getTargetConstantFromNode(Op1.getOperand(0))) - if (isSignBitValue(cast<ConstantFP>(C))) + if (isSignMask(cast<ConstantFP>(C))) return Op0; } else if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1)) { if (ConstantFPSDNode *CN = BV->getConstantFPSplatNode()) - if (isSignBitValue(CN->getConstantFPValue())) + if (isSignMask(CN->getConstantFPValue())) return Op0; } else if (auto *C = getTargetConstantFromNode(Op1)) { if (C->getType()->isVectorTy()) { if (auto *SplatV = C->getSplatValue()) - if (isSignBitValue(cast<ConstantFP>(SplatV))) + if (isSignMask(cast<ConstantFP>(SplatV))) return Op0; } else if (auto *FPConst = dyn_cast<ConstantFP>(C)) - if (isSignBitValue(FPConst)) + if (isSignMask(FPConst)) return Op0; } return SDValue(); @@ -34631,7 +34631,7 @@ static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG, return SDValue(); ShrinkMode Mode; - if (!canReduceVMulWidth(MulOp.getNode(), DAG, Mode)) + if (!canReduceVMulWidth(MulOp.getNode(), DAG, Mode) || Mode == MULU16) return SDValue(); EVT VT = N->getValueType(0); @@ -35922,14 +35922,11 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, if (Subtarget.is64Bit()) { Res.first = X86::RAX; Res.second = &X86::GR64_ADRegClass; - } else if (Subtarget.is32Bit()) { + } else { + assert((Subtarget.is32Bit() || Subtarget.is16Bit()) && + "Expecting 64, 32 or 16 bit subtarget"); Res.first = X86::EAX; Res.second = &X86::GR32_ADRegClass; - } else if (Subtarget.is16Bit()) { - Res.first = X86::AX; - Res.second = &X86::GR16_ADRegClass; - } else { - llvm_unreachable("Expecting 64, 32 or 16 bit subtarget"); } return Res; } diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index ab4910daca02..190a88335000 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -1207,7 +1207,7 @@ namespace llvm { bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const override; - bool mayBeEmittedAsTailCall(CallInst *CI) const override; + bool mayBeEmittedAsTailCall(const CallInst *CI) const override; EVT getTypeForExtReturn(LLVMContext &Context, EVT VT, ISD::NodeType ExtendKind) const override; diff --git a/lib/Target/X86/X86InstructionSelector.cpp b/lib/Target/X86/X86InstructionSelector.cpp index 6cc5e8b63597..fb9315792892 100644 --- a/lib/Target/X86/X86InstructionSelector.cpp +++ b/lib/Target/X86/X86InstructionSelector.cpp @@ -67,6 +67,8 @@ private: MachineFunction &MF) const; bool selectConstant(MachineInstr &I, MachineRegisterInfo &MRI, MachineFunction &MF) const; + bool selectTrunc(MachineInstr &I, MachineRegisterInfo &MRI, + MachineFunction &MF) const; const X86Subtarget &STI; const X86InstrInfo &TII; @@ -99,6 +101,10 @@ X86InstructionSelector::X86InstructionSelector(const X86Subtarget &STI, static const TargetRegisterClass * getRegClassForTypeOnBank(LLT Ty, const RegisterBank &RB) { if (RB.getID() == X86::GPRRegBankID) { + if (Ty.getSizeInBits() <= 8) + return &X86::GR8RegClass; + if (Ty.getSizeInBits() == 16) + return &X86::GR16RegClass; if (Ty.getSizeInBits() == 32) return &X86::GR32RegClass; if (Ty.getSizeInBits() == 64) @@ -207,6 +213,8 @@ bool X86InstructionSelector::select(MachineInstr &I) const { return true; if (selectConstant(I, MRI, MF)) return true; + if (selectTrunc(I, MRI, MF)) + return true; return selectImpl(I); } @@ -509,6 +517,59 @@ bool X86InstructionSelector::selectConstant(MachineInstr &I, return constrainSelectedInstRegOperands(I, TII, TRI, RBI); } +bool X86InstructionSelector::selectTrunc(MachineInstr &I, + MachineRegisterInfo &MRI, + MachineFunction &MF) const { + if (I.getOpcode() != TargetOpcode::G_TRUNC) + return false; + + const unsigned DstReg = I.getOperand(0).getReg(); + const unsigned SrcReg = I.getOperand(1).getReg(); + + const LLT DstTy = MRI.getType(DstReg); + const LLT SrcTy = MRI.getType(SrcReg); + + const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI); + const RegisterBank &SrcRB = *RBI.getRegBank(SrcReg, MRI, TRI); + + if (DstRB.getID() != SrcRB.getID()) { + DEBUG(dbgs() << "G_TRUNC input/output on different banks\n"); + return false; + } + + if (DstRB.getID() != X86::GPRRegBankID) + return false; + + const TargetRegisterClass *DstRC = getRegClassForTypeOnBank(DstTy, DstRB); + if (!DstRC) + return false; + + const TargetRegisterClass *SrcRC = getRegClassForTypeOnBank(SrcTy, SrcRB); + if (!SrcRC) + return false; + + if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI) || + !RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) { + DEBUG(dbgs() << "Failed to constrain G_TRUNC\n"); + return false; + } + + if (DstRC == SrcRC) { + // Nothing to be done + } else if (DstRC == &X86::GR32RegClass) { + I.getOperand(1).setSubReg(X86::sub_32bit); + } else if (DstRC == &X86::GR16RegClass) { + I.getOperand(1).setSubReg(X86::sub_16bit); + } else if (DstRC == &X86::GR8RegClass) { + I.getOperand(1).setSubReg(X86::sub_8bit); + } else { + return false; + } + + I.setDesc(TII.get(X86::COPY)); + return true; +} + InstructionSelector * llvm::createX86InstructionSelector(X86Subtarget &Subtarget, X86RegisterBankInfo &RBI) { diff --git a/lib/Target/X86/X86RegisterBankInfo.cpp b/lib/Target/X86/X86RegisterBankInfo.cpp index d395c826e6bf..0f8a750a0235 100644 --- a/lib/Target/X86/X86RegisterBankInfo.cpp +++ b/lib/Target/X86/X86RegisterBankInfo.cpp @@ -68,6 +68,7 @@ X86GenRegisterBankInfo::PartialMappingIdx X86GenRegisterBankInfo::getPartialMappingIdx(const LLT &Ty, bool isFP) { if ((Ty.isScalar() && !isFP) || Ty.isPointer()) { switch (Ty.getSizeInBits()) { + case 1: case 8: return PMI_GPR8; case 16: diff --git a/lib/Target/X86/X86RegisterInfo.h b/lib/Target/X86/X86RegisterInfo.h index 58fa31e94fba..25958f0c3106 100644 --- a/lib/Target/X86/X86RegisterInfo.h +++ b/lib/Target/X86/X86RegisterInfo.h @@ -133,6 +133,11 @@ public: unsigned getPtrSizedFrameRegister(const MachineFunction &MF) const; unsigned getStackRegister() const { return StackPtr; } unsigned getBaseRegister() const { return BasePtr; } + /// Returns physical register used as frame pointer. + /// This will always returns the frame pointer register, contrary to + /// getFrameRegister() which returns the "base pointer" in situations + /// involving a stack, frame and base pointer. + unsigned getFramePtr() const { return FramePtr; } // FIXME: Move to FrameInfok unsigned getSlotSize() const { return SlotSize; } }; diff --git a/lib/Target/X86/X86RegisterInfo.td b/lib/Target/X86/X86RegisterInfo.td index c177ba1d52f7..d235d2b40b15 100644 --- a/lib/Target/X86/X86RegisterInfo.td +++ b/lib/Target/X86/X86RegisterInfo.td @@ -438,7 +438,6 @@ def LOW32_ADDR_ACCESS_RBP : RegisterClass<"X86", [i32], 32, (add LOW32_ADDR_ACCESS, RBP)>; // A class to support the 'A' assembler constraint: [ER]AX then [ER]DX. -def GR16_AD : RegisterClass<"X86", [i16], 16, (add AX, DX)>; def GR32_AD : RegisterClass<"X86", [i32], 32, (add EAX, EDX)>; def GR64_AD : RegisterClass<"X86", [i64], 64, (add RAX, RDX)>; |