diff options
Diffstat (limited to 'lib/Target/ARM/ARMTargetTransformInfo.cpp')
-rw-r--r-- | lib/Target/ARM/ARMTargetTransformInfo.cpp | 275 |
1 files changed, 267 insertions, 8 deletions
diff --git a/lib/Target/ARM/ARMTargetTransformInfo.cpp b/lib/Target/ARM/ARMTargetTransformInfo.cpp index f72bb8632eb7..2a8ec734a05f 100644 --- a/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -1,9 +1,8 @@ //===- ARMTargetTransformInfo.cpp - ARM specific TTI ----------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -22,6 +21,7 @@ #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Type.h" #include "llvm/MC/SubtargetFeature.h" #include "llvm/Support/Casting.h" @@ -36,6 +36,10 @@ using namespace llvm; #define DEBUG_TYPE "armtti" +static cl::opt<bool> DisableLowOverheadLoops( + "disable-arm-loloops", cl::Hidden, cl::init(true), + cl::desc("Disable the generation of low-overhead loops")); + bool ARMTTIImpl::areInlineCompatible(const Function *Caller, const Function *Callee) const { const TargetMachine &TM = getTLI()->getTargetMachine(); @@ -107,9 +111,13 @@ int ARMTTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, Idx == 1) return 0; - if (Opcode == Instruction::And) - // Conversion to BIC is free, and means we can use ~Imm instead. - return std::min(getIntImmCost(Imm, Ty), getIntImmCost(~Imm, Ty)); + if (Opcode == Instruction::And) { + // UXTB/UXTH + if (Imm == 255 || Imm == 65535) + return 0; + // Conversion to BIC is free, and means we can use ~Imm instead. + return std::min(getIntImmCost(Imm, Ty), getIntImmCost(~Imm, Ty)); + } if (Opcode == Instruction::Add) // Conversion to SUB is free, and means we can use -Imm instead. @@ -398,6 +406,40 @@ int ARMTTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE, return 1; } +int ARMTTIImpl::getMemcpyCost(const Instruction *I) { + const MemCpyInst *MI = dyn_cast<MemCpyInst>(I); + assert(MI && "MemcpyInst expected"); + ConstantInt *C = dyn_cast<ConstantInt>(MI->getLength()); + + // To model the cost of a library call, we assume 1 for the call, and + // 3 for the argument setup. + const unsigned LibCallCost = 4; + + // If 'size' is not a constant, a library call will be generated. + if (!C) + return LibCallCost; + + const unsigned Size = C->getValue().getZExtValue(); + const unsigned DstAlign = MI->getDestAlignment(); + const unsigned SrcAlign = MI->getSourceAlignment(); + const Function *F = I->getParent()->getParent(); + const unsigned Limit = TLI->getMaxStoresPerMemmove(F->hasMinSize()); + std::vector<EVT> MemOps; + + // MemOps will be poplulated with a list of data types that needs to be + // loaded and stored. That's why we multiply the number of elements by 2 to + // get the cost for this memcpy. + if (getTLI()->findOptimalMemOpLowering( + MemOps, Limit, Size, DstAlign, SrcAlign, false /*IsMemset*/, + false /*ZeroMemset*/, false /*MemcpyStrSrc*/, false /*AllowOverlap*/, + MI->getDestAddressSpace(), MI->getSourceAddressSpace(), + F->getAttributes())) + return MemOps.size() * 2; + + // If we can't find an optimal memop lowering, return the default cost + return LibCallCost; +} + int ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp) { if (Kind == TTI::SK_Broadcast) { @@ -590,6 +632,222 @@ int ARMTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, UseMaskForCond, UseMaskForGaps); } +bool ARMTTIImpl::isLoweredToCall(const Function *F) { + if (!F->isIntrinsic()) + BaseT::isLoweredToCall(F); + + // Assume all Arm-specific intrinsics map to an instruction. + if (F->getName().startswith("llvm.arm")) + return false; + + switch (F->getIntrinsicID()) { + default: break; + case Intrinsic::powi: + case Intrinsic::sin: + case Intrinsic::cos: + case Intrinsic::pow: + case Intrinsic::log: + case Intrinsic::log10: + case Intrinsic::log2: + case Intrinsic::exp: + case Intrinsic::exp2: + return true; + case Intrinsic::sqrt: + case Intrinsic::fabs: + case Intrinsic::copysign: + case Intrinsic::floor: + case Intrinsic::ceil: + case Intrinsic::trunc: + case Intrinsic::rint: + case Intrinsic::nearbyint: + case Intrinsic::round: + case Intrinsic::canonicalize: + case Intrinsic::lround: + case Intrinsic::llround: + case Intrinsic::lrint: + case Intrinsic::llrint: + if (F->getReturnType()->isDoubleTy() && !ST->hasFP64()) + return true; + if (F->getReturnType()->isHalfTy() && !ST->hasFullFP16()) + return true; + // Some operations can be handled by vector instructions and assume + // unsupported vectors will be expanded into supported scalar ones. + // TODO Handle scalar operations properly. + return !ST->hasFPARMv8Base() && !ST->hasVFP2Base(); + case Intrinsic::masked_store: + case Intrinsic::masked_load: + case Intrinsic::masked_gather: + case Intrinsic::masked_scatter: + return !ST->hasMVEIntegerOps(); + case Intrinsic::sadd_with_overflow: + case Intrinsic::uadd_with_overflow: + case Intrinsic::ssub_with_overflow: + case Intrinsic::usub_with_overflow: + case Intrinsic::sadd_sat: + case Intrinsic::uadd_sat: + case Intrinsic::ssub_sat: + case Intrinsic::usub_sat: + return false; + } + + return BaseT::isLoweredToCall(F); +} + +bool ARMTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, + AssumptionCache &AC, + TargetLibraryInfo *LibInfo, + HardwareLoopInfo &HWLoopInfo) { + // Low-overhead branches are only supported in the 'low-overhead branch' + // extension of v8.1-m. + if (!ST->hasLOB() || DisableLowOverheadLoops) + return false; + + if (!SE.hasLoopInvariantBackedgeTakenCount(L)) + return false; + + const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L); + if (isa<SCEVCouldNotCompute>(BackedgeTakenCount)) + return false; + + const SCEV *TripCountSCEV = + SE.getAddExpr(BackedgeTakenCount, + SE.getOne(BackedgeTakenCount->getType())); + + // We need to store the trip count in LR, a 32-bit register. + if (SE.getUnsignedRangeMax(TripCountSCEV).getBitWidth() > 32) + return false; + + // Making a call will trash LR and clear LO_BRANCH_INFO, so there's little + // point in generating a hardware loop if that's going to happen. + auto MaybeCall = [this](Instruction &I) { + const ARMTargetLowering *TLI = getTLI(); + unsigned ISD = TLI->InstructionOpcodeToISD(I.getOpcode()); + EVT VT = TLI->getValueType(DL, I.getType(), true); + if (TLI->getOperationAction(ISD, VT) == TargetLowering::LibCall) + return true; + + // Check if an intrinsic will be lowered to a call and assume that any + // other CallInst will generate a bl. + if (auto *Call = dyn_cast<CallInst>(&I)) { + if (isa<IntrinsicInst>(Call)) { + if (const Function *F = Call->getCalledFunction()) + return isLoweredToCall(F); + } + return true; + } + + // FPv5 provides conversions between integer, double-precision, + // single-precision, and half-precision formats. + switch (I.getOpcode()) { + default: + break; + case Instruction::FPToSI: + case Instruction::FPToUI: + case Instruction::SIToFP: + case Instruction::UIToFP: + case Instruction::FPTrunc: + case Instruction::FPExt: + return !ST->hasFPARMv8Base(); + } + + // FIXME: Unfortunately the approach of checking the Operation Action does + // not catch all cases of Legalization that use library calls. Our + // Legalization step categorizes some transformations into library calls as + // Custom, Expand or even Legal when doing type legalization. So for now + // we have to special case for instance the SDIV of 64bit integers and the + // use of floating point emulation. + if (VT.isInteger() && VT.getSizeInBits() >= 64) { + switch (ISD) { + default: + break; + case ISD::SDIV: + case ISD::UDIV: + case ISD::SREM: + case ISD::UREM: + case ISD::SDIVREM: + case ISD::UDIVREM: + return true; + } + } + + // Assume all other non-float operations are supported. + if (!VT.isFloatingPoint()) + return false; + + // We'll need a library call to handle most floats when using soft. + if (TLI->useSoftFloat()) { + switch (I.getOpcode()) { + default: + return true; + case Instruction::Alloca: + case Instruction::Load: + case Instruction::Store: + case Instruction::Select: + case Instruction::PHI: + return false; + } + } + + // We'll need a libcall to perform double precision operations on a single + // precision only FPU. + if (I.getType()->isDoubleTy() && !ST->hasFP64()) + return true; + + // Likewise for half precision arithmetic. + if (I.getType()->isHalfTy() && !ST->hasFullFP16()) + return true; + + return false; + }; + + auto IsHardwareLoopIntrinsic = [](Instruction &I) { + if (auto *Call = dyn_cast<IntrinsicInst>(&I)) { + switch (Call->getIntrinsicID()) { + default: + break; + case Intrinsic::set_loop_iterations: + case Intrinsic::test_set_loop_iterations: + case Intrinsic::loop_decrement: + case Intrinsic::loop_decrement_reg: + return true; + } + } + return false; + }; + + // Scan the instructions to see if there's any that we know will turn into a + // call or if this loop is already a low-overhead loop. + auto ScanLoop = [&](Loop *L) { + for (auto *BB : L->getBlocks()) { + for (auto &I : *BB) { + if (MaybeCall(I) || IsHardwareLoopIntrinsic(I)) + return false; + } + } + return true; + }; + + // Visit inner loops. + for (auto Inner : *L) + if (!ScanLoop(Inner)) + return false; + + if (!ScanLoop(L)) + return false; + + // TODO: Check whether the trip count calculation is expensive. If L is the + // inner loop but we know it has a low trip count, calculating that trip + // count (in the parent loop) may be detrimental. + + LLVMContext &C = L->getHeader()->getContext(); + HWLoopInfo.CounterInReg = true; + HWLoopInfo.IsNestingLegal = false; + HWLoopInfo.PerformEntryTest = true; + HWLoopInfo.CountType = Type::getInt32Ty(C); + HWLoopInfo.LoopDecrement = ConstantInt::get(HWLoopInfo.CountType, 1); + return true; +} + void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP) { // Only currently enable these preferences for M-Class cores. @@ -599,7 +857,7 @@ void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, // Disable loop unrolling for Oz and Os. UP.OptSizeThreshold = 0; UP.PartialOptSizeThreshold = 0; - if (L->getHeader()->getParent()->optForSize()) + if (L->getHeader()->getParent()->hasOptSize()) return; // Only enable on Thumb-2 targets. @@ -645,6 +903,7 @@ void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, UP.Partial = true; UP.Runtime = true; + UP.UpperBound = true; UP.UnrollRemainder = true; UP.DefaultUnrollRuntimeCount = 4; UP.UnrollAndJam = true; |