aboutsummaryrefslogtreecommitdiff
path: root/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Target/PowerPC/PPCISelLowering.cpp')
-rw-r--r--contrib/llvm-project/llvm/lib/Target/PowerPC/PPCISelLowering.cpp266
1 files changed, 167 insertions, 99 deletions
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 547b71a6101a..22c662a79d87 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -132,6 +132,10 @@ cl::opt<bool> DisableAutoPairedVecSt(
cl::desc("disable automatically generated 32byte paired vector stores"),
cl::init(true), cl::Hidden);
+static cl::opt<unsigned> PPCMinimumJumpTableEntries(
+ "ppc-min-jump-table-entries", cl::init(64), cl::Hidden,
+ cl::desc("Set minimum number of entries to use a jump table on PPC"));
+
STATISTIC(NumTailCalls, "Number of tail calls");
STATISTIC(NumSiblingCalls, "Number of sibling calls");
STATISTIC(ShufflesHandledWithVPERM,
@@ -144,6 +148,12 @@ static SDValue widenVec(SelectionDAG &DAG, SDValue Vec, const SDLoc &dl);
static const char AIXSSPCanaryWordName[] = "__ssp_canary_word";
+// A faster local-exec TLS access sequence (enabled with the
+// -maix-small-local-exec-tls option) can be produced for TLS variables;
+// consistent with the IBM XL compiler, we apply a max size of slightly under
+// 32KB.
+constexpr uint64_t AIXSmallTlsPolicySizeLimit = 32751;
+
// FIXME: Remove this once the bug has been fixed!
extern cl::opt<bool> ANDIGlueBug;
@@ -389,7 +399,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
// MASS transformation for LLVM intrinsics with replicating fast-math flag
// to be consistent to PPCGenScalarMASSEntries pass
- if (TM.getOptLevel() == CodeGenOpt::Aggressive) {
+ if (TM.getOptLevel() == CodeGenOptLevel::Aggressive) {
setOperationAction(ISD::FSIN , MVT::f64, Custom);
setOperationAction(ISD::FCOS , MVT::f64, Custom);
setOperationAction(ISD::FPOW , MVT::f64, Custom);
@@ -1419,6 +1429,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
setLibcallName(RTLIB::LLRINT_F128, "llrintf128");
setLibcallName(RTLIB::NEARBYINT_F128, "nearbyintf128");
setLibcallName(RTLIB::FMA_F128, "fmaf128");
+ setLibcallName(RTLIB::FREXP_F128, "frexpf128");
if (Subtarget.isAIXABI()) {
setLibcallName(RTLIB::MEMCPY, isPPC64 ? "___memmove64" : "___memmove");
@@ -1434,6 +1445,12 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
setJumpIsExpensive();
}
+ // TODO: The default entry number is set to 64. This stops most jump table
+ // generation on PPC. But it is good for current PPC HWs because the indirect
+ // branch instruction mtctr to the jump table may lead to bad branch predict.
+ // Re-evaluate this value on future HWs that can do better with mtctr.
+ setMinimumJumpTableEntries(PPCMinimumJumpTableEntries);
+
setMinFunctionAlignment(Align(4));
switch (Subtarget.getCPUDirective()) {
@@ -1627,6 +1644,27 @@ bool PPCTargetLowering::preferIncOfAddToSubOfNot(EVT VT) const {
return VT.isScalarInteger();
}
+bool PPCTargetLowering::shallExtractConstSplatVectorElementToStore(
+ Type *VectorTy, unsigned ElemSizeInBits, unsigned &Index) const {
+ if (!Subtarget.isPPC64() || !Subtarget.hasVSX())
+ return false;
+
+ if (auto *VTy = dyn_cast<VectorType>(VectorTy)) {
+ if (VTy->getScalarType()->isIntegerTy()) {
+ // ElemSizeInBits 8/16 can fit in immediate field, not needed here.
+ if (ElemSizeInBits == 32) {
+ Index = Subtarget.isLittleEndian() ? 2 : 1;
+ return true;
+ }
+ if (ElemSizeInBits == 64) {
+ Index = Subtarget.isLittleEndian() ? 1 : 0;
+ return true;
+ }
+ }
+ }
+ return false;
+}
+
const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
switch ((PPCISD::NodeType)Opcode) {
case PPCISD::FIRST_NUMBER: break;
@@ -2936,7 +2974,7 @@ bool PPCTargetLowering::SelectAddressRegRegOnly(SDValue N, SDValue &Base,
template <typename Ty> static bool isValidPCRelNode(SDValue N) {
Ty *PCRelCand = dyn_cast<Ty>(N);
- return PCRelCand && (PCRelCand->getTargetFlags() & PPCII::MO_PCREL_FLAG);
+ return PCRelCand && (PPCInstrInfo::hasPCRelFlag(PCRelCand->getTargetFlags()));
}
/// Returns true if this address is a PC Relative address.
@@ -3097,8 +3135,8 @@ static void getLabelAccessInfo(bool IsPIC, const PPCSubtarget &Subtarget,
// Don't use the pic base if not in PIC relocation model.
if (IsPIC) {
- HiOpFlags |= PPCII::MO_PIC_FLAG;
- LoOpFlags |= PPCII::MO_PIC_FLAG;
+ HiOpFlags = PPCII::MO_PIC_HA_FLAG;
+ LoOpFlags = PPCII::MO_PIC_LO_FLAG;
}
}
@@ -3326,36 +3364,60 @@ SDValue PPCTargetLowering::LowerGlobalTLSAddressAIX(SDValue Op,
const GlobalValue *GV = GA->getGlobal();
EVT PtrVT = getPointerTy(DAG.getDataLayout());
bool Is64Bit = Subtarget.isPPC64();
+ bool HasAIXSmallLocalExecTLS = Subtarget.hasAIXSmallLocalExecTLS();
TLSModel::Model Model = getTargetMachine().getTLSModel(GV);
+ bool IsTLSLocalExecModel = Model == TLSModel::LocalExec;
- if (Model == TLSModel::LocalExec) {
+ if (IsTLSLocalExecModel || Model == TLSModel::InitialExec) {
SDValue VariableOffsetTGA =
DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, PPCII::MO_TPREL_FLAG);
SDValue VariableOffset = getTOCEntry(DAG, dl, VariableOffsetTGA);
SDValue TLSReg;
- if (Is64Bit)
- // For local-exec on AIX (64-bit), the sequence that is generated involves
- // a load of the variable offset (from the TOC), followed by an add of the
- // loaded variable offset to R13 (the thread pointer).
+ if (Is64Bit) {
+ // For local-exec and initial-exec on AIX (64-bit), the sequence generated
+ // involves a load of the variable offset (from the TOC), followed by an
+ // add of the loaded variable offset to R13 (the thread pointer).
// This code sequence looks like:
// ld reg1,var[TC](2)
// add reg2, reg1, r13 // r13 contains the thread pointer
TLSReg = DAG.getRegister(PPC::X13, MVT::i64);
- else
- // For local-exec on AIX (32-bit), the sequence that is generated involves
- // loading the variable offset from the TOC, generating a call to
+
+ // With the -maix-small-local-exec-tls option, produce a faster access
+ // sequence for local-exec TLS variables where the offset from the TLS
+ // base is encoded as an immediate operand.
+ //
+ // We only utilize the faster local-exec access sequence when the TLS
+ // variable has a size within the policy limit. We treat types that are
+ // not sized or are empty as being over the policy size limit.
+ if (HasAIXSmallLocalExecTLS && IsTLSLocalExecModel) {
+ Type *GVType = GV->getValueType();
+ if (GVType->isSized() && !GVType->isEmptyTy() &&
+ GV->getParent()->getDataLayout().getTypeAllocSize(GVType) <=
+ AIXSmallTlsPolicySizeLimit)
+ return DAG.getNode(PPCISD::Lo, dl, PtrVT, VariableOffsetTGA, TLSReg);
+ }
+ } else {
+ // For local-exec and initial-exec on AIX (32-bit), the sequence generated
+ // involves loading the variable offset from the TOC, generating a call to
// .__get_tpointer to get the thread pointer (which will be in R3), and
// adding the two together:
// lwz reg1,var[TC](2)
// bla .__get_tpointer
// add reg2, reg1, r3
TLSReg = DAG.getNode(PPCISD::GET_TPOINTER, dl, PtrVT);
+
+ // We do not implement the 32-bit version of the faster access sequence
+ // for local-exec that is controlled by -maix-small-local-exec-tls.
+ if (HasAIXSmallLocalExecTLS)
+ report_fatal_error("The small-local-exec TLS access sequence is "
+ "currently only supported on AIX (64-bit mode).");
+ }
return DAG.getNode(PPCISD::ADD_TLS, dl, PtrVT, TLSReg, VariableOffset);
}
- // The Local-Exec and General-Dynamic TLS models are currently the only
- // supported access models. If Local-exec is not possible or specified, all
- // GlobalTLSAddress nodes are lowered using the general-dynamic model.
+ // Only Local-Exec, Initial-Exec and General-Dynamic TLS models are currently
+ // supported models. If Local- or Initial-exec are not possible or specified,
+ // all GlobalTLSAddress nodes are lowered using the general-dynamic model.
// We need to generate two TOC entries, one for the variable offset, one for
// the region handle. The global address for the TOC entry of the region
// handle is created with the MO_TLSGDM_FLAG flag and the global address
@@ -3393,8 +3455,8 @@ SDValue PPCTargetLowering::LowerGlobalTLSAddressLinux(SDValue Op,
if (Model == TLSModel::LocalExec) {
if (Subtarget.isUsingPCRelativeCalls()) {
SDValue TLSReg = DAG.getRegister(PPC::X13, MVT::i64);
- SDValue TGA = DAG.getTargetGlobalAddress(
- GV, dl, PtrVT, 0, (PPCII::MO_PCREL_FLAG | PPCII::MO_TPREL_FLAG));
+ SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
+ PPCII::MO_TPREL_PCREL_FLAG);
SDValue MatAddr =
DAG.getNode(PPCISD::TLS_LOCAL_EXEC_MAT_ADDR, dl, PtrVT, TGA);
return DAG.getNode(PPCISD::ADD_TLS, dl, PtrVT, TLSReg, MatAddr);
@@ -3416,8 +3478,7 @@ SDValue PPCTargetLowering::LowerGlobalTLSAddressLinux(SDValue Op,
SDValue TGA = DAG.getTargetGlobalAddress(
GV, dl, PtrVT, 0, IsPCRel ? PPCII::MO_GOT_TPREL_PCREL_FLAG : 0);
SDValue TGATLS = DAG.getTargetGlobalAddress(
- GV, dl, PtrVT, 0,
- IsPCRel ? (PPCII::MO_TLS | PPCII::MO_PCREL_FLAG) : PPCII::MO_TLS);
+ GV, dl, PtrVT, 0, IsPCRel ? PPCII::MO_TLS_PCREL_FLAG : PPCII::MO_TLS);
SDValue TPOffset;
if (IsPCRel) {
SDValue MatPCRel = DAG.getNode(PPCISD::MAT_PCREL_ADDR, dl, PtrVT, TGA);
@@ -3513,8 +3574,7 @@ SDValue PPCTargetLowering::LowerGlobalAddress(SDValue Op,
EVT Ty = getPointerTy(DAG.getDataLayout());
if (isAccessedAsGotIndirect(Op)) {
SDValue GA = DAG.getTargetGlobalAddress(GV, DL, Ty, GSDN->getOffset(),
- PPCII::MO_PCREL_FLAG |
- PPCII::MO_GOT_FLAG);
+ PPCII::MO_GOT_PCREL_FLAG);
SDValue MatPCRel = DAG.getNode(PPCISD::MAT_PCREL_ADDR, DL, Ty, GA);
SDValue Load = DAG.getLoad(MVT::i64, DL, DAG.getEntryNode(), MatPCRel,
MachinePointerInfo());
@@ -3764,21 +3824,22 @@ SDValue PPCTargetLowering::LowerINLINEASM(SDValue Op, SelectionDAG &DAG) const {
// Check all operands that may contain the LR.
for (unsigned i = InlineAsm::Op_FirstOperand; i != NumOps;) {
- unsigned Flags = cast<ConstantSDNode>(Op.getOperand(i))->getZExtValue();
- unsigned NumVals = InlineAsm::getNumOperandRegisters(Flags);
+ const InlineAsm::Flag Flags(
+ cast<ConstantSDNode>(Op.getOperand(i))->getZExtValue());
+ unsigned NumVals = Flags.getNumOperandRegisters();
++i; // Skip the ID value.
- switch (InlineAsm::getKind(Flags)) {
+ switch (Flags.getKind()) {
default:
llvm_unreachable("Bad flags!");
- case InlineAsm::Kind_RegUse:
- case InlineAsm::Kind_Imm:
- case InlineAsm::Kind_Mem:
+ case InlineAsm::Kind::RegUse:
+ case InlineAsm::Kind::Imm:
+ case InlineAsm::Kind::Mem:
i += NumVals;
break;
- case InlineAsm::Kind_Clobber:
- case InlineAsm::Kind_RegDef:
- case InlineAsm::Kind_RegDefEarlyClobber: {
+ case InlineAsm::Kind::Clobber:
+ case InlineAsm::Kind::RegDef:
+ case InlineAsm::Kind::RegDefEarlyClobber: {
for (; NumVals; --NumVals, ++i) {
Register Reg = cast<RegisterSDNode>(Op.getOperand(i))->getReg();
if (Reg != PPC::LR && Reg != PPC::LR8)
@@ -5278,7 +5339,7 @@ static unsigned getCallOpcode(PPCTargetLowering::CallFlags CFlags,
// inserted into the DAG as part of call lowering. The restore of the TOC
// pointer is modeled by using a pseudo instruction for the call opcode that
// represents the 2 instruction sequence of an indirect branch and link,
- // immediately followed by a load of the TOC pointer from the the stack save
+ // immediately followed by a load of the TOC pointer from the stack save
// slot into gpr2. For 64-bit ELFv2 ABI with PCRel, do not restore the TOC
// as it is not saved or used.
RetOpc = isTOCSaveRestoreRequired(Subtarget) ? PPCISD::BCTRL_LOAD_TOC
@@ -7193,7 +7254,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_AIX(
// be future work.
SDValue Store = DAG.getStore(
CopyFrom.getValue(1), dl, CopyFrom,
- DAG.getObjectPtrOffset(dl, FIN, TypeSize::Fixed(Offset)),
+ DAG.getObjectPtrOffset(dl, FIN, TypeSize::getFixed(Offset)),
MachinePointerInfo::getFixedStack(MF, FI, Offset));
MemOps.push_back(Store);
@@ -7373,12 +7434,12 @@ SDValue PPCTargetLowering::LowerCall_AIX(
}
auto GetLoad = [&](EVT VT, unsigned LoadOffset) {
- return DAG.getExtLoad(
- ISD::ZEXTLOAD, dl, PtrVT, Chain,
- (LoadOffset != 0)
- ? DAG.getObjectPtrOffset(dl, Arg, TypeSize::Fixed(LoadOffset))
- : Arg,
- MachinePointerInfo(), VT);
+ return DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain,
+ (LoadOffset != 0)
+ ? DAG.getObjectPtrOffset(
+ dl, Arg, TypeSize::getFixed(LoadOffset))
+ : Arg,
+ MachinePointerInfo(), VT);
};
unsigned LoadOffset = 0;
@@ -7408,11 +7469,11 @@ SDValue PPCTargetLowering::LowerCall_AIX(
// Only memcpy the bytes that don't pass in register.
MemcpyFlags.setByValSize(ByValSize - LoadOffset);
Chain = CallSeqStart = createMemcpyOutsideCallSeq(
- (LoadOffset != 0)
- ? DAG.getObjectPtrOffset(dl, Arg, TypeSize::Fixed(LoadOffset))
- : Arg,
- DAG.getObjectPtrOffset(dl, StackPtr,
- TypeSize::Fixed(ByValVA.getLocMemOffset())),
+ (LoadOffset != 0) ? DAG.getObjectPtrOffset(
+ dl, Arg, TypeSize::getFixed(LoadOffset))
+ : Arg,
+ DAG.getObjectPtrOffset(
+ dl, StackPtr, TypeSize::getFixed(ByValVA.getLocMemOffset())),
CallSeqStart, MemcpyFlags, DAG, dl);
continue;
}
@@ -8020,7 +8081,8 @@ SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
// For more information, see section F.3 of the 2.06 ISA specification.
// With ISA 3.0
if ((!DAG.getTarget().Options.NoInfsFPMath && !Flags.hasNoInfs()) ||
- (!DAG.getTarget().Options.NoNaNsFPMath && !Flags.hasNoNaNs()))
+ (!DAG.getTarget().Options.NoNaNsFPMath && !Flags.hasNoNaNs()) ||
+ ResVT == MVT::f128)
return Op;
// If the RHS of the comparison is a 0.0, we don't need to do the
@@ -10254,11 +10316,6 @@ SDValue PPCTargetLowering::LowerVPERM(SDValue Op, SelectionDAG &DAG,
bool isLittleEndian = Subtarget.isLittleEndian();
bool isPPC64 = Subtarget.isPPC64();
- // Only need to place items backwards in LE,
- // the mask will be properly calculated.
- if (isLittleEndian)
- std::swap(V1, V2);
-
if (Subtarget.hasVSX() && Subtarget.hasP9Vector() &&
(V1->hasOneUse() || V2->hasOneUse())) {
LLVM_DEBUG(dbgs() << "At least one of two input vectors are dead - using "
@@ -10268,7 +10325,8 @@ SDValue PPCTargetLowering::LowerVPERM(SDValue Op, SelectionDAG &DAG,
// The second input to XXPERM is also an output so if the second input has
// multiple uses then copying is necessary, as a result we want the
// single-use operand to be used as the second input to prevent copying.
- if (!V2->hasOneUse() && V1->hasOneUse()) {
+ if ((!isLittleEndian && !V2->hasOneUse() && V1->hasOneUse()) ||
+ (isLittleEndian && !V1->hasOneUse() && V2->hasOneUse())) {
std::swap(V1, V2);
NeedSwap = !NeedSwap;
}
@@ -10307,27 +10365,24 @@ SDValue PPCTargetLowering::LowerVPERM(SDValue Op, SelectionDAG &DAG,
for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
unsigned SrcElt = PermMask[i] < 0 ? 0 : PermMask[i];
- if (Opcode == PPCISD::XXPERM) {
- if (V1HasXXSWAPD) {
- if (SrcElt < 8)
- SrcElt += 8;
- else if (SrcElt < 16)
- SrcElt -= 8;
- }
- if (V2HasXXSWAPD) {
- if (SrcElt > 23)
- SrcElt -= 8;
- else if (SrcElt > 15)
- SrcElt += 8;
- }
- if (NeedSwap) {
- if (SrcElt < 16)
- SrcElt += 16;
- else
- SrcElt -= 16;
- }
+ if (V1HasXXSWAPD) {
+ if (SrcElt < 8)
+ SrcElt += 8;
+ else if (SrcElt < 16)
+ SrcElt -= 8;
+ }
+ if (V2HasXXSWAPD) {
+ if (SrcElt > 23)
+ SrcElt -= 8;
+ else if (SrcElt > 15)
+ SrcElt += 8;
+ }
+ if (NeedSwap) {
+ if (SrcElt < 16)
+ SrcElt += 16;
+ else
+ SrcElt -= 16;
}
-
for (unsigned j = 0; j != BytesPerElement; ++j)
if (isLittleEndian)
ResultMask.push_back(
@@ -10337,18 +10392,19 @@ SDValue PPCTargetLowering::LowerVPERM(SDValue Op, SelectionDAG &DAG,
DAG.getConstant(SrcElt * BytesPerElement + j, dl, MVT::i32));
}
- if (Opcode == PPCISD::XXPERM && (V1HasXXSWAPD || V2HasXXSWAPD)) {
- if (V1HasXXSWAPD) {
- dl = SDLoc(V1->getOperand(0));
- V1 = V1->getOperand(0)->getOperand(1);
- }
- if (V2HasXXSWAPD) {
- dl = SDLoc(V2->getOperand(0));
- V2 = V2->getOperand(0)->getOperand(1);
- }
- if (isPPC64 && ValType != MVT::v2f64)
+ if (V1HasXXSWAPD) {
+ dl = SDLoc(V1->getOperand(0));
+ V1 = V1->getOperand(0)->getOperand(1);
+ }
+ if (V2HasXXSWAPD) {
+ dl = SDLoc(V2->getOperand(0));
+ V2 = V2->getOperand(0)->getOperand(1);
+ }
+
+ if (isPPC64 && (V1HasXXSWAPD || V2HasXXSWAPD)) {
+ if (ValType != MVT::v2f64)
V1 = DAG.getBitcast(MVT::v2f64, V1);
- if (isPPC64 && V2.getValueType() != MVT::v2f64)
+ if (V2.getValueType() != MVT::v2f64)
V2 = DAG.getBitcast(MVT::v2f64, V2);
}
@@ -10369,6 +10425,11 @@ SDValue PPCTargetLowering::LowerVPERM(SDValue Op, SelectionDAG &DAG,
if (Opcode == PPCISD::XXPERM)
VPermMask = DAG.getBitcast(MVT::v4i32, VPermMask);
+ // Only need to place items backwards in LE,
+ // the mask was properly calculated.
+ if (isLittleEndian)
+ std::swap(V1, V2);
+
SDValue VPERMNode =
DAG.getNode(Opcode, dl, V1.getValueType(), V1, V2, VPermMask);
@@ -11037,14 +11098,14 @@ SDValue PPCTargetLowering::LowerATOMIC_LOAD_STORE(SDValue Op,
SmallVector<SDValue, 4> Ops{
N->getOperand(0),
DAG.getConstant(Intrinsic::ppc_atomic_store_i128, dl, MVT::i32)};
- SDValue Val = N->getOperand(2);
+ SDValue Val = N->getOperand(1);
SDValue ValLo = DAG.getNode(ISD::TRUNCATE, dl, MVT::i64, Val);
SDValue ValHi = DAG.getNode(ISD::SRL, dl, MVT::i128, Val,
DAG.getConstant(64, dl, MVT::i32));
ValHi = DAG.getNode(ISD::TRUNCATE, dl, MVT::i64, ValHi);
Ops.push_back(ValLo);
Ops.push_back(ValHi);
- Ops.push_back(N->getOperand(1));
+ Ops.push_back(N->getOperand(2));
return DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID, dl, Tys, Ops, MemVT,
N->getMemOperand());
}
@@ -16659,13 +16720,14 @@ PPCTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
/// vector. If it is invalid, don't add anything to Ops.
void PPCTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
- std::string &Constraint,
- std::vector<SDValue>&Ops,
+ StringRef Constraint,
+ std::vector<SDValue> &Ops,
SelectionDAG &DAG) const {
SDValue Result;
// Only support length 1 constraints.
- if (Constraint.length() > 1) return;
+ if (Constraint.size() > 1)
+ return;
char Letter = Constraint[0];
switch (Letter) {
@@ -17075,13 +17137,23 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
/// target-independent logic.
EVT PPCTargetLowering::getOptimalMemOpType(
const MemOp &Op, const AttributeList &FuncAttributes) const {
- if (getTargetMachine().getOptLevel() != CodeGenOpt::None) {
+ if (getTargetMachine().getOptLevel() != CodeGenOptLevel::None) {
// We should use Altivec/VSX loads and stores when available. For unaligned
// addresses, unaligned VSX loads are only fast starting with the P8.
- if (Subtarget.hasAltivec() && Op.size() >= 16 &&
- (Op.isAligned(Align(16)) ||
- ((Op.isMemset() && Subtarget.hasVSX()) || Subtarget.hasP8Vector())))
- return MVT::v4i32;
+ if (Subtarget.hasAltivec() && Op.size() >= 16) {
+ if (Op.isMemset() && Subtarget.hasVSX()) {
+ uint64_t TailSize = Op.size() % 16;
+ // For memset lowering, EXTRACT_VECTOR_ELT tries to return constant
+ // element if vector element type matches tail store. For tail size
+ // 3/4, the tail store is i32, v4i32 cannot be used, need a legal one.
+ if (TailSize > 2 && TailSize <= 4) {
+ return MVT::v8i16;
+ }
+ return MVT::v4i32;
+ }
+ if (Op.isAligned(Align(16)) || Subtarget.hasP8Vector())
+ return MVT::v4i32;
+ }
}
if (Subtarget.isPPC64()) {
@@ -17227,7 +17299,7 @@ bool PPCTargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
bool PPCTargetLowering::isFMAFasterThanFMulAndFAdd(const Function &F,
Type *Ty) const {
- if (Subtarget.hasSPE())
+ if (Subtarget.hasSPE() || Subtarget.useSoftFloat())
return false;
switch (Ty->getScalarType()->getTypeID()) {
case Type::FloatTyID:
@@ -17431,7 +17503,7 @@ bool PPCTargetLowering::useLoadStackGuardNode() const {
void PPCTargetLowering::insertSSPDeclarations(Module &M) const {
if (Subtarget.isAIXABI()) {
M.getOrInsertGlobal(AIXSSPCanaryWordName,
- Type::getInt8PtrTy(M.getContext()));
+ PointerType::getUnqual(M.getContext()));
return;
}
if (!Subtarget.isTargetLinux())
@@ -18539,9 +18611,7 @@ Value *PPCTargetLowering::emitMaskedAtomicRMWIntrinsic(
Value *IncrLo = Builder.CreateTrunc(Incr, Int64Ty, "incr_lo");
Value *IncrHi =
Builder.CreateTrunc(Builder.CreateLShr(Incr, 64), Int64Ty, "incr_hi");
- Value *Addr =
- Builder.CreateBitCast(AlignedAddr, Type::getInt8PtrTy(M->getContext()));
- Value *LoHi = Builder.CreateCall(RMW, {Addr, IncrLo, IncrHi});
+ Value *LoHi = Builder.CreateCall(RMW, {AlignedAddr, IncrLo, IncrHi});
Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
Lo = Builder.CreateZExt(Lo, ValTy, "lo64");
@@ -18566,11 +18636,9 @@ Value *PPCTargetLowering::emitMaskedAtomicCmpXchgIntrinsic(
Value *NewLo = Builder.CreateTrunc(NewVal, Int64Ty, "new_lo");
Value *NewHi =
Builder.CreateTrunc(Builder.CreateLShr(NewVal, 64), Int64Ty, "new_hi");
- Value *Addr =
- Builder.CreateBitCast(AlignedAddr, Type::getInt8PtrTy(M->getContext()));
emitLeadingFence(Builder, CI, Ord);
Value *LoHi =
- Builder.CreateCall(IntCmpXchg, {Addr, CmpLo, CmpHi, NewLo, NewHi});
+ Builder.CreateCall(IntCmpXchg, {AlignedAddr, CmpLo, CmpHi, NewLo, NewHi});
emitTrailingFence(Builder, CI, Ord);
Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");