aboutsummaryrefslogtreecommitdiff
path: root/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2020-07-31 21:22:58 +0000
committerDimitry Andric <dim@FreeBSD.org>2020-07-31 21:22:58 +0000
commit5ffd83dbcc34f10e07f6d3e968ae6365869615f4 (patch)
tree0e9f5cf729dde39f949698fddef45a34e2bc7f44 /contrib/llvm-project/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
parent1799696096df87b52968b8996d00c91e0a5de8d9 (diff)
parentcfca06d7963fa0909f90483b42a6d7d194d01e08 (diff)
downloadsrc-5ffd83dbcc34f10e07f6d3e968ae6365869615f4.tar.gz
src-5ffd83dbcc34f10e07f6d3e968ae6365869615f4.zip
Merge llvm, clang, compiler-rt, libc++, libunwind, lld, lldb and openmp
master 2e10b7a39b9, the last commit before the llvmorg-12-init tag, from which release/11.x was branched. Note that for now, I rolled back all our local changes to make merging easier, and I will reapply the still-relevant ones after updating to 11.0.0-rc1.
Notes
Notes: svn path=/projects/clang1100-import/; revision=363742
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Target/PowerPC/PPCISelLowering.cpp')
-rw-r--r--contrib/llvm-project/llvm/lib/Target/PowerPC/PPCISelLowering.cpp2820
1 files changed, 2020 insertions, 800 deletions
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index ca1649fae258..ddfbd04e1ebc 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -55,7 +55,6 @@
#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
#include "llvm/CodeGen/TargetRegisterInfo.h"
#include "llvm/CodeGen/ValueTypes.h"
-#include "llvm/IR/CallSite.h"
#include "llvm/IR/CallingConv.h"
#include "llvm/IR/Constant.h"
#include "llvm/IR/Constants.h"
@@ -118,14 +117,13 @@ cl::desc("disable sibling call optimization on ppc"), cl::Hidden);
static cl::opt<bool> DisableInnermostLoopAlign32("disable-ppc-innermost-loop-align32",
cl::desc("don't always align innermost loop to 32 bytes on ppc"), cl::Hidden);
-static cl::opt<bool> EnableQuadPrecision("enable-ppc-quad-precision",
-cl::desc("enable quad precision float support on ppc"), cl::Hidden);
-
static cl::opt<bool> UseAbsoluteJumpTables("ppc-use-absolute-jumptables",
cl::desc("use absolute jump tables on ppc"), cl::Hidden);
STATISTIC(NumTailCalls, "Number of tail calls");
STATISTIC(NumSiblingCalls, "Number of sibling calls");
+STATISTIC(ShufflesHandledWithVPERM, "Number of shuffles lowered to a VPERM");
+STATISTIC(NumDynamicAllocaProbed, "Number of dynamic stack allocation probed");
static bool isNByteElemShuffleMask(ShuffleVectorSDNode *, unsigned, int);
@@ -260,15 +258,16 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
// PowerPC has no SREM/UREM instructions unless we are on P9
// On P9 we may use a hardware instruction to compute the remainder.
- // The instructions are not legalized directly because in the cases where the
- // result of both the remainder and the division is required it is more
- // efficient to compute the remainder from the result of the division rather
- // than use the remainder instruction.
+ // When the result of both the remainder and the division is required it is
+ // more efficient to compute the remainder from the result of the division
+ // rather than use the remainder instruction. The instructions are legalized
+ // directly because the DivRemPairsPass performs the transformation at the IR
+ // level.
if (Subtarget.isISA3_0()) {
- setOperationAction(ISD::SREM, MVT::i32, Custom);
- setOperationAction(ISD::UREM, MVT::i32, Custom);
- setOperationAction(ISD::SREM, MVT::i64, Custom);
- setOperationAction(ISD::UREM, MVT::i64, Custom);
+ setOperationAction(ISD::SREM, MVT::i32, Legal);
+ setOperationAction(ISD::UREM, MVT::i32, Legal);
+ setOperationAction(ISD::SREM, MVT::i64, Legal);
+ setOperationAction(ISD::UREM, MVT::i64, Legal);
} else {
setOperationAction(ISD::SREM, MVT::i32, Expand);
setOperationAction(ISD::UREM, MVT::i32, Expand);
@@ -286,6 +285,40 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
setOperationAction(ISD::UDIVREM, MVT::i64, Expand);
setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
+ // Handle constrained floating-point operations of scalar.
+ // TODO: Handle SPE specific operation.
+ setOperationAction(ISD::STRICT_FADD, MVT::f32, Legal);
+ setOperationAction(ISD::STRICT_FSUB, MVT::f32, Legal);
+ setOperationAction(ISD::STRICT_FMUL, MVT::f32, Legal);
+ setOperationAction(ISD::STRICT_FDIV, MVT::f32, Legal);
+ setOperationAction(ISD::STRICT_FMA, MVT::f32, Legal);
+ setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Legal);
+
+ setOperationAction(ISD::STRICT_FADD, MVT::f64, Legal);
+ setOperationAction(ISD::STRICT_FSUB, MVT::f64, Legal);
+ setOperationAction(ISD::STRICT_FMUL, MVT::f64, Legal);
+ setOperationAction(ISD::STRICT_FDIV, MVT::f64, Legal);
+ setOperationAction(ISD::STRICT_FMA, MVT::f64, Legal);
+ if (Subtarget.hasVSX())
+ setOperationAction(ISD::STRICT_FNEARBYINT, MVT::f64, Legal);
+
+ if (Subtarget.hasFSQRT()) {
+ setOperationAction(ISD::STRICT_FSQRT, MVT::f32, Legal);
+ setOperationAction(ISD::STRICT_FSQRT, MVT::f64, Legal);
+ }
+
+ if (Subtarget.hasFPRND()) {
+ setOperationAction(ISD::STRICT_FFLOOR, MVT::f32, Legal);
+ setOperationAction(ISD::STRICT_FCEIL, MVT::f32, Legal);
+ setOperationAction(ISD::STRICT_FTRUNC, MVT::f32, Legal);
+ setOperationAction(ISD::STRICT_FROUND, MVT::f32, Legal);
+
+ setOperationAction(ISD::STRICT_FFLOOR, MVT::f64, Legal);
+ setOperationAction(ISD::STRICT_FCEIL, MVT::f64, Legal);
+ setOperationAction(ISD::STRICT_FTRUNC, MVT::f64, Legal);
+ setOperationAction(ISD::STRICT_FROUND, MVT::f64, Legal);
+ }
+
// We don't support sin/cos/sqrt/fmod/pow
setOperationAction(ISD::FSIN , MVT::f64, Expand);
setOperationAction(ISD::FCOS , MVT::f64, Expand);
@@ -390,6 +423,9 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
if (Subtarget.hasSPE()) {
// SPE has built-in conversions
+ setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Legal);
+ setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Legal);
+ setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Legal);
setOperationAction(ISD::FP_TO_SINT, MVT::i32, Legal);
setOperationAction(ISD::SINT_TO_FP, MVT::i32, Legal);
setOperationAction(ISD::UINT_TO_FP, MVT::i32, Legal);
@@ -539,9 +575,10 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
} else {
// PowerPC does not have FP_TO_UINT on 32-bit implementations.
- if (Subtarget.hasSPE())
+ if (Subtarget.hasSPE()) {
+ setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Legal);
setOperationAction(ISD::FP_TO_UINT, MVT::i32, Legal);
- else
+ } else
setOperationAction(ISD::FP_TO_UINT, MVT::i32, Expand);
}
@@ -584,6 +621,12 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
}
if (Subtarget.hasAltivec()) {
+ for (MVT VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
+ setOperationAction(ISD::SADDSAT, VT, Legal);
+ setOperationAction(ISD::SSUBSAT, VT, Legal);
+ setOperationAction(ISD::UADDSAT, VT, Legal);
+ setOperationAction(ISD::USUBSAT, VT, Legal);
+ }
// First set operation action for all vector types to expand. Then we
// will selectively turn on ones that can be effectively codegen'd.
for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
@@ -738,6 +781,8 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
if (!Subtarget.hasP8Altivec())
setOperationAction(ISD::ABS, MVT::v2i64, Expand);
+ // Custom lowering ROTL v1i128 to VECTOR_SHUFFLE v16i8.
+ setOperationAction(ISD::ROTL, MVT::v1i128, Custom);
// With hasAltivec set, we can lower ISD::ROTL to vrl(b|h|w).
if (Subtarget.hasAltivec())
for (auto VT : {MVT::v4i32, MVT::v8i16, MVT::v16i8})
@@ -764,7 +809,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
else
setOperationAction(ISD::MUL, MVT::v4i32, Custom);
- setOperationAction(ISD::MUL, MVT::v8i16, Custom);
+ setOperationAction(ISD::MUL, MVT::v8i16, Legal);
setOperationAction(ISD::MUL, MVT::v16i8, Custom);
setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Custom);
@@ -811,12 +856,16 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
setOperationAction(ISD::FCEIL, MVT::v2f64, Legal);
setOperationAction(ISD::FTRUNC, MVT::v2f64, Legal);
setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Legal);
+ setOperationAction(ISD::FRINT, MVT::v2f64, Legal);
setOperationAction(ISD::FROUND, MVT::v2f64, Legal);
setOperationAction(ISD::FROUND, MVT::f64, Legal);
+ setOperationAction(ISD::FRINT, MVT::f64, Legal);
setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Legal);
+ setOperationAction(ISD::FRINT, MVT::v4f32, Legal);
setOperationAction(ISD::FROUND, MVT::v4f32, Legal);
setOperationAction(ISD::FROUND, MVT::f32, Legal);
+ setOperationAction(ISD::FRINT, MVT::f32, Legal);
setOperationAction(ISD::MUL, MVT::v2f64, Legal);
setOperationAction(ISD::FMA, MVT::v2f64, Legal);
@@ -906,6 +955,37 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom);
setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom);
+ // Handle constrained floating-point operations of vector.
+ // The predictor is `hasVSX` because altivec instruction has
+ // no exception but VSX vector instruction has.
+ setOperationAction(ISD::STRICT_FADD, MVT::v4f32, Legal);
+ setOperationAction(ISD::STRICT_FSUB, MVT::v4f32, Legal);
+ setOperationAction(ISD::STRICT_FMUL, MVT::v4f32, Legal);
+ setOperationAction(ISD::STRICT_FDIV, MVT::v4f32, Legal);
+ setOperationAction(ISD::STRICT_FMA, MVT::v4f32, Legal);
+ setOperationAction(ISD::STRICT_FSQRT, MVT::v4f32, Legal);
+ setOperationAction(ISD::STRICT_FMAXNUM, MVT::v4f32, Legal);
+ setOperationAction(ISD::STRICT_FMINNUM, MVT::v4f32, Legal);
+ setOperationAction(ISD::STRICT_FNEARBYINT, MVT::v4f32, Legal);
+ setOperationAction(ISD::STRICT_FFLOOR, MVT::v4f32, Legal);
+ setOperationAction(ISD::STRICT_FCEIL, MVT::v4f32, Legal);
+ setOperationAction(ISD::STRICT_FTRUNC, MVT::v4f32, Legal);
+ setOperationAction(ISD::STRICT_FROUND, MVT::v4f32, Legal);
+
+ setOperationAction(ISD::STRICT_FADD, MVT::v2f64, Legal);
+ setOperationAction(ISD::STRICT_FSUB, MVT::v2f64, Legal);
+ setOperationAction(ISD::STRICT_FMUL, MVT::v2f64, Legal);
+ setOperationAction(ISD::STRICT_FDIV, MVT::v2f64, Legal);
+ setOperationAction(ISD::STRICT_FMA, MVT::v2f64, Legal);
+ setOperationAction(ISD::STRICT_FSQRT, MVT::v2f64, Legal);
+ setOperationAction(ISD::STRICT_FMAXNUM, MVT::v2f64, Legal);
+ setOperationAction(ISD::STRICT_FMINNUM, MVT::v2f64, Legal);
+ setOperationAction(ISD::STRICT_FNEARBYINT, MVT::v2f64, Legal);
+ setOperationAction(ISD::STRICT_FFLOOR, MVT::v2f64, Legal);
+ setOperationAction(ISD::STRICT_FCEIL, MVT::v2f64, Legal);
+ setOperationAction(ISD::STRICT_FTRUNC, MVT::v2f64, Legal);
+ setOperationAction(ISD::STRICT_FROUND, MVT::v2f64, Legal);
+
addRegisterClass(MVT::v2i64, &PPC::VSRCRegClass);
}
@@ -925,44 +1005,59 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
setOperationAction(ISD::SRL, MVT::v1i128, Legal);
setOperationAction(ISD::SRA, MVT::v1i128, Expand);
- if (EnableQuadPrecision) {
- addRegisterClass(MVT::f128, &PPC::VRRCRegClass);
- setOperationAction(ISD::FADD, MVT::f128, Legal);
- setOperationAction(ISD::FSUB, MVT::f128, Legal);
- setOperationAction(ISD::FDIV, MVT::f128, Legal);
- setOperationAction(ISD::FMUL, MVT::f128, Legal);
- setOperationAction(ISD::FP_EXTEND, MVT::f128, Legal);
- // No extending loads to f128 on PPC.
- for (MVT FPT : MVT::fp_valuetypes())
- setLoadExtAction(ISD::EXTLOAD, MVT::f128, FPT, Expand);
- setOperationAction(ISD::FMA, MVT::f128, Legal);
- setCondCodeAction(ISD::SETULT, MVT::f128, Expand);
- setCondCodeAction(ISD::SETUGT, MVT::f128, Expand);
- setCondCodeAction(ISD::SETUEQ, MVT::f128, Expand);
- setCondCodeAction(ISD::SETOGE, MVT::f128, Expand);
- setCondCodeAction(ISD::SETOLE, MVT::f128, Expand);
- setCondCodeAction(ISD::SETONE, MVT::f128, Expand);
-
- setOperationAction(ISD::FTRUNC, MVT::f128, Legal);
- setOperationAction(ISD::FRINT, MVT::f128, Legal);
- setOperationAction(ISD::FFLOOR, MVT::f128, Legal);
- setOperationAction(ISD::FCEIL, MVT::f128, Legal);
- setOperationAction(ISD::FNEARBYINT, MVT::f128, Legal);
- setOperationAction(ISD::FROUND, MVT::f128, Legal);
-
- setOperationAction(ISD::SELECT, MVT::f128, Expand);
- setOperationAction(ISD::FP_ROUND, MVT::f64, Legal);
- setOperationAction(ISD::FP_ROUND, MVT::f32, Legal);
- setTruncStoreAction(MVT::f128, MVT::f64, Expand);
- setTruncStoreAction(MVT::f128, MVT::f32, Expand);
- setOperationAction(ISD::BITCAST, MVT::i128, Custom);
- // No implementation for these ops for PowerPC.
- setOperationAction(ISD::FSIN , MVT::f128, Expand);
- setOperationAction(ISD::FCOS , MVT::f128, Expand);
- setOperationAction(ISD::FPOW, MVT::f128, Expand);
- setOperationAction(ISD::FPOWI, MVT::f128, Expand);
- setOperationAction(ISD::FREM, MVT::f128, Expand);
- }
+ addRegisterClass(MVT::f128, &PPC::VRRCRegClass);
+ setOperationAction(ISD::FADD, MVT::f128, Legal);
+ setOperationAction(ISD::FSUB, MVT::f128, Legal);
+ setOperationAction(ISD::FDIV, MVT::f128, Legal);
+ setOperationAction(ISD::FMUL, MVT::f128, Legal);
+ setOperationAction(ISD::FP_EXTEND, MVT::f128, Legal);
+ // No extending loads to f128 on PPC.
+ for (MVT FPT : MVT::fp_valuetypes())
+ setLoadExtAction(ISD::EXTLOAD, MVT::f128, FPT, Expand);
+ setOperationAction(ISD::FMA, MVT::f128, Legal);
+ setCondCodeAction(ISD::SETULT, MVT::f128, Expand);
+ setCondCodeAction(ISD::SETUGT, MVT::f128, Expand);
+ setCondCodeAction(ISD::SETUEQ, MVT::f128, Expand);
+ setCondCodeAction(ISD::SETOGE, MVT::f128, Expand);
+ setCondCodeAction(ISD::SETOLE, MVT::f128, Expand);
+ setCondCodeAction(ISD::SETONE, MVT::f128, Expand);
+
+ setOperationAction(ISD::FTRUNC, MVT::f128, Legal);
+ setOperationAction(ISD::FRINT, MVT::f128, Legal);
+ setOperationAction(ISD::FFLOOR, MVT::f128, Legal);
+ setOperationAction(ISD::FCEIL, MVT::f128, Legal);
+ setOperationAction(ISD::FNEARBYINT, MVT::f128, Legal);
+ setOperationAction(ISD::FROUND, MVT::f128, Legal);
+
+ setOperationAction(ISD::SELECT, MVT::f128, Expand);
+ setOperationAction(ISD::FP_ROUND, MVT::f64, Legal);
+ setOperationAction(ISD::FP_ROUND, MVT::f32, Legal);
+ setTruncStoreAction(MVT::f128, MVT::f64, Expand);
+ setTruncStoreAction(MVT::f128, MVT::f32, Expand);
+ setOperationAction(ISD::BITCAST, MVT::i128, Custom);
+ // No implementation for these ops for PowerPC.
+ setOperationAction(ISD::FSIN, MVT::f128, Expand);
+ setOperationAction(ISD::FCOS, MVT::f128, Expand);
+ setOperationAction(ISD::FPOW, MVT::f128, Expand);
+ setOperationAction(ISD::FPOWI, MVT::f128, Expand);
+ setOperationAction(ISD::FREM, MVT::f128, Expand);
+
+ // Handle constrained floating-point operations of fp128
+ setOperationAction(ISD::STRICT_FADD, MVT::f128, Legal);
+ setOperationAction(ISD::STRICT_FSUB, MVT::f128, Legal);
+ setOperationAction(ISD::STRICT_FMUL, MVT::f128, Legal);
+ setOperationAction(ISD::STRICT_FDIV, MVT::f128, Legal);
+ setOperationAction(ISD::STRICT_FMA, MVT::f128, Legal);
+ setOperationAction(ISD::STRICT_FSQRT, MVT::f128, Legal);
+ setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f128, Legal);
+ setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Legal);
+ setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Legal);
+ setOperationAction(ISD::STRICT_FRINT, MVT::f128, Legal);
+ setOperationAction(ISD::STRICT_FNEARBYINT, MVT::f128, Legal);
+ setOperationAction(ISD::STRICT_FFLOOR, MVT::f128, Legal);
+ setOperationAction(ISD::STRICT_FCEIL, MVT::f128, Legal);
+ setOperationAction(ISD::STRICT_FTRUNC, MVT::f128, Legal);
+ setOperationAction(ISD::STRICT_FROUND, MVT::f128, Legal);
setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
setOperationAction(ISD::BSWAP, MVT::v8i16, Legal);
setOperationAction(ISD::BSWAP, MVT::v4i32, Legal);
@@ -1135,6 +1230,8 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
setOperationAction(ISD::FDIV, MVT::v4f32, Expand);
setOperationAction(ISD::FSQRT, MVT::v4f32, Expand);
}
+
+ // TODO: Handle constrained floating-point operations of v4f64
}
if (Subtarget.has64BitSupport())
@@ -1169,6 +1266,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
setTargetDAGCombine(ISD::SRA);
setTargetDAGCombine(ISD::SRL);
setTargetDAGCombine(ISD::MUL);
+ setTargetDAGCombine(ISD::FMA);
setTargetDAGCombine(ISD::SINT_TO_FP);
setTargetDAGCombine(ISD::BUILD_VECTOR);
if (Subtarget.hasFPCVT())
@@ -1208,34 +1306,18 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
setTargetDAGCombine(ISD::VSELECT);
}
- // Darwin long double math library functions have $LDBL128 appended.
- if (Subtarget.isDarwin()) {
- setLibcallName(RTLIB::COS_PPCF128, "cosl$LDBL128");
- setLibcallName(RTLIB::POW_PPCF128, "powl$LDBL128");
- setLibcallName(RTLIB::REM_PPCF128, "fmodl$LDBL128");
- setLibcallName(RTLIB::SIN_PPCF128, "sinl$LDBL128");
- setLibcallName(RTLIB::SQRT_PPCF128, "sqrtl$LDBL128");
- setLibcallName(RTLIB::LOG_PPCF128, "logl$LDBL128");
- setLibcallName(RTLIB::LOG2_PPCF128, "log2l$LDBL128");
- setLibcallName(RTLIB::LOG10_PPCF128, "log10l$LDBL128");
- setLibcallName(RTLIB::EXP_PPCF128, "expl$LDBL128");
- setLibcallName(RTLIB::EXP2_PPCF128, "exp2l$LDBL128");
- }
-
- if (EnableQuadPrecision) {
- setLibcallName(RTLIB::LOG_F128, "logf128");
- setLibcallName(RTLIB::LOG2_F128, "log2f128");
- setLibcallName(RTLIB::LOG10_F128, "log10f128");
- setLibcallName(RTLIB::EXP_F128, "expf128");
- setLibcallName(RTLIB::EXP2_F128, "exp2f128");
- setLibcallName(RTLIB::SIN_F128, "sinf128");
- setLibcallName(RTLIB::COS_F128, "cosf128");
- setLibcallName(RTLIB::POW_F128, "powf128");
- setLibcallName(RTLIB::FMIN_F128, "fminf128");
- setLibcallName(RTLIB::FMAX_F128, "fmaxf128");
- setLibcallName(RTLIB::POWI_F128, "__powikf2");
- setLibcallName(RTLIB::REM_F128, "fmodf128");
- }
+ setLibcallName(RTLIB::LOG_F128, "logf128");
+ setLibcallName(RTLIB::LOG2_F128, "log2f128");
+ setLibcallName(RTLIB::LOG10_F128, "log10f128");
+ setLibcallName(RTLIB::EXP_F128, "expf128");
+ setLibcallName(RTLIB::EXP2_F128, "exp2f128");
+ setLibcallName(RTLIB::SIN_F128, "sinf128");
+ setLibcallName(RTLIB::COS_F128, "cosf128");
+ setLibcallName(RTLIB::POW_F128, "powf128");
+ setLibcallName(RTLIB::FMIN_F128, "fminf128");
+ setLibcallName(RTLIB::FMAX_F128, "fmaxf128");
+ setLibcallName(RTLIB::POWI_F128, "__powikf2");
+ setLibcallName(RTLIB::REM_F128, "fmodf128");
// With 32 condition bits, we don't need to sink (and duplicate) compares
// aggressively in CodeGenPrep.
@@ -1245,8 +1327,6 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
}
setMinFunctionAlignment(Align(4));
- if (Subtarget.isDarwin())
- setPrefFunctionAlignment(Align(16));
switch (Subtarget.getCPUDirective()) {
default: break;
@@ -1263,6 +1343,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
case PPC::DIR_PWR7:
case PPC::DIR_PWR8:
case PPC::DIR_PWR9:
+ case PPC::DIR_PWR10:
case PPC::DIR_PWR_FUTURE:
setPrefLoopAlignment(Align(16));
setPrefFunctionAlignment(Align(16));
@@ -1298,27 +1379,33 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
MaxLoadsPerMemcmp = 8;
MaxLoadsPerMemcmpOptSize = 4;
}
+
+ // Let the subtarget (CPU) decide if a predictable select is more expensive
+ // than the corresponding branch. This information is used in CGP to decide
+ // when to convert selects into branches.
+ PredictableSelectIsExpensive = Subtarget.isPredictableSelectIsExpensive();
}
/// getMaxByValAlign - Helper for getByValTypeAlignment to determine
/// the desired ByVal argument alignment.
-static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign,
- unsigned MaxMaxAlign) {
+static void getMaxByValAlign(Type *Ty, Align &MaxAlign, Align MaxMaxAlign) {
if (MaxAlign == MaxMaxAlign)
return;
if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
- if (MaxMaxAlign >= 32 && VTy->getBitWidth() >= 256)
- MaxAlign = 32;
- else if (VTy->getBitWidth() >= 128 && MaxAlign < 16)
- MaxAlign = 16;
+ if (MaxMaxAlign >= 32 &&
+ VTy->getPrimitiveSizeInBits().getFixedSize() >= 256)
+ MaxAlign = Align(32);
+ else if (VTy->getPrimitiveSizeInBits().getFixedSize() >= 128 &&
+ MaxAlign < 16)
+ MaxAlign = Align(16);
} else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
- unsigned EltAlign = 0;
+ Align EltAlign;
getMaxByValAlign(ATy->getElementType(), EltAlign, MaxMaxAlign);
if (EltAlign > MaxAlign)
MaxAlign = EltAlign;
} else if (StructType *STy = dyn_cast<StructType>(Ty)) {
for (auto *EltTy : STy->elements()) {
- unsigned EltAlign = 0;
+ Align EltAlign;
getMaxByValAlign(EltTy, EltAlign, MaxMaxAlign);
if (EltAlign > MaxAlign)
MaxAlign = EltAlign;
@@ -1332,16 +1419,12 @@ static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign,
/// function arguments in the caller parameter area.
unsigned PPCTargetLowering::getByValTypeAlignment(Type *Ty,
const DataLayout &DL) const {
- // Darwin passes everything on 4 byte boundary.
- if (Subtarget.isDarwin())
- return 4;
-
// 16byte and wider vectors are passed on 16byte boundary.
// The rest is 8 on PPC64 and 4 on PPC32 boundary.
- unsigned Align = Subtarget.isPPC64() ? 8 : 4;
+ Align Alignment = Subtarget.isPPC64() ? Align(8) : Align(4);
if (Subtarget.hasAltivec() || Subtarget.hasQPX())
- getMaxByValAlign(Ty, Align, Subtarget.hasQPX() ? 32 : 16);
- return Align;
+ getMaxByValAlign(Ty, Alignment, Subtarget.hasQPX() ? Align(32) : Align(16));
+ return Alignment.value();
}
bool PPCTargetLowering::useSoftFloat() const {
@@ -1356,6 +1439,16 @@ bool PPCTargetLowering::preferIncOfAddToSubOfNot(EVT VT) const {
return VT.isScalarInteger();
}
+/// isMulhCheaperThanMulShift - Return true if a mulh[s|u] node for a specific
+/// type is cheaper than a multiply followed by a shift.
+/// This is true for words and doublewords on 64-bit PowerPC.
+bool PPCTargetLowering::isMulhCheaperThanMulShift(EVT Type) const {
+ if (Subtarget.isPPC64() && (isOperationLegal(ISD::MULHS, Type) ||
+ isOperationLegal(ISD::MULHU, Type)))
+ return true;
+ return TargetLowering::isMulhCheaperThanMulShift(Type);
+}
+
const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
switch ((PPCISD::NodeType)Opcode) {
case PPCISD::FIRST_NUMBER: break;
@@ -1377,10 +1470,12 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
case PPCISD::FRE: return "PPCISD::FRE";
case PPCISD::FRSQRTE: return "PPCISD::FRSQRTE";
case PPCISD::STFIWX: return "PPCISD::STFIWX";
- case PPCISD::VMADDFP: return "PPCISD::VMADDFP";
- case PPCISD::VNMSUBFP: return "PPCISD::VNMSUBFP";
case PPCISD::VPERM: return "PPCISD::VPERM";
case PPCISD::XXSPLT: return "PPCISD::XXSPLT";
+ case PPCISD::XXSPLTI_SP_TO_DP:
+ return "PPCISD::XXSPLTI_SP_TO_DP";
+ case PPCISD::XXSPLTI32DX:
+ return "PPCISD::XXSPLTI32DX";
case PPCISD::VECINSERT: return "PPCISD::VECINSERT";
case PPCISD::XXPERMDI: return "PPCISD::XXPERMDI";
case PPCISD::VECSHL: return "PPCISD::VECSHL";
@@ -1392,6 +1487,7 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
case PPCISD::ATOMIC_CMP_SWAP_16: return "PPCISD::ATOMIC_CMP_SWAP_16";
case PPCISD::DYNALLOC: return "PPCISD::DYNALLOC";
case PPCISD::DYNAREAOFFSET: return "PPCISD::DYNAREAOFFSET";
+ case PPCISD::PROBED_ALLOCA: return "PPCISD::PROBED_ALLOCA";
case PPCISD::GlobalBaseReg: return "PPCISD::GlobalBaseReg";
case PPCISD::SRL: return "PPCISD::SRL";
case PPCISD::SRA: return "PPCISD::SRA";
@@ -1399,6 +1495,7 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
case PPCISD::SRA_ADDZE: return "PPCISD::SRA_ADDZE";
case PPCISD::CALL: return "PPCISD::CALL";
case PPCISD::CALL_NOP: return "PPCISD::CALL_NOP";
+ case PPCISD::CALL_NOTOC: return "PPCISD::CALL_NOTOC";
case PPCISD::MTCTR: return "PPCISD::MTCTR";
case PPCISD::BCTRL: return "PPCISD::BCTRL";
case PPCISD::BCTRL_LOAD_TOC: return "PPCISD::BCTRL_LOAD_TOC";
@@ -1412,6 +1509,8 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
case PPCISD::MTVSRZ: return "PPCISD::MTVSRZ";
case PPCISD::SINT_VEC_TO_FP: return "PPCISD::SINT_VEC_TO_FP";
case PPCISD::UINT_VEC_TO_FP: return "PPCISD::UINT_VEC_TO_FP";
+ case PPCISD::SCALAR_TO_VECTOR_PERMUTED:
+ return "PPCISD::SCALAR_TO_VECTOR_PERMUTED";
case PPCISD::ANDI_rec_1_EQ_BIT:
return "PPCISD::ANDI_rec_1_EQ_BIT";
case PPCISD::ANDI_rec_1_GT_BIT:
@@ -1425,7 +1524,6 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
case PPCISD::LXSIZX: return "PPCISD::LXSIZX";
case PPCISD::STXSIX: return "PPCISD::STXSIX";
case PPCISD::VEXTS: return "PPCISD::VEXTS";
- case PPCISD::SExtVElems: return "PPCISD::SExtVElems";
case PPCISD::LXVD2X: return "PPCISD::LXVD2X";
case PPCISD::STXVD2X: return "PPCISD::STXVD2X";
case PPCISD::LOAD_VEC_BE: return "PPCISD::LOAD_VEC_BE";
@@ -1475,7 +1573,9 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
case PPCISD::EXTSWSLI: return "PPCISD::EXTSWSLI";
case PPCISD::LD_VSX_LH: return "PPCISD::LD_VSX_LH";
case PPCISD::FP_EXTEND_HALF: return "PPCISD::FP_EXTEND_HALF";
+ case PPCISD::MAT_PCREL_ADDR: return "PPCISD::MAT_PCREL_ADDR";
case PPCISD::LD_SPLAT: return "PPCISD::LD_SPLAT";
+ case PPCISD::FNMSUB: return "PPCISD::FNMSUB";
}
return nullptr;
}
@@ -2338,17 +2438,22 @@ bool PPCTargetLowering::SelectAddressEVXRegReg(SDValue N, SDValue &Base,
/// non-zero and N can be represented by a base register plus a signed 16-bit
/// displacement, make a more precise judgement by checking (displacement % \p
/// EncodingAlignment).
-bool PPCTargetLowering::SelectAddressRegReg(SDValue N, SDValue &Base,
- SDValue &Index, SelectionDAG &DAG,
- unsigned EncodingAlignment) const {
- int16_t imm = 0;
+bool PPCTargetLowering::SelectAddressRegReg(
+ SDValue N, SDValue &Base, SDValue &Index, SelectionDAG &DAG,
+ MaybeAlign EncodingAlignment) const {
+ // If we have a PC Relative target flag don't select as [reg+reg]. It will be
+ // a [pc+imm].
+ if (SelectAddressPCRel(N, Base))
+ return false;
+
+ int16_t Imm = 0;
if (N.getOpcode() == ISD::ADD) {
// Is there any SPE load/store (f64), which can't handle 16bit offset?
// SPE load/store can only handle 8-bit offsets.
if (hasSPE() && SelectAddressEVXRegReg(N, Base, Index, DAG))
return true;
- if (isIntS16Immediate(N.getOperand(1), imm) &&
- (!EncodingAlignment || !(imm % EncodingAlignment)))
+ if (isIntS16Immediate(N.getOperand(1), Imm) &&
+ (!EncodingAlignment || isAligned(*EncodingAlignment, Imm)))
return false; // r+i
if (N.getOperand(1).getOpcode() == PPCISD::Lo)
return false; // r+i
@@ -2357,8 +2462,8 @@ bool PPCTargetLowering::SelectAddressRegReg(SDValue N, SDValue &Base,
Index = N.getOperand(1);
return true;
} else if (N.getOpcode() == ISD::OR) {
- if (isIntS16Immediate(N.getOperand(1), imm) &&
- (!EncodingAlignment || !(imm % EncodingAlignment)))
+ if (isIntS16Immediate(N.getOperand(1), Imm) &&
+ (!EncodingAlignment || isAligned(*EncodingAlignment, Imm)))
return false; // r+i can fold it if we can.
// If this is an or of disjoint bitfields, we can codegen this as an add
@@ -2413,8 +2518,7 @@ static void fixupFuncForFI(SelectionDAG &DAG, int FrameIdx, EVT VT) {
MachineFunction &MF = DAG.getMachineFunction();
MachineFrameInfo &MFI = MF.getFrameInfo();
- unsigned Align = MFI.getObjectAlignment(FrameIdx);
- if (Align >= 4)
+ if (MFI.getObjectAlign(FrameIdx) >= Align(4))
return;
PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
@@ -2425,12 +2529,17 @@ static void fixupFuncForFI(SelectionDAG &DAG, int FrameIdx, EVT VT) {
/// a signed 16-bit displacement [r+imm], and if it is not better
/// represented as reg+reg. If \p EncodingAlignment is non-zero, only accept
/// displacements that are multiples of that value.
-bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp,
- SDValue &Base,
- SelectionDAG &DAG,
- unsigned EncodingAlignment) const {
+bool PPCTargetLowering::SelectAddressRegImm(
+ SDValue N, SDValue &Disp, SDValue &Base, SelectionDAG &DAG,
+ MaybeAlign EncodingAlignment) const {
// FIXME dl should come from parent load or store, not from address
SDLoc dl(N);
+
+ // If we have a PC Relative target flag don't select as [reg+imm]. It will be
+ // a [pc+imm].
+ if (SelectAddressPCRel(N, Base))
+ return false;
+
// If this can be more profitably realized as r+r, fail.
if (SelectAddressRegReg(N, Disp, Base, DAG, EncodingAlignment))
return false;
@@ -2438,7 +2547,7 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp,
if (N.getOpcode() == ISD::ADD) {
int16_t imm = 0;
if (isIntS16Immediate(N.getOperand(1), imm) &&
- (!EncodingAlignment || (imm % EncodingAlignment) == 0)) {
+ (!EncodingAlignment || isAligned(*EncodingAlignment, imm))) {
Disp = DAG.getTargetConstant(imm, dl, N.getValueType());
if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N.getOperand(0))) {
Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());
@@ -2462,7 +2571,7 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp,
} else if (N.getOpcode() == ISD::OR) {
int16_t imm = 0;
if (isIntS16Immediate(N.getOperand(1), imm) &&
- (!EncodingAlignment || (imm % EncodingAlignment) == 0)) {
+ (!EncodingAlignment || isAligned(*EncodingAlignment, imm))) {
// If this is an or of disjoint bitfields, we can codegen this as an add
// (for better address arithmetic) if the LHS and RHS of the OR are
// provably disjoint.
@@ -2489,7 +2598,7 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp,
// this as "d, 0"
int16_t Imm;
if (isIntS16Immediate(CN, Imm) &&
- (!EncodingAlignment || (Imm % EncodingAlignment) == 0)) {
+ (!EncodingAlignment || isAligned(*EncodingAlignment, Imm))) {
Disp = DAG.getTargetConstant(Imm, dl, CN->getValueType(0));
Base = DAG.getRegister(Subtarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO,
CN->getValueType(0));
@@ -2499,7 +2608,8 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp,
// Handle 32-bit sext immediates with LIS + addr mode.
if ((CN->getValueType(0) == MVT::i32 ||
(int64_t)CN->getZExtValue() == (int)CN->getZExtValue()) &&
- (!EncodingAlignment || (CN->getZExtValue() % EncodingAlignment) == 0)) {
+ (!EncodingAlignment ||
+ isAligned(*EncodingAlignment, CN->getZExtValue()))) {
int Addr = (int)CN->getZExtValue();
// Otherwise, break this down into an LIS + disp.
@@ -2554,6 +2664,27 @@ bool PPCTargetLowering::SelectAddressRegRegOnly(SDValue N, SDValue &Base,
return true;
}
+template <typename Ty> static bool isValidPCRelNode(SDValue N) {
+ Ty *PCRelCand = dyn_cast<Ty>(N);
+ return PCRelCand && (PCRelCand->getTargetFlags() & PPCII::MO_PCREL_FLAG);
+}
+
+/// Returns true if this address is a PC Relative address.
+/// PC Relative addresses are marked with the flag PPCII::MO_PCREL_FLAG
+/// or if the node opcode is PPCISD::MAT_PCREL_ADDR.
+bool PPCTargetLowering::SelectAddressPCRel(SDValue N, SDValue &Base) const {
+ // This is a materialize PC Relative node. Always select this as PC Relative.
+ Base = N;
+ if (N.getOpcode() == PPCISD::MAT_PCREL_ADDR)
+ return true;
+ if (isValidPCRelNode<ConstantPoolSDNode>(N) ||
+ isValidPCRelNode<GlobalAddressSDNode>(N) ||
+ isValidPCRelNode<JumpTableSDNode>(N) ||
+ isValidPCRelNode<BlockAddressSDNode>(N))
+ return true;
+ return false;
+}
+
/// Returns true if we should use a direct load into vector instruction
/// (such as lxsd or lfd), instead of a load into gpr + direct move sequence.
static bool usePartialVectorLoads(SDNode *N, const PPCSubtarget& ST) {
@@ -2591,7 +2722,8 @@ static bool usePartialVectorLoads(SDNode *N, const PPCSubtarget& ST) {
for (SDNode::use_iterator UI = LD->use_begin(), UE = LD->use_end();
UI != UE; ++UI)
if (UI.getUse().get().getResNo() == 0 &&
- UI->getOpcode() != ISD::SCALAR_TO_VECTOR)
+ UI->getOpcode() != ISD::SCALAR_TO_VECTOR &&
+ UI->getOpcode() != PPCISD::SCALAR_TO_VECTOR_PERMUTED)
return false;
return true;
@@ -2664,14 +2796,14 @@ bool PPCTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
// LDU/STU can only handle immediates that are a multiple of 4.
if (VT != MVT::i64) {
- if (!SelectAddressRegImm(Ptr, Offset, Base, DAG, 0))
+ if (!SelectAddressRegImm(Ptr, Offset, Base, DAG, None))
return false;
} else {
// LDU/STU need an address with at least 4-byte alignment.
if (Alignment < 4)
return false;
- if (!SelectAddressRegImm(Ptr, Offset, Base, DAG, 4))
+ if (!SelectAddressRegImm(Ptr, Offset, Base, DAG, Align(4)))
return false;
}
@@ -2705,18 +2837,6 @@ static void getLabelAccessInfo(bool IsPIC, const PPCSubtarget &Subtarget,
HiOpFlags |= PPCII::MO_PIC_FLAG;
LoOpFlags |= PPCII::MO_PIC_FLAG;
}
-
- // If this is a reference to a global value that requires a non-lazy-ptr, make
- // sure that instruction lowering adds it.
- if (GV && Subtarget.hasLazyResolverStub(GV)) {
- HiOpFlags |= PPCII::MO_NLP_FLAG;
- LoOpFlags |= PPCII::MO_NLP_FLAG;
-
- if (GV->hasHiddenVisibility()) {
- HiOpFlags |= PPCII::MO_NLP_HIDDEN_FLAG;
- LoOpFlags |= PPCII::MO_NLP_HIDDEN_FLAG;
- }
- }
}
static SDValue LowerLabelRef(SDValue HiPart, SDValue LoPart, bool isPIC,
@@ -2758,7 +2878,7 @@ SDValue PPCTargetLowering::getTOCEntry(SelectionDAG &DAG, const SDLoc &dl,
SDValue Ops[] = { GA, Reg };
return DAG.getMemIntrinsicNode(
PPCISD::TOC_ENTRY, dl, DAG.getVTList(VT, MVT::Other), Ops, VT,
- MachinePointerInfo::getGOT(DAG.getMachineFunction()), 0,
+ MachinePointerInfo::getGOT(DAG.getMachineFunction()), None,
MachineMemOperand::MOLoad);
}
@@ -2771,8 +2891,15 @@ SDValue PPCTargetLowering::LowerConstantPool(SDValue Op,
// 64-bit SVR4 ABI and AIX ABI code are always position-independent.
// The actual address of the GlobalValue is stored in the TOC.
if (Subtarget.is64BitELFABI() || Subtarget.isAIXABI()) {
+ if (Subtarget.isUsingPCRelativeCalls()) {
+ SDLoc DL(CP);
+ EVT Ty = getPointerTy(DAG.getDataLayout());
+ SDValue ConstPool = DAG.getTargetConstantPool(
+ C, Ty, CP->getAlign(), CP->getOffset(), PPCII::MO_PCREL_FLAG);
+ return DAG.getNode(PPCISD::MAT_PCREL_ADDR, DL, Ty, ConstPool);
+ }
setUsesTOCBasePtr(DAG);
- SDValue GA = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(), 0);
+ SDValue GA = DAG.getTargetConstantPool(C, PtrVT, CP->getAlign(), 0);
return getTOCEntry(DAG, SDLoc(CP), GA);
}
@@ -2781,15 +2908,15 @@ SDValue PPCTargetLowering::LowerConstantPool(SDValue Op,
getLabelAccessInfo(IsPIC, Subtarget, MOHiFlag, MOLoFlag);
if (IsPIC && Subtarget.isSVR4ABI()) {
- SDValue GA = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(),
- PPCII::MO_PIC_FLAG);
+ SDValue GA =
+ DAG.getTargetConstantPool(C, PtrVT, CP->getAlign(), PPCII::MO_PIC_FLAG);
return getTOCEntry(DAG, SDLoc(CP), GA);
}
SDValue CPIHi =
- DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(), 0, MOHiFlag);
+ DAG.getTargetConstantPool(C, PtrVT, CP->getAlign(), 0, MOHiFlag);
SDValue CPILo =
- DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(), 0, MOLoFlag);
+ DAG.getTargetConstantPool(C, PtrVT, CP->getAlign(), 0, MOLoFlag);
return LowerLabelRef(CPIHi, CPILo, IsPIC, DAG);
}
@@ -2846,6 +2973,16 @@ SDValue PPCTargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
EVT PtrVT = Op.getValueType();
JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
+ // isUsingPCRelativeCalls() returns true when PCRelative is enabled
+ if (Subtarget.isUsingPCRelativeCalls()) {
+ SDLoc DL(JT);
+ EVT Ty = getPointerTy(DAG.getDataLayout());
+ SDValue GA =
+ DAG.getTargetJumpTable(JT->getIndex(), Ty, PPCII::MO_PCREL_FLAG);
+ SDValue MatAddr = DAG.getNode(PPCISD::MAT_PCREL_ADDR, DL, Ty, GA);
+ return MatAddr;
+ }
+
// 64-bit SVR4 ABI and AIX ABI code are always position-independent.
// The actual address of the GlobalValue is stored in the TOC.
if (Subtarget.is64BitELFABI() || Subtarget.isAIXABI()) {
@@ -2875,6 +3012,16 @@ SDValue PPCTargetLowering::LowerBlockAddress(SDValue Op,
BlockAddressSDNode *BASDN = cast<BlockAddressSDNode>(Op);
const BlockAddress *BA = BASDN->getBlockAddress();
+ // isUsingPCRelativeCalls() returns true when PCRelative is enabled
+ if (Subtarget.isUsingPCRelativeCalls()) {
+ SDLoc DL(BASDN);
+ EVT Ty = getPointerTy(DAG.getDataLayout());
+ SDValue GA = DAG.getTargetBlockAddress(BA, Ty, BASDN->getOffset(),
+ PPCII::MO_PCREL_FLAG);
+ SDValue MatAddr = DAG.getNode(PPCISD::MAT_PCREL_ADDR, DL, Ty, GA);
+ return MatAddr;
+ }
+
// 64-bit SVR4 ABI and AIX ABI code are always position-independent.
// The actual BlockAddress is stored in the TOC.
if (Subtarget.is64BitELFABI() || Subtarget.isAIXABI()) {
@@ -3004,6 +3151,22 @@ SDValue PPCTargetLowering::LowerGlobalAddress(SDValue Op,
// 64-bit SVR4 ABI & AIX ABI code is always position-independent.
// The actual address of the GlobalValue is stored in the TOC.
if (Subtarget.is64BitELFABI() || Subtarget.isAIXABI()) {
+ if (Subtarget.isUsingPCRelativeCalls()) {
+ EVT Ty = getPointerTy(DAG.getDataLayout());
+ if (isAccessedAsGotIndirect(Op)) {
+ SDValue GA = DAG.getTargetGlobalAddress(GV, DL, Ty, GSDN->getOffset(),
+ PPCII::MO_PCREL_FLAG |
+ PPCII::MO_GOT_FLAG);
+ SDValue MatPCRel = DAG.getNode(PPCISD::MAT_PCREL_ADDR, DL, Ty, GA);
+ SDValue Load = DAG.getLoad(MVT::i64, DL, DAG.getEntryNode(), MatPCRel,
+ MachinePointerInfo());
+ return Load;
+ } else {
+ SDValue GA = DAG.getTargetGlobalAddress(GV, DL, Ty, GSDN->getOffset(),
+ PPCII::MO_PCREL_FLAG);
+ return DAG.getNode(PPCISD::MAT_PCREL_ADDR, DL, Ty, GA);
+ }
+ }
setUsesTOCBasePtr(DAG);
SDValue GA = DAG.getTargetGlobalAddress(GV, DL, PtrVT, GSDN->getOffset());
return getTOCEntry(DAG, DL, GA);
@@ -3025,13 +3188,7 @@ SDValue PPCTargetLowering::LowerGlobalAddress(SDValue Op,
SDValue GALo =
DAG.getTargetGlobalAddress(GV, DL, PtrVT, GSDN->getOffset(), MOLoFlag);
- SDValue Ptr = LowerLabelRef(GAHi, GALo, IsPIC, DAG);
-
- // If the global reference is actually to a non-lazy-pointer, we have to do an
- // extra load to get the address of the global.
- if (MOHiFlag & PPCII::MO_NLP_FLAG)
- Ptr = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Ptr, MachinePointerInfo());
- return Ptr;
+ return LowerLabelRef(GAHi, GALo, IsPIC, DAG);
}
SDValue PPCTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
@@ -3192,10 +3349,10 @@ SDValue PPCTargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const {
// We have to copy the entire va_list struct:
// 2*sizeof(char) + 2 Byte alignment + 2*sizeof(char*) = 12 Byte
- return DAG.getMemcpy(Op.getOperand(0), Op,
- Op.getOperand(1), Op.getOperand(2),
- DAG.getConstant(12, SDLoc(Op), MVT::i32), 8, false, true,
- false, MachinePointerInfo(), MachinePointerInfo());
+ return DAG.getMemcpy(Op.getOperand(0), Op, Op.getOperand(1), Op.getOperand(2),
+ DAG.getConstant(12, SDLoc(Op), MVT::i32), Align(8),
+ false, true, false, MachinePointerInfo(),
+ MachinePointerInfo());
}
SDValue PPCTargetLowering::LowerADJUST_TRAMPOLINE(SDValue Op,
@@ -3252,7 +3409,7 @@ SDValue PPCTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
SDLoc dl(Op);
- if (Subtarget.isDarwinABI() || Subtarget.isPPC64()) {
+ if (Subtarget.isPPC64() || Subtarget.isAIXABI()) {
// vastart just stores the address of the VarArgsFrameIndex slot into the
// memory location argument.
SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
@@ -3358,31 +3515,31 @@ static unsigned CalculateStackSlotSize(EVT ArgVT, ISD::ArgFlagsTy Flags,
/// CalculateStackSlotAlignment - Calculates the alignment of this argument
/// on the stack.
-static unsigned CalculateStackSlotAlignment(EVT ArgVT, EVT OrigVT,
- ISD::ArgFlagsTy Flags,
- unsigned PtrByteSize) {
- unsigned Align = PtrByteSize;
+static Align CalculateStackSlotAlignment(EVT ArgVT, EVT OrigVT,
+ ISD::ArgFlagsTy Flags,
+ unsigned PtrByteSize) {
+ Align Alignment(PtrByteSize);
// Altivec parameters are padded to a 16 byte boundary.
if (ArgVT == MVT::v4f32 || ArgVT == MVT::v4i32 ||
ArgVT == MVT::v8i16 || ArgVT == MVT::v16i8 ||
ArgVT == MVT::v2f64 || ArgVT == MVT::v2i64 ||
ArgVT == MVT::v1i128 || ArgVT == MVT::f128)
- Align = 16;
+ Alignment = Align(16);
// QPX vector types stored in double-precision are padded to a 32 byte
// boundary.
else if (ArgVT == MVT::v4f64 || ArgVT == MVT::v4i1)
- Align = 32;
+ Alignment = Align(32);
// ByVal parameters are aligned as requested.
if (Flags.isByVal()) {
- unsigned BVAlign = Flags.getByValAlign();
+ auto BVAlign = Flags.getNonZeroByValAlign();
if (BVAlign > PtrByteSize) {
- if (BVAlign % PtrByteSize != 0)
- llvm_unreachable(
+ if (BVAlign.value() % PtrByteSize != 0)
+ llvm_unreachable(
"ByVal alignment is not a multiple of the pointer size");
- Align = BVAlign;
+ Alignment = BVAlign;
}
}
@@ -3392,12 +3549,12 @@ static unsigned CalculateStackSlotAlignment(EVT ArgVT, EVT OrigVT,
// needs to be aligned to the size of the full type. (Except for
// ppcf128, which is only aligned as its f64 components.)
if (Flags.isSplit() && OrigVT != MVT::ppcf128)
- Align = OrigVT.getStoreSize();
+ Alignment = Align(OrigVT.getStoreSize());
else
- Align = ArgVT.getStoreSize();
+ Alignment = Align(ArgVT.getStoreSize());
}
- return Align;
+ return Alignment;
}
/// CalculateStackSlotUsed - Return whether this argument will use its
@@ -3415,9 +3572,9 @@ static bool CalculateStackSlotUsed(EVT ArgVT, EVT OrigVT,
bool UseMemory = false;
// Respect alignment of argument on the stack.
- unsigned Align =
- CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize);
- ArgOffset = ((ArgOffset + Align - 1) / Align) * Align;
+ Align Alignment =
+ CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize);
+ ArgOffset = alignTo(ArgOffset, Alignment);
// If there's no space left in the argument save area, we must
// use memory (this check also catches zero-sized arguments).
if (ArgOffset >= LinkageSize + ParamAreaSize)
@@ -3461,10 +3618,7 @@ static bool CalculateStackSlotUsed(EVT ArgVT, EVT OrigVT,
/// ensure minimum alignment required for target.
static unsigned EnsureStackAlignment(const PPCFrameLowering *Lowering,
unsigned NumBytes) {
- unsigned TargetAlign = Lowering->getStackAlignment();
- unsigned AlignMask = TargetAlign - 1;
- NumBytes = (NumBytes + AlignMask) & ~AlignMask;
- return NumBytes;
+ return alignTo(NumBytes, Lowering->getStackAlign());
}
SDValue PPCTargetLowering::LowerFormalArguments(
@@ -3527,7 +3681,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_32SVR4(
// Potential tail calls could cause overwriting of argument stack slots.
bool isImmutable = !(getTargetMachine().Options.GuaranteedTailCallOpt &&
(CallConv == CallingConv::Fast));
- unsigned PtrByteSize = 4;
+ const Align PtrAlign(4);
// Assign locations to all of the incoming arguments.
SmallVector<CCValAssign, 16> ArgLocs;
@@ -3536,7 +3690,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_32SVR4(
// Reserve space for the linkage area on the stack.
unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
- CCInfo.AllocateStack(LinkageSize, PtrByteSize);
+ CCInfo.AllocateStack(LinkageSize, PtrAlign);
if (useSoftFloat())
CCInfo.PreAnalyzeFormalArguments(Ins);
@@ -3645,7 +3799,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_32SVR4(
ByValArgLocs, *DAG.getContext());
// Reserve stack space for the allocations in CCInfo.
- CCByValInfo.AllocateStack(CCInfo.getNextStackOffset(), PtrByteSize);
+ CCByValInfo.AllocateStack(CCInfo.getNextStackOffset(), PtrAlign);
CCByValInfo.AnalyzeFormalArguments(Ins, CC_PPC32_SVR4_ByVal);
@@ -3692,7 +3846,8 @@ SDValue PPCTargetLowering::LowerFormalArguments_32SVR4(
MFI.CreateFixedObject(PtrVT.getSizeInBits()/8,
CCInfo.getNextStackOffset(), true));
- FuncInfo->setVarArgsFrameIndex(MFI.CreateStackObject(Depth, 8, false));
+ FuncInfo->setVarArgsFrameIndex(
+ MFI.CreateStackObject(Depth, Align(8), false));
SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
// The fixed integer arguments of a variadic function are stored to the
@@ -3839,11 +3994,13 @@ SDValue PPCTargetLowering::LowerFormalArguments_64SVR4(
// We re-align the argument offset for each argument, except when using the
// fast calling convention, when we need to make sure we do that only when
// we'll actually use a stack slot.
- unsigned CurArgOffset, Align;
+ unsigned CurArgOffset;
+ Align Alignment;
auto ComputeArgOffset = [&]() {
/* Respect alignment of argument on the stack. */
- Align = CalculateStackSlotAlignment(ObjectVT, OrigVT, Flags, PtrByteSize);
- ArgOffset = ((ArgOffset + Align - 1) / Align) * Align;
+ Alignment =
+ CalculateStackSlotAlignment(ObjectVT, OrigVT, Flags, PtrByteSize);
+ ArgOffset = alignTo(ArgOffset, Alignment);
CurArgOffset = ArgOffset;
};
@@ -3891,7 +4048,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_64SVR4(
ArgSize + ArgOffset > LinkageSize + Num_GPR_Regs * PtrByteSize)
FI = MFI.CreateFixedObject(ArgSize, ArgOffset, false, true);
else
- FI = MFI.CreateStackObject(ArgSize, Align, false);
+ FI = MFI.CreateStackObject(ArgSize, Alignment, false);
SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
// Handle aggregates smaller than 8 bytes.
@@ -4139,7 +4296,11 @@ SDValue PPCTargetLowering::LowerFormalArguments_64SVR4(
// If the function takes variable number of arguments, make a frame index for
// the start of the first vararg value... for expansion of llvm.va_start.
- if (isVarArg) {
+ // On ELFv2ABI spec, it writes:
+ // C programs that are intended to be *portable* across different compilers
+ // and architectures must use the header file <stdarg.h> to deal with variable
+ // argument lists.
+ if (isVarArg && MFI.hasVAStart()) {
int Depth = ArgOffset;
FuncInfo->setVarArgsFrameIndex(
@@ -4547,30 +4708,67 @@ static int CalculateTailCallSPDiff(SelectionDAG& DAG, bool isTailCall,
static bool isFunctionGlobalAddress(SDValue Callee);
-static bool
-callsShareTOCBase(const Function *Caller, SDValue Callee,
- const TargetMachine &TM) {
- // Callee is either a GlobalAddress or an ExternalSymbol. ExternalSymbols
- // don't have enough information to determine if the caller and calle share
- // the same TOC base, so we have to pessimistically assume they don't for
- // correctness.
- GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
- if (!G)
- return false;
-
- const GlobalValue *GV = G->getGlobal();
+static bool callsShareTOCBase(const Function *Caller, SDValue Callee,
+ const TargetMachine &TM) {
+ // It does not make sense to call callsShareTOCBase() with a caller that
+ // is PC Relative since PC Relative callers do not have a TOC.
+#ifndef NDEBUG
+ const PPCSubtarget *STICaller = &TM.getSubtarget<PPCSubtarget>(*Caller);
+ assert(!STICaller->isUsingPCRelativeCalls() &&
+ "PC Relative callers do not have a TOC and cannot share a TOC Base");
+#endif
+
+ // Callee is either a GlobalAddress or an ExternalSymbol. ExternalSymbols
+ // don't have enough information to determine if the caller and callee share
+ // the same TOC base, so we have to pessimistically assume they don't for
+ // correctness.
+ GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
+ if (!G)
+ return false;
+
+ const GlobalValue *GV = G->getGlobal();
+
+ // If the callee is preemptable, then the static linker will use a plt-stub
+ // which saves the toc to the stack, and needs a nop after the call
+ // instruction to convert to a toc-restore.
+ if (!TM.shouldAssumeDSOLocal(*Caller->getParent(), GV))
+ return false;
+
+ // Functions with PC Relative enabled may clobber the TOC in the same DSO.
+ // We may need a TOC restore in the situation where the caller requires a
+ // valid TOC but the callee is PC Relative and does not.
+ const Function *F = dyn_cast<Function>(GV);
+ const GlobalAlias *Alias = dyn_cast<GlobalAlias>(GV);
+
+ // If we have an Alias we can try to get the function from there.
+ if (Alias) {
+ const GlobalObject *GlobalObj = Alias->getBaseObject();
+ F = dyn_cast<Function>(GlobalObj);
+ }
+
+ // If we still have no valid function pointer we do not have enough
+ // information to determine if the callee uses PC Relative calls so we must
+ // assume that it does.
+ if (!F)
+ return false;
+
+ // If the callee uses PC Relative we cannot guarantee that the callee won't
+ // clobber the TOC of the caller and so we must assume that the two
+ // functions do not share a TOC base.
+ const PPCSubtarget *STICallee = &TM.getSubtarget<PPCSubtarget>(*F);
+ if (STICallee->isUsingPCRelativeCalls())
+ return false;
+
// The medium and large code models are expected to provide a sufficiently
// large TOC to provide all data addressing needs of a module with a
- // single TOC. Since each module will be addressed with a single TOC then we
- // only need to check that caller and callee don't cross dso boundaries.
+ // single TOC.
if (CodeModel::Medium == TM.getCodeModel() ||
CodeModel::Large == TM.getCodeModel())
- return TM.shouldAssumeDSOLocal(*Caller->getParent(), GV);
+ return true;
// Otherwise we need to ensure callee and caller are in the same section,
// since the linker may allocate multiple TOCs, and we don't know which
// sections will belong to the same TOC base.
-
if (!GV->isStrongDefinitionForLinker())
return false;
@@ -4585,26 +4783,6 @@ callsShareTOCBase(const Function *Caller, SDValue Callee,
return false;
}
- // If the callee might be interposed, then we can't assume the ultimate call
- // target will be in the same section. Even in cases where we can assume that
- // interposition won't happen, in any case where the linker might insert a
- // stub to allow for interposition, we must generate code as though
- // interposition might occur. To understand why this matters, consider a
- // situation where: a -> b -> c where the arrows indicate calls. b and c are
- // in the same section, but a is in a different module (i.e. has a different
- // TOC base pointer). If the linker allows for interposition between b and c,
- // then it will generate a stub for the call edge between b and c which will
- // save the TOC pointer into the designated stack slot allocated by b. If we
- // return true here, and therefore allow a tail call between b and c, that
- // stack slot won't exist and the b -> c stub will end up saving b'c TOC base
- // pointer into the stack slot allocated by a (where the a -> b stub saved
- // a's TOC base pointer). If we're not considering a tail call, but rather,
- // whether a nop is needed after the call instruction in b, because the linker
- // will insert a stub, it might complain about a missing nop if we omit it
- // (although many don't complain in this case).
- if (!TM.shouldAssumeDSOLocal(*Caller->getParent(), GV))
- return false;
-
return true;
}
@@ -4646,13 +4824,12 @@ needStackSlotPassParameters(const PPCSubtarget &Subtarget,
return false;
}
-static bool
-hasSameArgumentList(const Function *CallerFn, ImmutableCallSite CS) {
- if (CS.arg_size() != CallerFn->arg_size())
+static bool hasSameArgumentList(const Function *CallerFn, const CallBase &CB) {
+ if (CB.arg_size() != CallerFn->arg_size())
return false;
- ImmutableCallSite::arg_iterator CalleeArgIter = CS.arg_begin();
- ImmutableCallSite::arg_iterator CalleeArgEnd = CS.arg_end();
+ auto CalleeArgIter = CB.arg_begin();
+ auto CalleeArgEnd = CB.arg_end();
Function::const_arg_iterator CallerArgIter = CallerFn->arg_begin();
for (; CalleeArgIter != CalleeArgEnd; ++CalleeArgIter, ++CallerArgIter) {
@@ -4694,15 +4871,10 @@ areCallingConvEligibleForTCO_64SVR4(CallingConv::ID CallerCC,
return CallerCC == CallingConv::C || CallerCC == CalleeCC;
}
-bool
-PPCTargetLowering::IsEligibleForTailCallOptimization_64SVR4(
- SDValue Callee,
- CallingConv::ID CalleeCC,
- ImmutableCallSite CS,
- bool isVarArg,
- const SmallVectorImpl<ISD::OutputArg> &Outs,
- const SmallVectorImpl<ISD::InputArg> &Ins,
- SelectionDAG& DAG) const {
+bool PPCTargetLowering::IsEligibleForTailCallOptimization_64SVR4(
+ SDValue Callee, CallingConv::ID CalleeCC, const CallBase *CB, bool isVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
bool TailCallOpt = getTargetMachine().Options.GuaranteedTailCallOpt;
if (DisableSCO && !TailCallOpt) return false;
@@ -4744,15 +4916,22 @@ PPCTargetLowering::IsEligibleForTailCallOptimization_64SVR4(
needStackSlotPassParameters(Subtarget, Outs))
return false;
- // No TCO/SCO on indirect call because Caller have to restore its TOC
- if (!isFunctionGlobalAddress(Callee) &&
- !isa<ExternalSymbolSDNode>(Callee))
+ // All variants of 64-bit ELF ABIs without PC-Relative addressing require that
+ // the caller and callee share the same TOC for TCO/SCO. If the caller and
+ // callee potentially have different TOC bases then we cannot tail call since
+ // we need to restore the TOC pointer after the call.
+ // ref: https://bugzilla.mozilla.org/show_bug.cgi?id=973977
+ // We cannot guarantee this for indirect calls or calls to external functions.
+ // When PC-Relative addressing is used, the concept of the TOC is no longer
+ // applicable so this check is not required.
+ // Check first for indirect calls.
+ if (!Subtarget.isUsingPCRelativeCalls() &&
+ !isFunctionGlobalAddress(Callee) && !isa<ExternalSymbolSDNode>(Callee))
return false;
- // If the caller and callee potentially have different TOC bases then we
- // cannot tail call since we need to restore the TOC pointer after the call.
- // ref: https://bugzilla.mozilla.org/show_bug.cgi?id=973977
- if (!callsShareTOCBase(&Caller, Callee, getTargetMachine()))
+ // Check if we share the TOC base.
+ if (!Subtarget.isUsingPCRelativeCalls() &&
+ !callsShareTOCBase(&Caller, Callee, getTargetMachine()))
return false;
// TCO allows altering callee ABI, so we don't have to check further.
@@ -4764,10 +4943,14 @@ PPCTargetLowering::IsEligibleForTailCallOptimization_64SVR4(
// If callee use the same argument list that caller is using, then we can
// apply SCO on this case. If it is not, then we need to check if callee needs
// stack for passing arguments.
- if (!hasSameArgumentList(&Caller, CS) &&
- needStackSlotPassParameters(Subtarget, Outs)) {
+ // PC Relative tail calls may not have a CallBase.
+ // If there is no CallBase we cannot verify if we have the same argument
+ // list so assume that we don't have the same argument list.
+ if (CB && !hasSameArgumentList(&Caller, *CB) &&
+ needStackSlotPassParameters(Subtarget, Outs))
+ return false;
+ else if (!CB && needStackSlotPassParameters(Subtarget, Outs))
return false;
- }
return true;
}
@@ -4876,18 +5059,6 @@ static SDValue EmitTailCallStoreFPAndRetAddr(SelectionDAG &DAG, SDValue Chain,
SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewRetAddr, VT);
Chain = DAG.getStore(Chain, dl, OldRetAddr, NewRetAddrFrIdx,
MachinePointerInfo::getFixedStack(MF, NewRetAddr));
-
- // When using the 32/64-bit SVR4 ABI there is no need to move the FP stack
- // slot as the FP is never overwritten.
- if (Subtarget.isDarwinABI()) {
- int NewFPLoc = SPDiff + FL->getFramePointerSaveOffset();
- int NewFPIdx = MF.getFrameInfo().CreateFixedObject(SlotSize, NewFPLoc,
- true);
- SDValue NewFramePtrIdx = DAG.getFrameIndex(NewFPIdx, VT);
- Chain = DAG.getStore(Chain, dl, OldFP, NewFramePtrIdx,
- MachinePointerInfo::getFixedStack(
- DAG.getMachineFunction(), NewFPIdx));
- }
}
return Chain;
}
@@ -4922,14 +5093,6 @@ SDValue PPCTargetLowering::EmitTailCallLoadFPAndRetAddr(
LROpOut = getReturnAddrFrameIndex(DAG);
LROpOut = DAG.getLoad(VT, dl, Chain, LROpOut, MachinePointerInfo());
Chain = SDValue(LROpOut.getNode(), 1);
-
- // When using the 32/64-bit SVR4 ABI there is no need to load the FP stack
- // slot as the FP is never overwritten.
- if (Subtarget.isDarwinABI()) {
- FPOpOut = getFramePointerFrameIndex(DAG);
- FPOpOut = DAG.getLoad(VT, dl, Chain, FPOpOut, MachinePointerInfo());
- Chain = SDValue(FPOpOut.getNode(), 1);
- }
}
return Chain;
}
@@ -4944,9 +5107,9 @@ static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
SDValue Chain, ISD::ArgFlagsTy Flags,
SelectionDAG &DAG, const SDLoc &dl) {
SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);
- return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
- false, false, false, MachinePointerInfo(),
- MachinePointerInfo());
+ return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode,
+ Flags.getNonZeroByValAlign(), false, false, false,
+ MachinePointerInfo(), MachinePointerInfo());
}
/// LowerMemOpCallTo - Store the argument to the stack or remember it in case of
@@ -5097,28 +5260,37 @@ static bool isIndirectCall(const SDValue &Callee, SelectionDAG &DAG,
return true;
}
-static unsigned getCallOpcode(bool isIndirectCall, bool isPatchPoint,
- bool isTailCall, const Function &Caller,
+// AIX and 64-bit ELF ABIs w/o PCRel require a TOC save/restore around calls.
+static inline bool isTOCSaveRestoreRequired(const PPCSubtarget &Subtarget) {
+ return Subtarget.isAIXABI() ||
+ (Subtarget.is64BitELFABI() && !Subtarget.isUsingPCRelativeCalls());
+}
+
+static unsigned getCallOpcode(PPCTargetLowering::CallFlags CFlags,
+ const Function &Caller,
const SDValue &Callee,
const PPCSubtarget &Subtarget,
const TargetMachine &TM) {
- if (isTailCall)
+ if (CFlags.IsTailCall)
return PPCISD::TC_RETURN;
// This is a call through a function pointer.
- if (isIndirectCall) {
+ if (CFlags.IsIndirect) {
// AIX and the 64-bit ELF ABIs need to maintain the TOC pointer accross
// indirect calls. The save of the caller's TOC pointer to the stack will be
// inserted into the DAG as part of call lowering. The restore of the TOC
// pointer is modeled by using a pseudo instruction for the call opcode that
// represents the 2 instruction sequence of an indirect branch and link,
// immediately followed by a load of the TOC pointer from the the stack save
- // slot into gpr2.
- if (Subtarget.isAIXABI() || Subtarget.is64BitELFABI())
- return PPCISD::BCTRL_LOAD_TOC;
+ // slot into gpr2. For 64-bit ELFv2 ABI with PCRel, do not restore the TOC
+ // as it is not saved or used.
+ return isTOCSaveRestoreRequired(Subtarget) ? PPCISD::BCTRL_LOAD_TOC
+ : PPCISD::BCTRL;
+ }
- // An indirect call that does not need a TOC restore.
- return PPCISD::BCTRL;
+ if (Subtarget.isUsingPCRelativeCalls()) {
+ assert(Subtarget.is64BitELFABI() && "PC Relative is only on ELF ABI.");
+ return PPCISD::CALL_NOTOC;
}
// The ABIs that maintain a TOC pointer accross calls need to have a nop
@@ -5136,14 +5308,6 @@ static unsigned getCallOpcode(bool isIndirectCall, bool isPatchPoint,
return PPCISD::CALL;
}
-static bool isValidAIXExternalSymSDNode(StringRef SymName) {
- return StringSwitch<bool>(SymName)
- .Cases("__divdi3", "__fixunsdfdi", "__floatundidf", "__floatundisf",
- "__moddi3", "__udivdi3", "__umoddi3", true)
- .Cases("ceil", "floor", "memcpy", "memmove", "memset", "round", true)
- .Default(false);
-}
-
static SDValue transformCallee(const SDValue &Callee, SelectionDAG &DAG,
const SDLoc &dl, const PPCSubtarget &Subtarget) {
if (!Subtarget.usesFunctionDescriptors() && !Subtarget.isELFv2ABI())
@@ -5179,14 +5343,14 @@ static SDValue transformCallee(const SDValue &Callee, SelectionDAG &DAG,
MCSymbolXCOFF *S = cast<MCSymbolXCOFF>(
Context.getOrCreateSymbol(Twine(".") + Twine(FuncName)));
- if (IsDeclaration && !S->hasContainingCsect()) {
+ if (IsDeclaration && !S->hasRepresentedCsectSet()) {
// On AIX, an undefined symbol needs to be associated with a
// MCSectionXCOFF to get the correct storage mapping class.
// In this case, XCOFF::XMC_PR.
MCSectionXCOFF *Sec = Context.getXCOFFSection(
- S->getName(), XCOFF::XMC_PR, XCOFF::XTY_ER, SC,
+ S->getSymbolTableName(), XCOFF::XMC_PR, XCOFF::XTY_ER, SC,
SectionKind::getMetadata());
- S->setContainingCsect(Sec);
+ S->setRepresentedCsect(Sec);
}
MVT PtrVT =
@@ -5227,12 +5391,7 @@ static SDValue transformCallee(const SDValue &Callee, SelectionDAG &DAG,
SC);
}
- // TODO: Remove this when the support for ExternalSymbolSDNode is complete.
- if (isValidAIXExternalSymSDNode(SymName)) {
- return getAIXFuncEntryPointSymbolSDNode(SymName, true, XCOFF::C_EXT);
- }
-
- report_fatal_error("Unexpected ExternalSymbolSDNode: " + Twine(SymName));
+ return getAIXFuncEntryPointSymbolSDNode(SymName, true, XCOFF::C_EXT);
}
// No transformation needed.
@@ -5270,7 +5429,7 @@ static void prepareIndirectCall(SelectionDAG &DAG, SDValue &Callee,
static void prepareDescriptorIndirectCall(SelectionDAG &DAG, SDValue &Callee,
SDValue &Glue, SDValue &Chain,
SDValue CallSeqStart,
- ImmutableCallSite CS, const SDLoc &dl,
+ const CallBase *CB, const SDLoc &dl,
bool hasNest,
const PPCSubtarget &Subtarget) {
// Function pointers in the 64-bit SVR4 ABI do not point to the function
@@ -5306,7 +5465,7 @@ static void prepareDescriptorIndirectCall(SelectionDAG &DAG, SDValue &Callee,
MachineMemOperand::MOInvariant)
: MachineMemOperand::MONone;
- MachinePointerInfo MPI(CS ? CS.getCalledValue() : nullptr);
+ MachinePointerInfo MPI(CB ? CB->getCalledOperand() : nullptr);
// Registers used in building the DAG.
const MCRegister EnvPtrReg = Subtarget.getEnvironmentPointerRegister();
@@ -5360,12 +5519,12 @@ static void prepareDescriptorIndirectCall(SelectionDAG &DAG, SDValue &Callee,
}
static void
-buildCallOperands(SmallVectorImpl<SDValue> &Ops, CallingConv::ID CallConv,
- const SDLoc &dl, bool isTailCall, bool isVarArg,
- bool isPatchPoint, bool hasNest, SelectionDAG &DAG,
+buildCallOperands(SmallVectorImpl<SDValue> &Ops,
+ PPCTargetLowering::CallFlags CFlags, const SDLoc &dl,
+ SelectionDAG &DAG,
SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass,
SDValue Glue, SDValue Chain, SDValue &Callee, int SPDiff,
- const PPCSubtarget &Subtarget, bool isIndirect) {
+ const PPCSubtarget &Subtarget) {
const bool IsPPC64 = Subtarget.isPPC64();
// MVT for a general purpose register.
const MVT RegVT = IsPPC64 ? MVT::i64 : MVT::i32;
@@ -5374,10 +5533,10 @@ buildCallOperands(SmallVectorImpl<SDValue> &Ops, CallingConv::ID CallConv,
Ops.push_back(Chain);
// If it's a direct call pass the callee as the second operand.
- if (!isIndirect)
+ if (!CFlags.IsIndirect)
Ops.push_back(Callee);
else {
- assert(!isPatchPoint && "Patch point call are not indirect.");
+ assert(!CFlags.IsPatchPoint && "Patch point calls are not indirect.");
// For the TOC based ABIs, we have saved the TOC pointer to the linkage area
// on the stack (this would have been done in `LowerCall_64SVR4` or
@@ -5386,7 +5545,9 @@ buildCallOperands(SmallVectorImpl<SDValue> &Ops, CallingConv::ID CallConv,
// pointer from the linkage area. The operand for the TOC restore is an add
// of the TOC save offset to the stack pointer. This must be the second
// operand: after the chain input but before any other variadic arguments.
- if (Subtarget.is64BitELFABI() || Subtarget.isAIXABI()) {
+ // For 64-bit ELFv2 ABI with PCRel, do not restore the TOC as it is not
+ // saved or used.
+ if (isTOCSaveRestoreRequired(Subtarget)) {
const MCRegister StackPtrReg = Subtarget.getStackPointerRegister();
SDValue StackPtr = DAG.getRegister(StackPtrReg, RegVT);
@@ -5397,18 +5558,18 @@ buildCallOperands(SmallVectorImpl<SDValue> &Ops, CallingConv::ID CallConv,
}
// Add the register used for the environment pointer.
- if (Subtarget.usesFunctionDescriptors() && !hasNest)
+ if (Subtarget.usesFunctionDescriptors() && !CFlags.HasNest)
Ops.push_back(DAG.getRegister(Subtarget.getEnvironmentPointerRegister(),
RegVT));
// Add CTR register as callee so a bctr can be emitted later.
- if (isTailCall)
+ if (CFlags.IsTailCall)
Ops.push_back(DAG.getRegister(IsPPC64 ? PPC::CTR8 : PPC::CTR, RegVT));
}
// If this is a tail call add stack pointer delta.
- if (isTailCall)
+ if (CFlags.IsTailCall)
Ops.push_back(DAG.getConstant(SPDiff, dl, MVT::i32));
// Add argument registers to the end of the list so that they are known live
@@ -5420,17 +5581,18 @@ buildCallOperands(SmallVectorImpl<SDValue> &Ops, CallingConv::ID CallConv,
// We cannot add R2/X2 as an operand here for PATCHPOINT, because there is
// no way to mark dependencies as implicit here.
// We will add the R2/X2 dependency in EmitInstrWithCustomInserter.
- if ((Subtarget.is64BitELFABI() || Subtarget.isAIXABI()) && !isPatchPoint)
+ if ((Subtarget.is64BitELFABI() || Subtarget.isAIXABI()) &&
+ !CFlags.IsPatchPoint && !Subtarget.isUsingPCRelativeCalls())
Ops.push_back(DAG.getRegister(Subtarget.getTOCPointerRegister(), RegVT));
// Add implicit use of CR bit 6 for 32-bit SVR4 vararg calls
- if (isVarArg && Subtarget.is32BitELFABI())
+ if (CFlags.IsVarArg && Subtarget.is32BitELFABI())
Ops.push_back(DAG.getRegister(PPC::CR1EQ, MVT::i32));
// Add a register mask operand representing the call-preserved registers.
const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
const uint32_t *Mask =
- TRI->getCallPreservedMask(DAG.getMachineFunction(), CallConv);
+ TRI->getCallPreservedMask(DAG.getMachineFunction(), CFlags.CallConv);
assert(Mask && "Missing call preserved mask for calling convention");
Ops.push_back(DAG.getRegisterMask(Mask));
@@ -5440,44 +5602,47 @@ buildCallOperands(SmallVectorImpl<SDValue> &Ops, CallingConv::ID CallConv,
}
SDValue PPCTargetLowering::FinishCall(
- CallingConv::ID CallConv, const SDLoc &dl, bool isTailCall, bool isVarArg,
- bool isPatchPoint, bool hasNest, SelectionDAG &DAG,
+ CallFlags CFlags, const SDLoc &dl, SelectionDAG &DAG,
SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass, SDValue Glue,
SDValue Chain, SDValue CallSeqStart, SDValue &Callee, int SPDiff,
unsigned NumBytes, const SmallVectorImpl<ISD::InputArg> &Ins,
- SmallVectorImpl<SDValue> &InVals, ImmutableCallSite CS) const {
+ SmallVectorImpl<SDValue> &InVals, const CallBase *CB) const {
- if (Subtarget.is64BitELFABI() || Subtarget.isAIXABI())
+ if ((Subtarget.is64BitELFABI() && !Subtarget.isUsingPCRelativeCalls()) ||
+ Subtarget.isAIXABI())
setUsesTOCBasePtr(DAG);
- const bool isIndirect = isIndirectCall(Callee, DAG, Subtarget, isPatchPoint);
- unsigned CallOpc = getCallOpcode(isIndirect, isPatchPoint, isTailCall,
- DAG.getMachineFunction().getFunction(),
- Callee, Subtarget, DAG.getTarget());
+ unsigned CallOpc =
+ getCallOpcode(CFlags, DAG.getMachineFunction().getFunction(), Callee,
+ Subtarget, DAG.getTarget());
- if (!isIndirect)
+ if (!CFlags.IsIndirect)
Callee = transformCallee(Callee, DAG, dl, Subtarget);
else if (Subtarget.usesFunctionDescriptors())
- prepareDescriptorIndirectCall(DAG, Callee, Glue, Chain, CallSeqStart, CS,
- dl, hasNest, Subtarget);
+ prepareDescriptorIndirectCall(DAG, Callee, Glue, Chain, CallSeqStart, CB,
+ dl, CFlags.HasNest, Subtarget);
else
prepareIndirectCall(DAG, Callee, Glue, Chain, dl);
// Build the operand list for the call instruction.
SmallVector<SDValue, 8> Ops;
- buildCallOperands(Ops, CallConv, dl, isTailCall, isVarArg, isPatchPoint,
- hasNest, DAG, RegsToPass, Glue, Chain, Callee, SPDiff,
- Subtarget, isIndirect);
+ buildCallOperands(Ops, CFlags, dl, DAG, RegsToPass, Glue, Chain, Callee,
+ SPDiff, Subtarget);
// Emit tail call.
- if (isTailCall) {
+ if (CFlags.IsTailCall) {
+ // Indirect tail call when using PC Relative calls do not have the same
+ // constraints.
assert(((Callee.getOpcode() == ISD::Register &&
cast<RegisterSDNode>(Callee)->getReg() == PPC::CTR) ||
Callee.getOpcode() == ISD::TargetExternalSymbol ||
Callee.getOpcode() == ISD::TargetGlobalAddress ||
- isa<ConstantSDNode>(Callee)) &&
- "Expecting a global address, external symbol, absolute value or "
- "register");
+ isa<ConstantSDNode>(Callee) ||
+ (CFlags.IsIndirect && Subtarget.isUsingPCRelativeCalls())) &&
+ "Expecting a global address, external symbol, absolute value, "
+ "register or an indirect tail call when PC Relative calls are "
+ "used.");
+ // PC Relative calls also use TC_RETURN as the way to mark tail calls.
assert(CallOpc == PPCISD::TC_RETURN &&
"Unexpected call opcode for a tail call.");
DAG.getMachineFunction().getFrameInfo().setHasTailCall();
@@ -5486,12 +5651,13 @@ SDValue PPCTargetLowering::FinishCall(
std::array<EVT, 2> ReturnTypes = {{MVT::Other, MVT::Glue}};
Chain = DAG.getNode(CallOpc, dl, ReturnTypes, Ops);
+ DAG.addNoMergeSiteInfo(Chain.getNode(), CFlags.NoMerge);
Glue = Chain.getValue(1);
// When performing tail call optimization the callee pops its arguments off
// the stack. Account for this here so these bytes can be pushed back on in
// PPCFrameLowering::eliminateCallFramePseudoInstr.
- int BytesCalleePops = (CallConv == CallingConv::Fast &&
+ int BytesCalleePops = (CFlags.CallConv == CallingConv::Fast &&
getTargetMachine().Options.GuaranteedTailCallOpt)
? NumBytes
: 0;
@@ -5501,7 +5667,8 @@ SDValue PPCTargetLowering::FinishCall(
Glue, dl);
Glue = Chain.getValue(1);
- return LowerCallResult(Chain, Glue, CallConv, isVarArg, Ins, dl, DAG, InVals);
+ return LowerCallResult(Chain, Glue, CFlags.CallConv, CFlags.IsVarArg, Ins, dl,
+ DAG, InVals);
}
SDValue
@@ -5518,15 +5685,14 @@ PPCTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
CallingConv::ID CallConv = CLI.CallConv;
bool isVarArg = CLI.IsVarArg;
bool isPatchPoint = CLI.IsPatchPoint;
- ImmutableCallSite CS = CLI.CS;
+ const CallBase *CB = CLI.CB;
if (isTailCall) {
- if (Subtarget.useLongCalls() && !(CS && CS.isMustTailCall()))
+ if (Subtarget.useLongCalls() && !(CB && CB->isMustTailCall()))
isTailCall = false;
else if (Subtarget.isSVR4ABI() && Subtarget.isPPC64())
- isTailCall =
- IsEligibleForTailCallOptimization_64SVR4(Callee, CallConv, CS,
- isVarArg, Outs, Ins, DAG);
+ isTailCall = IsEligibleForTailCallOptimization_64SVR4(
+ Callee, CallConv, CB, isVarArg, Outs, Ins, DAG);
else
isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, isVarArg,
Ins, DAG);
@@ -5535,21 +5701,23 @@ PPCTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
if (!getTargetMachine().Options.GuaranteedTailCallOpt)
++NumSiblingCalls;
- assert(isa<GlobalAddressSDNode>(Callee) &&
+ // PC Relative calls no longer guarantee that the callee is a Global
+ // Address Node. The callee could be an indirect tail call in which
+ // case the SDValue for the callee could be a load (to load the address
+ // of a function pointer) or it may be a register copy (to move the
+ // address of the callee from a function parameter into a virtual
+ // register). It may also be an ExternalSymbolSDNode (ex memcopy).
+ assert((Subtarget.isUsingPCRelativeCalls() ||
+ isa<GlobalAddressSDNode>(Callee)) &&
"Callee should be an llvm::Function object.");
- LLVM_DEBUG(
- const GlobalValue *GV =
- cast<GlobalAddressSDNode>(Callee)->getGlobal();
- const unsigned Width =
- 80 - strlen("TCO caller: ") - strlen(", callee linkage: 0, 0");
- dbgs() << "TCO caller: "
- << left_justify(DAG.getMachineFunction().getName(), Width)
- << ", callee linkage: " << GV->getVisibility() << ", "
- << GV->getLinkage() << "\n");
+
+ LLVM_DEBUG(dbgs() << "TCO caller: " << DAG.getMachineFunction().getName()
+ << "\nTCO callee: ");
+ LLVM_DEBUG(Callee.dump());
}
}
- if (!isTailCall && CS && CS.isMustTailCall())
+ if (!isTailCall && CB && CB->isMustTailCall())
report_fatal_error("failed to perform tail call elimination on a call "
"site marked musttail");
@@ -5560,42 +5728,49 @@ PPCTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
!isTailCall)
Callee = LowerGlobalAddress(Callee, DAG);
+ CallFlags CFlags(
+ CallConv, isTailCall, isVarArg, isPatchPoint,
+ isIndirectCall(Callee, DAG, Subtarget, isPatchPoint),
+ // hasNest
+ Subtarget.is64BitELFABI() &&
+ any_of(Outs, [](ISD::OutputArg Arg) { return Arg.Flags.isNest(); }),
+ CLI.NoMerge);
+
if (Subtarget.isSVR4ABI() && Subtarget.isPPC64())
- return LowerCall_64SVR4(Chain, Callee, CallConv, isVarArg,
- isTailCall, isPatchPoint, Outs, OutVals, Ins,
- dl, DAG, InVals, CS);
+ return LowerCall_64SVR4(Chain, Callee, CFlags, Outs, OutVals, Ins, dl, DAG,
+ InVals, CB);
if (Subtarget.isSVR4ABI())
- return LowerCall_32SVR4(Chain, Callee, CallConv, isVarArg,
- isTailCall, isPatchPoint, Outs, OutVals, Ins,
- dl, DAG, InVals, CS);
+ return LowerCall_32SVR4(Chain, Callee, CFlags, Outs, OutVals, Ins, dl, DAG,
+ InVals, CB);
if (Subtarget.isAIXABI())
- return LowerCall_AIX(Chain, Callee, CallConv, isVarArg,
- isTailCall, isPatchPoint, Outs, OutVals, Ins,
- dl, DAG, InVals, CS);
+ return LowerCall_AIX(Chain, Callee, CFlags, Outs, OutVals, Ins, dl, DAG,
+ InVals, CB);
- return LowerCall_Darwin(Chain, Callee, CallConv, isVarArg,
- isTailCall, isPatchPoint, Outs, OutVals, Ins,
- dl, DAG, InVals, CS);
+ return LowerCall_Darwin(Chain, Callee, CFlags, Outs, OutVals, Ins, dl, DAG,
+ InVals, CB);
}
SDValue PPCTargetLowering::LowerCall_32SVR4(
- SDValue Chain, SDValue Callee, CallingConv::ID CallConv, bool isVarArg,
- bool isTailCall, bool isPatchPoint,
+ SDValue Chain, SDValue Callee, CallFlags CFlags,
const SmallVectorImpl<ISD::OutputArg> &Outs,
const SmallVectorImpl<SDValue> &OutVals,
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
- ImmutableCallSite CS) const {
+ const CallBase *CB) const {
// See PPCTargetLowering::LowerFormalArguments_32SVR4() for a description
// of the 32-bit SVR4 ABI stack frame layout.
+ const CallingConv::ID CallConv = CFlags.CallConv;
+ const bool IsVarArg = CFlags.IsVarArg;
+ const bool IsTailCall = CFlags.IsTailCall;
+
assert((CallConv == CallingConv::C ||
CallConv == CallingConv::Cold ||
CallConv == CallingConv::Fast) && "Unknown calling convention!");
- unsigned PtrByteSize = 4;
+ const Align PtrAlign(4);
MachineFunction &MF = DAG.getMachineFunction();
@@ -5614,15 +5789,15 @@ SDValue PPCTargetLowering::LowerCall_32SVR4(
// Assign locations to all of the outgoing arguments.
SmallVector<CCValAssign, 16> ArgLocs;
- PPCCCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
+ PPCCCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
// Reserve space for the linkage area on the stack.
CCInfo.AllocateStack(Subtarget.getFrameLowering()->getLinkageSize(),
- PtrByteSize);
+ PtrAlign);
if (useSoftFloat())
CCInfo.PreAnalyzeCallOperands(Outs);
- if (isVarArg) {
+ if (IsVarArg) {
// Handle fixed and variable vector arguments differently.
// Fixed vector arguments go into registers as long as registers are
// available. Variable vector arguments always go into memory.
@@ -5657,10 +5832,10 @@ SDValue PPCTargetLowering::LowerCall_32SVR4(
// Assign locations to all of the outgoing aggregate by value arguments.
SmallVector<CCValAssign, 16> ByValArgLocs;
- CCState CCByValInfo(CallConv, isVarArg, MF, ByValArgLocs, *DAG.getContext());
+ CCState CCByValInfo(CallConv, IsVarArg, MF, ByValArgLocs, *DAG.getContext());
// Reserve stack space for the allocations in CCInfo.
- CCByValInfo.AllocateStack(CCInfo.getNextStackOffset(), PtrByteSize);
+ CCByValInfo.AllocateStack(CCInfo.getNextStackOffset(), PtrAlign);
CCByValInfo.AnalyzeCallOperands(Outs, CC_PPC32_SVR4_ByVal);
@@ -5671,7 +5846,7 @@ SDValue PPCTargetLowering::LowerCall_32SVR4(
// Calculate by how many bytes the stack has to be adjusted in case of tail
// call optimization.
- int SPDiff = CalculateTailCallSPDiff(DAG, isTailCall, NumBytes);
+ int SPDiff = CalculateTailCallSPDiff(DAG, IsTailCall, NumBytes);
// Adjust the stack pointer for the new arguments...
// These operations are automatically eliminated by the prolog/epilog pass
@@ -5767,7 +5942,7 @@ SDValue PPCTargetLowering::LowerCall_32SVR4(
assert(VA.isMemLoc());
unsigned LocMemOffset = VA.getLocMemOffset();
- if (!isTailCall) {
+ if (!IsTailCall) {
SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(MF.getDataLayout()),
StackPtr, PtrOff);
@@ -5796,7 +5971,7 @@ SDValue PPCTargetLowering::LowerCall_32SVR4(
// Set CR bit 6 to true if this is a vararg call with floating args passed in
// registers.
- if (isVarArg) {
+ if (IsVarArg) {
SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
SDValue Ops[] = { Chain, InFlag };
@@ -5806,14 +5981,12 @@ SDValue PPCTargetLowering::LowerCall_32SVR4(
InFlag = Chain.getValue(1);
}
- if (isTailCall)
+ if (IsTailCall)
PrepareTailCall(DAG, InFlag, Chain, dl, SPDiff, NumBytes, LROp, FPOp,
TailCallArguments);
- return FinishCall(CallConv, dl, isTailCall, isVarArg, isPatchPoint,
- /* unused except on PPC64 ELFv1 */ false, DAG,
- RegsToPass, InFlag, Chain, CallSeqStart, Callee, SPDiff,
- NumBytes, Ins, InVals, CS);
+ return FinishCall(CFlags, dl, DAG, RegsToPass, InFlag, Chain, CallSeqStart,
+ Callee, SPDiff, NumBytes, Ins, InVals, CB);
}
// Copy an argument into memory, being careful to do this outside the
@@ -5834,25 +6007,24 @@ SDValue PPCTargetLowering::createMemcpyOutsideCallSeq(
}
SDValue PPCTargetLowering::LowerCall_64SVR4(
- SDValue Chain, SDValue Callee, CallingConv::ID CallConv, bool isVarArg,
- bool isTailCall, bool isPatchPoint,
+ SDValue Chain, SDValue Callee, CallFlags CFlags,
const SmallVectorImpl<ISD::OutputArg> &Outs,
const SmallVectorImpl<SDValue> &OutVals,
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
- ImmutableCallSite CS) const {
+ const CallBase *CB) const {
bool isELFv2ABI = Subtarget.isELFv2ABI();
bool isLittleEndian = Subtarget.isLittleEndian();
unsigned NumOps = Outs.size();
- bool hasNest = false;
bool IsSibCall = false;
+ bool IsFastCall = CFlags.CallConv == CallingConv::Fast;
EVT PtrVT = getPointerTy(DAG.getDataLayout());
unsigned PtrByteSize = 8;
MachineFunction &MF = DAG.getMachineFunction();
- if (isTailCall && !getTargetMachine().Options.GuaranteedTailCallOpt)
+ if (CFlags.IsTailCall && !getTargetMachine().Options.GuaranteedTailCallOpt)
IsSibCall = true;
// Mark this function as potentially containing a function that contains a
@@ -5860,11 +6032,10 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
// and restoring the callers stack pointer in this functions epilog. This is
// done because by tail calling the called function might overwrite the value
// in this function's (MF) stack pointer stack slot 0(SP).
- if (getTargetMachine().Options.GuaranteedTailCallOpt &&
- CallConv == CallingConv::Fast)
+ if (getTargetMachine().Options.GuaranteedTailCallOpt && IsFastCall)
MF.getInfo<PPCFunctionInfo>()->setHasFastCall();
- assert(!(CallConv == CallingConv::Fast && isVarArg) &&
+ assert(!(IsFastCall && CFlags.IsVarArg) &&
"fastcc not supported on varargs functions");
// Count how many bytes are to be pushed on the stack, including the linkage
@@ -5894,7 +6065,7 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
// can be passed to the callee in registers.
// For the fast calling convention, there is another check below.
// Note: We should keep consistent with LowerFormalArguments_64SVR4()
- bool HasParameterArea = !isELFv2ABI || isVarArg || CallConv == CallingConv::Fast;
+ bool HasParameterArea = !isELFv2ABI || CFlags.IsVarArg || IsFastCall;
if (!HasParameterArea) {
unsigned ParamAreaSize = NumGPRs * PtrByteSize;
unsigned AvailableFPRs = NumFPRs;
@@ -5916,7 +6087,7 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
// Avoid allocating parameter area for fastcc functions if all the arguments
// can be passed in the registers.
- if (CallConv == CallingConv::Fast)
+ if (IsFastCall)
HasParameterArea = false;
// Add up all the space actually used.
@@ -5928,7 +6099,7 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
if (Flags.isNest())
continue;
- if (CallConv == CallingConv::Fast) {
+ if (IsFastCall) {
if (Flags.isByVal()) {
NumGPRsUsed += (Flags.getByValSize()+7)/8;
if (NumGPRsUsed > NumGPRs)
@@ -5976,9 +6147,9 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
}
/* Respect alignment of argument on the stack. */
- unsigned Align =
- CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize);
- NumBytes = ((NumBytes + Align - 1) / Align) * Align;
+ auto Alignement =
+ CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize);
+ NumBytes = alignTo(NumBytes, Alignement);
NumBytes += CalculateStackSlotSize(ArgVT, Flags, PtrByteSize);
if (Flags.isInConsecutiveRegsLast())
@@ -6001,8 +6172,7 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
NumBytes = LinkageSize;
// Tail call needs the stack to be aligned.
- if (getTargetMachine().Options.GuaranteedTailCallOpt &&
- CallConv == CallingConv::Fast)
+ if (getTargetMachine().Options.GuaranteedTailCallOpt && IsFastCall)
NumBytes = EnsureStackAlignment(Subtarget.getFrameLowering(), NumBytes);
int SPDiff = 0;
@@ -6010,11 +6180,11 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
// Calculate by how many bytes the stack has to be adjusted in case of tail
// call optimization.
if (!IsSibCall)
- SPDiff = CalculateTailCallSPDiff(DAG, isTailCall, NumBytes);
+ SPDiff = CalculateTailCallSPDiff(DAG, CFlags.IsTailCall, NumBytes);
// To protect arguments on the stack from being clobbered in a tail call,
// force all the loads to happen before doing any other lowering.
- if (isTailCall)
+ if (CFlags.IsTailCall)
Chain = DAG.getStackArgumentTokenFactor(Chain);
// Adjust the stack pointer for the new arguments...
@@ -6058,16 +6228,16 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
// we'll actually use a stack slot.
auto ComputePtrOff = [&]() {
/* Respect alignment of argument on the stack. */
- unsigned Align =
- CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize);
- ArgOffset = ((ArgOffset + Align - 1) / Align) * Align;
+ auto Alignment =
+ CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize);
+ ArgOffset = alignTo(ArgOffset, Alignment);
PtrOff = DAG.getConstant(ArgOffset, dl, StackPtr.getValueType());
PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);
};
- if (CallConv != CallingConv::Fast) {
+ if (!IsFastCall) {
ComputePtrOff();
/* Compute GPR index associated with argument offset. */
@@ -6098,7 +6268,7 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
if (Size == 0)
continue;
- if (CallConv == CallingConv::Fast)
+ if (IsFastCall)
ComputePtrOff();
// All aggregates smaller than 8 bytes must be passed right-justified.
@@ -6203,7 +6373,6 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
if (Flags.isNest()) {
// The 'nest' parameter, if any, is passed in R11.
RegsToPass.push_back(std::make_pair(PPC::X11, Arg));
- hasNest = true;
break;
}
@@ -6213,18 +6382,18 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
if (GPR_idx != NumGPRs) {
RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Arg));
} else {
- if (CallConv == CallingConv::Fast)
+ if (IsFastCall)
ComputePtrOff();
assert(HasParameterArea &&
"Parameter area must exist to pass an argument in memory.");
LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
- true, isTailCall, false, MemOpChains,
+ true, CFlags.IsTailCall, false, MemOpChains,
TailCallArguments, dl);
- if (CallConv == CallingConv::Fast)
+ if (IsFastCall)
ArgOffset += PtrByteSize;
}
- if (CallConv != CallingConv::Fast)
+ if (!IsFastCall)
ArgOffset += PtrByteSize;
break;
case MVT::f32:
@@ -6238,7 +6407,7 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
// Unnamed arguments for vararg functions always go to GPRs and
// then the parameter save area. For now, put all arguments to vararg
// routines always in both locations (FPR *and* GPR or stack slot).
- bool NeedGPROrStack = isVarArg || FPR_idx == NumFPRs;
+ bool NeedGPROrStack = CFlags.IsVarArg || FPR_idx == NumFPRs;
bool NeededLoad = false;
// First load the argument into the next available FPR.
@@ -6248,7 +6417,7 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
// Next, load the argument into GPR or stack slot if needed.
if (!NeedGPROrStack)
;
- else if (GPR_idx != NumGPRs && CallConv != CallingConv::Fast) {
+ else if (GPR_idx != NumGPRs && !IsFastCall) {
// FIXME: We may want to re-enable this for CallingConv::Fast on the P8
// once we support fp <-> gpr moves.
@@ -6292,7 +6461,7 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
if (ArgVal.getNode())
RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], ArgVal));
} else {
- if (CallConv == CallingConv::Fast)
+ if (IsFastCall)
ComputePtrOff();
// Single-precision floating-point values are mapped to the
@@ -6306,7 +6475,7 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
assert(HasParameterArea &&
"Parameter area must exist to pass an argument in memory.");
LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
- true, isTailCall, false, MemOpChains,
+ true, CFlags.IsTailCall, false, MemOpChains,
TailCallArguments, dl);
NeededLoad = true;
@@ -6314,7 +6483,7 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
// When passing an array of floats, the array occupies consecutive
// space in the argument area; only round up to the next doubleword
// at the end of the array. Otherwise, each float takes 8 bytes.
- if (CallConv != CallingConv::Fast || NeededLoad) {
+ if (!IsFastCall || NeededLoad) {
ArgOffset += (Arg.getValueType() == MVT::f32 &&
Flags.isInConsecutiveRegs()) ? 4 : 8;
if (Flags.isInConsecutiveRegsLast())
@@ -6339,7 +6508,7 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
// usual; unnamed arguments always go to the stack or the corresponding
// GPRs when within range. For now, we always put the value in both
// locations (or even all three).
- if (isVarArg) {
+ if (CFlags.IsVarArg) {
assert(HasParameterArea &&
"Parameter area must exist if we have a varargs call.");
// We could elide this store in the case where the object fits
@@ -6371,19 +6540,19 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
if (VR_idx != NumVRs) {
RegsToPass.push_back(std::make_pair(VR[VR_idx++], Arg));
} else {
- if (CallConv == CallingConv::Fast)
+ if (IsFastCall)
ComputePtrOff();
assert(HasParameterArea &&
"Parameter area must exist to pass an argument in memory.");
LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
- true, isTailCall, true, MemOpChains,
+ true, CFlags.IsTailCall, true, MemOpChains,
TailCallArguments, dl);
- if (CallConv == CallingConv::Fast)
+ if (IsFastCall)
ArgOffset += 16;
}
- if (CallConv != CallingConv::Fast)
+ if (!IsFastCall)
ArgOffset += 16;
break;
} // not QPX
@@ -6395,7 +6564,7 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
case MVT::v4f64:
case MVT::v4i1: {
bool IsF32 = Arg.getValueType().getSimpleVT().SimpleTy == MVT::v4f32;
- if (isVarArg) {
+ if (CFlags.IsVarArg) {
assert(HasParameterArea &&
"Parameter area must exist if we have a varargs call.");
// We could elide this store in the case where the object fits
@@ -6427,19 +6596,19 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
if (QFPR_idx != NumQFPRs) {
RegsToPass.push_back(std::make_pair(QFPR[QFPR_idx++], Arg));
} else {
- if (CallConv == CallingConv::Fast)
+ if (IsFastCall)
ComputePtrOff();
assert(HasParameterArea &&
"Parameter area must exist to pass an argument in memory.");
LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
- true, isTailCall, true, MemOpChains,
+ true, CFlags.IsTailCall, true, MemOpChains,
TailCallArguments, dl);
- if (CallConv == CallingConv::Fast)
+ if (IsFastCall)
ArgOffset += (IsF32 ? 16 : 32);
}
- if (CallConv != CallingConv::Fast)
+ if (!IsFastCall)
ArgOffset += (IsF32 ? 16 : 32);
break;
}
@@ -6456,23 +6625,26 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
// Check if this is an indirect call (MTCTR/BCTRL).
// See prepareDescriptorIndirectCall and buildCallOperands for more
// information about calls through function pointers in the 64-bit SVR4 ABI.
- if (!isTailCall && !isPatchPoint &&
- !isFunctionGlobalAddress(Callee) &&
- !isa<ExternalSymbolSDNode>(Callee)) {
- // Load r2 into a virtual register and store it to the TOC save area.
- setUsesTOCBasePtr(DAG);
- SDValue Val = DAG.getCopyFromReg(Chain, dl, PPC::X2, MVT::i64);
- // TOC save area offset.
- unsigned TOCSaveOffset = Subtarget.getFrameLowering()->getTOCSaveOffset();
- SDValue PtrOff = DAG.getIntPtrConstant(TOCSaveOffset, dl);
- SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);
- Chain = DAG.getStore(
- Val.getValue(1), dl, Val, AddPtr,
- MachinePointerInfo::getStack(DAG.getMachineFunction(), TOCSaveOffset));
+ if (CFlags.IsIndirect) {
+ // For 64-bit ELFv2 ABI with PCRel, do not save the TOC of the
+ // caller in the TOC save area.
+ if (isTOCSaveRestoreRequired(Subtarget)) {
+ assert(!CFlags.IsTailCall && "Indirect tails calls not supported");
+ // Load r2 into a virtual register and store it to the TOC save area.
+ setUsesTOCBasePtr(DAG);
+ SDValue Val = DAG.getCopyFromReg(Chain, dl, PPC::X2, MVT::i64);
+ // TOC save area offset.
+ unsigned TOCSaveOffset = Subtarget.getFrameLowering()->getTOCSaveOffset();
+ SDValue PtrOff = DAG.getIntPtrConstant(TOCSaveOffset, dl);
+ SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);
+ Chain = DAG.getStore(Val.getValue(1), dl, Val, AddPtr,
+ MachinePointerInfo::getStack(
+ DAG.getMachineFunction(), TOCSaveOffset));
+ }
// In the ELFv2 ABI, R12 must contain the address of an indirect callee.
// This does not mean the MTCTR instruction must use R12; it's easier
// to model this as an extra parameter, so do that.
- if (isELFv2ABI && !isPatchPoint)
+ if (isELFv2ABI && !CFlags.IsPatchPoint)
RegsToPass.push_back(std::make_pair((unsigned)PPC::X12, Callee));
}
@@ -6485,23 +6657,21 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
InFlag = Chain.getValue(1);
}
- if (isTailCall && !IsSibCall)
+ if (CFlags.IsTailCall && !IsSibCall)
PrepareTailCall(DAG, InFlag, Chain, dl, SPDiff, NumBytes, LROp, FPOp,
TailCallArguments);
- return FinishCall(CallConv, dl, isTailCall, isVarArg, isPatchPoint, hasNest,
- DAG, RegsToPass, InFlag, Chain, CallSeqStart, Callee,
- SPDiff, NumBytes, Ins, InVals, CS);
+ return FinishCall(CFlags, dl, DAG, RegsToPass, InFlag, Chain, CallSeqStart,
+ Callee, SPDiff, NumBytes, Ins, InVals, CB);
}
SDValue PPCTargetLowering::LowerCall_Darwin(
- SDValue Chain, SDValue Callee, CallingConv::ID CallConv, bool isVarArg,
- bool isTailCall, bool isPatchPoint,
+ SDValue Chain, SDValue Callee, CallFlags CFlags,
const SmallVectorImpl<ISD::OutputArg> &Outs,
const SmallVectorImpl<SDValue> &OutVals,
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
- ImmutableCallSite CS) const {
+ const CallBase *CB) const {
unsigned NumOps = Outs.size();
EVT PtrVT = getPointerTy(DAG.getDataLayout());
@@ -6516,7 +6686,7 @@ SDValue PPCTargetLowering::LowerCall_Darwin(
// done because by tail calling the called function might overwrite the value
// in this function's (MF) stack pointer stack slot 0(SP).
if (getTargetMachine().Options.GuaranteedTailCallOpt &&
- CallConv == CallingConv::Fast)
+ CFlags.CallConv == CallingConv::Fast)
MF.getInfo<PPCFunctionInfo>()->setHasFastCall();
// Count how many bytes are to be pushed on the stack, including the linkage
@@ -6539,7 +6709,7 @@ SDValue PPCTargetLowering::LowerCall_Darwin(
if (ArgVT == MVT::v4f32 || ArgVT == MVT::v4i32 ||
ArgVT == MVT::v8i16 || ArgVT == MVT::v16i8 ||
ArgVT == MVT::v2f64 || ArgVT == MVT::v2i64) {
- if (!isVarArg && !isPPC64) {
+ if (!CFlags.IsVarArg && !isPPC64) {
// Non-varargs Altivec parameters go after all the non-Altivec
// parameters; handle those later so we know how much padding we need.
nAltivecParamsAtEnd++;
@@ -6566,16 +6736,16 @@ SDValue PPCTargetLowering::LowerCall_Darwin(
// Tail call needs the stack to be aligned.
if (getTargetMachine().Options.GuaranteedTailCallOpt &&
- CallConv == CallingConv::Fast)
+ CFlags.CallConv == CallingConv::Fast)
NumBytes = EnsureStackAlignment(Subtarget.getFrameLowering(), NumBytes);
// Calculate by how many bytes the stack has to be adjusted in case of tail
// call optimization.
- int SPDiff = CalculateTailCallSPDiff(DAG, isTailCall, NumBytes);
+ int SPDiff = CalculateTailCallSPDiff(DAG, CFlags.IsTailCall, NumBytes);
// To protect arguments on the stack from being clobbered in a tail call,
// force all the loads to happen before doing any other lowering.
- if (isTailCall)
+ if (CFlags.IsTailCall)
Chain = DAG.getStackArgumentTokenFactor(Chain);
// Adjust the stack pointer for the new arguments...
@@ -6711,7 +6881,7 @@ SDValue PPCTargetLowering::LowerCall_Darwin(
RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Arg));
} else {
LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
- isPPC64, isTailCall, false, MemOpChains,
+ isPPC64, CFlags.IsTailCall, false, MemOpChains,
TailCallArguments, dl);
}
ArgOffset += PtrByteSize;
@@ -6721,7 +6891,7 @@ SDValue PPCTargetLowering::LowerCall_Darwin(
if (FPR_idx != NumFPRs) {
RegsToPass.push_back(std::make_pair(FPR[FPR_idx++], Arg));
- if (isVarArg) {
+ if (CFlags.IsVarArg) {
SDValue Store =
DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo());
MemOpChains.push_back(Store);
@@ -6753,7 +6923,7 @@ SDValue PPCTargetLowering::LowerCall_Darwin(
}
} else
LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
- isPPC64, isTailCall, false, MemOpChains,
+ isPPC64, CFlags.IsTailCall, false, MemOpChains,
TailCallArguments, dl);
if (isPPC64)
ArgOffset += 8;
@@ -6764,7 +6934,7 @@ SDValue PPCTargetLowering::LowerCall_Darwin(
case MVT::v4i32:
case MVT::v8i16:
case MVT::v16i8:
- if (isVarArg) {
+ if (CFlags.IsVarArg) {
// These go aligned on the stack, or in the corresponding R registers
// when within range. The Darwin PPC ABI doc claims they also go in
// V registers; in fact gcc does this only for arguments that are
@@ -6810,7 +6980,7 @@ SDValue PPCTargetLowering::LowerCall_Darwin(
} else if (nAltivecParamsAtEnd==0) {
// We are emitting Altivec params in order.
LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
- isPPC64, isTailCall, true, MemOpChains,
+ isPPC64, CFlags.IsTailCall, true, MemOpChains,
TailCallArguments, dl);
ArgOffset += 16;
}
@@ -6822,7 +6992,7 @@ SDValue PPCTargetLowering::LowerCall_Darwin(
// don't track this here because nobody below needs it.
// If there are more Altivec parameters than fit in registers emit
// the stores here.
- if (!isVarArg && nAltivecParamsAtEnd > NumVRs) {
+ if (!CFlags.IsVarArg && nAltivecParamsAtEnd > NumVRs) {
unsigned j = 0;
// Offset is aligned; skip 1st 12 params which go in V registers.
ArgOffset = ((ArgOffset+15)/16)*16;
@@ -6836,7 +7006,7 @@ SDValue PPCTargetLowering::LowerCall_Darwin(
SDValue PtrOff;
// We are emitting Altivec params in order.
LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
- isPPC64, isTailCall, true, MemOpChains,
+ isPPC64, CFlags.IsTailCall, true, MemOpChains,
TailCallArguments, dl);
ArgOffset += 16;
}
@@ -6850,12 +7020,11 @@ SDValue PPCTargetLowering::LowerCall_Darwin(
// On Darwin, R12 must contain the address of an indirect callee. This does
// not mean the MTCTR instruction must use R12; it's easier to model this as
// an extra parameter, so do that.
- if (!isTailCall &&
- !isFunctionGlobalAddress(Callee) &&
- !isa<ExternalSymbolSDNode>(Callee) &&
- !isBLACompatibleAddress(Callee, DAG))
+ if (CFlags.IsIndirect) {
+ assert(!CFlags.IsTailCall && "Indirect tail-calls not supported.");
RegsToPass.push_back(std::make_pair((unsigned)(isPPC64 ? PPC::X12 :
PPC::R12), Callee));
+ }
// Build a sequence of copy-to-reg nodes chained together with token chain
// and flag operands which copy the outgoing args into the appropriate regs.
@@ -6866,37 +7035,37 @@ SDValue PPCTargetLowering::LowerCall_Darwin(
InFlag = Chain.getValue(1);
}
- if (isTailCall)
+ if (CFlags.IsTailCall)
PrepareTailCall(DAG, InFlag, Chain, dl, SPDiff, NumBytes, LROp, FPOp,
TailCallArguments);
- return FinishCall(CallConv, dl, isTailCall, isVarArg, isPatchPoint,
- /* unused except on PPC64 ELFv1 */ false, DAG,
- RegsToPass, InFlag, Chain, CallSeqStart, Callee, SPDiff,
- NumBytes, Ins, InVals, CS);
+ return FinishCall(CFlags, dl, DAG, RegsToPass, InFlag, Chain, CallSeqStart,
+ Callee, SPDiff, NumBytes, Ins, InVals, CB);
}
static bool CC_AIX(unsigned ValNo, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
CCState &State) {
+ const PPCSubtarget &Subtarget = static_cast<const PPCSubtarget &>(
+ State.getMachineFunction().getSubtarget());
+ const bool IsPPC64 = Subtarget.isPPC64();
+ const Align PtrAlign = IsPPC64 ? Align(8) : Align(4);
+ const MVT RegVT = IsPPC64 ? MVT::i64 : MVT::i32;
+
+ assert((!ValVT.isInteger() ||
+ (ValVT.getSizeInBits() <= RegVT.getSizeInBits())) &&
+ "Integer argument exceeds register size: should have been legalized");
+
if (ValVT == MVT::f128)
report_fatal_error("f128 is unimplemented on AIX.");
- if (ArgFlags.isByVal())
- report_fatal_error("Passing structure by value is unimplemented.");
-
if (ArgFlags.isNest())
report_fatal_error("Nest arguments are unimplemented.");
if (ValVT.isVector() || LocVT.isVector())
report_fatal_error("Vector arguments are unimplemented on AIX.");
- const PPCSubtarget &Subtarget = static_cast<const PPCSubtarget &>(
- State.getMachineFunction().getSubtarget());
- const bool IsPPC64 = Subtarget.isPPC64();
- const unsigned PtrByteSize = IsPPC64 ? 8 : 4;
-
static const MCPhysReg GPR_32[] = {// 32-bit registers.
PPC::R3, PPC::R4, PPC::R5, PPC::R6,
PPC::R7, PPC::R8, PPC::R9, PPC::R10};
@@ -6904,6 +7073,38 @@ static bool CC_AIX(unsigned ValNo, MVT ValVT, MVT LocVT,
PPC::X3, PPC::X4, PPC::X5, PPC::X6,
PPC::X7, PPC::X8, PPC::X9, PPC::X10};
+ if (ArgFlags.isByVal()) {
+ if (ArgFlags.getNonZeroByValAlign() > PtrAlign)
+ report_fatal_error("Pass-by-value arguments with alignment greater than "
+ "register width are not supported.");
+
+ const unsigned ByValSize = ArgFlags.getByValSize();
+
+ // An empty aggregate parameter takes up no storage and no registers,
+ // but needs a MemLoc for a stack slot for the formal arguments side.
+ if (ByValSize == 0) {
+ State.addLoc(CCValAssign::getMem(ValNo, MVT::INVALID_SIMPLE_VALUE_TYPE,
+ State.getNextStackOffset(), RegVT,
+ LocInfo));
+ return false;
+ }
+
+ const unsigned StackSize = alignTo(ByValSize, PtrAlign);
+ unsigned Offset = State.AllocateStack(StackSize, PtrAlign);
+ for (const unsigned E = Offset + StackSize; Offset < E;
+ Offset += PtrAlign.value()) {
+ if (unsigned Reg = State.AllocateReg(IsPPC64 ? GPR_64 : GPR_32))
+ State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, RegVT, LocInfo));
+ else {
+ State.addLoc(CCValAssign::getMem(ValNo, MVT::INVALID_SIMPLE_VALUE_TYPE,
+ Offset, MVT::INVALID_SIMPLE_VALUE_TYPE,
+ LocInfo));
+ break;
+ }
+ }
+ return false;
+ }
+
// Arguments always reserve parameter save area.
switch (ValVT.SimpleTy) {
default:
@@ -6913,49 +7114,55 @@ static bool CC_AIX(unsigned ValNo, MVT ValVT, MVT LocVT,
assert(IsPPC64 && "PPC32 should have split i64 values.");
LLVM_FALLTHROUGH;
case MVT::i1:
- case MVT::i32:
- State.AllocateStack(PtrByteSize, PtrByteSize);
- if (unsigned Reg = State.AllocateReg(IsPPC64 ? GPR_64 : GPR_32)) {
- MVT RegVT = IsPPC64 ? MVT::i64 : MVT::i32;
- // Promote integers if needed.
- if (ValVT.getSizeInBits() < RegVT.getSizeInBits())
- LocInfo = ArgFlags.isSExt() ? CCValAssign::LocInfo::SExt
- : CCValAssign::LocInfo::ZExt;
+ case MVT::i32: {
+ const unsigned Offset = State.AllocateStack(PtrAlign.value(), PtrAlign);
+ // AIX integer arguments are always passed in register width.
+ if (ValVT.getSizeInBits() < RegVT.getSizeInBits())
+ LocInfo = ArgFlags.isSExt() ? CCValAssign::LocInfo::SExt
+ : CCValAssign::LocInfo::ZExt;
+ if (unsigned Reg = State.AllocateReg(IsPPC64 ? GPR_64 : GPR_32))
State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, RegVT, LocInfo));
- }
else
- report_fatal_error("Handling of placing parameters on the stack is "
- "unimplemented!");
- return false;
+ State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, RegVT, LocInfo));
+ return false;
+ }
case MVT::f32:
case MVT::f64: {
// Parameter save area (PSA) is reserved even if the float passes in fpr.
const unsigned StoreSize = LocVT.getStoreSize();
// Floats are always 4-byte aligned in the PSA on AIX.
// This includes f64 in 64-bit mode for ABI compatibility.
- State.AllocateStack(IsPPC64 ? 8 : StoreSize, 4);
- if (unsigned Reg = State.AllocateReg(FPR))
- State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
- else
- report_fatal_error("Handling of placing parameters on the stack is "
- "unimplemented!");
-
- // AIX requires that GPRs are reserved for float arguments.
- // Successfully reserved GPRs are only initialized for vararg calls.
- MVT RegVT = IsPPC64 ? MVT::i64 : MVT::i32;
- for (unsigned I = 0; I < StoreSize; I += PtrByteSize) {
+ const unsigned Offset =
+ State.AllocateStack(IsPPC64 ? 8 : StoreSize, Align(4));
+ unsigned FReg = State.AllocateReg(FPR);
+ if (FReg)
+ State.addLoc(CCValAssign::getReg(ValNo, ValVT, FReg, LocVT, LocInfo));
+
+ // Reserve and initialize GPRs or initialize the PSA as required.
+ for (unsigned I = 0; I < StoreSize; I += PtrAlign.value()) {
if (unsigned Reg = State.AllocateReg(IsPPC64 ? GPR_64 : GPR_32)) {
+ assert(FReg && "An FPR should be available when a GPR is reserved.");
if (State.isVarArg()) {
+ // Successfully reserved GPRs are only initialized for vararg calls.
// Custom handling is required for:
// f64 in PPC32 needs to be split into 2 GPRs.
// f32 in PPC64 needs to occupy only lower 32 bits of 64-bit GPR.
State.addLoc(
CCValAssign::getCustomReg(ValNo, ValVT, Reg, RegVT, LocInfo));
}
- } else if (State.isVarArg()) {
- report_fatal_error("Handling of placing parameters on the stack is "
- "unimplemented!");
+ } else {
+ // If there are insufficient GPRs, the PSA needs to be initialized.
+ // Initialization occurs even if an FPR was initialized for
+ // compatibility with the AIX XL compiler. The full memory for the
+ // argument will be initialized even if a prior word is saved in GPR.
+ // A custom memLoc is used when the argument also passes in FPR so
+ // that the callee handling can skip over it easily.
+ State.addLoc(
+ FReg ? CCValAssign::getCustomMem(ValNo, ValVT, Offset, LocVT,
+ LocInfo)
+ : CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+ break;
}
}
@@ -7000,6 +7207,64 @@ static SDValue truncateScalarIntegerArg(ISD::ArgFlagsTy Flags, EVT ValVT,
return DAG.getNode(ISD::TRUNCATE, dl, ValVT, ArgValue);
}
+static unsigned mapArgRegToOffsetAIX(unsigned Reg, const PPCFrameLowering *FL) {
+ const unsigned LASize = FL->getLinkageSize();
+
+ if (PPC::GPRCRegClass.contains(Reg)) {
+ assert(Reg >= PPC::R3 && Reg <= PPC::R10 &&
+ "Reg must be a valid argument register!");
+ return LASize + 4 * (Reg - PPC::R3);
+ }
+
+ if (PPC::G8RCRegClass.contains(Reg)) {
+ assert(Reg >= PPC::X3 && Reg <= PPC::X10 &&
+ "Reg must be a valid argument register!");
+ return LASize + 8 * (Reg - PPC::X3);
+ }
+
+ llvm_unreachable("Only general purpose registers expected.");
+}
+
+// AIX ABI Stack Frame Layout:
+//
+// Low Memory +--------------------------------------------+
+// SP +---> | Back chain | ---+
+// | +--------------------------------------------+ |
+// | | Saved Condition Register | |
+// | +--------------------------------------------+ |
+// | | Saved Linkage Register | |
+// | +--------------------------------------------+ | Linkage Area
+// | | Reserved for compilers | |
+// | +--------------------------------------------+ |
+// | | Reserved for binders | |
+// | +--------------------------------------------+ |
+// | | Saved TOC pointer | ---+
+// | +--------------------------------------------+
+// | | Parameter save area |
+// | +--------------------------------------------+
+// | | Alloca space |
+// | +--------------------------------------------+
+// | | Local variable space |
+// | +--------------------------------------------+
+// | | Float/int conversion temporary |
+// | +--------------------------------------------+
+// | | Save area for AltiVec registers |
+// | +--------------------------------------------+
+// | | AltiVec alignment padding |
+// | +--------------------------------------------+
+// | | Save area for VRSAVE register |
+// | +--------------------------------------------+
+// | | Save area for General Purpose registers |
+// | +--------------------------------------------+
+// | | Save area for Floating Point registers |
+// | +--------------------------------------------+
+// +---- | Back chain |
+// High Memory +--------------------------------------------+
+//
+// Specifications:
+// AIX 7.2 Assembler Language Reference
+// Subroutine linkage convention
+
SDValue PPCTargetLowering::LowerFormalArguments_AIX(
SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
@@ -7009,9 +7274,6 @@ SDValue PPCTargetLowering::LowerFormalArguments_AIX(
CallConv == CallingConv::Fast) &&
"Unexpected calling convention!");
- if (isVarArg)
- report_fatal_error("This call type is unimplemented on AIX.");
-
if (getTargetMachine().Options.GuaranteedTailCallOpt)
report_fatal_error("Tail call support is unimplemented on AIX.");
@@ -7029,67 +7291,214 @@ SDValue PPCTargetLowering::LowerFormalArguments_AIX(
// Assign locations to all of the incoming arguments.
SmallVector<CCValAssign, 16> ArgLocs;
MachineFunction &MF = DAG.getMachineFunction();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
+ const EVT PtrVT = getPointerTy(MF.getDataLayout());
// Reserve space for the linkage area on the stack.
const unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
- // On AIX a minimum of 8 words is saved to the parameter save area.
- const unsigned MinParameterSaveArea = 8 * PtrByteSize;
- CCInfo.AllocateStack(LinkageSize + MinParameterSaveArea, PtrByteSize);
+ CCInfo.AllocateStack(LinkageSize, Align(PtrByteSize));
CCInfo.AnalyzeFormalArguments(Ins, CC_AIX);
- for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
- CCValAssign &VA = ArgLocs[i];
- SDValue ArgValue;
- ISD::ArgFlagsTy Flags = Ins[i].Flags;
- if (VA.isRegLoc()) {
- EVT ValVT = VA.getValVT();
- MVT LocVT = VA.getLocVT();
+ SmallVector<SDValue, 8> MemOps;
+
+ for (size_t I = 0, End = ArgLocs.size(); I != End; /* No increment here */) {
+ CCValAssign &VA = ArgLocs[I++];
+ MVT LocVT = VA.getLocVT();
+ ISD::ArgFlagsTy Flags = Ins[VA.getValNo()].Flags;
+
+ // For compatibility with the AIX XL compiler, the float args in the
+ // parameter save area are initialized even if the argument is available
+ // in register. The caller is required to initialize both the register
+ // and memory, however, the callee can choose to expect it in either.
+ // The memloc is dismissed here because the argument is retrieved from
+ // the register.
+ if (VA.isMemLoc() && VA.needsCustom())
+ continue;
+
+ if (Flags.isByVal() && VA.isMemLoc()) {
+ const unsigned Size =
+ alignTo(Flags.getByValSize() ? Flags.getByValSize() : PtrByteSize,
+ PtrByteSize);
+ const int FI = MF.getFrameInfo().CreateFixedObject(
+ Size, VA.getLocMemOffset(), /* IsImmutable */ false,
+ /* IsAliased */ true);
+ SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
+ InVals.push_back(FIN);
+
+ continue;
+ }
+
+ if (Flags.isByVal()) {
+ assert(VA.isRegLoc() && "MemLocs should already be handled.");
+
+ const MCPhysReg ArgReg = VA.getLocReg();
+ const PPCFrameLowering *FL = Subtarget.getFrameLowering();
+
+ if (Flags.getNonZeroByValAlign() > PtrByteSize)
+ report_fatal_error("Over aligned byvals not supported yet.");
+
+ const unsigned StackSize = alignTo(Flags.getByValSize(), PtrByteSize);
+ const int FI = MF.getFrameInfo().CreateFixedObject(
+ StackSize, mapArgRegToOffsetAIX(ArgReg, FL), /* IsImmutable */ false,
+ /* IsAliased */ true);
+ SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
+ InVals.push_back(FIN);
+
+ // Add live ins for all the RegLocs for the same ByVal.
+ const TargetRegisterClass *RegClass =
+ IsPPC64 ? &PPC::G8RCRegClass : &PPC::GPRCRegClass;
+
+ auto HandleRegLoc = [&, RegClass, LocVT](const MCPhysReg PhysReg,
+ unsigned Offset) {
+ const unsigned VReg = MF.addLiveIn(PhysReg, RegClass);
+ // Since the callers side has left justified the aggregate in the
+ // register, we can simply store the entire register into the stack
+ // slot.
+ SDValue CopyFrom = DAG.getCopyFromReg(Chain, dl, VReg, LocVT);
+ // The store to the fixedstack object is needed becuase accessing a
+ // field of the ByVal will use a gep and load. Ideally we will optimize
+ // to extracting the value from the register directly, and elide the
+ // stores when the arguments address is not taken, but that will need to
+ // be future work.
+ SDValue Store =
+ DAG.getStore(CopyFrom.getValue(1), dl, CopyFrom,
+ DAG.getObjectPtrOffset(dl, FIN, Offset),
+ MachinePointerInfo::getFixedStack(MF, FI, Offset));
+
+ MemOps.push_back(Store);
+ };
+
+ unsigned Offset = 0;
+ HandleRegLoc(VA.getLocReg(), Offset);
+ Offset += PtrByteSize;
+ for (; Offset != StackSize && ArgLocs[I].isRegLoc();
+ Offset += PtrByteSize) {
+ assert(ArgLocs[I].getValNo() == VA.getValNo() &&
+ "RegLocs should be for ByVal argument.");
+
+ const CCValAssign RL = ArgLocs[I++];
+ HandleRegLoc(RL.getLocReg(), Offset);
+ }
+
+ if (Offset != StackSize) {
+ assert(ArgLocs[I].getValNo() == VA.getValNo() &&
+ "Expected MemLoc for remaining bytes.");
+ assert(ArgLocs[I].isMemLoc() && "Expected MemLoc for remaining bytes.");
+ // Consume the MemLoc.The InVal has already been emitted, so nothing
+ // more needs to be done.
+ ++I;
+ }
+
+ continue;
+ }
+
+ EVT ValVT = VA.getValVT();
+ if (VA.isRegLoc() && !VA.needsCustom()) {
MVT::SimpleValueType SVT = ValVT.getSimpleVT().SimpleTy;
unsigned VReg =
MF.addLiveIn(VA.getLocReg(), getRegClassForSVT(SVT, IsPPC64));
- ArgValue = DAG.getCopyFromReg(Chain, dl, VReg, LocVT);
+ SDValue ArgValue = DAG.getCopyFromReg(Chain, dl, VReg, LocVT);
if (ValVT.isScalarInteger() &&
(ValVT.getSizeInBits() < LocVT.getSizeInBits())) {
ArgValue =
truncateScalarIntegerArg(Flags, ValVT, DAG, ArgValue, LocVT, dl);
}
InVals.push_back(ArgValue);
- } else {
- report_fatal_error("Handling of formal arguments on the stack is "
- "unimplemented!");
+ continue;
+ }
+ if (VA.isMemLoc()) {
+ const unsigned LocSize = LocVT.getStoreSize();
+ const unsigned ValSize = ValVT.getStoreSize();
+ assert((ValSize <= LocSize) &&
+ "Object size is larger than size of MemLoc");
+ int CurArgOffset = VA.getLocMemOffset();
+ // Objects are right-justified because AIX is big-endian.
+ if (LocSize > ValSize)
+ CurArgOffset += LocSize - ValSize;
+ // Potential tail calls could cause overwriting of argument stack slots.
+ const bool IsImmutable =
+ !(getTargetMachine().Options.GuaranteedTailCallOpt &&
+ (CallConv == CallingConv::Fast));
+ int FI = MFI.CreateFixedObject(ValSize, CurArgOffset, IsImmutable);
+ SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
+ SDValue ArgValue =
+ DAG.getLoad(ValVT, dl, Chain, FIN, MachinePointerInfo());
+ InVals.push_back(ArgValue);
+ continue;
}
}
+ // On AIX a minimum of 8 words is saved to the parameter save area.
+ const unsigned MinParameterSaveArea = 8 * PtrByteSize;
// Area that is at least reserved in the caller of this function.
- unsigned MinReservedArea = CCInfo.getNextStackOffset();
+ unsigned CallerReservedArea =
+ std::max(CCInfo.getNextStackOffset(), LinkageSize + MinParameterSaveArea);
// Set the size that is at least reserved in caller of this function. Tail
// call optimized function's reserved stack space needs to be aligned so
// that taking the difference between two stack areas will result in an
// aligned stack.
- MinReservedArea =
- EnsureStackAlignment(Subtarget.getFrameLowering(), MinReservedArea);
+ CallerReservedArea =
+ EnsureStackAlignment(Subtarget.getFrameLowering(), CallerReservedArea);
PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
- FuncInfo->setMinReservedArea(MinReservedArea);
+ FuncInfo->setMinReservedArea(CallerReservedArea);
+
+ if (isVarArg) {
+ FuncInfo->setVarArgsFrameIndex(
+ MFI.CreateFixedObject(PtrByteSize, CCInfo.getNextStackOffset(), true));
+ SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
+
+ static const MCPhysReg GPR_32[] = {PPC::R3, PPC::R4, PPC::R5, PPC::R6,
+ PPC::R7, PPC::R8, PPC::R9, PPC::R10};
+
+ static const MCPhysReg GPR_64[] = {PPC::X3, PPC::X4, PPC::X5, PPC::X6,
+ PPC::X7, PPC::X8, PPC::X9, PPC::X10};
+ const unsigned NumGPArgRegs = array_lengthof(IsPPC64 ? GPR_64 : GPR_32);
+
+ // The fixed integer arguments of a variadic function are stored to the
+ // VarArgsFrameIndex on the stack so that they may be loaded by
+ // dereferencing the result of va_next.
+ for (unsigned GPRIndex =
+ (CCInfo.getNextStackOffset() - LinkageSize) / PtrByteSize;
+ GPRIndex < NumGPArgRegs; ++GPRIndex) {
+
+ const unsigned VReg =
+ IsPPC64 ? MF.addLiveIn(GPR_64[GPRIndex], &PPC::G8RCRegClass)
+ : MF.addLiveIn(GPR_32[GPRIndex], &PPC::GPRCRegClass);
+
+ SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
+ SDValue Store =
+ DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo());
+ MemOps.push_back(Store);
+ // Increment the address for the next argument to store.
+ SDValue PtrOff = DAG.getConstant(PtrByteSize, dl, PtrVT);
+ FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff);
+ }
+ }
+
+ if (!MemOps.empty())
+ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
return Chain;
}
SDValue PPCTargetLowering::LowerCall_AIX(
- SDValue Chain, SDValue Callee, CallingConv::ID CallConv, bool isVarArg,
- bool isTailCall, bool isPatchPoint,
+ SDValue Chain, SDValue Callee, CallFlags CFlags,
const SmallVectorImpl<ISD::OutputArg> &Outs,
const SmallVectorImpl<SDValue> &OutVals,
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
- ImmutableCallSite CS) const {
+ const CallBase *CB) const {
+ // See PPCTargetLowering::LowerFormalArguments_AIX() for a description of the
+ // AIX ABI stack frame layout.
- assert((CallConv == CallingConv::C ||
- CallConv == CallingConv::Cold ||
- CallConv == CallingConv::Fast) && "Unexpected calling convention!");
+ assert((CFlags.CallConv == CallingConv::C ||
+ CFlags.CallConv == CallingConv::Cold ||
+ CFlags.CallConv == CallingConv::Fast) &&
+ "Unexpected calling convention!");
- if (isPatchPoint)
+ if (CFlags.IsPatchPoint)
report_fatal_error("This call type is unimplemented on AIX.");
const PPCSubtarget& Subtarget =
@@ -7101,7 +7510,8 @@ SDValue PPCTargetLowering::LowerCall_AIX(
MachineFunction &MF = DAG.getMachineFunction();
SmallVector<CCValAssign, 16> ArgLocs;
- CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
+ CCState CCInfo(CFlags.CallConv, CFlags.IsVarArg, MF, ArgLocs,
+ *DAG.getContext());
// Reserve space for the linkage save area (LSA) on the stack.
// In both PPC32 and PPC64 there are 6 reserved slots in the LSA:
@@ -7109,8 +7519,9 @@ SDValue PPCTargetLowering::LowerCall_AIX(
// The LSA is 24 bytes (6x4) in PPC32 and 48 bytes (6x8) in PPC64.
const unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
const bool IsPPC64 = Subtarget.isPPC64();
+ const EVT PtrVT = getPointerTy(DAG.getDataLayout());
const unsigned PtrByteSize = IsPPC64 ? 8 : 4;
- CCInfo.AllocateStack(LinkageSize, PtrByteSize);
+ CCInfo.AllocateStack(LinkageSize, Align(PtrByteSize));
CCInfo.AnalyzeCallOperands(Outs, CC_AIX);
// The prolog code of the callee may store up to 8 GPR argument registers to
@@ -7120,7 +7531,8 @@ SDValue PPCTargetLowering::LowerCall_AIX(
// conservatively assume that it is needed. As such, make sure we have at
// least enough stack space for the caller to store the 8 GPRs.
const unsigned MinParameterSaveAreaSize = 8 * PtrByteSize;
- const unsigned NumBytes = LinkageSize + MinParameterSaveAreaSize;
+ const unsigned NumBytes = std::max(LinkageSize + MinParameterSaveAreaSize,
+ CCInfo.getNextStackOffset());
// Adjust the stack pointer for the new arguments...
// These operations are automatically eliminated by the prolog/epilog pass.
@@ -7128,77 +7540,192 @@ SDValue PPCTargetLowering::LowerCall_AIX(
SDValue CallSeqStart = Chain;
SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
+ SmallVector<SDValue, 8> MemOpChains;
+
+ // Set up a copy of the stack pointer for loading and storing any
+ // arguments that may not fit in the registers available for argument
+ // passing.
+ const SDValue StackPtr = IsPPC64 ? DAG.getRegister(PPC::X1, MVT::i64)
+ : DAG.getRegister(PPC::R1, MVT::i32);
for (unsigned I = 0, E = ArgLocs.size(); I != E;) {
- CCValAssign &VA = ArgLocs[I++];
+ const unsigned ValNo = ArgLocs[I].getValNo();
+ SDValue Arg = OutVals[ValNo];
+ ISD::ArgFlagsTy Flags = Outs[ValNo].Flags;
- if (VA.isMemLoc())
- report_fatal_error("Handling of placing parameters on the stack is "
- "unimplemented!");
- if (!VA.isRegLoc())
- report_fatal_error(
- "Unexpected non-register location for function call argument.");
+ if (Flags.isByVal()) {
+ const unsigned ByValSize = Flags.getByValSize();
- SDValue Arg = OutVals[VA.getValNo()];
+ // Nothing to do for zero-sized ByVals on the caller side.
+ if (!ByValSize) {
+ ++I;
+ continue;
+ }
- if (!VA.needsCustom()) {
- switch (VA.getLocInfo()) {
- default:
- report_fatal_error("Unexpected argument extension type.");
- case CCValAssign::Full:
- break;
- case CCValAssign::ZExt:
- Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
- break;
- case CCValAssign::SExt:
- Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
- break;
+ auto GetLoad = [&](EVT VT, unsigned LoadOffset) {
+ return DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain,
+ (LoadOffset != 0)
+ ? DAG.getObjectPtrOffset(dl, Arg, LoadOffset)
+ : Arg,
+ MachinePointerInfo(), VT);
+ };
+
+ unsigned LoadOffset = 0;
+
+ // Initialize registers, which are fully occupied by the by-val argument.
+ while (LoadOffset + PtrByteSize <= ByValSize && ArgLocs[I].isRegLoc()) {
+ SDValue Load = GetLoad(PtrVT, LoadOffset);
+ MemOpChains.push_back(Load.getValue(1));
+ LoadOffset += PtrByteSize;
+ const CCValAssign &ByValVA = ArgLocs[I++];
+ assert(ByValVA.getValNo() == ValNo &&
+ "Unexpected location for pass-by-value argument.");
+ RegsToPass.push_back(std::make_pair(ByValVA.getLocReg(), Load));
}
+
+ if (LoadOffset == ByValSize)
+ continue;
+
+ // There must be one more loc to handle the remainder.
+ assert(ArgLocs[I].getValNo() == ValNo &&
+ "Expected additional location for by-value argument.");
+
+ if (ArgLocs[I].isMemLoc()) {
+ assert(LoadOffset < ByValSize && "Unexpected memloc for by-val arg.");
+ const CCValAssign &ByValVA = ArgLocs[I++];
+ ISD::ArgFlagsTy MemcpyFlags = Flags;
+ // Only memcpy the bytes that don't pass in register.
+ MemcpyFlags.setByValSize(ByValSize - LoadOffset);
+ Chain = CallSeqStart = createMemcpyOutsideCallSeq(
+ (LoadOffset != 0) ? DAG.getObjectPtrOffset(dl, Arg, LoadOffset)
+ : Arg,
+ DAG.getObjectPtrOffset(dl, StackPtr, ByValVA.getLocMemOffset()),
+ CallSeqStart, MemcpyFlags, DAG, dl);
+ continue;
+ }
+
+ // Initialize the final register residue.
+ // Any residue that occupies the final by-val arg register must be
+ // left-justified on AIX. Loads must be a power-of-2 size and cannot be
+ // larger than the ByValSize. For example: a 7 byte by-val arg requires 4,
+ // 2 and 1 byte loads.
+ const unsigned ResidueBytes = ByValSize % PtrByteSize;
+ assert(ResidueBytes != 0 && LoadOffset + PtrByteSize > ByValSize &&
+ "Unexpected register residue for by-value argument.");
+ SDValue ResidueVal;
+ for (unsigned Bytes = 0; Bytes != ResidueBytes;) {
+ const unsigned N = PowerOf2Floor(ResidueBytes - Bytes);
+ const MVT VT =
+ N == 1 ? MVT::i8
+ : ((N == 2) ? MVT::i16 : (N == 4 ? MVT::i32 : MVT::i64));
+ SDValue Load = GetLoad(VT, LoadOffset);
+ MemOpChains.push_back(Load.getValue(1));
+ LoadOffset += N;
+ Bytes += N;
+
+ // By-val arguments are passed left-justfied in register.
+ // Every load here needs to be shifted, otherwise a full register load
+ // should have been used.
+ assert(PtrVT.getSimpleVT().getSizeInBits() > (Bytes * 8) &&
+ "Unexpected load emitted during handling of pass-by-value "
+ "argument.");
+ unsigned NumSHLBits = PtrVT.getSimpleVT().getSizeInBits() - (Bytes * 8);
+ EVT ShiftAmountTy =
+ getShiftAmountTy(Load->getValueType(0), DAG.getDataLayout());
+ SDValue SHLAmt = DAG.getConstant(NumSHLBits, dl, ShiftAmountTy);
+ SDValue ShiftedLoad =
+ DAG.getNode(ISD::SHL, dl, Load.getValueType(), Load, SHLAmt);
+ ResidueVal = ResidueVal ? DAG.getNode(ISD::OR, dl, PtrVT, ResidueVal,
+ ShiftedLoad)
+ : ShiftedLoad;
+ }
+
+ const CCValAssign &ByValVA = ArgLocs[I++];
+ RegsToPass.push_back(std::make_pair(ByValVA.getLocReg(), ResidueVal));
+ continue;
+ }
+
+ CCValAssign &VA = ArgLocs[I++];
+ const MVT LocVT = VA.getLocVT();
+ const MVT ValVT = VA.getValVT();
+
+ switch (VA.getLocInfo()) {
+ default:
+ report_fatal_error("Unexpected argument extension type.");
+ case CCValAssign::Full:
+ break;
+ case CCValAssign::ZExt:
+ Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
+ break;
+ case CCValAssign::SExt:
+ Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
+ break;
+ }
+
+ if (VA.isRegLoc() && !VA.needsCustom()) {
RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+ continue;
+ }
+
+ if (VA.isMemLoc()) {
+ SDValue PtrOff =
+ DAG.getConstant(VA.getLocMemOffset(), dl, StackPtr.getValueType());
+ PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);
+ MemOpChains.push_back(
+ DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo()));
continue;
}
// Custom handling is used for GPR initializations for vararg float
// arguments.
- assert(isVarArg && VA.getValVT().isFloatingPoint() &&
- VA.getLocVT().isInteger() &&
- "Unexpected custom register handling for calling convention.");
+ assert(VA.isRegLoc() && VA.needsCustom() && CFlags.IsVarArg &&
+ ValVT.isFloatingPoint() && LocVT.isInteger() &&
+ "Unexpected register handling for calling convention.");
SDValue ArgAsInt =
- DAG.getBitcast(MVT::getIntegerVT(VA.getValVT().getSizeInBits()), Arg);
+ DAG.getBitcast(MVT::getIntegerVT(ValVT.getSizeInBits()), Arg);
- if (Arg.getValueType().getStoreSize() == VA.getLocVT().getStoreSize())
+ if (Arg.getValueType().getStoreSize() == LocVT.getStoreSize())
// f32 in 32-bit GPR
// f64 in 64-bit GPR
RegsToPass.push_back(std::make_pair(VA.getLocReg(), ArgAsInt));
- else if (Arg.getValueType().getSizeInBits() < VA.getLocVT().getSizeInBits())
+ else if (Arg.getValueType().getSizeInBits() < LocVT.getSizeInBits())
// f32 in 64-bit GPR.
RegsToPass.push_back(std::make_pair(
- VA.getLocReg(), DAG.getZExtOrTrunc(ArgAsInt, dl, VA.getLocVT())));
+ VA.getLocReg(), DAG.getZExtOrTrunc(ArgAsInt, dl, LocVT)));
else {
// f64 in two 32-bit GPRs
// The 2 GPRs are marked custom and expected to be adjacent in ArgLocs.
- assert(Arg.getValueType() == MVT::f64 && isVarArg && !IsPPC64 &&
+ assert(Arg.getValueType() == MVT::f64 && CFlags.IsVarArg && !IsPPC64 &&
"Unexpected custom register for argument!");
CCValAssign &GPR1 = VA;
SDValue MSWAsI64 = DAG.getNode(ISD::SRL, dl, MVT::i64, ArgAsInt,
DAG.getConstant(32, dl, MVT::i8));
RegsToPass.push_back(std::make_pair(
GPR1.getLocReg(), DAG.getZExtOrTrunc(MSWAsI64, dl, MVT::i32)));
- assert(I != E && "A second custom GPR is expected!");
- CCValAssign &GPR2 = ArgLocs[I++];
- assert(GPR2.isRegLoc() && GPR2.getValNo() == GPR1.getValNo() &&
- GPR2.needsCustom() && "A second custom GPR is expected!");
- RegsToPass.push_back(std::make_pair(
- GPR2.getLocReg(), DAG.getZExtOrTrunc(ArgAsInt, dl, MVT::i32)));
+
+ if (I != E) {
+ // If only 1 GPR was available, there will only be one custom GPR and
+ // the argument will also pass in memory.
+ CCValAssign &PeekArg = ArgLocs[I];
+ if (PeekArg.isRegLoc() && PeekArg.getValNo() == PeekArg.getValNo()) {
+ assert(PeekArg.needsCustom() && "A second custom GPR is expected.");
+ CCValAssign &GPR2 = ArgLocs[I++];
+ RegsToPass.push_back(std::make_pair(
+ GPR2.getLocReg(), DAG.getZExtOrTrunc(ArgAsInt, dl, MVT::i32)));
+ }
+ }
}
}
+ if (!MemOpChains.empty())
+ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
+
// For indirect calls, we need to save the TOC base to the stack for
// restoration after the call.
- if (!isTailCall && !isPatchPoint &&
- !isFunctionGlobalAddress(Callee) && !isa<ExternalSymbolSDNode>(Callee)) {
+ if (CFlags.IsIndirect) {
+ assert(!CFlags.IsTailCall && "Indirect tail-calls not supported.");
const MCRegister TOCBaseReg = Subtarget.getTOCPointerRegister();
const MCRegister StackPtrReg = Subtarget.getStackPointerRegister();
const MVT PtrVT = Subtarget.isPPC64() ? MVT::i64 : MVT::i32;
@@ -7224,10 +7751,8 @@ SDValue PPCTargetLowering::LowerCall_AIX(
}
const int SPDiff = 0;
- return FinishCall(CallConv, dl, isTailCall, isVarArg, isPatchPoint,
- /* unused except on PPC64 ELFv1 */ false, DAG, RegsToPass,
- InFlag, Chain, CallSeqStart, Callee, SPDiff, NumBytes, Ins,
- InVals, CS);
+ return FinishCall(CFlags, dl, DAG, RegsToPass, InFlag, Chain, CallSeqStart,
+ Callee, SPDiff, NumBytes, Ins, InVals, CB);
}
bool
@@ -7299,25 +7824,6 @@ PPCTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
}
- const PPCRegisterInfo *TRI = Subtarget.getRegisterInfo();
- const MCPhysReg *I =
- TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
- if (I) {
- for (; *I; ++I) {
-
- if (PPC::G8RCRegClass.contains(*I))
- RetOps.push_back(DAG.getRegister(*I, MVT::i64));
- else if (PPC::F8RCRegClass.contains(*I))
- RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64)));
- else if (PPC::CRRCRegClass.contains(*I))
- RetOps.push_back(DAG.getRegister(*I, MVT::i1));
- else if (PPC::VRRCRegClass.contains(*I))
- RetOps.push_back(DAG.getRegister(*I, MVT::Other));
- else
- llvm_unreachable("Unexpected register class in CSRsViaCopy!");
- }
- }
-
RetOps[0] = Chain; // Update chain.
// Add the flag if we have it.
@@ -7419,6 +7925,7 @@ PPCTargetLowering::getFramePointerFrameIndex(SelectionDAG & DAG) const {
SDValue PPCTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
SelectionDAG &DAG) const {
+ MachineFunction &MF = DAG.getMachineFunction();
// Get the inputs.
SDValue Chain = Op.getOperand(0);
SDValue Size = Op.getOperand(1);
@@ -7431,9 +7938,10 @@ SDValue PPCTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
DAG.getConstant(0, dl, PtrVT), Size);
// Construct a node for the frame pointer save index.
SDValue FPSIdx = getFramePointerFrameIndex(DAG);
- // Build a DYNALLOC node.
SDValue Ops[3] = { Chain, NegSize, FPSIdx };
SDVTList VTs = DAG.getVTList(PtrVT, MVT::Other);
+ if (hasInlineStackProbe(MF))
+ return DAG.getNode(PPCISD::PROBED_ALLOCA, dl, VTs, Ops);
return DAG.getNode(PPCISD::DYNALLOC, dl, VTs, Ops);
}
@@ -7582,15 +8090,6 @@ SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
!Op.getOperand(2).getValueType().isFloatingPoint())
return Op;
- bool HasNoInfs = DAG.getTarget().Options.NoInfsFPMath;
- bool HasNoNaNs = DAG.getTarget().Options.NoNaNsFPMath;
- // We might be able to do better than this under some circumstances, but in
- // general, fsel-based lowering of select is a finite-math-only optimization.
- // For more information, see section F.3 of the 2.06 ISA specification.
- // With ISA 3.0, we have xsmaxcdp/xsmincdp which are OK to emit even in the
- // presence of infinities.
- if (!Subtarget.hasP9Vector() && (!HasNoInfs || !HasNoNaNs))
- return Op;
ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
EVT ResVT = Op.getValueType();
@@ -7598,14 +8097,14 @@ SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1);
SDValue TV = Op.getOperand(2), FV = Op.getOperand(3);
SDLoc dl(Op);
+ SDNodeFlags Flags = Op.getNode()->getFlags();
+ // We have xsmaxcdp/xsmincdp which are OK to emit even in the
+ // presence of infinities.
if (Subtarget.hasP9Vector() && LHS == TV && RHS == FV) {
switch (CC) {
default:
- // Not a min/max but with finite math, we may still be able to use fsel.
- if (HasNoInfs && HasNoNaNs)
- break;
- return Op;
+ break;
case ISD::SETOGT:
case ISD::SETGT:
return DAG.getNode(PPCISD::XSMAXCDP, dl, Op.getValueType(), LHS, RHS);
@@ -7615,10 +8114,13 @@ SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
}
}
- // TODO: Propagate flags from the select rather than global settings.
- SDNodeFlags Flags;
- Flags.setNoInfs(true);
- Flags.setNoNaNs(true);
+ // We might be able to do better than this under some circumstances, but in
+ // general, fsel-based lowering of select is a finite-math-only optimization.
+ // For more information, see section F.3 of the 2.06 ISA specification.
+ // With ISA 3.0
+ if ((!DAG.getTarget().Options.NoInfsFPMath && !Flags.hasNoInfs()) ||
+ (!DAG.getTarget().Options.NoNaNsFPMath && !Flags.hasNoNaNs()))
+ return Op;
// If the RHS of the comparison is a 0.0, we don't need to do the
// subtraction at all.
@@ -7738,12 +8240,12 @@ void PPCTargetLowering::LowerFP_TO_INTForReuse(SDValue Op, ReuseLoadInfo &RLI,
// Emit a store to the stack slot.
SDValue Chain;
- unsigned Alignment = DAG.getEVTAlignment(Tmp.getValueType());
+ Align Alignment(DAG.getEVTAlign(Tmp.getValueType()));
if (i32Stack) {
MachineFunction &MF = DAG.getMachineFunction();
- Alignment = 4;
+ Alignment = Align(4);
MachineMemOperand *MMO =
- MF.getMachineMemOperand(MPI, MachineMemOperand::MOStore, 4, Alignment);
+ MF.getMachineMemOperand(MPI, MachineMemOperand::MOStore, 4, Alignment);
SDValue Ops[] = { DAG.getEntryNode(), Tmp, FIPtr };
Chain = DAG.getMemIntrinsicNode(PPCISD::STFIWX, dl,
DAG.getVTList(MVT::Other), Ops, MVT::i32, MMO);
@@ -7803,7 +8305,7 @@ SDValue PPCTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG,
const SDLoc &dl) const {
// FP to INT conversions are legal for f128.
- if (EnableQuadPrecision && (Op->getOperand(0).getValueType() == MVT::f128))
+ if (Op->getOperand(0).getValueType() == MVT::f128)
return Op;
// Expand ppcf128 to i32 by hand for the benefit of llvm-gcc bootstrap on
@@ -7899,7 +8401,7 @@ bool PPCTargetLowering::canReuseLoadAddress(SDValue Op, EVT MemVT,
RLI.MPI = LD->getPointerInfo();
RLI.IsDereferenceable = LD->isDereferenceable();
RLI.IsInvariant = LD->isInvariant();
- RLI.Alignment = LD->getAlignment();
+ RLI.Alignment = LD->getAlign();
RLI.AAInfo = LD->getAAInfo();
RLI.Ranges = LD->getRanges();
@@ -8043,16 +8545,19 @@ SDValue PPCTargetLowering::LowerINT_TO_FPVector(SDValue Op, SelectionDAG &DAG,
SDValue ShuffleSrc2 =
SignedConv ? DAG.getUNDEF(WideVT) : DAG.getConstant(0, dl, WideVT);
SDValue Arrange = DAG.getVectorShuffle(WideVT, dl, Wide, ShuffleSrc2, ShuffV);
- unsigned ExtendOp =
- SignedConv ? (unsigned)PPCISD::SExtVElems : (unsigned)ISD::BITCAST;
SDValue Extend;
- if (!Subtarget.hasP9Altivec() && SignedConv) {
+ if (SignedConv) {
Arrange = DAG.getBitcast(IntermediateVT, Arrange);
+ EVT ExtVT = Op.getOperand(0).getValueType();
+ if (Subtarget.hasP9Altivec())
+ ExtVT = EVT::getVectorVT(*DAG.getContext(), WideVT.getVectorElementType(),
+ IntermediateVT.getVectorNumElements());
+
Extend = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, IntermediateVT, Arrange,
- DAG.getValueType(Op.getOperand(0).getValueType()));
+ DAG.getValueType(ExtVT));
} else
- Extend = DAG.getNode(ExtendOp, dl, IntermediateVT, Arrange);
+ Extend = DAG.getNode(ISD::BITCAST, dl, IntermediateVT, Arrange);
return DAG.getNode(Opc, dl, Op.getValueType(), Extend);
}
@@ -8068,7 +8573,7 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,
return LowerINT_TO_FPVector(Op, DAG, dl);
// Conversions to f128 are legal.
- if (EnableQuadPrecision && (Op.getValueType() == MVT::f128))
+ if (Op.getValueType() == MVT::f128)
return Op;
if (Subtarget.hasQPX() && Op.getOperand(0).getValueType() == MVT::v4i1) {
@@ -8163,8 +8668,10 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,
SINT, DAG.getConstant(53, dl, MVT::i32));
Cond = DAG.getNode(ISD::ADD, dl, MVT::i64,
Cond, DAG.getConstant(1, dl, MVT::i64));
- Cond = DAG.getSetCC(dl, MVT::i32,
- Cond, DAG.getConstant(1, dl, MVT::i64), ISD::SETUGT);
+ Cond = DAG.getSetCC(
+ dl,
+ getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
+ Cond, DAG.getConstant(1, dl, MVT::i64), ISD::SETUGT);
SINT = DAG.getNode(ISD::SELECT, dl, MVT::i64, Cond, Round, SINT);
}
@@ -8205,7 +8712,7 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,
MachineFrameInfo &MFI = MF.getFrameInfo();
EVT PtrVT = getPointerTy(DAG.getDataLayout());
- int FrameIdx = MFI.CreateStackObject(4, 4, false);
+ int FrameIdx = MFI.CreateStackObject(4, Align(4), false);
SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
SDValue Store =
@@ -8220,7 +8727,7 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,
RLI.Chain = Store;
RLI.MPI =
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx);
- RLI.Alignment = 4;
+ RLI.Alignment = Align(4);
MachineMemOperand *MMO =
MF.getMachineMemOperand(RLI.MPI, MachineMemOperand::MOLoad, 4,
@@ -8257,7 +8764,7 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,
bool ReusingLoad;
if (!(ReusingLoad = canReuseLoadAddress(Op.getOperand(0), MVT::i32, RLI,
DAG))) {
- int FrameIdx = MFI.CreateStackObject(4, 4, false);
+ int FrameIdx = MFI.CreateStackObject(4, Align(4), false);
SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
SDValue Store =
@@ -8272,7 +8779,7 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,
RLI.Chain = Store;
RLI.MPI =
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx);
- RLI.Alignment = 4;
+ RLI.Alignment = Align(4);
}
MachineMemOperand *MMO =
@@ -8289,7 +8796,7 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,
assert(Subtarget.isPPC64() &&
"i32->FP without LFIWAX supported only on PPC64");
- int FrameIdx = MFI.CreateStackObject(8, 8, false);
+ int FrameIdx = MFI.CreateStackObject(8, Align(8), false);
SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
SDValue Ext64 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i64,
@@ -8341,22 +8848,20 @@ SDValue PPCTargetLowering::LowerFLT_ROUNDS_(SDValue Op,
EVT PtrVT = getPointerTy(MF.getDataLayout());
// Save FP Control Word to register
- EVT NodeTys[] = {
- MVT::f64, // return register
- MVT::Glue // unused in this context
- };
- SDValue Chain = DAG.getNode(PPCISD::MFFS, dl, NodeTys, None);
+ SDValue Chain = Op.getOperand(0);
+ SDValue MFFS = DAG.getNode(PPCISD::MFFS, dl, {MVT::f64, MVT::Other}, Chain);
+ Chain = MFFS.getValue(1);
// Save FP register to stack slot
- int SSFI = MF.getFrameInfo().CreateStackObject(8, 8, false);
+ int SSFI = MF.getFrameInfo().CreateStackObject(8, Align(8), false);
SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
- SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Chain, StackSlot,
- MachinePointerInfo());
+ Chain = DAG.getStore(Chain, dl, MFFS, StackSlot, MachinePointerInfo());
// Load FP Control Word from low 32 bits of stack slot.
SDValue Four = DAG.getConstant(4, dl, PtrVT);
SDValue Addr = DAG.getNode(ISD::ADD, dl, PtrVT, StackSlot, Four);
- SDValue CWD = DAG.getLoad(MVT::i32, dl, Store, Addr, MachinePointerInfo());
+ SDValue CWD = DAG.getLoad(MVT::i32, dl, Chain, Addr, MachinePointerInfo());
+ Chain = CWD.getValue(1);
// Transform as necessary
SDValue CWD1 =
@@ -8373,8 +8878,11 @@ SDValue PPCTargetLowering::LowerFLT_ROUNDS_(SDValue Op,
SDValue RetVal =
DAG.getNode(ISD::XOR, dl, MVT::i32, CWD1, CWD2);
- return DAG.getNode((VT.getSizeInBits() < 16 ?
- ISD::TRUNCATE : ISD::ZERO_EXTEND), dl, VT, RetVal);
+ RetVal =
+ DAG.getNode((VT.getSizeInBits() < 16 ? ISD::TRUNCATE : ISD::ZERO_EXTEND),
+ dl, VT, RetVal);
+
+ return DAG.getMergeValues({RetVal, Chain}, dl);
}
SDValue PPCTargetLowering::LowerSHL_PARTS(SDValue Op, SelectionDAG &DAG) const {
@@ -8468,19 +8976,21 @@ SDValue PPCTargetLowering::LowerSRA_PARTS(SDValue Op, SelectionDAG &DAG) const {
// Vector related lowering.
//
-/// BuildSplatI - Build a canonical splati of Val with an element size of
-/// SplatSize. Cast the result to VT.
-static SDValue BuildSplatI(int Val, unsigned SplatSize, EVT VT,
- SelectionDAG &DAG, const SDLoc &dl) {
+/// getCanonicalConstSplat - Build a canonical splat immediate of Val with an
+/// element size of SplatSize. Cast the result to VT.
+static SDValue getCanonicalConstSplat(uint64_t Val, unsigned SplatSize, EVT VT,
+ SelectionDAG &DAG, const SDLoc &dl) {
static const MVT VTys[] = { // canonical VT to use for each size.
MVT::v16i8, MVT::v8i16, MVT::Other, MVT::v4i32
};
EVT ReqVT = VT != MVT::Other ? VT : VTys[SplatSize-1];
- // Force vspltis[hw] -1 to vspltisb -1 to canonicalize.
- if (Val == -1)
+ // For a splat with all ones, turn it to vspltisb 0xFF to canonicalize.
+ if (Val == ((1LU << (SplatSize * 8)) - 1)) {
SplatSize = 1;
+ Val = 0xFF;
+ }
EVT CanonicalVT = VTys[SplatSize-1];
@@ -8591,10 +9101,9 @@ SDValue PPCTargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const {
SDLoc dl(Op);
SDValue Op0 = Op->getOperand(0);
- if (!EnableQuadPrecision ||
- (Op.getValueType() != MVT::f128 ) ||
+ if ((Op.getValueType() != MVT::f128) ||
(Op0.getOpcode() != ISD::BUILD_PAIR) ||
- (Op0.getOperand(0).getValueType() != MVT::i64) ||
+ (Op0.getOperand(0).getValueType() != MVT::i64) ||
(Op0.getOperand(1).getValueType() != MVT::i64))
return SDValue();
@@ -8606,7 +9115,8 @@ static const SDValue *getNormalLoadInput(const SDValue &Op) {
const SDValue *InputLoad = &Op;
if (InputLoad->getOpcode() == ISD::BITCAST)
InputLoad = &InputLoad->getOperand(0);
- if (InputLoad->getOpcode() == ISD::SCALAR_TO_VECTOR)
+ if (InputLoad->getOpcode() == ISD::SCALAR_TO_VECTOR ||
+ InputLoad->getOpcode() == PPCISD::SCALAR_TO_VECTOR_PERMUTED)
InputLoad = &InputLoad->getOperand(0);
if (InputLoad->getOpcode() != ISD::LOAD)
return nullptr;
@@ -8614,6 +9124,34 @@ static const SDValue *getNormalLoadInput(const SDValue &Op) {
return ISD::isNormalLoad(LD) ? InputLoad : nullptr;
}
+// Convert the argument APFloat to a single precision APFloat if there is no
+// loss in information during the conversion to single precision APFloat and the
+// resulting number is not a denormal number. Return true if successful.
+bool llvm::convertToNonDenormSingle(APFloat &ArgAPFloat) {
+ APFloat APFloatToConvert = ArgAPFloat;
+ bool LosesInfo = true;
+ APFloatToConvert.convert(APFloat::IEEEsingle(), APFloat::rmNearestTiesToEven,
+ &LosesInfo);
+ bool Success = (!LosesInfo && !APFloatToConvert.isDenormal());
+ if (Success)
+ ArgAPFloat = APFloatToConvert;
+ return Success;
+}
+
+// Bitcast the argument APInt to a double and convert it to a single precision
+// APFloat, bitcast the APFloat to an APInt and assign it to the original
+// argument if there is no loss in information during the conversion from
+// double to single precision APFloat and the resulting number is not a denormal
+// number. Return true if successful.
+bool llvm::convertToNonDenormSingle(APInt &ArgAPInt) {
+ double DpValue = ArgAPInt.bitsToDouble();
+ APFloat APFloatDp(DpValue);
+ bool Success = convertToNonDenormSingle(APFloatDp);
+ if (Success)
+ ArgAPInt = APFloatDp.bitcastToAPInt();
+ return Success;
+}
+
// If this is a case we can't handle, return null and let the default
// expansion code take care of it. If we CAN select this case, and if it
// selects to a single instruction, return Op. Otherwise, if we can codegen
@@ -8630,7 +9168,7 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
// then convert it to a floating-point vector and compare it
// to a zero vector to get the boolean result.
MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
- int FrameIdx = MFI.CreateStackObject(16, 16, false);
+ int FrameIdx = MFI.CreateStackObject(16, Align(16), false);
MachinePointerInfo PtrInfo =
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx);
EVT PtrVT = getPointerTy(DAG.getDataLayout());
@@ -8665,8 +9203,8 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
}
Constant *CP = ConstantVector::get(CV);
- SDValue CPIdx = DAG.getConstantPool(CP, getPointerTy(DAG.getDataLayout()),
- 16 /* alignment */);
+ SDValue CPIdx =
+ DAG.getConstantPool(CP, getPointerTy(DAG.getDataLayout()), Align(16));
SDValue Ops[] = {DAG.getEntryNode(), CPIdx};
SDVTList VTs = DAG.getVTList({MVT::v4i1, /*chain*/ MVT::Other});
@@ -8733,9 +9271,23 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
APInt APSplatBits, APSplatUndef;
unsigned SplatBitSize;
bool HasAnyUndefs;
- if (! BVN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize,
- HasAnyUndefs, 0, !Subtarget.isLittleEndian()) ||
- SplatBitSize > 32) {
+ bool BVNIsConstantSplat =
+ BVN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize,
+ HasAnyUndefs, 0, !Subtarget.isLittleEndian());
+
+ // If it is a splat of a double, check if we can shrink it to a 32 bit
+ // non-denormal float which when converted back to double gives us the same
+ // double. This is to exploit the XXSPLTIDP instruction.
+ if (BVNIsConstantSplat && Subtarget.hasPrefixInstrs() &&
+ (SplatBitSize == 64) && (Op->getValueType(0) == MVT::v2f64) &&
+ convertToNonDenormSingle(APSplatBits)) {
+ SDValue SplatNode = DAG.getNode(
+ PPCISD::XXSPLTI_SP_TO_DP, dl, MVT::v2f64,
+ DAG.getTargetConstant(APSplatBits.getZExtValue(), dl, MVT::i32));
+ return DAG.getBitcast(Op.getValueType(), SplatNode);
+ }
+
+ if (!BVNIsConstantSplat || SplatBitSize > 32) {
const SDValue *InputLoad = getNormalLoadInput(Op.getOperand(0));
// Handle load-and-splat patterns as we have instructions that will do this
@@ -8774,8 +9326,8 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
return SDValue();
}
- unsigned SplatBits = APSplatBits.getZExtValue();
- unsigned SplatUndef = APSplatUndef.getZExtValue();
+ uint64_t SplatBits = APSplatBits.getZExtValue();
+ uint64_t SplatUndef = APSplatUndef.getZExtValue();
unsigned SplatSize = SplatBitSize / 8;
// First, handle single instruction cases.
@@ -8790,17 +9342,30 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
return Op;
}
- // We have XXSPLTIB for constant splats one byte wide
- // FIXME: SplatBits is an unsigned int being cast to an int while passing it
- // as an argument to BuildSplatiI. Given SplatSize == 1 it is okay here.
+ // We have XXSPLTIW for constant splats four bytes wide.
+ // Given vector length is a multiple of 4, 2-byte splats can be replaced
+ // with 4-byte splats. We replicate the SplatBits in case of 2-byte splat to
+ // make a 4-byte splat element. For example: 2-byte splat of 0xABAB can be
+ // turned into a 4-byte splat of 0xABABABAB.
+ if (Subtarget.hasPrefixInstrs() && SplatSize == 2)
+ return getCanonicalConstSplat((SplatBits |= SplatBits << 16), SplatSize * 2,
+ Op.getValueType(), DAG, dl);
+
+ if (Subtarget.hasPrefixInstrs() && SplatSize == 4)
+ return getCanonicalConstSplat(SplatBits, SplatSize, Op.getValueType(), DAG,
+ dl);
+
+ // We have XXSPLTIB for constant splats one byte wide.
if (Subtarget.hasP9Vector() && SplatSize == 1)
- return BuildSplatI(SplatBits, SplatSize, Op.getValueType(), DAG, dl);
+ return getCanonicalConstSplat(SplatBits, SplatSize, Op.getValueType(), DAG,
+ dl);
// If the sign extended value is in the range [-16,15], use VSPLTI[bhw].
int32_t SextVal= (int32_t(SplatBits << (32-SplatBitSize)) >>
(32-SplatBitSize));
if (SextVal >= -16 && SextVal <= 15)
- return BuildSplatI(SextVal, SplatSize, Op.getValueType(), DAG, dl);
+ return getCanonicalConstSplat(SextVal, SplatSize, Op.getValueType(), DAG,
+ dl);
// Two instruction sequences.
@@ -8831,7 +9396,7 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
// for fneg/fabs.
if (SplatSize == 4 && SplatBits == (0x7FFFFFFF&~SplatUndef)) {
// Make -1 and vspltisw -1:
- SDValue OnesV = BuildSplatI(-1, 4, MVT::v4i32, DAG, dl);
+ SDValue OnesV = getCanonicalConstSplat(-1, 4, MVT::v4i32, DAG, dl);
// Make the VSLW intrinsic, computing 0x8000_0000.
SDValue Res = BuildIntrinsicOp(Intrinsic::ppc_altivec_vslw, OnesV,
@@ -8859,7 +9424,7 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
// vsplti + shl self.
if (SextVal == (int)((unsigned)i << TypeShiftAmt)) {
- SDValue Res = BuildSplatI(i, SplatSize, MVT::Other, DAG, dl);
+ SDValue Res = getCanonicalConstSplat(i, SplatSize, MVT::Other, DAG, dl);
static const unsigned IIDs[] = { // Intrinsic to use for each size.
Intrinsic::ppc_altivec_vslb, Intrinsic::ppc_altivec_vslh, 0,
Intrinsic::ppc_altivec_vslw
@@ -8870,7 +9435,7 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
// vsplti + srl self.
if (SextVal == (int)((unsigned)i >> TypeShiftAmt)) {
- SDValue Res = BuildSplatI(i, SplatSize, MVT::Other, DAG, dl);
+ SDValue Res = getCanonicalConstSplat(i, SplatSize, MVT::Other, DAG, dl);
static const unsigned IIDs[] = { // Intrinsic to use for each size.
Intrinsic::ppc_altivec_vsrb, Intrinsic::ppc_altivec_vsrh, 0,
Intrinsic::ppc_altivec_vsrw
@@ -8881,7 +9446,7 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
// vsplti + sra self.
if (SextVal == (int)((unsigned)i >> TypeShiftAmt)) {
- SDValue Res = BuildSplatI(i, SplatSize, MVT::Other, DAG, dl);
+ SDValue Res = getCanonicalConstSplat(i, SplatSize, MVT::Other, DAG, dl);
static const unsigned IIDs[] = { // Intrinsic to use for each size.
Intrinsic::ppc_altivec_vsrab, Intrinsic::ppc_altivec_vsrah, 0,
Intrinsic::ppc_altivec_vsraw
@@ -8893,7 +9458,7 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
// vsplti + rol self.
if (SextVal == (int)(((unsigned)i << TypeShiftAmt) |
((unsigned)i >> (SplatBitSize-TypeShiftAmt)))) {
- SDValue Res = BuildSplatI(i, SplatSize, MVT::Other, DAG, dl);
+ SDValue Res = getCanonicalConstSplat(i, SplatSize, MVT::Other, DAG, dl);
static const unsigned IIDs[] = { // Intrinsic to use for each size.
Intrinsic::ppc_altivec_vrlb, Intrinsic::ppc_altivec_vrlh, 0,
Intrinsic::ppc_altivec_vrlw
@@ -8904,19 +9469,19 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
// t = vsplti c, result = vsldoi t, t, 1
if (SextVal == (int)(((unsigned)i << 8) | (i < 0 ? 0xFF : 0))) {
- SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl);
+ SDValue T = getCanonicalConstSplat(i, SplatSize, MVT::v16i8, DAG, dl);
unsigned Amt = Subtarget.isLittleEndian() ? 15 : 1;
return BuildVSLDOI(T, T, Amt, Op.getValueType(), DAG, dl);
}
// t = vsplti c, result = vsldoi t, t, 2
if (SextVal == (int)(((unsigned)i << 16) | (i < 0 ? 0xFFFF : 0))) {
- SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl);
+ SDValue T = getCanonicalConstSplat(i, SplatSize, MVT::v16i8, DAG, dl);
unsigned Amt = Subtarget.isLittleEndian() ? 14 : 2;
return BuildVSLDOI(T, T, Amt, Op.getValueType(), DAG, dl);
}
// t = vsplti c, result = vsldoi t, t, 3
if (SextVal == (int)(((unsigned)i << 24) | (i < 0 ? 0xFFFFFF : 0))) {
- SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl);
+ SDValue T = getCanonicalConstSplat(i, SplatSize, MVT::v16i8, DAG, dl);
unsigned Amt = Subtarget.isLittleEndian() ? 13 : 3;
return BuildVSLDOI(T, T, Amt, Op.getValueType(), DAG, dl);
}
@@ -9215,6 +9780,107 @@ SDValue PPCTargetLowering::lowerToVINSERTH(ShuffleVectorSDNode *N,
return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins);
}
+/// lowerToXXSPLTI32DX - Return the SDValue if this VECTOR_SHUFFLE can be
+/// handled by the XXSPLTI32DX instruction introduced in ISA 3.1, otherwise
+/// return the default SDValue.
+SDValue PPCTargetLowering::lowerToXXSPLTI32DX(ShuffleVectorSDNode *SVN,
+ SelectionDAG &DAG) const {
+ // The LHS and RHS may be bitcasts to v16i8 as we canonicalize shuffles
+ // to v16i8. Peek through the bitcasts to get the actual operands.
+ SDValue LHS = peekThroughBitcasts(SVN->getOperand(0));
+ SDValue RHS = peekThroughBitcasts(SVN->getOperand(1));
+
+ auto ShuffleMask = SVN->getMask();
+ SDValue VecShuffle(SVN, 0);
+ SDLoc DL(SVN);
+
+ // Check that we have a four byte shuffle.
+ if (!isNByteElemShuffleMask(SVN, 4, 1))
+ return SDValue();
+
+ // Canonicalize the RHS being a BUILD_VECTOR when lowering to xxsplti32dx.
+ if (RHS->getOpcode() != ISD::BUILD_VECTOR) {
+ std::swap(LHS, RHS);
+ VecShuffle = DAG.getCommutedVectorShuffle(*SVN);
+ ShuffleMask = cast<ShuffleVectorSDNode>(VecShuffle)->getMask();
+ }
+
+ // Ensure that the RHS is a vector of constants.
+ BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode());
+ if (!BVN)
+ return SDValue();
+
+ // Check if RHS is a splat of 4-bytes (or smaller).
+ APInt APSplatValue, APSplatUndef;
+ unsigned SplatBitSize;
+ bool HasAnyUndefs;
+ if (!BVN->isConstantSplat(APSplatValue, APSplatUndef, SplatBitSize,
+ HasAnyUndefs, 0, !Subtarget.isLittleEndian()) ||
+ SplatBitSize > 32)
+ return SDValue();
+
+ // Check that the shuffle mask matches the semantics of XXSPLTI32DX.
+ // The instruction splats a constant C into two words of the source vector
+ // producing { C, Unchanged, C, Unchanged } or { Unchanged, C, Unchanged, C }.
+ // Thus we check that the shuffle mask is the equivalent of
+ // <0, [4-7], 2, [4-7]> or <[4-7], 1, [4-7], 3> respectively.
+ // Note: the check above of isNByteElemShuffleMask() ensures that the bytes
+ // within each word are consecutive, so we only need to check the first byte.
+ SDValue Index;
+ bool IsLE = Subtarget.isLittleEndian();
+ if ((ShuffleMask[0] == 0 && ShuffleMask[8] == 8) &&
+ (ShuffleMask[4] % 4 == 0 && ShuffleMask[12] % 4 == 0 &&
+ ShuffleMask[4] > 15 && ShuffleMask[12] > 15))
+ Index = DAG.getTargetConstant(IsLE ? 0 : 1, DL, MVT::i32);
+ else if ((ShuffleMask[4] == 4 && ShuffleMask[12] == 12) &&
+ (ShuffleMask[0] % 4 == 0 && ShuffleMask[8] % 4 == 0 &&
+ ShuffleMask[0] > 15 && ShuffleMask[8] > 15))
+ Index = DAG.getTargetConstant(IsLE ? 1 : 0, DL, MVT::i32);
+ else
+ return SDValue();
+
+ // If the splat is narrower than 32-bits, we need to get the 32-bit value
+ // for XXSPLTI32DX.
+ unsigned SplatVal = APSplatValue.getZExtValue();
+ for (; SplatBitSize < 32; SplatBitSize <<= 1)
+ SplatVal |= (SplatVal << SplatBitSize);
+
+ SDValue SplatNode = DAG.getNode(
+ PPCISD::XXSPLTI32DX, DL, MVT::v2i64, DAG.getBitcast(MVT::v2i64, LHS),
+ Index, DAG.getTargetConstant(SplatVal, DL, MVT::i32));
+ return DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, SplatNode);
+}
+
+/// LowerROTL - Custom lowering for ROTL(v1i128) to vector_shuffle(v16i8).
+/// We lower ROTL(v1i128) to vector_shuffle(v16i8) only if shift amount is
+/// a multiple of 8. Otherwise convert it to a scalar rotation(i128)
+/// i.e (or (shl x, C1), (srl x, 128-C1)).
+SDValue PPCTargetLowering::LowerROTL(SDValue Op, SelectionDAG &DAG) const {
+ assert(Op.getOpcode() == ISD::ROTL && "Should only be called for ISD::ROTL");
+ assert(Op.getValueType() == MVT::v1i128 &&
+ "Only set v1i128 as custom, other type shouldn't reach here!");
+ SDLoc dl(Op);
+ SDValue N0 = peekThroughBitcasts(Op.getOperand(0));
+ SDValue N1 = peekThroughBitcasts(Op.getOperand(1));
+ unsigned SHLAmt = N1.getConstantOperandVal(0);
+ if (SHLAmt % 8 == 0) {
+ SmallVector<int, 16> Mask(16, 0);
+ std::iota(Mask.begin(), Mask.end(), 0);
+ std::rotate(Mask.begin(), Mask.begin() + SHLAmt / 8, Mask.end());
+ if (SDValue Shuffle =
+ DAG.getVectorShuffle(MVT::v16i8, dl, DAG.getBitcast(MVT::v16i8, N0),
+ DAG.getUNDEF(MVT::v16i8), Mask))
+ return DAG.getNode(ISD::BITCAST, dl, MVT::v1i128, Shuffle);
+ }
+ SDValue ArgVal = DAG.getBitcast(MVT::i128, N0);
+ SDValue SHLOp = DAG.getNode(ISD::SHL, dl, MVT::i128, ArgVal,
+ DAG.getConstant(SHLAmt, dl, MVT::i32));
+ SDValue SRLOp = DAG.getNode(ISD::SRL, dl, MVT::i128, ArgVal,
+ DAG.getConstant(128 - SHLAmt, dl, MVT::i32));
+ SDValue OROp = DAG.getNode(ISD::OR, dl, MVT::i128, SHLOp, SRLOp);
+ return DAG.getNode(ISD::BITCAST, dl, MVT::v1i128, OROp);
+}
+
/// LowerVECTOR_SHUFFLE - Return the code we lower for VECTOR_SHUFFLE. If this
/// is a shuffle we can handle in a single instruction, return it. Otherwise,
/// return the code it can be lowered into. Worst case, it can always be
@@ -9225,6 +9891,18 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
SDValue V1 = Op.getOperand(0);
SDValue V2 = Op.getOperand(1);
ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+
+ // Any nodes that were combined in the target-independent combiner prior
+ // to vector legalization will not be sent to the target combine. Try to
+ // combine it here.
+ if (SDValue NewShuffle = combineVectorShuffle(SVOp, DAG)) {
+ if (!isa<ShuffleVectorSDNode>(NewShuffle))
+ return NewShuffle;
+ Op = NewShuffle;
+ SVOp = cast<ShuffleVectorSDNode>(Op);
+ V1 = Op.getOperand(0);
+ V2 = Op.getOperand(1);
+ }
EVT VT = Op.getValueType();
bool isLittleEndian = Subtarget.isLittleEndian();
@@ -9250,6 +9928,11 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
Offset = isLittleEndian ? (3 - SplatIdx) * 4 : SplatIdx * 4;
else
Offset = isLittleEndian ? (1 - SplatIdx) * 8 : SplatIdx * 8;
+
+ // If we are loading a partial vector, it does not make sense to adjust
+ // the base pointer. This happens with (splat (s_to_v_permuted (ld))).
+ if (LD->getMemoryVT().getSizeInBits() == (IsFourByte ? 32 : 64))
+ Offset = 0;
SDValue BasePtr = LD->getBasePtr();
if (Offset != 0)
BasePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
@@ -9288,6 +9971,12 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins);
}
+ if (Subtarget.hasPrefixInstrs()) {
+ SDValue SplatInsertNode;
+ if ((SplatInsertNode = lowerToXXSPLTI32DX(SVOp, DAG)))
+ return SplatInsertNode;
+ }
+
if (Subtarget.hasP9Altivec()) {
SDValue NewISDNode;
if ((NewISDNode = lowerToVINSERTH(SVOp, DAG)))
@@ -9523,7 +10212,13 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
MVT::i32));
}
+ ShufflesHandledWithVPERM++;
SDValue VPermMask = DAG.getBuildVector(MVT::v16i8, dl, ResultMask);
+ LLVM_DEBUG(dbgs() << "Emitting a VPERM for the following shuffle:\n");
+ LLVM_DEBUG(SVOp->dump());
+ LLVM_DEBUG(dbgs() << "With the following permute control vector:\n");
+ LLVM_DEBUG(VPermMask.dump());
+
if (isLittleEndian)
return DAG.getNode(PPCISD::VPERM, dl, V1.getValueType(),
V2, V1, VPermMask);
@@ -9880,18 +10575,6 @@ SDValue PPCTargetLowering::LowerINTRINSIC_VOID(SDValue Op,
return SDValue();
}
-SDValue PPCTargetLowering::LowerREM(SDValue Op, SelectionDAG &DAG) const {
- // Check for a DIV with the same operands as this REM.
- for (auto UI : Op.getOperand(1)->uses()) {
- if ((Op.getOpcode() == ISD::SREM && UI->getOpcode() == ISD::SDIV) ||
- (Op.getOpcode() == ISD::UREM && UI->getOpcode() == ISD::UDIV))
- if (UI->getOperand(0) == Op.getOperand(0) &&
- UI->getOperand(1) == Op.getOperand(1))
- return SDValue();
- }
- return Op;
-}
-
// Lower scalar BSWAP64 to xxbrd.
SDValue PPCTargetLowering::LowerBSWAP(SDValue Op, SelectionDAG &DAG) const {
SDLoc dl(Op);
@@ -9950,7 +10633,7 @@ SDValue PPCTargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op,
SDLoc dl(Op);
// Create a stack slot that is 16-byte aligned.
MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
- int FrameIdx = MFI.CreateStackObject(16, 16, false);
+ int FrameIdx = MFI.CreateStackObject(16, Align(16), false);
EVT PtrVT = getPointerTy(DAG.getDataLayout());
SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
@@ -10020,7 +10703,7 @@ SDValue PPCTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
Value);
MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
- int FrameIdx = MFI.CreateStackObject(16, 16, false);
+ int FrameIdx = MFI.CreateStackObject(16, Align(16), false);
MachinePointerInfo PtrInfo =
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx);
EVT PtrVT = getPointerTy(DAG.getDataLayout());
@@ -10161,9 +10844,8 @@ SDValue PPCTargetLowering::LowerVectorStore(SDValue Op,
SDValue Stores[4];
for (unsigned Idx = 0; Idx < 4; ++Idx) {
- SDValue Ex = DAG.getNode(
- ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, Value,
- DAG.getConstant(Idx, dl, getVectorIdxTy(DAG.getDataLayout())));
+ SDValue Ex = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, Value,
+ DAG.getVectorIdxConstant(Idx, dl));
SDValue Store;
if (ScalarVT != ScalarMemVT)
Store =
@@ -10220,7 +10902,7 @@ SDValue PPCTargetLowering::LowerVectorStore(SDValue Op,
Value);
MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
- int FrameIdx = MFI.CreateStackObject(16, 16, false);
+ int FrameIdx = MFI.CreateStackObject(16, Align(16), false);
MachinePointerInfo PtrInfo =
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx);
EVT PtrVT = getPointerTy(DAG.getDataLayout());
@@ -10269,9 +10951,9 @@ SDValue PPCTargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
if (Op.getValueType() == MVT::v4i32) {
SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1);
- SDValue Zero = BuildSplatI( 0, 1, MVT::v4i32, DAG, dl);
- SDValue Neg16 = BuildSplatI(-16, 4, MVT::v4i32, DAG, dl);//+16 as shift amt.
-
+ SDValue Zero = getCanonicalConstSplat(0, 1, MVT::v4i32, DAG, dl);
+ // +16 as shift amt.
+ SDValue Neg16 = getCanonicalConstSplat(-16, 4, MVT::v4i32, DAG, dl);
SDValue RHSSwap = // = vrlw RHS, 16
BuildIntrinsicOp(Intrinsic::ppc_altivec_vrlw, RHS, Neg16, DAG, dl);
@@ -10291,13 +10973,6 @@ SDValue PPCTargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
HiProd = BuildIntrinsicOp(Intrinsic::ppc_altivec_vslw, HiProd,
Neg16, DAG, dl);
return DAG.getNode(ISD::ADD, dl, MVT::v4i32, LoProd, HiProd);
- } else if (Op.getValueType() == MVT::v8i16) {
- SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1);
-
- SDValue Zero = BuildSplatI(0, 1, MVT::v8i16, DAG, dl);
-
- return BuildIntrinsicOp(Intrinsic::ppc_altivec_vmladduhm,
- LHS, RHS, Zero, DAG, dl);
} else if (Op.getValueType() == MVT::v16i8) {
SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1);
bool isLittleEndian = Subtarget.isLittleEndian();
@@ -10504,6 +11179,7 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::MUL: return LowerMUL(Op, DAG);
case ISD::ABS: return LowerABS(Op, DAG);
case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
+ case ISD::ROTL: return LowerROTL(Op, DAG);
// For counter-based loop handling.
case ISD::INTRINSIC_W_CHAIN: return SDValue();
@@ -10516,9 +11192,6 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::INTRINSIC_VOID:
return LowerINTRINSIC_VOID(Op, DAG);
- case ISD::SREM:
- case ISD::UREM:
- return LowerREM(Op, DAG);
case ISD::BSWAP:
return LowerBSWAP(Op, DAG);
case ISD::ATOMIC_CMP_SWAP:
@@ -10537,8 +11210,8 @@ void PPCTargetLowering::ReplaceNodeResults(SDNode *N,
SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other);
SDValue RTB = DAG.getNode(PPCISD::READ_TIME_BASE, dl, VTs, N->getOperand(0));
- Results.push_back(RTB);
- Results.push_back(RTB.getValue(1));
+ Results.push_back(
+ DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, RTB, RTB.getValue(1)));
Results.push_back(RTB.getValue(2));
break;
}
@@ -11198,13 +11871,192 @@ PPCTargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
return MBB;
}
+bool PPCTargetLowering::hasInlineStackProbe(MachineFunction &MF) const {
+ // If the function specifically requests inline stack probes, emit them.
+ if (MF.getFunction().hasFnAttribute("probe-stack"))
+ return MF.getFunction().getFnAttribute("probe-stack").getValueAsString() ==
+ "inline-asm";
+ return false;
+}
+
+unsigned PPCTargetLowering::getStackProbeSize(MachineFunction &MF) const {
+ const TargetFrameLowering *TFI = Subtarget.getFrameLowering();
+ unsigned StackAlign = TFI->getStackAlignment();
+ assert(StackAlign >= 1 && isPowerOf2_32(StackAlign) &&
+ "Unexpected stack alignment");
+ // The default stack probe size is 4096 if the function has no
+ // stack-probe-size attribute.
+ unsigned StackProbeSize = 4096;
+ const Function &Fn = MF.getFunction();
+ if (Fn.hasFnAttribute("stack-probe-size"))
+ Fn.getFnAttribute("stack-probe-size")
+ .getValueAsString()
+ .getAsInteger(0, StackProbeSize);
+ // Round down to the stack alignment.
+ StackProbeSize &= ~(StackAlign - 1);
+ return StackProbeSize ? StackProbeSize : StackAlign;
+}
+
+// Lower dynamic stack allocation with probing. `emitProbedAlloca` is splitted
+// into three phases. In the first phase, it uses pseudo instruction
+// PREPARE_PROBED_ALLOCA to get the future result of actual FramePointer and
+// FinalStackPtr. In the second phase, it generates a loop for probing blocks.
+// At last, it uses pseudo instruction DYNAREAOFFSET to get the future result of
+// MaxCallFrameSize so that it can calculate correct data area pointer.
+MachineBasicBlock *
+PPCTargetLowering::emitProbedAlloca(MachineInstr &MI,
+ MachineBasicBlock *MBB) const {
+ const bool isPPC64 = Subtarget.isPPC64();
+ MachineFunction *MF = MBB->getParent();
+ const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+ DebugLoc DL = MI.getDebugLoc();
+ const unsigned ProbeSize = getStackProbeSize(*MF);
+ const BasicBlock *ProbedBB = MBB->getBasicBlock();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+ // The CFG of probing stack looks as
+ // +-----+
+ // | MBB |
+ // +--+--+
+ // |
+ // +----v----+
+ // +--->+ TestMBB +---+
+ // | +----+----+ |
+ // | | |
+ // | +-----v----+ |
+ // +---+ BlockMBB | |
+ // +----------+ |
+ // |
+ // +---------+ |
+ // | TailMBB +<--+
+ // +---------+
+ // In MBB, calculate previous frame pointer and final stack pointer.
+ // In TestMBB, test if sp is equal to final stack pointer, if so, jump to
+ // TailMBB. In BlockMBB, update the sp atomically and jump back to TestMBB.
+ // TailMBB is spliced via \p MI.
+ MachineBasicBlock *TestMBB = MF->CreateMachineBasicBlock(ProbedBB);
+ MachineBasicBlock *TailMBB = MF->CreateMachineBasicBlock(ProbedBB);
+ MachineBasicBlock *BlockMBB = MF->CreateMachineBasicBlock(ProbedBB);
+
+ MachineFunction::iterator MBBIter = ++MBB->getIterator();
+ MF->insert(MBBIter, TestMBB);
+ MF->insert(MBBIter, BlockMBB);
+ MF->insert(MBBIter, TailMBB);
+
+ const TargetRegisterClass *G8RC = &PPC::G8RCRegClass;
+ const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
+
+ Register DstReg = MI.getOperand(0).getReg();
+ Register NegSizeReg = MI.getOperand(1).getReg();
+ Register SPReg = isPPC64 ? PPC::X1 : PPC::R1;
+ Register FinalStackPtr = MRI.createVirtualRegister(isPPC64 ? G8RC : GPRC);
+ Register FramePointer = MRI.createVirtualRegister(isPPC64 ? G8RC : GPRC);
+
+ // Get the canonical FinalStackPtr like what
+ // PPCRegisterInfo::lowerDynamicAlloc does.
+ BuildMI(*MBB, {MI}, DL,
+ TII->get(isPPC64 ? PPC::PREPARE_PROBED_ALLOCA_64
+ : PPC::PREPARE_PROBED_ALLOCA_32),
+ FramePointer)
+ .addDef(FinalStackPtr)
+ .addReg(NegSizeReg)
+ .add(MI.getOperand(2))
+ .add(MI.getOperand(3));
+
+ // Materialize a scratch register for update.
+ int64_t NegProbeSize = -(int64_t)ProbeSize;
+ assert(isInt<32>(NegProbeSize) && "Unhandled probe size!");
+ Register ScratchReg = MRI.createVirtualRegister(isPPC64 ? G8RC : GPRC);
+ if (!isInt<16>(NegProbeSize)) {
+ Register TempReg = MRI.createVirtualRegister(isPPC64 ? G8RC : GPRC);
+ BuildMI(*MBB, {MI}, DL, TII->get(isPPC64 ? PPC::LIS8 : PPC::LIS), TempReg)
+ .addImm(NegProbeSize >> 16);
+ BuildMI(*MBB, {MI}, DL, TII->get(isPPC64 ? PPC::ORI8 : PPC::ORI),
+ ScratchReg)
+ .addReg(TempReg)
+ .addImm(NegProbeSize & 0xFFFF);
+ } else
+ BuildMI(*MBB, {MI}, DL, TII->get(isPPC64 ? PPC::LI8 : PPC::LI), ScratchReg)
+ .addImm(NegProbeSize);
+
+ {
+ // Probing leading residual part.
+ Register Div = MRI.createVirtualRegister(isPPC64 ? G8RC : GPRC);
+ BuildMI(*MBB, {MI}, DL, TII->get(isPPC64 ? PPC::DIVD : PPC::DIVW), Div)
+ .addReg(NegSizeReg)
+ .addReg(ScratchReg);
+ Register Mul = MRI.createVirtualRegister(isPPC64 ? G8RC : GPRC);
+ BuildMI(*MBB, {MI}, DL, TII->get(isPPC64 ? PPC::MULLD : PPC::MULLW), Mul)
+ .addReg(Div)
+ .addReg(ScratchReg);
+ Register NegMod = MRI.createVirtualRegister(isPPC64 ? G8RC : GPRC);
+ BuildMI(*MBB, {MI}, DL, TII->get(isPPC64 ? PPC::SUBF8 : PPC::SUBF), NegMod)
+ .addReg(Mul)
+ .addReg(NegSizeReg);
+ BuildMI(*MBB, {MI}, DL, TII->get(isPPC64 ? PPC::STDUX : PPC::STWUX), SPReg)
+ .addReg(FramePointer)
+ .addReg(SPReg)
+ .addReg(NegMod);
+ }
+
+ {
+ // Remaining part should be multiple of ProbeSize.
+ Register CmpResult = MRI.createVirtualRegister(&PPC::CRRCRegClass);
+ BuildMI(TestMBB, DL, TII->get(isPPC64 ? PPC::CMPD : PPC::CMPW), CmpResult)
+ .addReg(SPReg)
+ .addReg(FinalStackPtr);
+ BuildMI(TestMBB, DL, TII->get(PPC::BCC))
+ .addImm(PPC::PRED_EQ)
+ .addReg(CmpResult)
+ .addMBB(TailMBB);
+ TestMBB->addSuccessor(BlockMBB);
+ TestMBB->addSuccessor(TailMBB);
+ }
+
+ {
+ // Touch the block.
+ // |P...|P...|P...
+ BuildMI(BlockMBB, DL, TII->get(isPPC64 ? PPC::STDUX : PPC::STWUX), SPReg)
+ .addReg(FramePointer)
+ .addReg(SPReg)
+ .addReg(ScratchReg);
+ BuildMI(BlockMBB, DL, TII->get(PPC::B)).addMBB(TestMBB);
+ BlockMBB->addSuccessor(TestMBB);
+ }
+
+ // Calculation of MaxCallFrameSize is deferred to prologepilog, use
+ // DYNAREAOFFSET pseudo instruction to get the future result.
+ Register MaxCallFrameSizeReg =
+ MRI.createVirtualRegister(isPPC64 ? G8RC : GPRC);
+ BuildMI(TailMBB, DL,
+ TII->get(isPPC64 ? PPC::DYNAREAOFFSET8 : PPC::DYNAREAOFFSET),
+ MaxCallFrameSizeReg)
+ .add(MI.getOperand(2))
+ .add(MI.getOperand(3));
+ BuildMI(TailMBB, DL, TII->get(isPPC64 ? PPC::ADD8 : PPC::ADD4), DstReg)
+ .addReg(SPReg)
+ .addReg(MaxCallFrameSizeReg);
+
+ // Splice instructions after MI to TailMBB.
+ TailMBB->splice(TailMBB->end(), MBB,
+ std::next(MachineBasicBlock::iterator(MI)), MBB->end());
+ TailMBB->transferSuccessorsAndUpdatePHIs(MBB);
+ MBB->addSuccessor(TestMBB);
+
+ // Delete the pseudo instruction.
+ MI.eraseFromParent();
+
+ ++NumDynamicAllocaProbed;
+ return TailMBB;
+}
+
MachineBasicBlock *
PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
MachineBasicBlock *BB) const {
if (MI.getOpcode() == TargetOpcode::STACKMAP ||
MI.getOpcode() == TargetOpcode::PATCHPOINT) {
if (Subtarget.is64BitELFABI() &&
- MI.getOpcode() == TargetOpcode::PATCHPOINT) {
+ MI.getOpcode() == TargetOpcode::PATCHPOINT &&
+ !Subtarget.isUsingPCRelativeCalls()) {
// Call lowering should have added an r2 operand to indicate a dependence
// on the TOC base pointer value. It can't however, because there is no
// way to mark the dependence as implicit there, and so the stackmap code
@@ -11886,12 +12738,12 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
}
MachineFrameInfo &MFI = F->getFrameInfo();
- int FrameIdx = MFI.CreateStackObject(8, 8, false);
+ int FrameIdx = MFI.CreateStackObject(8, Align(8), false);
MachineMemOperand *MMOStore = F->getMachineMemOperand(
- MachinePointerInfo::getFixedStack(*F, FrameIdx, 0),
- MachineMemOperand::MOStore, MFI.getObjectSize(FrameIdx),
- MFI.getObjectAlignment(FrameIdx));
+ MachinePointerInfo::getFixedStack(*F, FrameIdx, 0),
+ MachineMemOperand::MOStore, MFI.getObjectSize(FrameIdx),
+ MFI.getObjectAlign(FrameIdx));
// Store the SrcReg into the stack.
BuildMI(*BB, MI, dl, TII->get(StoreOp))
@@ -11901,9 +12753,9 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
.addMemOperand(MMOStore);
MachineMemOperand *MMOLoad = F->getMachineMemOperand(
- MachinePointerInfo::getFixedStack(*F, FrameIdx, 0),
- MachineMemOperand::MOLoad, MFI.getObjectSize(FrameIdx),
- MFI.getObjectAlignment(FrameIdx));
+ MachinePointerInfo::getFixedStack(*F, FrameIdx, 0),
+ MachineMemOperand::MOLoad, MFI.getObjectSize(FrameIdx),
+ MFI.getObjectAlign(FrameIdx));
// Load from the stack where SrcReg is stored, and save to DestReg,
// so we have done the RegClass conversion from RegClass::SrcReg to
@@ -11963,6 +12815,9 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
.addReg(NewFPSCRReg)
.addImm(0)
.addImm(0);
+ } else if (MI.getOpcode() == PPC::PROBED_ALLOCA_32 ||
+ MI.getOpcode() == PPC::PROBED_ALLOCA_64) {
+ return emitProbedAlloca(MI, BB);
} else {
llvm_unreachable("Unexpected instr type to insert");
}
@@ -13167,15 +14022,20 @@ static SDValue addShuffleForVecExtend(SDNode *N, SelectionDAG &DAG,
DAG.getVectorShuffle(Input.getValueType(), dl, Input,
DAG.getUNDEF(Input.getValueType()), ShuffleMask);
- EVT Ty = N->getValueType(0);
- SDValue BV = DAG.getNode(PPCISD::SExtVElems, dl, Ty, Shuffle);
- return BV;
+ EVT VT = N->getValueType(0);
+ SDValue Conv = DAG.getBitcast(VT, Shuffle);
+
+ EVT ExtVT = EVT::getVectorVT(*DAG.getContext(),
+ Input.getValueType().getVectorElementType(),
+ VT.getVectorNumElements());
+ return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, VT, Conv,
+ DAG.getValueType(ExtVT));
}
// Look for build vector patterns where input operands come from sign
// extended vector_extract elements of specific indices. If the correct indices
-// aren't used, add a vector shuffle to fix up the indices and create a new
-// PPCISD:SExtVElems node which selects the vector sign extend instructions
+// aren't used, add a vector shuffle to fix up the indices and create
+// SIGN_EXTEND_INREG node which selects the vector sign extend instructions
// during instruction selection.
static SDValue combineBVOfVecSExt(SDNode *N, SelectionDAG &DAG) {
// This array encodes the indices that the vector sign extend instructions
@@ -13498,8 +14358,8 @@ SDValue PPCTargetLowering::expandVSXLoadForLE(SDNode *N,
// Do not expand to PPCISD::LXVD2X + PPCISD::XXSWAPD when the load is
// aligned and the type is a vector with elements up to 4 bytes
- if (Subtarget.needsSwapsForVSXMemOps() && !(MMO->getAlignment()%16)
- && VecTy.getScalarSizeInBits() <= 32 ) {
+ if (Subtarget.needsSwapsForVSXMemOps() && MMO->getAlign() >= Align(16) &&
+ VecTy.getScalarSizeInBits() <= 32) {
return SDValue();
}
@@ -13569,8 +14429,8 @@ SDValue PPCTargetLowering::expandVSXStoreForLE(SDNode *N,
// Do not expand to PPCISD::XXSWAPD and PPCISD::STXVD2X when the load is
// aligned and the type is a vector with elements up to 4 bytes
- if (Subtarget.needsSwapsForVSXMemOps() && !(MMO->getAlignment()%16)
- && VecTy.getScalarSizeInBits() <= 32 ) {
+ if (Subtarget.needsSwapsForVSXMemOps() && MMO->getAlign() >= Align(16) &&
+ VecTy.getScalarSizeInBits() <= 32) {
return SDValue();
}
@@ -13650,6 +14510,210 @@ SDValue PPCTargetLowering::combineStoreFPToInt(SDNode *N,
return Val;
}
+static bool isAlternatingShuffMask(const ArrayRef<int> &Mask, int NumElts) {
+ // Check that the source of the element keeps flipping
+ // (i.e. Mask[i] < NumElts -> Mask[i+i] >= NumElts).
+ bool PrevElemFromFirstVec = Mask[0] < NumElts;
+ for (int i = 1, e = Mask.size(); i < e; i++) {
+ if (PrevElemFromFirstVec && Mask[i] < NumElts)
+ return false;
+ if (!PrevElemFromFirstVec && Mask[i] >= NumElts)
+ return false;
+ PrevElemFromFirstVec = !PrevElemFromFirstVec;
+ }
+ return true;
+}
+
+static bool isSplatBV(SDValue Op) {
+ if (Op.getOpcode() != ISD::BUILD_VECTOR)
+ return false;
+ SDValue FirstOp;
+
+ // Find first non-undef input.
+ for (int i = 0, e = Op.getNumOperands(); i < e; i++) {
+ FirstOp = Op.getOperand(i);
+ if (!FirstOp.isUndef())
+ break;
+ }
+
+ // All inputs are undef or the same as the first non-undef input.
+ for (int i = 1, e = Op.getNumOperands(); i < e; i++)
+ if (Op.getOperand(i) != FirstOp && !Op.getOperand(i).isUndef())
+ return false;
+ return true;
+}
+
+static SDValue isScalarToVec(SDValue Op) {
+ if (Op.getOpcode() == ISD::SCALAR_TO_VECTOR)
+ return Op;
+ if (Op.getOpcode() != ISD::BITCAST)
+ return SDValue();
+ Op = Op.getOperand(0);
+ if (Op.getOpcode() == ISD::SCALAR_TO_VECTOR)
+ return Op;
+ return SDValue();
+}
+
+static void fixupShuffleMaskForPermutedSToV(SmallVectorImpl<int> &ShuffV,
+ int LHSMaxIdx, int RHSMinIdx,
+ int RHSMaxIdx, int HalfVec) {
+ for (int i = 0, e = ShuffV.size(); i < e; i++) {
+ int Idx = ShuffV[i];
+ if ((Idx >= 0 && Idx < LHSMaxIdx) || (Idx >= RHSMinIdx && Idx < RHSMaxIdx))
+ ShuffV[i] += HalfVec;
+ }
+ return;
+}
+
+// Replace a SCALAR_TO_VECTOR with a SCALAR_TO_VECTOR_PERMUTED except if
+// the original is:
+// (<n x Ty> (scalar_to_vector (Ty (extract_elt <n x Ty> %a, C))))
+// In such a case, just change the shuffle mask to extract the element
+// from the permuted index.
+static SDValue getSToVPermuted(SDValue OrigSToV, SelectionDAG &DAG) {
+ SDLoc dl(OrigSToV);
+ EVT VT = OrigSToV.getValueType();
+ assert(OrigSToV.getOpcode() == ISD::SCALAR_TO_VECTOR &&
+ "Expecting a SCALAR_TO_VECTOR here");
+ SDValue Input = OrigSToV.getOperand(0);
+
+ if (Input.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
+ ConstantSDNode *Idx = dyn_cast<ConstantSDNode>(Input.getOperand(1));
+ SDValue OrigVector = Input.getOperand(0);
+
+ // Can't handle non-const element indices or different vector types
+ // for the input to the extract and the output of the scalar_to_vector.
+ if (Idx && VT == OrigVector.getValueType()) {
+ SmallVector<int, 16> NewMask(VT.getVectorNumElements(), -1);
+ NewMask[VT.getVectorNumElements() / 2] = Idx->getZExtValue();
+ return DAG.getVectorShuffle(VT, dl, OrigVector, OrigVector, NewMask);
+ }
+ }
+ return DAG.getNode(PPCISD::SCALAR_TO_VECTOR_PERMUTED, dl, VT,
+ OrigSToV.getOperand(0));
+}
+
+// On little endian subtargets, combine shuffles such as:
+// vector_shuffle<16,1,17,3,18,5,19,7,20,9,21,11,22,13,23,15>, <zero>, %b
+// into:
+// vector_shuffle<16,0,17,1,18,2,19,3,20,4,21,5,22,6,23,7>, <zero>, %b
+// because the latter can be matched to a single instruction merge.
+// Furthermore, SCALAR_TO_VECTOR on little endian always involves a permute
+// to put the value into element zero. Adjust the shuffle mask so that the
+// vector can remain in permuted form (to prevent a swap prior to a shuffle).
+SDValue PPCTargetLowering::combineVectorShuffle(ShuffleVectorSDNode *SVN,
+ SelectionDAG &DAG) const {
+ SDValue LHS = SVN->getOperand(0);
+ SDValue RHS = SVN->getOperand(1);
+ auto Mask = SVN->getMask();
+ int NumElts = LHS.getValueType().getVectorNumElements();
+ SDValue Res(SVN, 0);
+ SDLoc dl(SVN);
+
+ // None of these combines are useful on big endian systems since the ISA
+ // already has a big endian bias.
+ if (!Subtarget.isLittleEndian() || !Subtarget.hasVSX())
+ return Res;
+
+ // If this is not a shuffle of a shuffle and the first element comes from
+ // the second vector, canonicalize to the commuted form. This will make it
+ // more likely to match one of the single instruction patterns.
+ if (Mask[0] >= NumElts && LHS.getOpcode() != ISD::VECTOR_SHUFFLE &&
+ RHS.getOpcode() != ISD::VECTOR_SHUFFLE) {
+ std::swap(LHS, RHS);
+ Res = DAG.getCommutedVectorShuffle(*SVN);
+ Mask = cast<ShuffleVectorSDNode>(Res)->getMask();
+ }
+
+ // Adjust the shuffle mask if either input vector comes from a
+ // SCALAR_TO_VECTOR and keep the respective input vector in permuted
+ // form (to prevent the need for a swap).
+ SmallVector<int, 16> ShuffV(Mask.begin(), Mask.end());
+ SDValue SToVLHS = isScalarToVec(LHS);
+ SDValue SToVRHS = isScalarToVec(RHS);
+ if (SToVLHS || SToVRHS) {
+ int NumEltsIn = SToVLHS ? SToVLHS.getValueType().getVectorNumElements()
+ : SToVRHS.getValueType().getVectorNumElements();
+ int NumEltsOut = ShuffV.size();
+
+ // Initially assume that neither input is permuted. These will be adjusted
+ // accordingly if either input is.
+ int LHSMaxIdx = -1;
+ int RHSMinIdx = -1;
+ int RHSMaxIdx = -1;
+ int HalfVec = LHS.getValueType().getVectorNumElements() / 2;
+
+ // Get the permuted scalar to vector nodes for the source(s) that come from
+ // ISD::SCALAR_TO_VECTOR.
+ if (SToVLHS) {
+ // Set up the values for the shuffle vector fixup.
+ LHSMaxIdx = NumEltsOut / NumEltsIn;
+ SToVLHS = getSToVPermuted(SToVLHS, DAG);
+ if (SToVLHS.getValueType() != LHS.getValueType())
+ SToVLHS = DAG.getBitcast(LHS.getValueType(), SToVLHS);
+ LHS = SToVLHS;
+ }
+ if (SToVRHS) {
+ RHSMinIdx = NumEltsOut;
+ RHSMaxIdx = NumEltsOut / NumEltsIn + RHSMinIdx;
+ SToVRHS = getSToVPermuted(SToVRHS, DAG);
+ if (SToVRHS.getValueType() != RHS.getValueType())
+ SToVRHS = DAG.getBitcast(RHS.getValueType(), SToVRHS);
+ RHS = SToVRHS;
+ }
+
+ // Fix up the shuffle mask to reflect where the desired element actually is.
+ // The minimum and maximum indices that correspond to element zero for both
+ // the LHS and RHS are computed and will control which shuffle mask entries
+ // are to be changed. For example, if the RHS is permuted, any shuffle mask
+ // entries in the range [RHSMinIdx,RHSMaxIdx) will be incremented by
+ // HalfVec to refer to the corresponding element in the permuted vector.
+ fixupShuffleMaskForPermutedSToV(ShuffV, LHSMaxIdx, RHSMinIdx, RHSMaxIdx,
+ HalfVec);
+ Res = DAG.getVectorShuffle(SVN->getValueType(0), dl, LHS, RHS, ShuffV);
+
+ // We may have simplified away the shuffle. We won't be able to do anything
+ // further with it here.
+ if (!isa<ShuffleVectorSDNode>(Res))
+ return Res;
+ Mask = cast<ShuffleVectorSDNode>(Res)->getMask();
+ }
+
+ // The common case after we commuted the shuffle is that the RHS is a splat
+ // and we have elements coming in from the splat at indices that are not
+ // conducive to using a merge.
+ // Example:
+ // vector_shuffle<0,17,1,19,2,21,3,23,4,25,5,27,6,29,7,31> t1, <zero>
+ if (!isSplatBV(RHS))
+ return Res;
+
+ // We are looking for a mask such that all even elements are from
+ // one vector and all odd elements from the other.
+ if (!isAlternatingShuffMask(Mask, NumElts))
+ return Res;
+
+ // Adjust the mask so we are pulling in the same index from the splat
+ // as the index from the interesting vector in consecutive elements.
+ // Example (even elements from first vector):
+ // vector_shuffle<0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23> t1, <zero>
+ if (Mask[0] < NumElts)
+ for (int i = 1, e = Mask.size(); i < e; i += 2)
+ ShuffV[i] = (ShuffV[i - 1] + NumElts);
+ // Example (odd elements from first vector):
+ // vector_shuffle<16,0,17,1,18,2,19,3,20,4,21,5,22,6,23,7> t1, <zero>
+ else
+ for (int i = 0, e = Mask.size(); i < e; i += 2)
+ ShuffV[i] = (ShuffV[i + 1] + NumElts);
+
+ // If the RHS has undefs, we need to remove them since we may have created
+ // a shuffle that adds those instead of the splat value.
+ SDValue SplatVal = cast<BuildVectorSDNode>(RHS.getNode())->getSplatValue();
+ RHS = DAG.getSplatBuildVector(RHS.getValueType(), dl, SplatVal);
+
+ Res = DAG.getVectorShuffle(SVN->getValueType(0), dl, LHS, RHS, ShuffV);
+ return Res;
+}
+
SDValue PPCTargetLowering::combineVReverseMemOP(ShuffleVectorSDNode *SVN,
LSBaseSDNode *LSBase,
DAGCombinerInfo &DCI) const {
@@ -13721,6 +14785,9 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
return combineSRL(N, DCI);
case ISD::MUL:
return combineMUL(N, DCI);
+ case ISD::FMA:
+ case PPCISD::FNMSUB:
+ return combineFMALike(N, DCI);
case PPCISD::SHL:
if (isNullConstant(N->getOperand(0))) // 0 << V -> 0.
return N->getOperand(0);
@@ -13756,7 +14823,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
LSBaseSDNode* LSBase = cast<LSBaseSDNode>(N->getOperand(0));
return combineVReverseMemOP(cast<ShuffleVectorSDNode>(N), LSBase, DCI);
}
- break;
+ return combineVectorShuffle(cast<ShuffleVectorSDNode>(N), DCI.DAG);
case ISD::STORE: {
EVT Op1VT = N->getOperand(1).getValueType();
@@ -13963,17 +15030,18 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
EVT MemVT = LD->getMemoryVT();
Type *Ty = MemVT.getTypeForEVT(*DAG.getContext());
- unsigned ABIAlignment = DAG.getDataLayout().getABITypeAlignment(Ty);
+ Align ABIAlignment = DAG.getDataLayout().getABITypeAlign(Ty);
Type *STy = MemVT.getScalarType().getTypeForEVT(*DAG.getContext());
- unsigned ScalarABIAlignment = DAG.getDataLayout().getABITypeAlignment(STy);
+ Align ScalarABIAlignment = DAG.getDataLayout().getABITypeAlign(STy);
if (LD->isUnindexed() && VT.isVector() &&
((Subtarget.hasAltivec() && ISD::isNON_EXTLoad(N) &&
// P8 and later hardware should just use LOAD.
- !Subtarget.hasP8Vector() && (VT == MVT::v16i8 || VT == MVT::v8i16 ||
- VT == MVT::v4i32 || VT == MVT::v4f32)) ||
+ !Subtarget.hasP8Vector() &&
+ (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
+ VT == MVT::v4f32)) ||
(Subtarget.hasQPX() && (VT == MVT::v4f64 || VT == MVT::v4f32) &&
- LD->getAlignment() >= ScalarABIAlignment)) &&
- LD->getAlignment() < ABIAlignment) {
+ LD->getAlign() >= ScalarABIAlignment)) &&
+ LD->getAlign() < ABIAlignment) {
// This is a type-legal unaligned Altivec or QPX load.
SDValue Chain = LD->getChain();
SDValue Ptr = LD->getBasePtr();
@@ -14520,6 +15588,7 @@ Align PPCTargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
case PPC::DIR_PWR7:
case PPC::DIR_PWR8:
case PPC::DIR_PWR9:
+ case PPC::DIR_PWR10:
case PPC::DIR_PWR_FUTURE: {
if (!ML)
break;
@@ -14926,18 +15995,16 @@ SDValue PPCTargetLowering::LowerFRAMEADDR(SDValue Op,
Register PPCTargetLowering::getRegisterByName(const char* RegName, LLT VT,
const MachineFunction &MF) const {
bool isPPC64 = Subtarget.isPPC64();
- bool IsDarwinABI = Subtarget.isDarwinABI();
bool is64Bit = isPPC64 && VT == LLT::scalar(64);
if (!is64Bit && VT != LLT::scalar(32))
report_fatal_error("Invalid register global variable type");
Register Reg = StringSwitch<Register>(RegName)
- .Case("r1", is64Bit ? PPC::X1 : PPC::R1)
- .Case("r2", (IsDarwinABI || isPPC64) ? Register() : PPC::R2)
- .Case("r13", (!isPPC64 && IsDarwinABI) ? Register() :
- (is64Bit ? PPC::X13 : PPC::R13))
- .Default(Register());
+ .Case("r1", is64Bit ? PPC::X1 : PPC::R1)
+ .Case("r2", isPPC64 ? Register() : PPC::R2)
+ .Case("r13", (is64Bit ? PPC::X13 : PPC::R13))
+ .Default(Register());
if (Reg)
return Reg;
@@ -15030,7 +16097,7 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.ptrVal = I.getArgOperand(0);
Info.offset = -VT.getStoreSize()+1;
Info.size = 2*VT.getStoreSize()-1;
- Info.align = Align::None();
+ Info.align = Align(1);
Info.flags = MachineMemOperand::MOLoad;
return true;
}
@@ -15064,7 +16131,7 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.ptrVal = I.getArgOperand(0);
Info.offset = 0;
Info.size = VT.getStoreSize();
- Info.align = Align::None();
+ Info.align = Align(1);
Info.flags = MachineMemOperand::MOLoad;
return true;
}
@@ -15116,7 +16183,7 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.ptrVal = I.getArgOperand(1);
Info.offset = -VT.getStoreSize()+1;
Info.size = 2*VT.getStoreSize()-1;
- Info.align = Align::None();
+ Info.align = Align(1);
Info.flags = MachineMemOperand::MOStore;
return true;
}
@@ -15149,7 +16216,7 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.ptrVal = I.getArgOperand(1);
Info.offset = 0;
Info.size = VT.getStoreSize();
- Info.align = Align::None();
+ Info.align = Align(1);
Info.flags = MachineMemOperand::MOStore;
return true;
}
@@ -15160,35 +16227,24 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
return false;
}
-/// getOptimalMemOpType - Returns the target specific optimal type for load
-/// and store operations as a result of memset, memcpy, and memmove
-/// lowering. If DstAlign is zero that means it's safe to destination
-/// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
-/// means there isn't a need to check it against alignment requirement,
-/// probably because the source does not need to be loaded. If 'IsMemset' is
-/// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
-/// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
-/// source is constant so it does not need to be loaded.
/// It returns EVT::Other if the type should be determined using generic
/// target-independent logic.
EVT PPCTargetLowering::getOptimalMemOpType(
- uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset,
- bool ZeroMemset, bool MemcpyStrSrc,
- const AttributeList &FuncAttributes) const {
+ const MemOp &Op, const AttributeList &FuncAttributes) const {
if (getTargetMachine().getOptLevel() != CodeGenOpt::None) {
// When expanding a memset, require at least two QPX instructions to cover
// the cost of loading the value to be stored from the constant pool.
- if (Subtarget.hasQPX() && Size >= 32 && (!IsMemset || Size >= 64) &&
- (!SrcAlign || SrcAlign >= 32) && (!DstAlign || DstAlign >= 32) &&
+ if (Subtarget.hasQPX() && Op.size() >= 32 &&
+ (Op.isMemcpy() || Op.size() >= 64) && Op.isAligned(Align(32)) &&
!FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat)) {
return MVT::v4f64;
}
// We should use Altivec/VSX loads and stores when available. For unaligned
// addresses, unaligned VSX loads are only fast starting with the P8.
- if (Subtarget.hasAltivec() && Size >= 16 &&
- (((!SrcAlign || SrcAlign >= 16) && (!DstAlign || DstAlign >= 16)) ||
- ((IsMemset && Subtarget.hasVSX()) || Subtarget.hasP8Vector())))
+ if (Subtarget.hasAltivec() && Op.size() >= 16 &&
+ (Op.isAligned(Align(16)) ||
+ ((Op.isMemset() && Subtarget.hasVSX()) || Subtarget.hasP8Vector())))
return MVT::v4i32;
}
@@ -15304,22 +16360,48 @@ bool PPCTargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
bool PPCTargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
EVT VT) const {
- VT = VT.getScalarType();
-
- if (!VT.isSimple())
- return false;
+ return isFMAFasterThanFMulAndFAdd(
+ MF.getFunction(), VT.getTypeForEVT(MF.getFunction().getContext()));
+}
- switch (VT.getSimpleVT().SimpleTy) {
- case MVT::f32:
- case MVT::f64:
+bool PPCTargetLowering::isFMAFasterThanFMulAndFAdd(const Function &F,
+ Type *Ty) const {
+ switch (Ty->getScalarType()->getTypeID()) {
+ case Type::FloatTyID:
+ case Type::DoubleTyID:
return true;
- case MVT::f128:
- return (EnableQuadPrecision && Subtarget.hasP9Vector());
+ case Type::FP128TyID:
+ return Subtarget.hasP9Vector();
default:
- break;
+ return false;
}
+}
- return false;
+// Currently this is a copy from AArch64TargetLowering::isProfitableToHoist.
+// FIXME: add more patterns which are profitable to hoist.
+bool PPCTargetLowering::isProfitableToHoist(Instruction *I) const {
+ if (I->getOpcode() != Instruction::FMul)
+ return true;
+
+ if (!I->hasOneUse())
+ return true;
+
+ Instruction *User = I->user_back();
+ assert(User && "A single use instruction with no uses.");
+
+ if (User->getOpcode() != Instruction::FSub &&
+ User->getOpcode() != Instruction::FAdd)
+ return true;
+
+ const TargetOptions &Options = getTargetMachine().Options;
+ const Function *F = I->getFunction();
+ const DataLayout &DL = F->getParent()->getDataLayout();
+ Type *Ty = User->getOperand(0)->getType();
+
+ return !(
+ isFMAFasterThanFMulAndFAdd(*F, Ty) &&
+ isOperationLegalOrCustom(ISD::FMA, getValueType(DL, Ty)) &&
+ (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath));
}
const MCPhysReg *
@@ -15335,12 +16417,12 @@ PPCTargetLowering::getScratchRegisters(CallingConv::ID) const {
return ScratchRegs;
}
-unsigned PPCTargetLowering::getExceptionPointerRegister(
+Register PPCTargetLowering::getExceptionPointerRegister(
const Constant *PersonalityFn) const {
return Subtarget.isPPC64() ? PPC::X3 : PPC::R3;
}
-unsigned PPCTargetLowering::getExceptionSelectorRegister(
+Register PPCTargetLowering::getExceptionSelectorRegister(
const Constant *PersonalityFn) const {
return Subtarget.isPPC64() ? PPC::X4 : PPC::R4;
}
@@ -15371,58 +16453,83 @@ PPCTargetLowering::createFastISel(FunctionLoweringInfo &FuncInfo,
return PPC::createFastISel(FuncInfo, LibInfo);
}
-void PPCTargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
- if (Subtarget.isDarwinABI()) return;
- if (!Subtarget.isPPC64()) return;
-
- // Update IsSplitCSR in PPCFunctionInfo
- PPCFunctionInfo *PFI = Entry->getParent()->getInfo<PPCFunctionInfo>();
- PFI->setIsSplitCSR(true);
+// 'Inverted' means the FMA opcode after negating one multiplicand.
+// For example, (fma -a b c) = (fnmsub a b c)
+static unsigned invertFMAOpcode(unsigned Opc) {
+ switch (Opc) {
+ default:
+ llvm_unreachable("Invalid FMA opcode for PowerPC!");
+ case ISD::FMA:
+ return PPCISD::FNMSUB;
+ case PPCISD::FNMSUB:
+ return ISD::FMA;
+ }
}
-void PPCTargetLowering::insertCopiesSplitCSR(
- MachineBasicBlock *Entry,
- const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
- const PPCRegisterInfo *TRI = Subtarget.getRegisterInfo();
- const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
- if (!IStart)
- return;
+SDValue PPCTargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,
+ bool LegalOps, bool OptForSize,
+ NegatibleCost &Cost,
+ unsigned Depth) const {
+ if (Depth > SelectionDAG::MaxRecursionDepth)
+ return SDValue();
- const TargetInstrInfo *TII = Subtarget.getInstrInfo();
- MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
- MachineBasicBlock::iterator MBBI = Entry->begin();
- for (const MCPhysReg *I = IStart; *I; ++I) {
- const TargetRegisterClass *RC = nullptr;
- if (PPC::G8RCRegClass.contains(*I))
- RC = &PPC::G8RCRegClass;
- else if (PPC::F8RCRegClass.contains(*I))
- RC = &PPC::F8RCRegClass;
- else if (PPC::CRRCRegClass.contains(*I))
- RC = &PPC::CRRCRegClass;
- else if (PPC::VRRCRegClass.contains(*I))
- RC = &PPC::VRRCRegClass;
- else
- llvm_unreachable("Unexpected register class in CSRsViaCopy!");
+ unsigned Opc = Op.getOpcode();
+ EVT VT = Op.getValueType();
+ SDNodeFlags Flags = Op.getNode()->getFlags();
+
+ switch (Opc) {
+ case PPCISD::FNMSUB:
+ // TODO: QPX subtarget is deprecated. No transformation here.
+ if (!Op.hasOneUse() || !isTypeLegal(VT) || Subtarget.hasQPX())
+ break;
- Register NewVR = MRI->createVirtualRegister(RC);
- // Create copy from CSR to a virtual register.
- // FIXME: this currently does not emit CFI pseudo-instructions, it works
- // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
- // nounwind. If we want to generalize this later, we may need to emit
- // CFI pseudo-instructions.
- assert(Entry->getParent()->getFunction().hasFnAttribute(
- Attribute::NoUnwind) &&
- "Function should be nounwind in insertCopiesSplitCSR!");
- Entry->addLiveIn(*I);
- BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
- .addReg(*I);
+ const TargetOptions &Options = getTargetMachine().Options;
+ SDValue N0 = Op.getOperand(0);
+ SDValue N1 = Op.getOperand(1);
+ SDValue N2 = Op.getOperand(2);
+ SDLoc Loc(Op);
- // Insert the copy-back instructions right before the terminator.
- for (auto *Exit : Exits)
- BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
- TII->get(TargetOpcode::COPY), *I)
- .addReg(NewVR);
+ NegatibleCost N2Cost = NegatibleCost::Expensive;
+ SDValue NegN2 =
+ getNegatedExpression(N2, DAG, LegalOps, OptForSize, N2Cost, Depth + 1);
+
+ if (!NegN2)
+ return SDValue();
+
+ // (fneg (fnmsub a b c)) => (fnmsub (fneg a) b (fneg c))
+ // (fneg (fnmsub a b c)) => (fnmsub a (fneg b) (fneg c))
+ // These transformations may change sign of zeroes. For example,
+ // -(-ab-(-c))=-0 while -(-(ab-c))=+0 when a=b=c=1.
+ if (Flags.hasNoSignedZeros() || Options.NoSignedZerosFPMath) {
+ // Try and choose the cheaper one to negate.
+ NegatibleCost N0Cost = NegatibleCost::Expensive;
+ SDValue NegN0 = getNegatedExpression(N0, DAG, LegalOps, OptForSize,
+ N0Cost, Depth + 1);
+
+ NegatibleCost N1Cost = NegatibleCost::Expensive;
+ SDValue NegN1 = getNegatedExpression(N1, DAG, LegalOps, OptForSize,
+ N1Cost, Depth + 1);
+
+ if (NegN0 && N0Cost <= N1Cost) {
+ Cost = std::min(N0Cost, N2Cost);
+ return DAG.getNode(Opc, Loc, VT, NegN0, N1, NegN2, Flags);
+ } else if (NegN1) {
+ Cost = std::min(N1Cost, N2Cost);
+ return DAG.getNode(Opc, Loc, VT, N0, NegN1, NegN2, Flags);
+ }
+ }
+
+ // (fneg (fnmsub a b c)) => (fma a b (fneg c))
+ if (isOperationLegal(ISD::FMA, VT)) {
+ Cost = N2Cost;
+ return DAG.getNode(ISD::FMA, Loc, VT, N0, N1, NegN2, Flags);
+ }
+
+ break;
}
+
+ return TargetLowering::getNegatedExpression(Op, DAG, LegalOps, OptForSize,
+ Cost, Depth);
}
// Override to enable LOAD_STACK_GUARD lowering on Linux.
@@ -15450,6 +16557,13 @@ bool PPCTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
return false;
case MVT::f32:
case MVT::f64:
+ if (Subtarget.hasPrefixInstrs()) {
+ // With prefixed instructions, we can materialize anything that can be
+ // represented with a 32-bit immediate, not just positive zero.
+ APFloat APFloatOfImm = Imm;
+ return convertToNonDenormSingle(APFloatOfImm);
+ }
+ LLVM_FALLTHROUGH;
case MVT::ppcf128:
return Imm.isPosZero();
}
@@ -15620,10 +16734,59 @@ static SDValue combineADDToADDZE(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+// Transform
+// (add C1, (MAT_PCREL_ADDR GlobalAddr+C2)) to
+// (MAT_PCREL_ADDR GlobalAddr+(C1+C2))
+// In this case both C1 and C2 must be known constants.
+// C1+C2 must fit into a 34 bit signed integer.
+static SDValue combineADDToMAT_PCREL_ADDR(SDNode *N, SelectionDAG &DAG,
+ const PPCSubtarget &Subtarget) {
+ if (!Subtarget.isUsingPCRelativeCalls())
+ return SDValue();
+
+ // Check both Operand 0 and Operand 1 of the ADD node for the PCRel node.
+ // If we find that node try to cast the Global Address and the Constant.
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+
+ if (LHS.getOpcode() != PPCISD::MAT_PCREL_ADDR)
+ std::swap(LHS, RHS);
+
+ if (LHS.getOpcode() != PPCISD::MAT_PCREL_ADDR)
+ return SDValue();
+
+ // Operand zero of PPCISD::MAT_PCREL_ADDR is the GA node.
+ GlobalAddressSDNode *GSDN = dyn_cast<GlobalAddressSDNode>(LHS.getOperand(0));
+ ConstantSDNode* ConstNode = dyn_cast<ConstantSDNode>(RHS);
+
+ // Check that both casts succeeded.
+ if (!GSDN || !ConstNode)
+ return SDValue();
+
+ int64_t NewOffset = GSDN->getOffset() + ConstNode->getSExtValue();
+ SDLoc DL(GSDN);
+
+ // The signed int offset needs to fit in 34 bits.
+ if (!isInt<34>(NewOffset))
+ return SDValue();
+
+ // The new global address is a copy of the old global address except
+ // that it has the updated Offset.
+ SDValue GA =
+ DAG.getTargetGlobalAddress(GSDN->getGlobal(), DL, GSDN->getValueType(0),
+ NewOffset, GSDN->getTargetFlags());
+ SDValue MatPCRel =
+ DAG.getNode(PPCISD::MAT_PCREL_ADDR, DL, GSDN->getValueType(0), GA);
+ return MatPCRel;
+}
+
SDValue PPCTargetLowering::combineADD(SDNode *N, DAGCombinerInfo &DCI) const {
if (auto Value = combineADDToADDZE(N, DCI.DAG, Subtarget))
return Value;
+ if (auto Value = combineADDToMAT_PCREL_ADDR(N, DCI.DAG, Subtarget))
+ return Value;
+
return SDValue();
}
@@ -15648,6 +16811,24 @@ SDValue PPCTargetLowering::combineTRUNCATE(SDNode *N,
SDLoc dl(N);
SDValue Op0 = N->getOperand(0);
+ // fold (truncate (abs (sub (zext a), (zext b)))) -> (vabsd a, b)
+ if (Subtarget.hasP9Altivec() && Op0.getOpcode() == ISD::ABS) {
+ EVT VT = N->getValueType(0);
+ if (VT != MVT::v4i32 && VT != MVT::v8i16 && VT != MVT::v16i8)
+ return SDValue();
+ SDValue Sub = Op0.getOperand(0);
+ if (Sub.getOpcode() == ISD::SUB) {
+ SDValue SubOp0 = Sub.getOperand(0);
+ SDValue SubOp1 = Sub.getOperand(1);
+ if ((SubOp0.getOpcode() == ISD::ZERO_EXTEND) &&
+ (SubOp1.getOpcode() == ISD::ZERO_EXTEND)) {
+ return DCI.DAG.getNode(PPCISD::VABSD, dl, VT, SubOp0.getOperand(0),
+ SubOp1.getOperand(0),
+ DCI.DAG.getTargetConstant(0, dl, MVT::i32));
+ }
+ }
+ }
+
// Looking for a truncate of i128 to i64.
if (Op0.getValueType() != MVT::i128 || N->getValueType(0) != MVT::i64)
return SDValue();
@@ -15702,6 +16883,7 @@ SDValue PPCTargetLowering::combineMUL(SDNode *N, DAGCombinerInfo &DCI) const {
// vector 7 2 2
return true;
case PPC::DIR_PWR9:
+ case PPC::DIR_PWR10:
case PPC::DIR_PWR_FUTURE:
// type mul add shl
// scalar 5 2 2
@@ -15763,6 +16945,44 @@ SDValue PPCTargetLowering::combineMUL(SDNode *N, DAGCombinerInfo &DCI) const {
}
}
+// Combine fma-like op (like fnmsub) with fnegs to appropriate op. Do this
+// in combiner since we need to check SD flags and other subtarget features.
+SDValue PPCTargetLowering::combineFMALike(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ SDValue N2 = N->getOperand(2);
+ SDNodeFlags Flags = N->getFlags();
+ EVT VT = N->getValueType(0);
+ SelectionDAG &DAG = DCI.DAG;
+ const TargetOptions &Options = getTargetMachine().Options;
+ unsigned Opc = N->getOpcode();
+ bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
+ bool LegalOps = !DCI.isBeforeLegalizeOps();
+ SDLoc Loc(N);
+
+ // TODO: QPX subtarget is deprecated. No transformation here.
+ if (Subtarget.hasQPX() || !isOperationLegal(ISD::FMA, VT))
+ return SDValue();
+
+ // Allowing transformation to FNMSUB may change sign of zeroes when ab-c=0
+ // since (fnmsub a b c)=-0 while c-ab=+0.
+ if (!Flags.hasNoSignedZeros() && !Options.NoSignedZerosFPMath)
+ return SDValue();
+
+ // (fma (fneg a) b c) => (fnmsub a b c)
+ // (fnmsub (fneg a) b c) => (fma a b c)
+ if (SDValue NegN0 = getCheaperNegatedExpression(N0, DAG, LegalOps, CodeSize))
+ return DAG.getNode(invertFMAOpcode(Opc), Loc, VT, NegN0, N1, N2, Flags);
+
+ // (fma a (fneg b) c) => (fnmsub a b c)
+ // (fnmsub a (fneg b) c) => (fma a b c)
+ if (SDValue NegN1 = getCheaperNegatedExpression(N1, DAG, LegalOps, CodeSize))
+ return DAG.getNode(invertFMAOpcode(Opc), Loc, VT, N0, NegN1, N2, Flags);
+
+ return SDValue();
+}
+
bool PPCTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
// Only duplicate to increase tail-calls for the 64bit SysV ABIs.
if (!Subtarget.is64BitELFABI())