src - FreeBSD source tree

diff options


context:
space:
mode:

author	Dimitry Andric <dim@FreeBSD.org>	2020-07-31 21:22:58 +0000
committer	Dimitry Andric <dim@FreeBSD.org>	2020-07-31 21:22:58 +0000
commit	5ffd83dbcc34f10e07f6d3e968ae6365869615f4 (patch)
tree	0e9f5cf729dde39f949698fddef45a34e2bc7f44 /contrib/llvm-project/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
parent	1799696096df87b52968b8996d00c91e0a5de8d9 (diff)
parent	cfca06d7963fa0909f90483b42a6d7d194d01e08 (diff)
download	src-5ffd83dbcc34f10e07f6d3e968ae6365869615f4.tar.gz src-5ffd83dbcc34f10e07f6d3e968ae6365869615f4.zip

Merge llvm, clang, compiler-rt, libc++, libunwind, lld, lldb and openmp

master 2e10b7a39b9, the last commit before the llvmorg-12-init tag, from which release/11.x was branched. Note that for now, I rolled back all our local changes to make merging easier, and I will reapply the still-relevant ones after updating to 11.0.0-rc1.

Notes

Notes: svn path=/projects/clang1100-import/; revision=363742

Diffstat (limited to 'contrib/llvm-project/llvm/lib/Target/PowerPC/PPCISelLowering.cpp')

-rw-r--r--

contrib/llvm-project/llvm/lib/Target/PowerPC/PPCISelLowering.cpp

2820

1 files changed, 2020 insertions, 800 deletions

diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index ca1649fae258..ddfbd04e1ebc 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCISelLowering.cpp

@@ -55,7 +55,6 @@

#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"

#include "llvm/CodeGen/TargetRegisterInfo.h"

#include "llvm/CodeGen/ValueTypes.h"

-#include "llvm/IR/CallSite.h"

#include "llvm/IR/CallingConv.h"

#include "llvm/IR/Constant.h"

#include "llvm/IR/Constants.h"

@@ -118,14 +117,13 @@ cl::desc("disable sibling call optimization on ppc"), cl::Hidden);

static cl::opt<bool> DisableInnermostLoopAlign32("disable-ppc-innermost-loop-align32",

cl::desc("don't always align innermost loop to 32 bytes on ppc"), cl::Hidden);

-static cl::opt<bool> EnableQuadPrecision("enable-ppc-quad-precision",

-cl::desc("enable quad precision float support on ppc"), cl::Hidden);

static cl::opt<bool> UseAbsoluteJumpTables("ppc-use-absolute-jumptables",

cl::desc("use absolute jump tables on ppc"), cl::Hidden);

STATISTIC(NumTailCalls, "Number of tail calls");

STATISTIC(NumSiblingCalls, "Number of sibling calls");

+STATISTIC(ShufflesHandledWithVPERM, "Number of shuffles lowered to a VPERM");

+STATISTIC(NumDynamicAllocaProbed, "Number of dynamic stack allocation probed");

static bool isNByteElemShuffleMask(ShuffleVectorSDNode *, unsigned, int);

@@ -260,15 +258,16 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,

// PowerPC has no SREM/UREM instructions unless we are on P9

// On P9 we may use a hardware instruction to compute the remainder.

- // The instructions are not legalized directly because in the cases where the

- // result of both the remainder and the division is required it is more

- // efficient to compute the remainder from the result of the division rather

- // than use the remainder instruction.

+ // When the result of both the remainder and the division is required it is

+ // more efficient to compute the remainder from the result of the division

+ // rather than use the remainder instruction. The instructions are legalized

+ // directly because the DivRemPairsPass performs the transformation at the IR

+ // level.

if (Subtarget.isISA3_0()) {

- setOperationAction(ISD::SREM, MVT::i32, Custom);

- setOperationAction(ISD::UREM, MVT::i32, Custom);

- setOperationAction(ISD::SREM, MVT::i64, Custom);

- setOperationAction(ISD::UREM, MVT::i64, Custom);

+ setOperationAction(ISD::SREM, MVT::i32, Legal);

+ setOperationAction(ISD::UREM, MVT::i32, Legal);

+ setOperationAction(ISD::SREM, MVT::i64, Legal);

+ setOperationAction(ISD::UREM, MVT::i64, Legal);

} else {

setOperationAction(ISD::SREM, MVT::i32, Expand);

setOperationAction(ISD::UREM, MVT::i32, Expand);

@@ -286,6 +285,40 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,

setOperationAction(ISD::UDIVREM, MVT::i64, Expand);

setOperationAction(ISD::SDIVREM, MVT::i64, Expand);

+ // Handle constrained floating-point operations of scalar.

+ // TODO: Handle SPE specific operation.

+ setOperationAction(ISD::STRICT_FADD, MVT::f32, Legal);

+ setOperationAction(ISD::STRICT_FSUB, MVT::f32, Legal);

+ setOperationAction(ISD::STRICT_FMUL, MVT::f32, Legal);

+ setOperationAction(ISD::STRICT_FDIV, MVT::f32, Legal);

+ setOperationAction(ISD::STRICT_FMA, MVT::f32, Legal);

+ setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Legal);

+ setOperationAction(ISD::STRICT_FADD, MVT::f64, Legal);

+ setOperationAction(ISD::STRICT_FSUB, MVT::f64, Legal);

+ setOperationAction(ISD::STRICT_FMUL, MVT::f64, Legal);

+ setOperationAction(ISD::STRICT_FDIV, MVT::f64, Legal);

+ setOperationAction(ISD::STRICT_FMA, MVT::f64, Legal);

+ if (Subtarget.hasVSX())

+ setOperationAction(ISD::STRICT_FNEARBYINT, MVT::f64, Legal);

+ if (Subtarget.hasFSQRT()) {

+ setOperationAction(ISD::STRICT_FSQRT, MVT::f32, Legal);

+ setOperationAction(ISD::STRICT_FSQRT, MVT::f64, Legal);

+ }

+ if (Subtarget.hasFPRND()) {

+ setOperationAction(ISD::STRICT_FFLOOR, MVT::f32, Legal);

+ setOperationAction(ISD::STRICT_FCEIL, MVT::f32, Legal);

+ setOperationAction(ISD::STRICT_FTRUNC, MVT::f32, Legal);

+ setOperationAction(ISD::STRICT_FROUND, MVT::f32, Legal);

+ setOperationAction(ISD::STRICT_FFLOOR, MVT::f64, Legal);

+ setOperationAction(ISD::STRICT_FCEIL, MVT::f64, Legal);

+ setOperationAction(ISD::STRICT_FTRUNC, MVT::f64, Legal);

+ setOperationAction(ISD::STRICT_FROUND, MVT::f64, Legal);

+ }

// We don't support sin/cos/sqrt/fmod/pow

setOperationAction(ISD::FSIN , MVT::f64, Expand);

setOperationAction(ISD::FCOS , MVT::f64, Expand);

@@ -390,6 +423,9 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,

if (Subtarget.hasSPE()) {

// SPE has built-in conversions

+ setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Legal);

+ setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Legal);

+ setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Legal);

setOperationAction(ISD::FP_TO_SINT, MVT::i32, Legal);

setOperationAction(ISD::SINT_TO_FP, MVT::i32, Legal);

setOperationAction(ISD::UINT_TO_FP, MVT::i32, Legal);

@@ -539,9 +575,10 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,

setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);

} else {

// PowerPC does not have FP_TO_UINT on 32-bit implementations.

- if (Subtarget.hasSPE())

+ if (Subtarget.hasSPE()) {

+ setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Legal);

setOperationAction(ISD::FP_TO_UINT, MVT::i32, Legal);

- else

+ } else

setOperationAction(ISD::FP_TO_UINT, MVT::i32, Expand);

}

@@ -584,6 +621,12 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,

}

if (Subtarget.hasAltivec()) {

+ for (MVT VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {

+ setOperationAction(ISD::SADDSAT, VT, Legal);

+ setOperationAction(ISD::SSUBSAT, VT, Legal);

+ setOperationAction(ISD::UADDSAT, VT, Legal);

+ setOperationAction(ISD::USUBSAT, VT, Legal);

+ }

// First set operation action for all vector types to expand. Then we

// will selectively turn on ones that can be effectively codegen'd.

for (MVT VT : MVT::fixedlen_vector_valuetypes()) {

@@ -738,6 +781,8 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,

if (!Subtarget.hasP8Altivec())

setOperationAction(ISD::ABS, MVT::v2i64, Expand);

+ // Custom lowering ROTL v1i128 to VECTOR_SHUFFLE v16i8.

+ setOperationAction(ISD::ROTL, MVT::v1i128, Custom);

// With hasAltivec set, we can lower ISD::ROTL to vrl(b|h|w).

if (Subtarget.hasAltivec())

for (auto VT : {MVT::v4i32, MVT::v8i16, MVT::v16i8})

@@ -764,7 +809,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,

else

setOperationAction(ISD::MUL, MVT::v4i32, Custom);

- setOperationAction(ISD::MUL, MVT::v8i16, Custom);

+ setOperationAction(ISD::MUL, MVT::v8i16, Legal);

setOperationAction(ISD::MUL, MVT::v16i8, Custom);

setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Custom);

@@ -811,12 +856,16 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,

setOperationAction(ISD::FCEIL, MVT::v2f64, Legal);

setOperationAction(ISD::FTRUNC, MVT::v2f64, Legal);

setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Legal);

+ setOperationAction(ISD::FRINT, MVT::v2f64, Legal);

setOperationAction(ISD::FROUND, MVT::v2f64, Legal);

setOperationAction(ISD::FROUND, MVT::f64, Legal);

+ setOperationAction(ISD::FRINT, MVT::f64, Legal);

setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Legal);

+ setOperationAction(ISD::FRINT, MVT::v4f32, Legal);

setOperationAction(ISD::FROUND, MVT::v4f32, Legal);

setOperationAction(ISD::FROUND, MVT::f32, Legal);

+ setOperationAction(ISD::FRINT, MVT::f32, Legal);

setOperationAction(ISD::MUL, MVT::v2f64, Legal);

setOperationAction(ISD::FMA, MVT::v2f64, Legal);

@@ -906,6 +955,37 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,

setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom);

setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom);

+ // Handle constrained floating-point operations of vector.

+ // The predictor is `hasVSX` because altivec instruction has

+ // no exception but VSX vector instruction has.

+ setOperationAction(ISD::STRICT_FADD, MVT::v4f32, Legal);

+ setOperationAction(ISD::STRICT_FSUB, MVT::v4f32, Legal);

+ setOperationAction(ISD::STRICT_FMUL, MVT::v4f32, Legal);

+ setOperationAction(ISD::STRICT_FDIV, MVT::v4f32, Legal);

+ setOperationAction(ISD::STRICT_FMA, MVT::v4f32, Legal);

+ setOperationAction(ISD::STRICT_FSQRT, MVT::v4f32, Legal);

+ setOperationAction(ISD::STRICT_FMAXNUM, MVT::v4f32, Legal);

+ setOperationAction(ISD::STRICT_FMINNUM, MVT::v4f32, Legal);

+ setOperationAction(ISD::STRICT_FNEARBYINT, MVT::v4f32, Legal);

+ setOperationAction(ISD::STRICT_FFLOOR, MVT::v4f32, Legal);

+ setOperationAction(ISD::STRICT_FCEIL, MVT::v4f32, Legal);

+ setOperationAction(ISD::STRICT_FTRUNC, MVT::v4f32, Legal);

+ setOperationAction(ISD::STRICT_FROUND, MVT::v4f32, Legal);

+ setOperationAction(ISD::STRICT_FADD, MVT::v2f64, Legal);

+ setOperationAction(ISD::STRICT_FSUB, MVT::v2f64, Legal);

+ setOperationAction(ISD::STRICT_FMUL, MVT::v2f64, Legal);

+ setOperationAction(ISD::STRICT_FDIV, MVT::v2f64, Legal);

+ setOperationAction(ISD::STRICT_FMA, MVT::v2f64, Legal);

+ setOperationAction(ISD::STRICT_FSQRT, MVT::v2f64, Legal);

+ setOperationAction(ISD::STRICT_FMAXNUM, MVT::v2f64, Legal);

+ setOperationAction(ISD::STRICT_FMINNUM, MVT::v2f64, Legal);

+ setOperationAction(ISD::STRICT_FNEARBYINT, MVT::v2f64, Legal);

+ setOperationAction(ISD::STRICT_FFLOOR, MVT::v2f64, Legal);

+ setOperationAction(ISD::STRICT_FCEIL, MVT::v2f64, Legal);

+ setOperationAction(ISD::STRICT_FTRUNC, MVT::v2f64, Legal);

+ setOperationAction(ISD::STRICT_FROUND, MVT::v2f64, Legal);

addRegisterClass(MVT::v2i64, &PPC::VSRCRegClass);

}

@@ -925,44 +1005,59 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,

setOperationAction(ISD::SRL, MVT::v1i128, Legal);

setOperationAction(ISD::SRA, MVT::v1i128, Expand);

- if (EnableQuadPrecision) {

- addRegisterClass(MVT::f128, &PPC::VRRCRegClass);

- setOperationAction(ISD::FADD, MVT::f128, Legal);

- setOperationAction(ISD::FSUB, MVT::f128, Legal);

- setOperationAction(ISD::FDIV, MVT::f128, Legal);

- setOperationAction(ISD::FMUL, MVT::f128, Legal);

- setOperationAction(ISD::FP_EXTEND, MVT::f128, Legal);

- // No extending loads to f128 on PPC.

- for (MVT FPT : MVT::fp_valuetypes())

- setLoadExtAction(ISD::EXTLOAD, MVT::f128, FPT, Expand);

- setOperationAction(ISD::FMA, MVT::f128, Legal);

- setCondCodeAction(ISD::SETULT, MVT::f128, Expand);

- setCondCodeAction(ISD::SETUGT, MVT::f128, Expand);

- setCondCodeAction(ISD::SETUEQ, MVT::f128, Expand);

- setCondCodeAction(ISD::SETOGE, MVT::f128, Expand);

- setCondCodeAction(ISD::SETOLE, MVT::f128, Expand);

- setCondCodeAction(ISD::SETONE, MVT::f128, Expand);

- setOperationAction(ISD::FTRUNC, MVT::f128, Legal);

- setOperationAction(ISD::FRINT, MVT::f128, Legal);

- setOperationAction(ISD::FFLOOR, MVT::f128, Legal);

- setOperationAction(ISD::FCEIL, MVT::f128, Legal);

- setOperationAction(ISD::FNEARBYINT, MVT::f128, Legal);

- setOperationAction(ISD::FROUND, MVT::f128, Legal);

- setOperationAction(ISD::SELECT, MVT::f128, Expand);

- setOperationAction(ISD::FP_ROUND, MVT::f64, Legal);

- setOperationAction(ISD::FP_ROUND, MVT::f32, Legal);

- setTruncStoreAction(MVT::f128, MVT::f64, Expand);

- setTruncStoreAction(MVT::f128, MVT::f32, Expand);

- setOperationAction(ISD::BITCAST, MVT::i128, Custom);

- // No implementation for these ops for PowerPC.

- setOperationAction(ISD::FSIN , MVT::f128, Expand);

- setOperationAction(ISD::FCOS , MVT::f128, Expand);

- setOperationAction(ISD::FPOW, MVT::f128, Expand);

- setOperationAction(ISD::FPOWI, MVT::f128, Expand);

- setOperationAction(ISD::FREM, MVT::f128, Expand);

- }

+ addRegisterClass(MVT::f128, &PPC::VRRCRegClass);

+ setOperationAction(ISD::FADD, MVT::f128, Legal);

+ setOperationAction(ISD::FSUB, MVT::f128, Legal);

+ setOperationAction(ISD::FDIV, MVT::f128, Legal);

+ setOperationAction(ISD::FMUL, MVT::f128, Legal);

+ setOperationAction(ISD::FP_EXTEND, MVT::f128, Legal);

+ // No extending loads to f128 on PPC.

+ for (MVT FPT : MVT::fp_valuetypes())

+ setLoadExtAction(ISD::EXTLOAD, MVT::f128, FPT, Expand);

+ setOperationAction(ISD::FMA, MVT::f128, Legal);

+ setCondCodeAction(ISD::SETULT, MVT::f128, Expand);

+ setCondCodeAction(ISD::SETUGT, MVT::f128, Expand);

+ setCondCodeAction(ISD::SETUEQ, MVT::f128, Expand);

+ setCondCodeAction(ISD::SETOGE, MVT::f128, Expand);

+ setCondCodeAction(ISD::SETOLE, MVT::f128, Expand);

+ setCondCodeAction(ISD::SETONE, MVT::f128, Expand);

+ setOperationAction(ISD::FTRUNC, MVT::f128, Legal);

+ setOperationAction(ISD::FRINT, MVT::f128, Legal);

+ setOperationAction(ISD::FFLOOR, MVT::f128, Legal);

+ setOperationAction(ISD::FCEIL, MVT::f128, Legal);

+ setOperationAction(ISD::FNEARBYINT, MVT::f128, Legal);

+ setOperationAction(ISD::FROUND, MVT::f128, Legal);

+ setOperationAction(ISD::SELECT, MVT::f128, Expand);

+ setOperationAction(ISD::FP_ROUND, MVT::f64, Legal);

+ setOperationAction(ISD::FP_ROUND, MVT::f32, Legal);

+ setTruncStoreAction(MVT::f128, MVT::f64, Expand);

+ setTruncStoreAction(MVT::f128, MVT::f32, Expand);

+ setOperationAction(ISD::BITCAST, MVT::i128, Custom);

+ // No implementation for these ops for PowerPC.

+ setOperationAction(ISD::FSIN, MVT::f128, Expand);

+ setOperationAction(ISD::FCOS, MVT::f128, Expand);

+ setOperationAction(ISD::FPOW, MVT::f128, Expand);

+ setOperationAction(ISD::FPOWI, MVT::f128, Expand);

+ setOperationAction(ISD::FREM, MVT::f128, Expand);

+ // Handle constrained floating-point operations of fp128

+ setOperationAction(ISD::STRICT_FADD, MVT::f128, Legal);

+ setOperationAction(ISD::STRICT_FSUB, MVT::f128, Legal);

+ setOperationAction(ISD::STRICT_FMUL, MVT::f128, Legal);

+ setOperationAction(ISD::STRICT_FDIV, MVT::f128, Legal);

+ setOperationAction(ISD::STRICT_FMA, MVT::f128, Legal);

+ setOperationAction(ISD::STRICT_FSQRT, MVT::f128, Legal);

+ setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f128, Legal);

+ setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Legal);

+ setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Legal);

+ setOperationAction(ISD::STRICT_FRINT, MVT::f128, Legal);

+ setOperationAction(ISD::STRICT_FNEARBYINT, MVT::f128, Legal);

+ setOperationAction(ISD::STRICT_FFLOOR, MVT::f128, Legal);

+ setOperationAction(ISD::STRICT_FCEIL, MVT::f128, Legal);

+ setOperationAction(ISD::STRICT_FTRUNC, MVT::f128, Legal);

+ setOperationAction(ISD::STRICT_FROUND, MVT::f128, Legal);

setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);

setOperationAction(ISD::BSWAP, MVT::v8i16, Legal);

setOperationAction(ISD::BSWAP, MVT::v4i32, Legal);

@@ -1135,6 +1230,8 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,

setOperationAction(ISD::FDIV, MVT::v4f32, Expand);

setOperationAction(ISD::FSQRT, MVT::v4f32, Expand);

}

+ // TODO: Handle constrained floating-point operations of v4f64

}

if (Subtarget.has64BitSupport())

@@ -1169,6 +1266,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,

setTargetDAGCombine(ISD::SRA);

setTargetDAGCombine(ISD::SRL);

setTargetDAGCombine(ISD::MUL);

+ setTargetDAGCombine(ISD::FMA);

setTargetDAGCombine(ISD::SINT_TO_FP);

setTargetDAGCombine(ISD::BUILD_VECTOR);

if (Subtarget.hasFPCVT())

@@ -1208,34 +1306,18 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,

setTargetDAGCombine(ISD::VSELECT);

}

- // Darwin long double math library functions have $LDBL128 appended.

- if (Subtarget.isDarwin()) {

- setLibcallName(RTLIB::COS_PPCF128, "cosl$LDBL128");

- setLibcallName(RTLIB::POW_PPCF128, "powl$LDBL128");

- setLibcallName(RTLIB::REM_PPCF128, "fmodl$LDBL128");

- setLibcallName(RTLIB::SIN_PPCF128, "sinl$LDBL128");

- setLibcallName(RTLIB::SQRT_PPCF128, "sqrtl$LDBL128");

- setLibcallName(RTLIB::LOG_PPCF128, "logl$LDBL128");

- setLibcallName(RTLIB::LOG2_PPCF128, "log2l$LDBL128");

- setLibcallName(RTLIB::LOG10_PPCF128, "log10l$LDBL128");

- setLibcallName(RTLIB::EXP_PPCF128, "expl$LDBL128");

- setLibcallName(RTLIB::EXP2_PPCF128, "exp2l$LDBL128");

- }

- if (EnableQuadPrecision) {

- setLibcallName(RTLIB::LOG_F128, "logf128");

- setLibcallName(RTLIB::LOG2_F128, "log2f128");

- setLibcallName(RTLIB::LOG10_F128, "log10f128");

- setLibcallName(RTLIB::EXP_F128, "expf128");

- setLibcallName(RTLIB::EXP2_F128, "exp2f128");

- setLibcallName(RTLIB::SIN_F128, "sinf128");

- setLibcallName(RTLIB::COS_F128, "cosf128");

- setLibcallName(RTLIB::POW_F128, "powf128");

- setLibcallName(RTLIB::FMIN_F128, "fminf128");

- setLibcallName(RTLIB::FMAX_F128, "fmaxf128");

- setLibcallName(RTLIB::POWI_F128, "__powikf2");

- setLibcallName(RTLIB::REM_F128, "fmodf128");

- }

+ setLibcallName(RTLIB::LOG_F128, "logf128");

+ setLibcallName(RTLIB::LOG2_F128, "log2f128");

+ setLibcallName(RTLIB::LOG10_F128, "log10f128");

+ setLibcallName(RTLIB::EXP_F128, "expf128");

+ setLibcallName(RTLIB::EXP2_F128, "exp2f128");

+ setLibcallName(RTLIB::SIN_F128, "sinf128");

+ setLibcallName(RTLIB::COS_F128, "cosf128");

+ setLibcallName(RTLIB::POW_F128, "powf128");

+ setLibcallName(RTLIB::FMIN_F128, "fminf128");

+ setLibcallName(RTLIB::FMAX_F128, "fmaxf128");

+ setLibcallName(RTLIB::POWI_F128, "__powikf2");

+ setLibcallName(RTLIB::REM_F128, "fmodf128");

// With 32 condition bits, we don't need to sink (and duplicate) compares

// aggressively in CodeGenPrep.

@@ -1245,8 +1327,6 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,

}

setMinFunctionAlignment(Align(4));

- if (Subtarget.isDarwin())

- setPrefFunctionAlignment(Align(16));

switch (Subtarget.getCPUDirective()) {

default: break;

@@ -1263,6 +1343,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,

case PPC::DIR_PWR7:

case PPC::DIR_PWR8:

case PPC::DIR_PWR9:

+ case PPC::DIR_PWR10:

case PPC::DIR_PWR_FUTURE:

setPrefLoopAlignment(Align(16));

setPrefFunctionAlignment(Align(16));

@@ -1298,27 +1379,33 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,

MaxLoadsPerMemcmp = 8;

MaxLoadsPerMemcmpOptSize = 4;

}

+ // Let the subtarget (CPU) decide if a predictable select is more expensive

+ // than the corresponding branch. This information is used in CGP to decide

+ // when to convert selects into branches.

+ PredictableSelectIsExpensive = Subtarget.isPredictableSelectIsExpensive();

}

/// getMaxByValAlign - Helper for getByValTypeAlignment to determine

/// the desired ByVal argument alignment.

-static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign,

- unsigned MaxMaxAlign) {

+static void getMaxByValAlign(Type *Ty, Align &MaxAlign, Align MaxMaxAlign) {

if (MaxAlign == MaxMaxAlign)

return;

if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {

- if (MaxMaxAlign >= 32 && VTy->getBitWidth() >= 256)

- MaxAlign = 32;

- else if (VTy->getBitWidth() >= 128 && MaxAlign < 16)

- MaxAlign = 16;

+ if (MaxMaxAlign >= 32 &&

+ VTy->getPrimitiveSizeInBits().getFixedSize() >= 256)

+ MaxAlign = Align(32);

+ else if (VTy->getPrimitiveSizeInBits().getFixedSize() >= 128 &&

+ MaxAlign < 16)

+ MaxAlign = Align(16);

} else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {

- unsigned EltAlign = 0;

+ Align EltAlign;

getMaxByValAlign(ATy->getElementType(), EltAlign, MaxMaxAlign);

if (EltAlign > MaxAlign)

MaxAlign = EltAlign;

} else if (StructType *STy = dyn_cast<StructType>(Ty)) {

for (auto *EltTy : STy->elements()) {

- unsigned EltAlign = 0;

+ Align EltAlign;

getMaxByValAlign(EltTy, EltAlign, MaxMaxAlign);

if (EltAlign > MaxAlign)

MaxAlign = EltAlign;

@@ -1332,16 +1419,12 @@ static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign,

/// function arguments in the caller parameter area.

unsigned PPCTargetLowering::getByValTypeAlignment(Type *Ty,

const DataLayout &DL) const {

- // Darwin passes everything on 4 byte boundary.

- if (Subtarget.isDarwin())

- return 4;

// 16byte and wider vectors are passed on 16byte boundary.

// The rest is 8 on PPC64 and 4 on PPC32 boundary.

- unsigned Align = Subtarget.isPPC64() ? 8 : 4;

+ Align Alignment = Subtarget.isPPC64() ? Align(8) : Align(4);

if (Subtarget.hasAltivec() || Subtarget.hasQPX())

- getMaxByValAlign(Ty, Align, Subtarget.hasQPX() ? 32 : 16);

- return Align;

+ getMaxByValAlign(Ty, Alignment, Subtarget.hasQPX() ? Align(32) : Align(16));

+ return Alignment.value();

}

bool PPCTargetLowering::useSoftFloat() const {

@@ -1356,6 +1439,16 @@ bool PPCTargetLowering::preferIncOfAddToSubOfNot(EVT VT) const {

return VT.isScalarInteger();

}

+/// isMulhCheaperThanMulShift - Return true if a mulh[s|u] node for a specific

+/// type is cheaper than a multiply followed by a shift.

+/// This is true for words and doublewords on 64-bit PowerPC.

+bool PPCTargetLowering::isMulhCheaperThanMulShift(EVT Type) const {

+ if (Subtarget.isPPC64() && (isOperationLegal(ISD::MULHS, Type) ||

+ isOperationLegal(ISD::MULHU, Type)))

+ return true;

+ return TargetLowering::isMulhCheaperThanMulShift(Type);

const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {

switch ((PPCISD::NodeType)Opcode) {

case PPCISD::FIRST_NUMBER: break;

@@ -1377,10 +1470,12 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {

case PPCISD::FRE: return "PPCISD::FRE";

case PPCISD::FRSQRTE: return "PPCISD::FRSQRTE";

case PPCISD::STFIWX: return "PPCISD::STFIWX";

- case PPCISD::VMADDFP: return "PPCISD::VMADDFP";

- case PPCISD::VNMSUBFP: return "PPCISD::VNMSUBFP";

case PPCISD::VPERM: return "PPCISD::VPERM";

case PPCISD::XXSPLT: return "PPCISD::XXSPLT";

+ case PPCISD::XXSPLTI_SP_TO_DP:

+ return "PPCISD::XXSPLTI_SP_TO_DP";

+ case PPCISD::XXSPLTI32DX:

+ return "PPCISD::XXSPLTI32DX";

case PPCISD::VECINSERT: return "PPCISD::VECINSERT";

case PPCISD::XXPERMDI: return "PPCISD::XXPERMDI";

case PPCISD::VECSHL: return "PPCISD::VECSHL";

@@ -1392,6 +1487,7 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {

case PPCISD::ATOMIC_CMP_SWAP_16: return "PPCISD::ATOMIC_CMP_SWAP_16";

case PPCISD::DYNALLOC: return "PPCISD::DYNALLOC";

case PPCISD::DYNAREAOFFSET: return "PPCISD::DYNAREAOFFSET";

+ case PPCISD::PROBED_ALLOCA: return "PPCISD::PROBED_ALLOCA";

case PPCISD::GlobalBaseReg: return "PPCISD::GlobalBaseReg";

case PPCISD::SRL: return "PPCISD::SRL";

case PPCISD::SRA: return "PPCISD::SRA";

@@ -1399,6 +1495,7 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {

case PPCISD::SRA_ADDZE: return "PPCISD::SRA_ADDZE";

case PPCISD::CALL: return "PPCISD::CALL";

case PPCISD::CALL_NOP: return "PPCISD::CALL_NOP";

+ case PPCISD::CALL_NOTOC: return "PPCISD::CALL_NOTOC";

case PPCISD::MTCTR: return "PPCISD::MTCTR";

case PPCISD::BCTRL: return "PPCISD::BCTRL";

case PPCISD::BCTRL_LOAD_TOC: return "PPCISD::BCTRL_LOAD_TOC";

@@ -1412,6 +1509,8 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {

case PPCISD::MTVSRZ: return "PPCISD::MTVSRZ";

case PPCISD::SINT_VEC_TO_FP: return "PPCISD::SINT_VEC_TO_FP";

case PPCISD::UINT_VEC_TO_FP: return "PPCISD::UINT_VEC_TO_FP";

+ case PPCISD::SCALAR_TO_VECTOR_PERMUTED:

+ return "PPCISD::SCALAR_TO_VECTOR_PERMUTED";

case PPCISD::ANDI_rec_1_EQ_BIT:

return "PPCISD::ANDI_rec_1_EQ_BIT";

case PPCISD::ANDI_rec_1_GT_BIT:

@@ -1425,7 +1524,6 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {

case PPCISD::LXSIZX: return "PPCISD::LXSIZX";

case PPCISD::STXSIX: return "PPCISD::STXSIX";

case PPCISD::VEXTS: return "PPCISD::VEXTS";

- case PPCISD::SExtVElems: return "PPCISD::SExtVElems";

case PPCISD::LXVD2X: return "PPCISD::LXVD2X";

case PPCISD::STXVD2X: return "PPCISD::STXVD2X";

case PPCISD::LOAD_VEC_BE: return "PPCISD::LOAD_VEC_BE";

@@ -1475,7 +1573,9 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {

case PPCISD::EXTSWSLI: return "PPCISD::EXTSWSLI";

case PPCISD::LD_VSX_LH: return "PPCISD::LD_VSX_LH";

case PPCISD::FP_EXTEND_HALF: return "PPCISD::FP_EXTEND_HALF";

+ case PPCISD::MAT_PCREL_ADDR: return "PPCISD::MAT_PCREL_ADDR";

case PPCISD::LD_SPLAT: return "PPCISD::LD_SPLAT";

+ case PPCISD::FNMSUB: return "PPCISD::FNMSUB";

}

return nullptr;

}

@@ -2338,17 +2438,22 @@ bool PPCTargetLowering::SelectAddressEVXRegReg(SDValue N, SDValue &Base,

/// non-zero and N can be represented by a base register plus a signed 16-bit

/// displacement, make a more precise judgement by checking (displacement % \p

/// EncodingAlignment).

-bool PPCTargetLowering::SelectAddressRegReg(SDValue N, SDValue &Base,

- SDValue &Index, SelectionDAG &DAG,

- unsigned EncodingAlignment) const {

- int16_t imm = 0;

+bool PPCTargetLowering::SelectAddressRegReg(

+ SDValue N, SDValue &Base, SDValue &Index, SelectionDAG &DAG,

+ MaybeAlign EncodingAlignment) const {

+ // If we have a PC Relative target flag don't select as [reg+reg]. It will be

+ // a [pc+imm].

+ if (SelectAddressPCRel(N, Base))

+ return false;

+ int16_t Imm = 0;

if (N.getOpcode() == ISD::ADD) {

// Is there any SPE load/store (f64), which can't handle 16bit offset?

// SPE load/store can only handle 8-bit offsets.

if (hasSPE() && SelectAddressEVXRegReg(N, Base, Index, DAG))

return true;

- if (isIntS16Immediate(N.getOperand(1), imm) &&

- (!EncodingAlignment || !(imm % EncodingAlignment)))

+ if (isIntS16Immediate(N.getOperand(1), Imm) &&

+ (!EncodingAlignment || isAligned(*EncodingAlignment, Imm)))

return false; // r+i

if (N.getOperand(1).getOpcode() == PPCISD::Lo)

return false; // r+i

@@ -2357,8 +2462,8 @@ bool PPCTargetLowering::SelectAddressRegReg(SDValue N, SDValue &Base,

Index = N.getOperand(1);

return true;

} else if (N.getOpcode() == ISD::OR) {

- if (isIntS16Immediate(N.getOperand(1), imm) &&

- (!EncodingAlignment || !(imm % EncodingAlignment)))

+ if (isIntS16Immediate(N.getOperand(1), Imm) &&

+ (!EncodingAlignment || isAligned(*EncodingAlignment, Imm)))

return false; // r+i can fold it if we can.

// If this is an or of disjoint bitfields, we can codegen this as an add

@@ -2413,8 +2518,7 @@ static void fixupFuncForFI(SelectionDAG &DAG, int FrameIdx, EVT VT) {

MachineFunction &MF = DAG.getMachineFunction();

MachineFrameInfo &MFI = MF.getFrameInfo();

- unsigned Align = MFI.getObjectAlignment(FrameIdx);

- if (Align >= 4)

+ if (MFI.getObjectAlign(FrameIdx) >= Align(4))

return;

PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();

@@ -2425,12 +2529,17 @@ static void fixupFuncForFI(SelectionDAG &DAG, int FrameIdx, EVT VT) {

/// a signed 16-bit displacement [r+imm], and if it is not better

/// represented as reg+reg. If \p EncodingAlignment is non-zero, only accept

/// displacements that are multiples of that value.

-bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp,

- SDValue &Base,

- SelectionDAG &DAG,

- unsigned EncodingAlignment) const {

+bool PPCTargetLowering::SelectAddressRegImm(

+ SDValue N, SDValue &Disp, SDValue &Base, SelectionDAG &DAG,

+ MaybeAlign EncodingAlignment) const {

// FIXME dl should come from parent load or store, not from address

SDLoc dl(N);

+ // If we have a PC Relative target flag don't select as [reg+imm]. It will be

+ // a [pc+imm].

+ if (SelectAddressPCRel(N, Base))

+ return false;

// If this can be more profitably realized as r+r, fail.

if (SelectAddressRegReg(N, Disp, Base, DAG, EncodingAlignment))

return false;

@@ -2438,7 +2547,7 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp,

if (N.getOpcode() == ISD::ADD) {

int16_t imm = 0;

if (isIntS16Immediate(N.getOperand(1), imm) &&

- (!EncodingAlignment || (imm % EncodingAlignment) == 0)) {

+ (!EncodingAlignment || isAligned(*EncodingAlignment, imm))) {

Disp = DAG.getTargetConstant(imm, dl, N.getValueType());

if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N.getOperand(0))) {

Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());

@@ -2462,7 +2571,7 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp,

} else if (N.getOpcode() == ISD::OR) {

int16_t imm = 0;

if (isIntS16Immediate(N.getOperand(1), imm) &&

- (!EncodingAlignment || (imm % EncodingAlignment) == 0)) {

+ (!EncodingAlignment || isAligned(*EncodingAlignment, imm))) {

// If this is an or of disjoint bitfields, we can codegen this as an add

// (for better address arithmetic) if the LHS and RHS of the OR are

// provably disjoint.

@@ -2489,7 +2598,7 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp,

// this as "d, 0"

int16_t Imm;

if (isIntS16Immediate(CN, Imm) &&

- (!EncodingAlignment || (Imm % EncodingAlignment) == 0)) {

+ (!EncodingAlignment || isAligned(*EncodingAlignment, Imm))) {

Disp = DAG.getTargetConstant(Imm, dl, CN->getValueType(0));

Base = DAG.getRegister(Subtarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO,

CN->getValueType(0));

@@ -2499,7 +2608,8 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp,

// Handle 32-bit sext immediates with LIS + addr mode.

if ((CN->getValueType(0) == MVT::i32 ||

(int64_t)CN->getZExtValue() == (int)CN->getZExtValue()) &&

- (!EncodingAlignment || (CN->getZExtValue() % EncodingAlignment) == 0)) {

+ (!EncodingAlignment ||

+ isAligned(*EncodingAlignment, CN->getZExtValue()))) {

int Addr = (int)CN->getZExtValue();

// Otherwise, break this down into an LIS + disp.

@@ -2554,6 +2664,27 @@ bool PPCTargetLowering::SelectAddressRegRegOnly(SDValue N, SDValue &Base,

return true;

}

+template <typename Ty> static bool isValidPCRelNode(SDValue N) {

+ Ty *PCRelCand = dyn_cast<Ty>(N);

+ return PCRelCand && (PCRelCand->getTargetFlags() & PPCII::MO_PCREL_FLAG);

+/// Returns true if this address is a PC Relative address.

+/// PC Relative addresses are marked with the flag PPCII::MO_PCREL_FLAG

+/// or if the node opcode is PPCISD::MAT_PCREL_ADDR.

+bool PPCTargetLowering::SelectAddressPCRel(SDValue N, SDValue &Base) const {

+ // This is a materialize PC Relative node. Always select this as PC Relative.

+ Base = N;

+ if (N.getOpcode() == PPCISD::MAT_PCREL_ADDR)

+ return true;

+ if (isValidPCRelNode<ConstantPoolSDNode>(N) ||

+ isValidPCRelNode<GlobalAddressSDNode>(N) ||

+ isValidPCRelNode<JumpTableSDNode>(N) ||

+ isValidPCRelNode<BlockAddressSDNode>(N))

+ return true;

+ return false;

/// Returns true if we should use a direct load into vector instruction

/// (such as lxsd or lfd), instead of a load into gpr + direct move sequence.

static bool usePartialVectorLoads(SDNode *N, const PPCSubtarget& ST) {

@@ -2591,7 +2722,8 @@ static bool usePartialVectorLoads(SDNode *N, const PPCSubtarget& ST) {

for (SDNode::use_iterator UI = LD->use_begin(), UE = LD->use_end();

UI != UE; ++UI)

if (UI.getUse().get().getResNo() == 0 &&

- UI->getOpcode() != ISD::SCALAR_TO_VECTOR)

+ UI->getOpcode() != ISD::SCALAR_TO_VECTOR &&

+ UI->getOpcode() != PPCISD::SCALAR_TO_VECTOR_PERMUTED)

return false;

return true;

@@ -2664,14 +2796,14 @@ bool PPCTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,

// LDU/STU can only handle immediates that are a multiple of 4.

if (VT != MVT::i64) {

- if (!SelectAddressRegImm(Ptr, Offset, Base, DAG, 0))

+ if (!SelectAddressRegImm(Ptr, Offset, Base, DAG, None))

return false;

} else {

// LDU/STU need an address with at least 4-byte alignment.

if (Alignment < 4)

return false;

- if (!SelectAddressRegImm(Ptr, Offset, Base, DAG, 4))

+ if (!SelectAddressRegImm(Ptr, Offset, Base, DAG, Align(4)))

return false;

}

@@ -2705,18 +2837,6 @@ static void getLabelAccessInfo(bool IsPIC, const PPCSubtarget &Subtarget,

HiOpFlags |= PPCII::MO_PIC_FLAG;

LoOpFlags |= PPCII::MO_PIC_FLAG;

}

- // If this is a reference to a global value that requires a non-lazy-ptr, make

- // sure that instruction lowering adds it.

- if (GV && Subtarget.hasLazyResolverStub(GV)) {

- HiOpFlags |= PPCII::MO_NLP_FLAG;

- LoOpFlags |= PPCII::MO_NLP_FLAG;

- if (GV->hasHiddenVisibility()) {

- HiOpFlags |= PPCII::MO_NLP_HIDDEN_FLAG;

- LoOpFlags |= PPCII::MO_NLP_HIDDEN_FLAG;

- }

}

static SDValue LowerLabelRef(SDValue HiPart, SDValue LoPart, bool isPIC,

@@ -2758,7 +2878,7 @@ SDValue PPCTargetLowering::getTOCEntry(SelectionDAG &DAG, const SDLoc &dl,

SDValue Ops[] = { GA, Reg };

return DAG.getMemIntrinsicNode(

PPCISD::TOC_ENTRY, dl, DAG.getVTList(VT, MVT::Other), Ops, VT,

- MachinePointerInfo::getGOT(DAG.getMachineFunction()), 0,

+ MachinePointerInfo::getGOT(DAG.getMachineFunction()), None,

MachineMemOperand::MOLoad);

}

@@ -2771,8 +2891,15 @@ SDValue PPCTargetLowering::LowerConstantPool(SDValue Op,

// 64-bit SVR4 ABI and AIX ABI code are always position-independent.

// The actual address of the GlobalValue is stored in the TOC.

if (Subtarget.is64BitELFABI() || Subtarget.isAIXABI()) {

+ if (Subtarget.isUsingPCRelativeCalls()) {

+ SDLoc DL(CP);

+ EVT Ty = getPointerTy(DAG.getDataLayout());

+ SDValue ConstPool = DAG.getTargetConstantPool(

+ C, Ty, CP->getAlign(), CP->getOffset(), PPCII::MO_PCREL_FLAG);

+ return DAG.getNode(PPCISD::MAT_PCREL_ADDR, DL, Ty, ConstPool);

+ }

setUsesTOCBasePtr(DAG);

- SDValue GA = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(), 0);

+ SDValue GA = DAG.getTargetConstantPool(C, PtrVT, CP->getAlign(), 0);

return getTOCEntry(DAG, SDLoc(CP), GA);

}

@@ -2781,15 +2908,15 @@ SDValue PPCTargetLowering::LowerConstantPool(SDValue Op,

getLabelAccessInfo(IsPIC, Subtarget, MOHiFlag, MOLoFlag);

if (IsPIC && Subtarget.isSVR4ABI()) {

- SDValue GA = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(),

- PPCII::MO_PIC_FLAG);

+ SDValue GA =

+ DAG.getTargetConstantPool(C, PtrVT, CP->getAlign(), PPCII::MO_PIC_FLAG);

return getTOCEntry(DAG, SDLoc(CP), GA);

}

SDValue CPIHi =

- DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(), 0, MOHiFlag);

+ DAG.getTargetConstantPool(C, PtrVT, CP->getAlign(), 0, MOHiFlag);

SDValue CPILo =

- DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(), 0, MOLoFlag);

+ DAG.getTargetConstantPool(C, PtrVT, CP->getAlign(), 0, MOLoFlag);

return LowerLabelRef(CPIHi, CPILo, IsPIC, DAG);

}

@@ -2846,6 +2973,16 @@ SDValue PPCTargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {

EVT PtrVT = Op.getValueType();

JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);

+ // isUsingPCRelativeCalls() returns true when PCRelative is enabled

+ if (Subtarget.isUsingPCRelativeCalls()) {

+ SDLoc DL(JT);

+ EVT Ty = getPointerTy(DAG.getDataLayout());

+ SDValue GA =

+ DAG.getTargetJumpTable(JT->getIndex(), Ty, PPCII::MO_PCREL_FLAG);

+ SDValue MatAddr = DAG.getNode(PPCISD::MAT_PCREL_ADDR, DL, Ty, GA);

+ return MatAddr;

+ }

// 64-bit SVR4 ABI and AIX ABI code are always position-independent.

// The actual address of the GlobalValue is stored in the TOC.

if (Subtarget.is64BitELFABI() || Subtarget.isAIXABI()) {

@@ -2875,6 +3012,16 @@ SDValue PPCTargetLowering::LowerBlockAddress(SDValue Op,

BlockAddressSDNode *BASDN = cast<BlockAddressSDNode>(Op);

const BlockAddress *BA = BASDN->getBlockAddress();

+ // isUsingPCRelativeCalls() returns true when PCRelative is enabled

+ if (Subtarget.isUsingPCRelativeCalls()) {

+ SDLoc DL(BASDN);

+ EVT Ty = getPointerTy(DAG.getDataLayout());

+ SDValue GA = DAG.getTargetBlockAddress(BA, Ty, BASDN->getOffset(),

+ PPCII::MO_PCREL_FLAG);

+ SDValue MatAddr = DAG.getNode(PPCISD::MAT_PCREL_ADDR, DL, Ty, GA);

+ return MatAddr;

+ }

// 64-bit SVR4 ABI and AIX ABI code are always position-independent.

// The actual BlockAddress is stored in the TOC.

if (Subtarget.is64BitELFABI() || Subtarget.isAIXABI()) {

@@ -3004,6 +3151,22 @@ SDValue PPCTargetLowering::LowerGlobalAddress(SDValue Op,

// 64-bit SVR4 ABI & AIX ABI code is always position-independent.

// The actual address of the GlobalValue is stored in the TOC.

if (Subtarget.is64BitELFABI() || Subtarget.isAIXABI()) {

+ if (Subtarget.isUsingPCRelativeCalls()) {

+ EVT Ty = getPointerTy(DAG.getDataLayout());

+ if (isAccessedAsGotIndirect(Op)) {

+ SDValue GA = DAG.getTargetGlobalAddress(GV, DL, Ty, GSDN->getOffset(),

+ PPCII::MO_PCREL_FLAG |

+ PPCII::MO_GOT_FLAG);

+ SDValue MatPCRel = DAG.getNode(PPCISD::MAT_PCREL_ADDR, DL, Ty, GA);

+ SDValue Load = DAG.getLoad(MVT::i64, DL, DAG.getEntryNode(), MatPCRel,

+ MachinePointerInfo());

+ return Load;

+ } else {

+ SDValue GA = DAG.getTargetGlobalAddress(GV, DL, Ty, GSDN->getOffset(),

+ PPCII::MO_PCREL_FLAG);

+ return DAG.getNode(PPCISD::MAT_PCREL_ADDR, DL, Ty, GA);

+ }

setUsesTOCBasePtr(DAG);

SDValue GA = DAG.getTargetGlobalAddress(GV, DL, PtrVT, GSDN->getOffset());

return getTOCEntry(DAG, DL, GA);

@@ -3025,13 +3188,7 @@ SDValue PPCTargetLowering::LowerGlobalAddress(SDValue Op,

SDValue GALo =

DAG.getTargetGlobalAddress(GV, DL, PtrVT, GSDN->getOffset(), MOLoFlag);

- SDValue Ptr = LowerLabelRef(GAHi, GALo, IsPIC, DAG);

- // If the global reference is actually to a non-lazy-pointer, we have to do an

- // extra load to get the address of the global.

- if (MOHiFlag & PPCII::MO_NLP_FLAG)

- Ptr = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Ptr, MachinePointerInfo());

- return Ptr;

+ return LowerLabelRef(GAHi, GALo, IsPIC, DAG);

}

SDValue PPCTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {

@@ -3192,10 +3349,10 @@ SDValue PPCTargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const {

// We have to copy the entire va_list struct:

// 2*sizeof(char) + 2 Byte alignment + 2*sizeof(char*) = 12 Byte

- return DAG.getMemcpy(Op.getOperand(0), Op,

- Op.getOperand(1), Op.getOperand(2),

- DAG.getConstant(12, SDLoc(Op), MVT::i32), 8, false, true,

- false, MachinePointerInfo(), MachinePointerInfo());

+ return DAG.getMemcpy(Op.getOperand(0), Op, Op.getOperand(1), Op.getOperand(2),

+ DAG.getConstant(12, SDLoc(Op), MVT::i32), Align(8),

+ false, true, false, MachinePointerInfo(),

+ MachinePointerInfo());

}

SDValue PPCTargetLowering::LowerADJUST_TRAMPOLINE(SDValue Op,

@@ -3252,7 +3409,7 @@ SDValue PPCTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {

SDLoc dl(Op);

- if (Subtarget.isDarwinABI() || Subtarget.isPPC64()) {

+ if (Subtarget.isPPC64() || Subtarget.isAIXABI()) {

// vastart just stores the address of the VarArgsFrameIndex slot into the

// memory location argument.

SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);

@@ -3358,31 +3515,31 @@ static unsigned CalculateStackSlotSize(EVT ArgVT, ISD::ArgFlagsTy Flags,

/// CalculateStackSlotAlignment - Calculates the alignment of this argument

/// on the stack.

-static unsigned CalculateStackSlotAlignment(EVT ArgVT, EVT OrigVT,

- ISD::ArgFlagsTy Flags,

- unsigned PtrByteSize) {

- unsigned Align = PtrByteSize;

+static Align CalculateStackSlotAlignment(EVT ArgVT, EVT OrigVT,

+ ISD::ArgFlagsTy Flags,

+ unsigned PtrByteSize) {

+ Align Alignment(PtrByteSize);

// Altivec parameters are padded to a 16 byte boundary.

if (ArgVT == MVT::v4f32 || ArgVT == MVT::v4i32 ||

ArgVT == MVT::v8i16 || ArgVT == MVT::v16i8 ||

ArgVT == MVT::v2f64 || ArgVT == MVT::v2i64 ||

ArgVT == MVT::v1i128 || ArgVT == MVT::f128)

- Align = 16;

+ Alignment = Align(16);

// QPX vector types stored in double-precision are padded to a 32 byte

// boundary.

else if (ArgVT == MVT::v4f64 || ArgVT == MVT::v4i1)

- Align = 32;

+ Alignment = Align(32);

// ByVal parameters are aligned as requested.

if (Flags.isByVal()) {

- unsigned BVAlign = Flags.getByValAlign();

+ auto BVAlign = Flags.getNonZeroByValAlign();

if (BVAlign > PtrByteSize) {

- if (BVAlign % PtrByteSize != 0)

- llvm_unreachable(

+ if (BVAlign.value() % PtrByteSize != 0)

+ llvm_unreachable(

"ByVal alignment is not a multiple of the pointer size");

- Align = BVAlign;

+ Alignment = BVAlign;

}

@@ -3392,12 +3549,12 @@ static unsigned CalculateStackSlotAlignment(EVT ArgVT, EVT OrigVT,

// needs to be aligned to the size of the full type. (Except for

// ppcf128, which is only aligned as its f64 components.)

if (Flags.isSplit() && OrigVT != MVT::ppcf128)

- Align = OrigVT.getStoreSize();

+ Alignment = Align(OrigVT.getStoreSize());

else

- Align = ArgVT.getStoreSize();

+ Alignment = Align(ArgVT.getStoreSize());

}

- return Align;

+ return Alignment;

}

/// CalculateStackSlotUsed - Return whether this argument will use its

@@ -3415,9 +3572,9 @@ static bool CalculateStackSlotUsed(EVT ArgVT, EVT OrigVT,

bool UseMemory = false;

// Respect alignment of argument on the stack.

- unsigned Align =

- CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize);

- ArgOffset = ((ArgOffset + Align - 1) / Align) * Align;

+ Align Alignment =

+ CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize);

+ ArgOffset = alignTo(ArgOffset, Alignment);

// If there's no space left in the argument save area, we must

// use memory (this check also catches zero-sized arguments).

if (ArgOffset >= LinkageSize + ParamAreaSize)

@@ -3461,10 +3618,7 @@ static bool CalculateStackSlotUsed(EVT ArgVT, EVT OrigVT,

/// ensure minimum alignment required for target.

static unsigned EnsureStackAlignment(const PPCFrameLowering *Lowering,

unsigned NumBytes) {

- unsigned TargetAlign = Lowering->getStackAlignment();

- unsigned AlignMask = TargetAlign - 1;

- NumBytes = (NumBytes + AlignMask) & ~AlignMask;

- return NumBytes;

+ return alignTo(NumBytes, Lowering->getStackAlign());

}

SDValue PPCTargetLowering::LowerFormalArguments(

@@ -3527,7 +3681,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_32SVR4(

// Potential tail calls could cause overwriting of argument stack slots.

bool isImmutable = !(getTargetMachine().Options.GuaranteedTailCallOpt &&

(CallConv == CallingConv::Fast));

- unsigned PtrByteSize = 4;

+ const Align PtrAlign(4);

// Assign locations to all of the incoming arguments.

SmallVector<CCValAssign, 16> ArgLocs;

@@ -3536,7 +3690,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_32SVR4(

// Reserve space for the linkage area on the stack.

unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();

- CCInfo.AllocateStack(LinkageSize, PtrByteSize);

+ CCInfo.AllocateStack(LinkageSize, PtrAlign);

if (useSoftFloat())

CCInfo.PreAnalyzeFormalArguments(Ins);

@@ -3645,7 +3799,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_32SVR4(

ByValArgLocs, *DAG.getContext());

// Reserve stack space for the allocations in CCInfo.

- CCByValInfo.AllocateStack(CCInfo.getNextStackOffset(), PtrByteSize);

+ CCByValInfo.AllocateStack(CCInfo.getNextStackOffset(), PtrAlign);

CCByValInfo.AnalyzeFormalArguments(Ins, CC_PPC32_SVR4_ByVal);

@@ -3692,7 +3846,8 @@ SDValue PPCTargetLowering::LowerFormalArguments_32SVR4(

MFI.CreateFixedObject(PtrVT.getSizeInBits()/8,

CCInfo.getNextStackOffset(), true));

- FuncInfo->setVarArgsFrameIndex(MFI.CreateStackObject(Depth, 8, false));

+ FuncInfo->setVarArgsFrameIndex(

+ MFI.CreateStackObject(Depth, Align(8), false));

SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);

// The fixed integer arguments of a variadic function are stored to the

@@ -3839,11 +3994,13 @@ SDValue PPCTargetLowering::LowerFormalArguments_64SVR4(

// We re-align the argument offset for each argument, except when using the

// fast calling convention, when we need to make sure we do that only when

// we'll actually use a stack slot.

- unsigned CurArgOffset, Align;

+ unsigned CurArgOffset;

+ Align Alignment;

auto ComputeArgOffset = [&]() {

/* Respect alignment of argument on the stack. */

- Align = CalculateStackSlotAlignment(ObjectVT, OrigVT, Flags, PtrByteSize);

- ArgOffset = ((ArgOffset + Align - 1) / Align) * Align;

+ Alignment =

+ CalculateStackSlotAlignment(ObjectVT, OrigVT, Flags, PtrByteSize);

+ ArgOffset = alignTo(ArgOffset, Alignment);

CurArgOffset = ArgOffset;

};

@@ -3891,7 +4048,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_64SVR4(

ArgSize + ArgOffset > LinkageSize + Num_GPR_Regs * PtrByteSize)

FI = MFI.CreateFixedObject(ArgSize, ArgOffset, false, true);

else

- FI = MFI.CreateStackObject(ArgSize, Align, false);

+ FI = MFI.CreateStackObject(ArgSize, Alignment, false);

SDValue FIN = DAG.getFrameIndex(FI, PtrVT);

// Handle aggregates smaller than 8 bytes.

@@ -4139,7 +4296,11 @@ SDValue PPCTargetLowering::LowerFormalArguments_64SVR4(

// If the function takes variable number of arguments, make a frame index for

// the start of the first vararg value... for expansion of llvm.va_start.

- if (isVarArg) {

+ // On ELFv2ABI spec, it writes:

+ // C programs that are intended to be *portable* across different compilers

+ // and architectures must use the header file <stdarg.h> to deal with variable

+ // argument lists.

+ if (isVarArg && MFI.hasVAStart()) {

int Depth = ArgOffset;

FuncInfo->setVarArgsFrameIndex(

@@ -4547,30 +4708,67 @@ static int CalculateTailCallSPDiff(SelectionDAG& DAG, bool isTailCall,

static bool isFunctionGlobalAddress(SDValue Callee);

-static bool

-callsShareTOCBase(const Function *Caller, SDValue Callee,

- const TargetMachine &TM) {

- // Callee is either a GlobalAddress or an ExternalSymbol. ExternalSymbols

- // don't have enough information to determine if the caller and calle share

- // the same TOC base, so we have to pessimistically assume they don't for

- // correctness.

- GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);

- if (!G)

- return false;

- const GlobalValue *GV = G->getGlobal();

+static bool callsShareTOCBase(const Function *Caller, SDValue Callee,

+ const TargetMachine &TM) {

+ // It does not make sense to call callsShareTOCBase() with a caller that

+ // is PC Relative since PC Relative callers do not have a TOC.

+#ifndef NDEBUG

+ const PPCSubtarget *STICaller = &TM.getSubtarget<PPCSubtarget>(*Caller);

+ assert(!STICaller->isUsingPCRelativeCalls() &&

+ "PC Relative callers do not have a TOC and cannot share a TOC Base");

+#endif

+ // Callee is either a GlobalAddress or an ExternalSymbol. ExternalSymbols

+ // don't have enough information to determine if the caller and callee share

+ // the same TOC base, so we have to pessimistically assume they don't for

+ // correctness.

+ GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);

+ if (!G)

+ return false;

+ const GlobalValue *GV = G->getGlobal();

+ // If the callee is preemptable, then the static linker will use a plt-stub

+ // which saves the toc to the stack, and needs a nop after the call

+ // instruction to convert to a toc-restore.

+ if (!TM.shouldAssumeDSOLocal(*Caller->getParent(), GV))

+ return false;

+ // Functions with PC Relative enabled may clobber the TOC in the same DSO.

+ // We may need a TOC restore in the situation where the caller requires a

+ // valid TOC but the callee is PC Relative and does not.

+ const Function *F = dyn_cast<Function>(GV);

+ const GlobalAlias *Alias = dyn_cast<GlobalAlias>(GV);

+ // If we have an Alias we can try to get the function from there.

+ if (Alias) {

+ const GlobalObject *GlobalObj = Alias->getBaseObject();

+ F = dyn_cast<Function>(GlobalObj);

+ }

+ // If we still have no valid function pointer we do not have enough

+ // information to determine if the callee uses PC Relative calls so we must

+ // assume that it does.

+ if (!F)

+ return false;

+ // If the callee uses PC Relative we cannot guarantee that the callee won't

+ // clobber the TOC of the caller and so we must assume that the two

+ // functions do not share a TOC base.

+ const PPCSubtarget *STICallee = &TM.getSubtarget<PPCSubtarget>(*F);

+ if (STICallee->isUsingPCRelativeCalls())

+ return false;

// The medium and large code models are expected to provide a sufficiently

// large TOC to provide all data addressing needs of a module with a

- // single TOC. Since each module will be addressed with a single TOC then we

- // only need to check that caller and callee don't cross dso boundaries.

+ // single TOC.

if (CodeModel::Medium == TM.getCodeModel() ||

CodeModel::Large == TM.getCodeModel())

- return TM.shouldAssumeDSOLocal(*Caller->getParent(), GV);

+ return true;

// Otherwise we need to ensure callee and caller are in the same section,

// since the linker may allocate multiple TOCs, and we don't know which

// sections will belong to the same TOC base.

if (!GV->isStrongDefinitionForLinker())

return false;

@@ -4585,26 +4783,6 @@ callsShareTOCBase(const Function *Caller, SDValue Callee,

return false;

}

- // If the callee might be interposed, then we can't assume the ultimate call

- // target will be in the same section. Even in cases where we can assume that

- // interposition won't happen, in any case where the linker might insert a

- // stub to allow for interposition, we must generate code as though

- // interposition might occur. To understand why this matters, consider a

- // situation where: a -> b -> c where the arrows indicate calls. b and c are

- // in the same section, but a is in a different module (i.e. has a different

- // TOC base pointer). If the linker allows for interposition between b and c,

- // then it will generate a stub for the call edge between b and c which will

- // save the TOC pointer into the designated stack slot allocated by b. If we

- // return true here, and therefore allow a tail call between b and c, that

- // stack slot won't exist and the b -> c stub will end up saving b'c TOC base

- // pointer into the stack slot allocated by a (where the a -> b stub saved

- // a's TOC base pointer). If we're not considering a tail call, but rather,

- // whether a nop is needed after the call instruction in b, because the linker

- // will insert a stub, it might complain about a missing nop if we omit it

- // (although many don't complain in this case).

- if (!TM.shouldAssumeDSOLocal(*Caller->getParent(), GV))

- return false;

return true;

}

@@ -4646,13 +4824,12 @@ needStackSlotPassParameters(const PPCSubtarget &Subtarget,

return false;

}

-static bool

-hasSameArgumentList(const Function *CallerFn, ImmutableCallSite CS) {

- if (CS.arg_size() != CallerFn->arg_size())

+static bool hasSameArgumentList(const Function *CallerFn, const CallBase &CB) {

+ if (CB.arg_size() != CallerFn->arg_size())

return false;

- ImmutableCallSite::arg_iterator CalleeArgIter = CS.arg_begin();

- ImmutableCallSite::arg_iterator CalleeArgEnd = CS.arg_end();

+ auto CalleeArgIter = CB.arg_begin();

+ auto CalleeArgEnd = CB.arg_end();

Function::const_arg_iterator CallerArgIter = CallerFn->arg_begin();

for (; CalleeArgIter != CalleeArgEnd; ++CalleeArgIter, ++CallerArgIter) {

@@ -4694,15 +4871,10 @@ areCallingConvEligibleForTCO_64SVR4(CallingConv::ID CallerCC,

return CallerCC == CallingConv::C || CallerCC == CalleeCC;

}

-bool

-PPCTargetLowering::IsEligibleForTailCallOptimization_64SVR4(

- SDValue Callee,

- CallingConv::ID CalleeCC,

- ImmutableCallSite CS,

- bool isVarArg,

- const SmallVectorImpl<ISD::OutputArg> &Outs,

- const SmallVectorImpl<ISD::InputArg> &Ins,

- SelectionDAG& DAG) const {

+bool PPCTargetLowering::IsEligibleForTailCallOptimization_64SVR4(

+ SDValue Callee, CallingConv::ID CalleeCC, const CallBase *CB, bool isVarArg,

+ const SmallVectorImpl<ISD::OutputArg> &Outs,

+ const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {

bool TailCallOpt = getTargetMachine().Options.GuaranteedTailCallOpt;

if (DisableSCO && !TailCallOpt) return false;

@@ -4744,15 +4916,22 @@ PPCTargetLowering::IsEligibleForTailCallOptimization_64SVR4(

needStackSlotPassParameters(Subtarget, Outs))

return false;

- // No TCO/SCO on indirect call because Caller have to restore its TOC

- if (!isFunctionGlobalAddress(Callee) &&

- !isa<ExternalSymbolSDNode>(Callee))

+ // All variants of 64-bit ELF ABIs without PC-Relative addressing require that

+ // the caller and callee share the same TOC for TCO/SCO. If the caller and

+ // callee potentially have different TOC bases then we cannot tail call since

+ // we need to restore the TOC pointer after the call.

+ // ref: https://bugzilla.mozilla.org/show_bug.cgi?id=973977

+ // We cannot guarantee this for indirect calls or calls to external functions.

+ // When PC-Relative addressing is used, the concept of the TOC is no longer

+ // applicable so this check is not required.

+ // Check first for indirect calls.

+ if (!Subtarget.isUsingPCRelativeCalls() &&

+ !isFunctionGlobalAddress(Callee) && !isa<ExternalSymbolSDNode>(Callee))

return false;

- // If the caller and callee potentially have different TOC bases then we

- // cannot tail call since we need to restore the TOC pointer after the call.

- // ref: https://bugzilla.mozilla.org/show_bug.cgi?id=973977

- if (!callsShareTOCBase(&Caller, Callee, getTargetMachine()))

+ // Check if we share the TOC base.

+ if (!Subtarget.isUsingPCRelativeCalls() &&

+ !callsShareTOCBase(&Caller, Callee, getTargetMachine()))

return false;

// TCO allows altering callee ABI, so we don't have to check further.

@@ -4764,10 +4943,14 @@ PPCTargetLowering::IsEligibleForTailCallOptimization_64SVR4(

// If callee use the same argument list that caller is using, then we can

// apply SCO on this case. If it is not, then we need to check if callee needs

// stack for passing arguments.

- if (!hasSameArgumentList(&Caller, CS) &&

- needStackSlotPassParameters(Subtarget, Outs)) {

+ // PC Relative tail calls may not have a CallBase.

+ // If there is no CallBase we cannot verify if we have the same argument

+ // list so assume that we don't have the same argument list.

+ if (CB && !hasSameArgumentList(&Caller, *CB) &&

+ needStackSlotPassParameters(Subtarget, Outs))

+ return false;

+ else if (!CB && needStackSlotPassParameters(Subtarget, Outs))

return false;

- }

return true;

}

@@ -4876,18 +5059,6 @@ static SDValue EmitTailCallStoreFPAndRetAddr(SelectionDAG &DAG, SDValue Chain,

SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewRetAddr, VT);

Chain = DAG.getStore(Chain, dl, OldRetAddr, NewRetAddrFrIdx,

MachinePointerInfo::getFixedStack(MF, NewRetAddr));

- // When using the 32/64-bit SVR4 ABI there is no need to move the FP stack

- // slot as the FP is never overwritten.

- if (Subtarget.isDarwinABI()) {

- int NewFPLoc = SPDiff + FL->getFramePointerSaveOffset();

- int NewFPIdx = MF.getFrameInfo().CreateFixedObject(SlotSize, NewFPLoc,

- true);

- SDValue NewFramePtrIdx = DAG.getFrameIndex(NewFPIdx, VT);

- Chain = DAG.getStore(Chain, dl, OldFP, NewFramePtrIdx,

- MachinePointerInfo::getFixedStack(

- DAG.getMachineFunction(), NewFPIdx));

- }

}

return Chain;

}

@@ -4922,14 +5093,6 @@ SDValue PPCTargetLowering::EmitTailCallLoadFPAndRetAddr(

LROpOut = getReturnAddrFrameIndex(DAG);

LROpOut = DAG.getLoad(VT, dl, Chain, LROpOut, MachinePointerInfo());

Chain = SDValue(LROpOut.getNode(), 1);

- // When using the 32/64-bit SVR4 ABI there is no need to load the FP stack

- // slot as the FP is never overwritten.

- if (Subtarget.isDarwinABI()) {

- FPOpOut = getFramePointerFrameIndex(DAG);

- FPOpOut = DAG.getLoad(VT, dl, Chain, FPOpOut, MachinePointerInfo());

- Chain = SDValue(FPOpOut.getNode(), 1);

- }

}

return Chain;

}

@@ -4944,9 +5107,9 @@ static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,

SDValue Chain, ISD::ArgFlagsTy Flags,

SelectionDAG &DAG, const SDLoc &dl) {

SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);

- return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),

- false, false, false, MachinePointerInfo(),

- MachinePointerInfo());

+ return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode,

+ Flags.getNonZeroByValAlign(), false, false, false,

+ MachinePointerInfo(), MachinePointerInfo());

}

/// LowerMemOpCallTo - Store the argument to the stack or remember it in case of

@@ -5097,28 +5260,37 @@ static bool isIndirectCall(const SDValue &Callee, SelectionDAG &DAG,

return true;

}

-static unsigned getCallOpcode(bool isIndirectCall, bool isPatchPoint,

- bool isTailCall, const Function &Caller,

+// AIX and 64-bit ELF ABIs w/o PCRel require a TOC save/restore around calls.

+static inline bool isTOCSaveRestoreRequired(const PPCSubtarget &Subtarget) {

+ return Subtarget.isAIXABI() ||

+ (Subtarget.is64BitELFABI() && !Subtarget.isUsingPCRelativeCalls());

+static unsigned getCallOpcode(PPCTargetLowering::CallFlags CFlags,

+ const Function &Caller,

const SDValue &Callee,

const PPCSubtarget &Subtarget,

const TargetMachine &TM) {

- if (isTailCall)

+ if (CFlags.IsTailCall)

return PPCISD::TC_RETURN;

// This is a call through a function pointer.

- if (isIndirectCall) {

+ if (CFlags.IsIndirect) {

// AIX and the 64-bit ELF ABIs need to maintain the TOC pointer accross

// indirect calls. The save of the caller's TOC pointer to the stack will be

// inserted into the DAG as part of call lowering. The restore of the TOC

// pointer is modeled by using a pseudo instruction for the call opcode that

// represents the 2 instruction sequence of an indirect branch and link,

// immediately followed by a load of the TOC pointer from the the stack save

- // slot into gpr2.

- if (Subtarget.isAIXABI() || Subtarget.is64BitELFABI())

- return PPCISD::BCTRL_LOAD_TOC;

+ // slot into gpr2. For 64-bit ELFv2 ABI with PCRel, do not restore the TOC

+ // as it is not saved or used.

+ return isTOCSaveRestoreRequired(Subtarget) ? PPCISD::BCTRL_LOAD_TOC

+ : PPCISD::BCTRL;

+ }

- // An indirect call that does not need a TOC restore.

- return PPCISD::BCTRL;

+ if (Subtarget.isUsingPCRelativeCalls()) {

+ assert(Subtarget.is64BitELFABI() && "PC Relative is only on ELF ABI.");

+ return PPCISD::CALL_NOTOC;

}

// The ABIs that maintain a TOC pointer accross calls need to have a nop

@@ -5136,14 +5308,6 @@ static unsigned getCallOpcode(bool isIndirectCall, bool isPatchPoint,

return PPCISD::CALL;

}

-static bool isValidAIXExternalSymSDNode(StringRef SymName) {

- return StringSwitch<bool>(SymName)

- .Cases("__divdi3", "__fixunsdfdi", "__floatundidf", "__floatundisf",

- "__moddi3", "__udivdi3", "__umoddi3", true)

- .Cases("ceil", "floor", "memcpy", "memmove", "memset", "round", true)

- .Default(false);

static SDValue transformCallee(const SDValue &Callee, SelectionDAG &DAG,

const SDLoc &dl, const PPCSubtarget &Subtarget) {

if (!Subtarget.usesFunctionDescriptors() && !Subtarget.isELFv2ABI())

@@ -5179,14 +5343,14 @@ static SDValue transformCallee(const SDValue &Callee, SelectionDAG &DAG,

MCSymbolXCOFF *S = cast<MCSymbolXCOFF>(

Context.getOrCreateSymbol(Twine(".") + Twine(FuncName)));

- if (IsDeclaration && !S->hasContainingCsect()) {

+ if (IsDeclaration && !S->hasRepresentedCsectSet()) {

// On AIX, an undefined symbol needs to be associated with a

// MCSectionXCOFF to get the correct storage mapping class.

// In this case, XCOFF::XMC_PR.

MCSectionXCOFF *Sec = Context.getXCOFFSection(

- S->getName(), XCOFF::XMC_PR, XCOFF::XTY_ER, SC,

+ S->getSymbolTableName(), XCOFF::XMC_PR, XCOFF::XTY_ER, SC,

SectionKind::getMetadata());

- S->setContainingCsect(Sec);

+ S->setRepresentedCsect(Sec);

}

MVT PtrVT =

@@ -5227,12 +5391,7 @@ static SDValue transformCallee(const SDValue &Callee, SelectionDAG &DAG,

SC);

}

- // TODO: Remove this when the support for ExternalSymbolSDNode is complete.

- if (isValidAIXExternalSymSDNode(SymName)) {

- return getAIXFuncEntryPointSymbolSDNode(SymName, true, XCOFF::C_EXT);

- }

- report_fatal_error("Unexpected ExternalSymbolSDNode: " + Twine(SymName));

+ return getAIXFuncEntryPointSymbolSDNode(SymName, true, XCOFF::C_EXT);

}

// No transformation needed.

@@ -5270,7 +5429,7 @@ static void prepareIndirectCall(SelectionDAG &DAG, SDValue &Callee,

static void prepareDescriptorIndirectCall(SelectionDAG &DAG, SDValue &Callee,

SDValue &Glue, SDValue &Chain,

SDValue CallSeqStart,

- ImmutableCallSite CS, const SDLoc &dl,

+ const CallBase *CB, const SDLoc &dl,

bool hasNest,

const PPCSubtarget &Subtarget) {

// Function pointers in the 64-bit SVR4 ABI do not point to the function

@@ -5306,7 +5465,7 @@ static void prepareDescriptorIndirectCall(SelectionDAG &DAG, SDValue &Callee,

MachineMemOperand::MOInvariant)

: MachineMemOperand::MONone;

- MachinePointerInfo MPI(CS ? CS.getCalledValue() : nullptr);

+ MachinePointerInfo MPI(CB ? CB->getCalledOperand() : nullptr);

// Registers used in building the DAG.

const MCRegister EnvPtrReg = Subtarget.getEnvironmentPointerRegister();

@@ -5360,12 +5519,12 @@ static void prepareDescriptorIndirectCall(SelectionDAG &DAG, SDValue &Callee,

}

static void

-buildCallOperands(SmallVectorImpl<SDValue> &Ops, CallingConv::ID CallConv,

- const SDLoc &dl, bool isTailCall, bool isVarArg,

- bool isPatchPoint, bool hasNest, SelectionDAG &DAG,

+buildCallOperands(SmallVectorImpl<SDValue> &Ops,

+ PPCTargetLowering::CallFlags CFlags, const SDLoc &dl,

+ SelectionDAG &DAG,

SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass,

SDValue Glue, SDValue Chain, SDValue &Callee, int SPDiff,

- const PPCSubtarget &Subtarget, bool isIndirect) {

+ const PPCSubtarget &Subtarget) {

const bool IsPPC64 = Subtarget.isPPC64();

// MVT for a general purpose register.

const MVT RegVT = IsPPC64 ? MVT::i64 : MVT::i32;

@@ -5374,10 +5533,10 @@ buildCallOperands(SmallVectorImpl<SDValue> &Ops, CallingConv::ID CallConv,

Ops.push_back(Chain);

// If it's a direct call pass the callee as the second operand.

- if (!isIndirect)

+ if (!CFlags.IsIndirect)

Ops.push_back(Callee);

else {

- assert(!isPatchPoint && "Patch point call are not indirect.");

+ assert(!CFlags.IsPatchPoint && "Patch point calls are not indirect.");

// For the TOC based ABIs, we have saved the TOC pointer to the linkage area

// on the stack (this would have been done in `LowerCall_64SVR4` or

@@ -5386,7 +5545,9 @@ buildCallOperands(SmallVectorImpl<SDValue> &Ops, CallingConv::ID CallConv,

// pointer from the linkage area. The operand for the TOC restore is an add

// of the TOC save offset to the stack pointer. This must be the second

// operand: after the chain input but before any other variadic arguments.

- if (Subtarget.is64BitELFABI() || Subtarget.isAIXABI()) {

+ // For 64-bit ELFv2 ABI with PCRel, do not restore the TOC as it is not

+ // saved or used.

+ if (isTOCSaveRestoreRequired(Subtarget)) {

const MCRegister StackPtrReg = Subtarget.getStackPointerRegister();

SDValue StackPtr = DAG.getRegister(StackPtrReg, RegVT);

@@ -5397,18 +5558,18 @@ buildCallOperands(SmallVectorImpl<SDValue> &Ops, CallingConv::ID CallConv,

}

// Add the register used for the environment pointer.

- if (Subtarget.usesFunctionDescriptors() && !hasNest)

+ if (Subtarget.usesFunctionDescriptors() && !CFlags.HasNest)

Ops.push_back(DAG.getRegister(Subtarget.getEnvironmentPointerRegister(),

RegVT));

// Add CTR register as callee so a bctr can be emitted later.

- if (isTailCall)

+ if (CFlags.IsTailCall)

Ops.push_back(DAG.getRegister(IsPPC64 ? PPC::CTR8 : PPC::CTR, RegVT));

}

// If this is a tail call add stack pointer delta.

- if (isTailCall)

+ if (CFlags.IsTailCall)

Ops.push_back(DAG.getConstant(SPDiff, dl, MVT::i32));

// Add argument registers to the end of the list so that they are known live

@@ -5420,17 +5581,18 @@ buildCallOperands(SmallVectorImpl<SDValue> &Ops, CallingConv::ID CallConv,

// We cannot add R2/X2 as an operand here for PATCHPOINT, because there is

// no way to mark dependencies as implicit here.

// We will add the R2/X2 dependency in EmitInstrWithCustomInserter.

- if ((Subtarget.is64BitELFABI() || Subtarget.isAIXABI()) && !isPatchPoint)

+ if ((Subtarget.is64BitELFABI() || Subtarget.isAIXABI()) &&

+ !CFlags.IsPatchPoint && !Subtarget.isUsingPCRelativeCalls())

Ops.push_back(DAG.getRegister(Subtarget.getTOCPointerRegister(), RegVT));

// Add implicit use of CR bit 6 for 32-bit SVR4 vararg calls

- if (isVarArg && Subtarget.is32BitELFABI())

+ if (CFlags.IsVarArg && Subtarget.is32BitELFABI())

Ops.push_back(DAG.getRegister(PPC::CR1EQ, MVT::i32));

// Add a register mask operand representing the call-preserved registers.

const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();

const uint32_t *Mask =

- TRI->getCallPreservedMask(DAG.getMachineFunction(), CallConv);

+ TRI->getCallPreservedMask(DAG.getMachineFunction(), CFlags.CallConv);

assert(Mask && "Missing call preserved mask for calling convention");

Ops.push_back(DAG.getRegisterMask(Mask));

@@ -5440,44 +5602,47 @@ buildCallOperands(SmallVectorImpl<SDValue> &Ops, CallingConv::ID CallConv,

}

SDValue PPCTargetLowering::FinishCall(

- CallingConv::ID CallConv, const SDLoc &dl, bool isTailCall, bool isVarArg,

- bool isPatchPoint, bool hasNest, SelectionDAG &DAG,

+ CallFlags CFlags, const SDLoc &dl, SelectionDAG &DAG,

SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass, SDValue Glue,

SDValue Chain, SDValue CallSeqStart, SDValue &Callee, int SPDiff,

unsigned NumBytes, const SmallVectorImpl<ISD::InputArg> &Ins,

- SmallVectorImpl<SDValue> &InVals, ImmutableCallSite CS) const {

+ SmallVectorImpl<SDValue> &InVals, const CallBase *CB) const {

- if (Subtarget.is64BitELFABI() || Subtarget.isAIXABI())

+ if ((Subtarget.is64BitELFABI() && !Subtarget.isUsingPCRelativeCalls()) ||

+ Subtarget.isAIXABI())

setUsesTOCBasePtr(DAG);

- const bool isIndirect = isIndirectCall(Callee, DAG, Subtarget, isPatchPoint);

- unsigned CallOpc = getCallOpcode(isIndirect, isPatchPoint, isTailCall,

- DAG.getMachineFunction().getFunction(),

- Callee, Subtarget, DAG.getTarget());

+ unsigned CallOpc =

+ getCallOpcode(CFlags, DAG.getMachineFunction().getFunction(), Callee,

+ Subtarget, DAG.getTarget());

- if (!isIndirect)

+ if (!CFlags.IsIndirect)

Callee = transformCallee(Callee, DAG, dl, Subtarget);

else if (Subtarget.usesFunctionDescriptors())

- prepareDescriptorIndirectCall(DAG, Callee, Glue, Chain, CallSeqStart, CS,

- dl, hasNest, Subtarget);

+ prepareDescriptorIndirectCall(DAG, Callee, Glue, Chain, CallSeqStart, CB,

+ dl, CFlags.HasNest, Subtarget);

else

prepareIndirectCall(DAG, Callee, Glue, Chain, dl);

// Build the operand list for the call instruction.

SmallVector<SDValue, 8> Ops;

- buildCallOperands(Ops, CallConv, dl, isTailCall, isVarArg, isPatchPoint,

- hasNest, DAG, RegsToPass, Glue, Chain, Callee, SPDiff,

- Subtarget, isIndirect);

+ buildCallOperands(Ops, CFlags, dl, DAG, RegsToPass, Glue, Chain, Callee,

+ SPDiff, Subtarget);

// Emit tail call.

- if (isTailCall) {

+ if (CFlags.IsTailCall) {

+ // Indirect tail call when using PC Relative calls do not have the same

+ // constraints.

assert(((Callee.getOpcode() == ISD::Register &&

cast<RegisterSDNode>(Callee)->getReg() == PPC::CTR) ||

Callee.getOpcode() == ISD::TargetExternalSymbol ||

Callee.getOpcode() == ISD::TargetGlobalAddress ||

- isa<ConstantSDNode>(Callee)) &&

- "Expecting a global address, external symbol, absolute value or "

- "register");

+ isa<ConstantSDNode>(Callee) ||

+ (CFlags.IsIndirect && Subtarget.isUsingPCRelativeCalls())) &&

+ "Expecting a global address, external symbol, absolute value, "

+ "register or an indirect tail call when PC Relative calls are "

+ "used.");

+ // PC Relative calls also use TC_RETURN as the way to mark tail calls.

assert(CallOpc == PPCISD::TC_RETURN &&

"Unexpected call opcode for a tail call.");

DAG.getMachineFunction().getFrameInfo().setHasTailCall();

@@ -5486,12 +5651,13 @@ SDValue PPCTargetLowering::FinishCall(

std::array<EVT, 2> ReturnTypes = {{MVT::Other, MVT::Glue}};

Chain = DAG.getNode(CallOpc, dl, ReturnTypes, Ops);

+ DAG.addNoMergeSiteInfo(Chain.getNode(), CFlags.NoMerge);

Glue = Chain.getValue(1);

// When performing tail call optimization the callee pops its arguments off

// the stack. Account for this here so these bytes can be pushed back on in

// PPCFrameLowering::eliminateCallFramePseudoInstr.

- int BytesCalleePops = (CallConv == CallingConv::Fast &&

+ int BytesCalleePops = (CFlags.CallConv == CallingConv::Fast &&

getTargetMachine().Options.GuaranteedTailCallOpt)

? NumBytes

: 0;

@@ -5501,7 +5667,8 @@ SDValue PPCTargetLowering::FinishCall(

Glue, dl);

Glue = Chain.getValue(1);

- return LowerCallResult(Chain, Glue, CallConv, isVarArg, Ins, dl, DAG, InVals);

+ return LowerCallResult(Chain, Glue, CFlags.CallConv, CFlags.IsVarArg, Ins, dl,

+ DAG, InVals);

}

SDValue

@@ -5518,15 +5685,14 @@ PPCTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,

CallingConv::ID CallConv = CLI.CallConv;

bool isVarArg = CLI.IsVarArg;

bool isPatchPoint = CLI.IsPatchPoint;

- ImmutableCallSite CS = CLI.CS;

+ const CallBase *CB = CLI.CB;

if (isTailCall) {

- if (Subtarget.useLongCalls() && !(CS && CS.isMustTailCall()))

+ if (Subtarget.useLongCalls() && !(CB && CB->isMustTailCall()))

isTailCall = false;

else if (Subtarget.isSVR4ABI() && Subtarget.isPPC64())

- isTailCall =

- IsEligibleForTailCallOptimization_64SVR4(Callee, CallConv, CS,

- isVarArg, Outs, Ins, DAG);

+ isTailCall = IsEligibleForTailCallOptimization_64SVR4(

+ Callee, CallConv, CB, isVarArg, Outs, Ins, DAG);

else

isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, isVarArg,

Ins, DAG);

@@ -5535,21 +5701,23 @@ PPCTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,

if (!getTargetMachine().Options.GuaranteedTailCallOpt)

++NumSiblingCalls;

- assert(isa<GlobalAddressSDNode>(Callee) &&

+ // PC Relative calls no longer guarantee that the callee is a Global

+ // Address Node. The callee could be an indirect tail call in which

+ // case the SDValue for the callee could be a load (to load the address

+ // of a function pointer) or it may be a register copy (to move the

+ // address of the callee from a function parameter into a virtual

+ // register). It may also be an ExternalSymbolSDNode (ex memcopy).

+ assert((Subtarget.isUsingPCRelativeCalls() ||

+ isa<GlobalAddressSDNode>(Callee)) &&

"Callee should be an llvm::Function object.");

- LLVM_DEBUG(

- const GlobalValue *GV =

- cast<GlobalAddressSDNode>(Callee)->getGlobal();

- const unsigned Width =

- 80 - strlen("TCO caller: ") - strlen(", callee linkage: 0, 0");

- dbgs() << "TCO caller: "

- << left_justify(DAG.getMachineFunction().getName(), Width)

- << ", callee linkage: " << GV->getVisibility() << ", "

- << GV->getLinkage() << "\n");

+ LLVM_DEBUG(dbgs() << "TCO caller: " << DAG.getMachineFunction().getName()

+ << "\nTCO callee: ");

+ LLVM_DEBUG(Callee.dump());

}

- if (!isTailCall && CS && CS.isMustTailCall())

+ if (!isTailCall && CB && CB->isMustTailCall())

report_fatal_error("failed to perform tail call elimination on a call "

"site marked musttail");

@@ -5560,42 +5728,49 @@ PPCTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,

!isTailCall)

Callee = LowerGlobalAddress(Callee, DAG);

+ CallFlags CFlags(

+ CallConv, isTailCall, isVarArg, isPatchPoint,

+ isIndirectCall(Callee, DAG, Subtarget, isPatchPoint),

+ // hasNest

+ Subtarget.is64BitELFABI() &&

+ any_of(Outs, [](ISD::OutputArg Arg) { return Arg.Flags.isNest(); }),

+ CLI.NoMerge);

if (Subtarget.isSVR4ABI() && Subtarget.isPPC64())

- return LowerCall_64SVR4(Chain, Callee, CallConv, isVarArg,

- isTailCall, isPatchPoint, Outs, OutVals, Ins,

- dl, DAG, InVals, CS);

+ return LowerCall_64SVR4(Chain, Callee, CFlags, Outs, OutVals, Ins, dl, DAG,

+ InVals, CB);

if (Subtarget.isSVR4ABI())

- return LowerCall_32SVR4(Chain, Callee, CallConv, isVarArg,

- isTailCall, isPatchPoint, Outs, OutVals, Ins,

- dl, DAG, InVals, CS);

+ return LowerCall_32SVR4(Chain, Callee, CFlags, Outs, OutVals, Ins, dl, DAG,

+ InVals, CB);

if (Subtarget.isAIXABI())

- return LowerCall_AIX(Chain, Callee, CallConv, isVarArg,

- isTailCall, isPatchPoint, Outs, OutVals, Ins,

- dl, DAG, InVals, CS);

+ return LowerCall_AIX(Chain, Callee, CFlags, Outs, OutVals, Ins, dl, DAG,

+ InVals, CB);

- return LowerCall_Darwin(Chain, Callee, CallConv, isVarArg,

- isTailCall, isPatchPoint, Outs, OutVals, Ins,

- dl, DAG, InVals, CS);

+ return LowerCall_Darwin(Chain, Callee, CFlags, Outs, OutVals, Ins, dl, DAG,

+ InVals, CB);

}

SDValue PPCTargetLowering::LowerCall_32SVR4(

- SDValue Chain, SDValue Callee, CallingConv::ID CallConv, bool isVarArg,

- bool isTailCall, bool isPatchPoint,

+ SDValue Chain, SDValue Callee, CallFlags CFlags,

const SmallVectorImpl<ISD::OutputArg> &Outs,

const SmallVectorImpl<SDValue> &OutVals,

const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,

SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,

- ImmutableCallSite CS) const {

+ const CallBase *CB) const {

// See PPCTargetLowering::LowerFormalArguments_32SVR4() for a description

// of the 32-bit SVR4 ABI stack frame layout.

+ const CallingConv::ID CallConv = CFlags.CallConv;

+ const bool IsVarArg = CFlags.IsVarArg;

+ const bool IsTailCall = CFlags.IsTailCall;

assert((CallConv == CallingConv::C ||

CallConv == CallingConv::Cold ||

CallConv == CallingConv::Fast) && "Unknown calling convention!");

- unsigned PtrByteSize = 4;

+ const Align PtrAlign(4);

MachineFunction &MF = DAG.getMachineFunction();

@@ -5614,15 +5789,15 @@ SDValue PPCTargetLowering::LowerCall_32SVR4(

// Assign locations to all of the outgoing arguments.

SmallVector<CCValAssign, 16> ArgLocs;

- PPCCCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());

+ PPCCCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());

// Reserve space for the linkage area on the stack.

CCInfo.AllocateStack(Subtarget.getFrameLowering()->getLinkageSize(),

- PtrByteSize);

+ PtrAlign);

if (useSoftFloat())

CCInfo.PreAnalyzeCallOperands(Outs);

- if (isVarArg) {

+ if (IsVarArg) {

// Handle fixed and variable vector arguments differently.

// Fixed vector arguments go into registers as long as registers are

// available. Variable vector arguments always go into memory.

@@ -5657,10 +5832,10 @@ SDValue PPCTargetLowering::LowerCall_32SVR4(

// Assign locations to all of the outgoing aggregate by value arguments.

SmallVector<CCValAssign, 16> ByValArgLocs;

- CCState CCByValInfo(CallConv, isVarArg, MF, ByValArgLocs, *DAG.getContext());

+ CCState CCByValInfo(CallConv, IsVarArg, MF, ByValArgLocs, *DAG.getContext());

// Reserve stack space for the allocations in CCInfo.

- CCByValInfo.AllocateStack(CCInfo.getNextStackOffset(), PtrByteSize);

+ CCByValInfo.AllocateStack(CCInfo.getNextStackOffset(), PtrAlign);

CCByValInfo.AnalyzeCallOperands(Outs, CC_PPC32_SVR4_ByVal);

@@ -5671,7 +5846,7 @@ SDValue PPCTargetLowering::LowerCall_32SVR4(

// Calculate by how many bytes the stack has to be adjusted in case of tail

// call optimization.

- int SPDiff = CalculateTailCallSPDiff(DAG, isTailCall, NumBytes);

+ int SPDiff = CalculateTailCallSPDiff(DAG, IsTailCall, NumBytes);

// Adjust the stack pointer for the new arguments...

// These operations are automatically eliminated by the prolog/epilog pass

@@ -5767,7 +5942,7 @@ SDValue PPCTargetLowering::LowerCall_32SVR4(

assert(VA.isMemLoc());

unsigned LocMemOffset = VA.getLocMemOffset();

- if (!isTailCall) {

+ if (!IsTailCall) {

SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);

PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(MF.getDataLayout()),

StackPtr, PtrOff);

@@ -5796,7 +5971,7 @@ SDValue PPCTargetLowering::LowerCall_32SVR4(

// Set CR bit 6 to true if this is a vararg call with floating args passed in

// registers.

- if (isVarArg) {

+ if (IsVarArg) {

SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);

SDValue Ops[] = { Chain, InFlag };

@@ -5806,14 +5981,12 @@ SDValue PPCTargetLowering::LowerCall_32SVR4(

InFlag = Chain.getValue(1);

}

- if (isTailCall)

+ if (IsTailCall)

PrepareTailCall(DAG, InFlag, Chain, dl, SPDiff, NumBytes, LROp, FPOp,

TailCallArguments);

- return FinishCall(CallConv, dl, isTailCall, isVarArg, isPatchPoint,

- /* unused except on PPC64 ELFv1 */ false, DAG,

- RegsToPass, InFlag, Chain, CallSeqStart, Callee, SPDiff,

- NumBytes, Ins, InVals, CS);

+ return FinishCall(CFlags, dl, DAG, RegsToPass, InFlag, Chain, CallSeqStart,

+ Callee, SPDiff, NumBytes, Ins, InVals, CB);

}

// Copy an argument into memory, being careful to do this outside the

@@ -5834,25 +6007,24 @@ SDValue PPCTargetLowering::createMemcpyOutsideCallSeq(

}

SDValue PPCTargetLowering::LowerCall_64SVR4(

- SDValue Chain, SDValue Callee, CallingConv::ID CallConv, bool isVarArg,

- bool isTailCall, bool isPatchPoint,

+ SDValue Chain, SDValue Callee, CallFlags CFlags,

const SmallVectorImpl<ISD::OutputArg> &Outs,

const SmallVectorImpl<SDValue> &OutVals,

const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,

SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,

- ImmutableCallSite CS) const {

+ const CallBase *CB) const {

bool isELFv2ABI = Subtarget.isELFv2ABI();

bool isLittleEndian = Subtarget.isLittleEndian();

unsigned NumOps = Outs.size();

- bool hasNest = false;

bool IsSibCall = false;

+ bool IsFastCall = CFlags.CallConv == CallingConv::Fast;

EVT PtrVT = getPointerTy(DAG.getDataLayout());

unsigned PtrByteSize = 8;

MachineFunction &MF = DAG.getMachineFunction();

- if (isTailCall && !getTargetMachine().Options.GuaranteedTailCallOpt)

+ if (CFlags.IsTailCall && !getTargetMachine().Options.GuaranteedTailCallOpt)

IsSibCall = true;

// Mark this function as potentially containing a function that contains a

@@ -5860,11 +6032,10 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(

// and restoring the callers stack pointer in this functions epilog. This is

// done because by tail calling the called function might overwrite the value

// in this function's (MF) stack pointer stack slot 0(SP).

- if (getTargetMachine().Options.GuaranteedTailCallOpt &&

- CallConv == CallingConv::Fast)

+ if (getTargetMachine().Options.GuaranteedTailCallOpt && IsFastCall)

MF.getInfo<PPCFunctionInfo>()->setHasFastCall();

- assert(!(CallConv == CallingConv::Fast && isVarArg) &&

+ assert(!(IsFastCall && CFlags.IsVarArg) &&

"fastcc not supported on varargs functions");

// Count how many bytes are to be pushed on the stack, including the linkage

@@ -5894,7 +6065,7 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(

// can be passed to the callee in registers.

// For the fast calling convention, there is another check below.

// Note: We should keep consistent with LowerFormalArguments_64SVR4()

- bool HasParameterArea = !isELFv2ABI || isVarArg || CallConv == CallingConv::Fast;

+ bool HasParameterArea = !isELFv2ABI || CFlags.IsVarArg || IsFastCall;

if (!HasParameterArea) {

unsigned ParamAreaSize = NumGPRs * PtrByteSize;

unsigned AvailableFPRs = NumFPRs;

@@ -5916,7 +6087,7 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(

// Avoid allocating parameter area for fastcc functions if all the arguments

// can be passed in the registers.

- if (CallConv == CallingConv::Fast)

+ if (IsFastCall)

HasParameterArea = false;

// Add up all the space actually used.

@@ -5928,7 +6099,7 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(

if (Flags.isNest())

continue;

- if (CallConv == CallingConv::Fast) {

+ if (IsFastCall) {

if (Flags.isByVal()) {

NumGPRsUsed += (Flags.getByValSize()+7)/8;

if (NumGPRsUsed > NumGPRs)

@@ -5976,9 +6147,9 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(

}

/* Respect alignment of argument on the stack. */

- unsigned Align =

- CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize);

- NumBytes = ((NumBytes + Align - 1) / Align) * Align;

+ auto Alignement =

+ CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize);

+ NumBytes = alignTo(NumBytes, Alignement);

NumBytes += CalculateStackSlotSize(ArgVT, Flags, PtrByteSize);

if (Flags.isInConsecutiveRegsLast())

@@ -6001,8 +6172,7 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(

NumBytes = LinkageSize;

// Tail call needs the stack to be aligned.

- if (getTargetMachine().Options.GuaranteedTailCallOpt &&

- CallConv == CallingConv::Fast)

+ if (getTargetMachine().Options.GuaranteedTailCallOpt && IsFastCall)

NumBytes = EnsureStackAlignment(Subtarget.getFrameLowering(), NumBytes);

int SPDiff = 0;

@@ -6010,11 +6180,11 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(

// Calculate by how many bytes the stack has to be adjusted in case of tail

// call optimization.

if (!IsSibCall)

- SPDiff = CalculateTailCallSPDiff(DAG, isTailCall, NumBytes);

+ SPDiff = CalculateTailCallSPDiff(DAG, CFlags.IsTailCall, NumBytes);

// To protect arguments on the stack from being clobbered in a tail call,

// force all the loads to happen before doing any other lowering.

- if (isTailCall)

+ if (CFlags.IsTailCall)

Chain = DAG.getStackArgumentTokenFactor(Chain);

// Adjust the stack pointer for the new arguments...

@@ -6058,16 +6228,16 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(

// we'll actually use a stack slot.

auto ComputePtrOff = [&]() {

/* Respect alignment of argument on the stack. */

- unsigned Align =

- CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize);

- ArgOffset = ((ArgOffset + Align - 1) / Align) * Align;

+ auto Alignment =

+ CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize);

+ ArgOffset = alignTo(ArgOffset, Alignment);

PtrOff = DAG.getConstant(ArgOffset, dl, StackPtr.getValueType());

PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);

};

- if (CallConv != CallingConv::Fast) {

+ if (!IsFastCall) {

ComputePtrOff();

/* Compute GPR index associated with argument offset. */

@@ -6098,7 +6268,7 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(

if (Size == 0)

continue;

- if (CallConv == CallingConv::Fast)

+ if (IsFastCall)

ComputePtrOff();

// All aggregates smaller than 8 bytes must be passed right-justified.

@@ -6203,7 +6373,6 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(

if (Flags.isNest()) {

// The 'nest' parameter, if any, is passed in R11.

RegsToPass.push_back(std::make_pair(PPC::X11, Arg));

- hasNest = true;

break;

}

@@ -6213,18 +6382,18 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(

if (GPR_idx != NumGPRs) {

RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Arg));

} else {

- if (CallConv == CallingConv::Fast)

+ if (IsFastCall)

ComputePtrOff();

assert(HasParameterArea &&

"Parameter area must exist to pass an argument in memory.");

LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,

- true, isTailCall, false, MemOpChains,

+ true, CFlags.IsTailCall, false, MemOpChains,

TailCallArguments, dl);

- if (CallConv == CallingConv::Fast)

+ if (IsFastCall)

ArgOffset += PtrByteSize;

}

- if (CallConv != CallingConv::Fast)

+ if (!IsFastCall)

ArgOffset += PtrByteSize;

break;

case MVT::f32:

@@ -6238,7 +6407,7 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(

// Unnamed arguments for vararg functions always go to GPRs and

// then the parameter save area. For now, put all arguments to vararg

// routines always in both locations (FPR *and* GPR or stack slot).

- bool NeedGPROrStack = isVarArg || FPR_idx == NumFPRs;

+ bool NeedGPROrStack = CFlags.IsVarArg || FPR_idx == NumFPRs;

bool NeededLoad = false;

// First load the argument into the next available FPR.

@@ -6248,7 +6417,7 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(

// Next, load the argument into GPR or stack slot if needed.

if (!NeedGPROrStack)

;

- else if (GPR_idx != NumGPRs && CallConv != CallingConv::Fast) {

+ else if (GPR_idx != NumGPRs && !IsFastCall) {

// FIXME: We may want to re-enable this for CallingConv::Fast on the P8

// once we support fp <-> gpr moves.

@@ -6292,7 +6461,7 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(

if (ArgVal.getNode())

RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], ArgVal));

} else {

- if (CallConv == CallingConv::Fast)

+ if (IsFastCall)

ComputePtrOff();

// Single-precision floating-point values are mapped to the

@@ -6306,7 +6475,7 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(

assert(HasParameterArea &&

"Parameter area must exist to pass an argument in memory.");

LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,

- true, isTailCall, false, MemOpChains,

+ true, CFlags.IsTailCall, false, MemOpChains,

TailCallArguments, dl);

NeededLoad = true;

@@ -6314,7 +6483,7 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(

// When passing an array of floats, the array occupies consecutive

// space in the argument area; only round up to the next doubleword

// at the end of the array. Otherwise, each float takes 8 bytes.

- if (CallConv != CallingConv::Fast || NeededLoad) {

+ if (!IsFastCall || NeededLoad) {

ArgOffset += (Arg.getValueType() == MVT::f32 &&

Flags.isInConsecutiveRegs()) ? 4 : 8;

if (Flags.isInConsecutiveRegsLast())

@@ -6339,7 +6508,7 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(

// usual; unnamed arguments always go to the stack or the corresponding

// GPRs when within range. For now, we always put the value in both

// locations (or even all three).

- if (isVarArg) {

+ if (CFlags.IsVarArg) {

assert(HasParameterArea &&

"Parameter area must exist if we have a varargs call.");

// We could elide this store in the case where the object fits

@@ -6371,19 +6540,19 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(

if (VR_idx != NumVRs) {

RegsToPass.push_back(std::make_pair(VR[VR_idx++], Arg));

} else {

- if (CallConv == CallingConv::Fast)

+ if (IsFastCall)

ComputePtrOff();

assert(HasParameterArea &&

"Parameter area must exist to pass an argument in memory.");

LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,

- true, isTailCall, true, MemOpChains,

+ true, CFlags.IsTailCall, true, MemOpChains,

TailCallArguments, dl);

- if (CallConv == CallingConv::Fast)

+ if (IsFastCall)

ArgOffset += 16;

}

- if (CallConv != CallingConv::Fast)

+ if (!IsFastCall)

ArgOffset += 16;

break;

} // not QPX

@@ -6395,7 +6564,7 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(

case MVT::v4f64:

case MVT::v4i1: {

bool IsF32 = Arg.getValueType().getSimpleVT().SimpleTy == MVT::v4f32;

- if (isVarArg) {

+ if (CFlags.IsVarArg) {

assert(HasParameterArea &&

"Parameter area must exist if we have a varargs call.");

// We could elide this store in the case where the object fits

@@ -6427,19 +6596,19 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(

if (QFPR_idx != NumQFPRs) {

RegsToPass.push_back(std::make_pair(QFPR[QFPR_idx++], Arg));

} else {

- if (CallConv == CallingConv::Fast)

+ if (IsFastCall)

ComputePtrOff();

assert(HasParameterArea &&

"Parameter area must exist to pass an argument in memory.");

LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,

- true, isTailCall, true, MemOpChains,

+ true, CFlags.IsTailCall, true, MemOpChains,

TailCallArguments, dl);

- if (CallConv == CallingConv::Fast)

+ if (IsFastCall)

ArgOffset += (IsF32 ? 16 : 32);

}

- if (CallConv != CallingConv::Fast)

+ if (!IsFastCall)

ArgOffset += (IsF32 ? 16 : 32);

break;

}

@@ -6456,23 +6625,26 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(

// Check if this is an indirect call (MTCTR/BCTRL).

// See prepareDescriptorIndirectCall and buildCallOperands for more

// information about calls through function pointers in the 64-bit SVR4 ABI.

- if (!isTailCall && !isPatchPoint &&

- !isFunctionGlobalAddress(Callee) &&

- !isa<ExternalSymbolSDNode>(Callee)) {

- // Load r2 into a virtual register and store it to the TOC save area.

- setUsesTOCBasePtr(DAG);

- SDValue Val = DAG.getCopyFromReg(Chain, dl, PPC::X2, MVT::i64);

- // TOC save area offset.

- unsigned TOCSaveOffset = Subtarget.getFrameLowering()->getTOCSaveOffset();

- SDValue PtrOff = DAG.getIntPtrConstant(TOCSaveOffset, dl);

- SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);

- Chain = DAG.getStore(

- Val.getValue(1), dl, Val, AddPtr,

- MachinePointerInfo::getStack(DAG.getMachineFunction(), TOCSaveOffset));

+ if (CFlags.IsIndirect) {

+ // For 64-bit ELFv2 ABI with PCRel, do not save the TOC of the

+ // caller in the TOC save area.

+ if (isTOCSaveRestoreRequired(Subtarget)) {

+ assert(!CFlags.IsTailCall && "Indirect tails calls not supported");

+ // Load r2 into a virtual register and store it to the TOC save area.

+ setUsesTOCBasePtr(DAG);

+ SDValue Val = DAG.getCopyFromReg(Chain, dl, PPC::X2, MVT::i64);

+ // TOC save area offset.

+ unsigned TOCSaveOffset = Subtarget.getFrameLowering()->getTOCSaveOffset();

+ SDValue PtrOff = DAG.getIntPtrConstant(TOCSaveOffset, dl);

+ SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);

+ Chain = DAG.getStore(Val.getValue(1), dl, Val, AddPtr,

+ MachinePointerInfo::getStack(

+ DAG.getMachineFunction(), TOCSaveOffset));

+ }

// In the ELFv2 ABI, R12 must contain the address of an indirect callee.

// This does not mean the MTCTR instruction must use R12; it's easier

// to model this as an extra parameter, so do that.

- if (isELFv2ABI && !isPatchPoint)

+ if (isELFv2ABI && !CFlags.IsPatchPoint)

RegsToPass.push_back(std::make_pair((unsigned)PPC::X12, Callee));

}

@@ -6485,23 +6657,21 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(

InFlag = Chain.getValue(1);

}

- if (isTailCall && !IsSibCall)

+ if (CFlags.IsTailCall && !IsSibCall)

PrepareTailCall(DAG, InFlag, Chain, dl, SPDiff, NumBytes, LROp, FPOp,

TailCallArguments);

- return FinishCall(CallConv, dl, isTailCall, isVarArg, isPatchPoint, hasNest,

- DAG, RegsToPass, InFlag, Chain, CallSeqStart, Callee,

- SPDiff, NumBytes, Ins, InVals, CS);

+ return FinishCall(CFlags, dl, DAG, RegsToPass, InFlag, Chain, CallSeqStart,

+ Callee, SPDiff, NumBytes, Ins, InVals, CB);

}

SDValue PPCTargetLowering::LowerCall_Darwin(

- SDValue Chain, SDValue Callee, CallingConv::ID CallConv, bool isVarArg,

- bool isTailCall, bool isPatchPoint,

+ SDValue Chain, SDValue Callee, CallFlags CFlags,

const SmallVectorImpl<ISD::OutputArg> &Outs,

const SmallVectorImpl<SDValue> &OutVals,

const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,

SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,

- ImmutableCallSite CS) const {

+ const CallBase *CB) const {

unsigned NumOps = Outs.size();

EVT PtrVT = getPointerTy(DAG.getDataLayout());

@@ -6516,7 +6686,7 @@ SDValue PPCTargetLowering::LowerCall_Darwin(

// done because by tail calling the called function might overwrite the value

// in this function's (MF) stack pointer stack slot 0(SP).

if (getTargetMachine().Options.GuaranteedTailCallOpt &&

- CallConv == CallingConv::Fast)

+ CFlags.CallConv == CallingConv::Fast)

MF.getInfo<PPCFunctionInfo>()->setHasFastCall();

// Count how many bytes are to be pushed on the stack, including the linkage

@@ -6539,7 +6709,7 @@ SDValue PPCTargetLowering::LowerCall_Darwin(

if (ArgVT == MVT::v4f32 || ArgVT == MVT::v4i32 ||

ArgVT == MVT::v8i16 || ArgVT == MVT::v16i8 ||

ArgVT == MVT::v2f64 || ArgVT == MVT::v2i64) {

- if (!isVarArg && !isPPC64) {

+ if (!CFlags.IsVarArg && !isPPC64) {

// Non-varargs Altivec parameters go after all the non-Altivec

// parameters; handle those later so we know how much padding we need.

nAltivecParamsAtEnd++;

@@ -6566,16 +6736,16 @@ SDValue PPCTargetLowering::LowerCall_Darwin(

// Tail call needs the stack to be aligned.

if (getTargetMachine().Options.GuaranteedTailCallOpt &&

- CallConv == CallingConv::Fast)

+ CFlags.CallConv == CallingConv::Fast)

NumBytes = EnsureStackAlignment(Subtarget.getFrameLowering(), NumBytes);

// Calculate by how many bytes the stack has to be adjusted in case of tail

// call optimization.

- int SPDiff = CalculateTailCallSPDiff(DAG, isTailCall, NumBytes);

+ int SPDiff = CalculateTailCallSPDiff(DAG, CFlags.IsTailCall, NumBytes);

// To protect arguments on the stack from being clobbered in a tail call,

// force all the loads to happen before doing any other lowering.

- if (isTailCall)

+ if (CFlags.IsTailCall)

Chain = DAG.getStackArgumentTokenFactor(Chain);

// Adjust the stack pointer for the new arguments...

@@ -6711,7 +6881,7 @@ SDValue PPCTargetLowering::LowerCall_Darwin(

RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Arg));

} else {

LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,

- isPPC64, isTailCall, false, MemOpChains,

+ isPPC64, CFlags.IsTailCall, false, MemOpChains,

TailCallArguments, dl);

}

ArgOffset += PtrByteSize;

@@ -6721,7 +6891,7 @@ SDValue PPCTargetLowering::LowerCall_Darwin(

if (FPR_idx != NumFPRs) {

RegsToPass.push_back(std::make_pair(FPR[FPR_idx++], Arg));

- if (isVarArg) {

+ if (CFlags.IsVarArg) {

SDValue Store =

DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo());

MemOpChains.push_back(Store);

@@ -6753,7 +6923,7 @@ SDValue PPCTargetLowering::LowerCall_Darwin(

}

} else

LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,

- isPPC64, isTailCall, false, MemOpChains,

+ isPPC64, CFlags.IsTailCall, false, MemOpChains,

TailCallArguments, dl);

if (isPPC64)

ArgOffset += 8;

@@ -6764,7 +6934,7 @@ SDValue PPCTargetLowering::LowerCall_Darwin(

case MVT::v4i32:

case MVT::v8i16:

case MVT::v16i8:

- if (isVarArg) {

+ if (CFlags.IsVarArg) {

// These go aligned on the stack, or in the corresponding R registers

// when within range. The Darwin PPC ABI doc claims they also go in

// V registers; in fact gcc does this only for arguments that are

@@ -6810,7 +6980,7 @@ SDValue PPCTargetLowering::LowerCall_Darwin(

} else if (nAltivecParamsAtEnd==0) {

// We are emitting Altivec params in order.

LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,

- isPPC64, isTailCall, true, MemOpChains,

+ isPPC64, CFlags.IsTailCall, true, MemOpChains,

TailCallArguments, dl);

ArgOffset += 16;

}

@@ -6822,7 +6992,7 @@ SDValue PPCTargetLowering::LowerCall_Darwin(

// don't track this here because nobody below needs it.

// If there are more Altivec parameters than fit in registers emit

// the stores here.

- if (!isVarArg && nAltivecParamsAtEnd > NumVRs) {

+ if (!CFlags.IsVarArg && nAltivecParamsAtEnd > NumVRs) {

unsigned j = 0;

// Offset is aligned; skip 1st 12 params which go in V registers.

ArgOffset = ((ArgOffset+15)/16)*16;

@@ -6836,7 +7006,7 @@ SDValue PPCTargetLowering::LowerCall_Darwin(

SDValue PtrOff;

// We are emitting Altivec params in order.

LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,

- isPPC64, isTailCall, true, MemOpChains,

+ isPPC64, CFlags.IsTailCall, true, MemOpChains,

TailCallArguments, dl);

ArgOffset += 16;

}

@@ -6850,12 +7020,11 @@ SDValue PPCTargetLowering::LowerCall_Darwin(

// On Darwin, R12 must contain the address of an indirect callee. This does

// not mean the MTCTR instruction must use R12; it's easier to model this as

// an extra parameter, so do that.

- if (!isTailCall &&

- !isFunctionGlobalAddress(Callee) &&

- !isa<ExternalSymbolSDNode>(Callee) &&

- !isBLACompatibleAddress(Callee, DAG))

+ if (CFlags.IsIndirect) {

+ assert(!CFlags.IsTailCall && "Indirect tail-calls not supported.");

RegsToPass.push_back(std::make_pair((unsigned)(isPPC64 ? PPC::X12 :

PPC::R12), Callee));

+ }

// Build a sequence of copy-to-reg nodes chained together with token chain

// and flag operands which copy the outgoing args into the appropriate regs.

@@ -6866,37 +7035,37 @@ SDValue PPCTargetLowering::LowerCall_Darwin(

InFlag = Chain.getValue(1);

}

- if (isTailCall)

+ if (CFlags.IsTailCall)

PrepareTailCall(DAG, InFlag, Chain, dl, SPDiff, NumBytes, LROp, FPOp,

TailCallArguments);

- return FinishCall(CallConv, dl, isTailCall, isVarArg, isPatchPoint,

- /* unused except on PPC64 ELFv1 */ false, DAG,

- RegsToPass, InFlag, Chain, CallSeqStart, Callee, SPDiff,

- NumBytes, Ins, InVals, CS);

+ return FinishCall(CFlags, dl, DAG, RegsToPass, InFlag, Chain, CallSeqStart,

+ Callee, SPDiff, NumBytes, Ins, InVals, CB);

}

static bool CC_AIX(unsigned ValNo, MVT ValVT, MVT LocVT,

CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,

CCState &State) {

+ const PPCSubtarget &Subtarget = static_cast<const PPCSubtarget &>(

+ State.getMachineFunction().getSubtarget());

+ const bool IsPPC64 = Subtarget.isPPC64();

+ const Align PtrAlign = IsPPC64 ? Align(8) : Align(4);

+ const MVT RegVT = IsPPC64 ? MVT::i64 : MVT::i32;

+ assert((!ValVT.isInteger() ||

+ (ValVT.getSizeInBits() <= RegVT.getSizeInBits())) &&

+ "Integer argument exceeds register size: should have been legalized");

if (ValVT == MVT::f128)

report_fatal_error("f128 is unimplemented on AIX.");

- if (ArgFlags.isByVal())

- report_fatal_error("Passing structure by value is unimplemented.");

if (ArgFlags.isNest())

report_fatal_error("Nest arguments are unimplemented.");

if (ValVT.isVector() || LocVT.isVector())

report_fatal_error("Vector arguments are unimplemented on AIX.");

- const PPCSubtarget &Subtarget = static_cast<const PPCSubtarget &>(

- State.getMachineFunction().getSubtarget());

- const bool IsPPC64 = Subtarget.isPPC64();

- const unsigned PtrByteSize = IsPPC64 ? 8 : 4;

static const MCPhysReg GPR_32[] = {// 32-bit registers.

PPC::R3, PPC::R4, PPC::R5, PPC::R6,

PPC::R7, PPC::R8, PPC::R9, PPC::R10};

@@ -6904,6 +7073,38 @@ static bool CC_AIX(unsigned ValNo, MVT ValVT, MVT LocVT,

PPC::X3, PPC::X4, PPC::X5, PPC::X6,

PPC::X7, PPC::X8, PPC::X9, PPC::X10};

+ if (ArgFlags.isByVal()) {

+ if (ArgFlags.getNonZeroByValAlign() > PtrAlign)

+ report_fatal_error("Pass-by-value arguments with alignment greater than "

+ "register width are not supported.");

+ const unsigned ByValSize = ArgFlags.getByValSize();

+ // An empty aggregate parameter takes up no storage and no registers,

+ // but needs a MemLoc for a stack slot for the formal arguments side.

+ if (ByValSize == 0) {

+ State.addLoc(CCValAssign::getMem(ValNo, MVT::INVALID_SIMPLE_VALUE_TYPE,

+ State.getNextStackOffset(), RegVT,

+ LocInfo));

+ return false;

+ }

+ const unsigned StackSize = alignTo(ByValSize, PtrAlign);

+ unsigned Offset = State.AllocateStack(StackSize, PtrAlign);

+ for (const unsigned E = Offset + StackSize; Offset < E;

+ Offset += PtrAlign.value()) {

+ if (unsigned Reg = State.AllocateReg(IsPPC64 ? GPR_64 : GPR_32))

+ State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, RegVT, LocInfo));

+ else {

+ State.addLoc(CCValAssign::getMem(ValNo, MVT::INVALID_SIMPLE_VALUE_TYPE,

+ Offset, MVT::INVALID_SIMPLE_VALUE_TYPE,

+ LocInfo));

+ break;

+ }

+ return false;

+ }

// Arguments always reserve parameter save area.

switch (ValVT.SimpleTy) {

default:

@@ -6913,49 +7114,55 @@ static bool CC_AIX(unsigned ValNo, MVT ValVT, MVT LocVT,

assert(IsPPC64 && "PPC32 should have split i64 values.");

LLVM_FALLTHROUGH;

case MVT::i1:

- case MVT::i32:

- State.AllocateStack(PtrByteSize, PtrByteSize);

- if (unsigned Reg = State.AllocateReg(IsPPC64 ? GPR_64 : GPR_32)) {

- MVT RegVT = IsPPC64 ? MVT::i64 : MVT::i32;

- // Promote integers if needed.

- if (ValVT.getSizeInBits() < RegVT.getSizeInBits())

- LocInfo = ArgFlags.isSExt() ? CCValAssign::LocInfo::SExt

- : CCValAssign::LocInfo::ZExt;

+ case MVT::i32: {

+ const unsigned Offset = State.AllocateStack(PtrAlign.value(), PtrAlign);

+ // AIX integer arguments are always passed in register width.

+ if (ValVT.getSizeInBits() < RegVT.getSizeInBits())

+ LocInfo = ArgFlags.isSExt() ? CCValAssign::LocInfo::SExt

+ : CCValAssign::LocInfo::ZExt;

+ if (unsigned Reg = State.AllocateReg(IsPPC64 ? GPR_64 : GPR_32))

State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, RegVT, LocInfo));

- }

else

- report_fatal_error("Handling of placing parameters on the stack is "

- "unimplemented!");

- return false;

+ State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, RegVT, LocInfo));

+ return false;

+ }

case MVT::f32:

case MVT::f64: {

// Parameter save area (PSA) is reserved even if the float passes in fpr.

const unsigned StoreSize = LocVT.getStoreSize();

// Floats are always 4-byte aligned in the PSA on AIX.

// This includes f64 in 64-bit mode for ABI compatibility.

- State.AllocateStack(IsPPC64 ? 8 : StoreSize, 4);

- if (unsigned Reg = State.AllocateReg(FPR))

- State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));

- else

- report_fatal_error("Handling of placing parameters on the stack is "

- "unimplemented!");

- // AIX requires that GPRs are reserved for float arguments.

- // Successfully reserved GPRs are only initialized for vararg calls.

- MVT RegVT = IsPPC64 ? MVT::i64 : MVT::i32;

- for (unsigned I = 0; I < StoreSize; I += PtrByteSize) {

+ const unsigned Offset =

+ State.AllocateStack(IsPPC64 ? 8 : StoreSize, Align(4));

+ unsigned FReg = State.AllocateReg(FPR);

+ if (FReg)

+ State.addLoc(CCValAssign::getReg(ValNo, ValVT, FReg, LocVT, LocInfo));

+ // Reserve and initialize GPRs or initialize the PSA as required.

+ for (unsigned I = 0; I < StoreSize; I += PtrAlign.value()) {

if (unsigned Reg = State.AllocateReg(IsPPC64 ? GPR_64 : GPR_32)) {

+ assert(FReg && "An FPR should be available when a GPR is reserved.");

if (State.isVarArg()) {

+ // Successfully reserved GPRs are only initialized for vararg calls.

// Custom handling is required for:

// f64 in PPC32 needs to be split into 2 GPRs.

// f32 in PPC64 needs to occupy only lower 32 bits of 64-bit GPR.

State.addLoc(

CCValAssign::getCustomReg(ValNo, ValVT, Reg, RegVT, LocInfo));

}

- } else if (State.isVarArg()) {

- report_fatal_error("Handling of placing parameters on the stack is "

- "unimplemented!");

+ } else {

+ // If there are insufficient GPRs, the PSA needs to be initialized.

+ // Initialization occurs even if an FPR was initialized for

+ // compatibility with the AIX XL compiler. The full memory for the

+ // argument will be initialized even if a prior word is saved in GPR.

+ // A custom memLoc is used when the argument also passes in FPR so

+ // that the callee handling can skip over it easily.

+ State.addLoc(

+ FReg ? CCValAssign::getCustomMem(ValNo, ValVT, Offset, LocVT,

+ LocInfo)

+ : CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));

+ break;

}

@@ -7000,6 +7207,64 @@ static SDValue truncateScalarIntegerArg(ISD::ArgFlagsTy Flags, EVT ValVT,

return DAG.getNode(ISD::TRUNCATE, dl, ValVT, ArgValue);

}

+static unsigned mapArgRegToOffsetAIX(unsigned Reg, const PPCFrameLowering *FL) {

+ const unsigned LASize = FL->getLinkageSize();

+ if (PPC::GPRCRegClass.contains(Reg)) {

+ assert(Reg >= PPC::R3 && Reg <= PPC::R10 &&

+ "Reg must be a valid argument register!");

+ return LASize + 4 * (Reg - PPC::R3);

+ }

+ if (PPC::G8RCRegClass.contains(Reg)) {

+ assert(Reg >= PPC::X3 && Reg <= PPC::X10 &&

+ "Reg must be a valid argument register!");

+ return LASize + 8 * (Reg - PPC::X3);

+ }

+ llvm_unreachable("Only general purpose registers expected.");

+// AIX ABI Stack Frame Layout:

+//

+// Low Memory +--------------------------------------------+

+// SP +---> | Back chain | ---+

+// | +--------------------------------------------+ |

+// | | Saved Condition Register | |

+// | +--------------------------------------------+ |

+// | | Saved Linkage Register | |

+// | +--------------------------------------------+ | Linkage Area

+// | | Reserved for compilers | |

+// | +--------------------------------------------+ |

+// | | Reserved for binders | |

+// | +--------------------------------------------+ |

+// | | Saved TOC pointer | ---+

+// | +--------------------------------------------+

+// | | Parameter save area |

+// | +--------------------------------------------+

+// | | Alloca space |

+// | +--------------------------------------------+

+// | | Local variable space |

+// | +--------------------------------------------+

+// | | Float/int conversion temporary |

+// | +--------------------------------------------+

+// | | Save area for AltiVec registers |

+// | +--------------------------------------------+

+// | | AltiVec alignment padding |

+// | +--------------------------------------------+

+// | | Save area for VRSAVE register |

+// | +--------------------------------------------+

+// | | Save area for General Purpose registers |

+// | +--------------------------------------------+

+// | | Save area for Floating Point registers |

+// | +--------------------------------------------+

+// +---- | Back chain |

+// High Memory +--------------------------------------------+

+//

+// Specifications:

+// AIX 7.2 Assembler Language Reference

+// Subroutine linkage convention

SDValue PPCTargetLowering::LowerFormalArguments_AIX(

SDValue Chain, CallingConv::ID CallConv, bool isVarArg,

const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,

@@ -7009,9 +7274,6 @@ SDValue PPCTargetLowering::LowerFormalArguments_AIX(

CallConv == CallingConv::Fast) &&

"Unexpected calling convention!");

- if (isVarArg)

- report_fatal_error("This call type is unimplemented on AIX.");

if (getTargetMachine().Options.GuaranteedTailCallOpt)

report_fatal_error("Tail call support is unimplemented on AIX.");

@@ -7029,67 +7291,214 @@ SDValue PPCTargetLowering::LowerFormalArguments_AIX(

// Assign locations to all of the incoming arguments.

SmallVector<CCValAssign, 16> ArgLocs;

MachineFunction &MF = DAG.getMachineFunction();

+ MachineFrameInfo &MFI = MF.getFrameInfo();

CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());

+ const EVT PtrVT = getPointerTy(MF.getDataLayout());

// Reserve space for the linkage area on the stack.

const unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();

- // On AIX a minimum of 8 words is saved to the parameter save area.

- const unsigned MinParameterSaveArea = 8 * PtrByteSize;

- CCInfo.AllocateStack(LinkageSize + MinParameterSaveArea, PtrByteSize);

+ CCInfo.AllocateStack(LinkageSize, Align(PtrByteSize));

CCInfo.AnalyzeFormalArguments(Ins, CC_AIX);

- for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {

- CCValAssign &VA = ArgLocs[i];

- SDValue ArgValue;

- ISD::ArgFlagsTy Flags = Ins[i].Flags;

- if (VA.isRegLoc()) {

- EVT ValVT = VA.getValVT();

- MVT LocVT = VA.getLocVT();

+ SmallVector<SDValue, 8> MemOps;

+ for (size_t I = 0, End = ArgLocs.size(); I != End; /* No increment here */) {

+ CCValAssign &VA = ArgLocs[I++];

+ MVT LocVT = VA.getLocVT();

+ ISD::ArgFlagsTy Flags = Ins[VA.getValNo()].Flags;

+ // For compatibility with the AIX XL compiler, the float args in the

+ // parameter save area are initialized even if the argument is available

+ // in register. The caller is required to initialize both the register

+ // and memory, however, the callee can choose to expect it in either.

+ // The memloc is dismissed here because the argument is retrieved from

+ // the register.

+ if (VA.isMemLoc() && VA.needsCustom())

+ continue;

+ if (Flags.isByVal() && VA.isMemLoc()) {

+ const unsigned Size =

+ alignTo(Flags.getByValSize() ? Flags.getByValSize() : PtrByteSize,

+ PtrByteSize);

+ const int FI = MF.getFrameInfo().CreateFixedObject(

+ Size, VA.getLocMemOffset(), /* IsImmutable */ false,

+ /* IsAliased */ true);

+ SDValue FIN = DAG.getFrameIndex(FI, PtrVT);

+ InVals.push_back(FIN);

+ continue;

+ }

+ if (Flags.isByVal()) {

+ assert(VA.isRegLoc() && "MemLocs should already be handled.");

+ const MCPhysReg ArgReg = VA.getLocReg();

+ const PPCFrameLowering *FL = Subtarget.getFrameLowering();

+ if (Flags.getNonZeroByValAlign() > PtrByteSize)

+ report_fatal_error("Over aligned byvals not supported yet.");

+ const unsigned StackSize = alignTo(Flags.getByValSize(), PtrByteSize);

+ const int FI = MF.getFrameInfo().CreateFixedObject(

+ StackSize, mapArgRegToOffsetAIX(ArgReg, FL), /* IsImmutable */ false,

+ /* IsAliased */ true);

+ SDValue FIN = DAG.getFrameIndex(FI, PtrVT);

+ InVals.push_back(FIN);

+ // Add live ins for all the RegLocs for the same ByVal.

+ const TargetRegisterClass *RegClass =

+ IsPPC64 ? &PPC::G8RCRegClass : &PPC::GPRCRegClass;

+ auto HandleRegLoc = [&, RegClass, LocVT](const MCPhysReg PhysReg,

+ unsigned Offset) {

+ const unsigned VReg = MF.addLiveIn(PhysReg, RegClass);

+ // Since the callers side has left justified the aggregate in the

+ // register, we can simply store the entire register into the stack

+ // slot.

+ SDValue CopyFrom = DAG.getCopyFromReg(Chain, dl, VReg, LocVT);

+ // The store to the fixedstack object is needed becuase accessing a

+ // field of the ByVal will use a gep and load. Ideally we will optimize

+ // to extracting the value from the register directly, and elide the

+ // stores when the arguments address is not taken, but that will need to

+ // be future work.

+ SDValue Store =

+ DAG.getStore(CopyFrom.getValue(1), dl, CopyFrom,

+ DAG.getObjectPtrOffset(dl, FIN, Offset),

+ MachinePointerInfo::getFixedStack(MF, FI, Offset));

+ MemOps.push_back(Store);

+ };

+ unsigned Offset = 0;

+ HandleRegLoc(VA.getLocReg(), Offset);

+ Offset += PtrByteSize;

+ for (; Offset != StackSize && ArgLocs[I].isRegLoc();

+ Offset += PtrByteSize) {

+ assert(ArgLocs[I].getValNo() == VA.getValNo() &&

+ "RegLocs should be for ByVal argument.");

+ const CCValAssign RL = ArgLocs[I++];

+ HandleRegLoc(RL.getLocReg(), Offset);

+ }

+ if (Offset != StackSize) {

+ assert(ArgLocs[I].getValNo() == VA.getValNo() &&

+ "Expected MemLoc for remaining bytes.");

+ assert(ArgLocs[I].isMemLoc() && "Expected MemLoc for remaining bytes.");

+ // Consume the MemLoc.The InVal has already been emitted, so nothing

+ // more needs to be done.

+ ++I;

+ }

+ continue;

+ }

+ EVT ValVT = VA.getValVT();

+ if (VA.isRegLoc() && !VA.needsCustom()) {

MVT::SimpleValueType SVT = ValVT.getSimpleVT().SimpleTy;

unsigned VReg =

MF.addLiveIn(VA.getLocReg(), getRegClassForSVT(SVT, IsPPC64));

- ArgValue = DAG.getCopyFromReg(Chain, dl, VReg, LocVT);

+ SDValue ArgValue = DAG.getCopyFromReg(Chain, dl, VReg, LocVT);

if (ValVT.isScalarInteger() &&

(ValVT.getSizeInBits() < LocVT.getSizeInBits())) {

ArgValue =

truncateScalarIntegerArg(Flags, ValVT, DAG, ArgValue, LocVT, dl);

}

InVals.push_back(ArgValue);

- } else {

- report_fatal_error("Handling of formal arguments on the stack is "

- "unimplemented!");

+ continue;

+ }

+ if (VA.isMemLoc()) {

+ const unsigned LocSize = LocVT.getStoreSize();

+ const unsigned ValSize = ValVT.getStoreSize();

+ assert((ValSize <= LocSize) &&

+ "Object size is larger than size of MemLoc");

+ int CurArgOffset = VA.getLocMemOffset();

+ // Objects are right-justified because AIX is big-endian.

+ if (LocSize > ValSize)

+ CurArgOffset += LocSize - ValSize;

+ // Potential tail calls could cause overwriting of argument stack slots.

+ const bool IsImmutable =

+ !(getTargetMachine().Options.GuaranteedTailCallOpt &&

+ (CallConv == CallingConv::Fast));

+ int FI = MFI.CreateFixedObject(ValSize, CurArgOffset, IsImmutable);

+ SDValue FIN = DAG.getFrameIndex(FI, PtrVT);

+ SDValue ArgValue =

+ DAG.getLoad(ValVT, dl, Chain, FIN, MachinePointerInfo());

+ InVals.push_back(ArgValue);

+ continue;

}

+ // On AIX a minimum of 8 words is saved to the parameter save area.

+ const unsigned MinParameterSaveArea = 8 * PtrByteSize;

// Area that is at least reserved in the caller of this function.

- unsigned MinReservedArea = CCInfo.getNextStackOffset();

+ unsigned CallerReservedArea =

+ std::max(CCInfo.getNextStackOffset(), LinkageSize + MinParameterSaveArea);

// Set the size that is at least reserved in caller of this function. Tail

// call optimized function's reserved stack space needs to be aligned so

// that taking the difference between two stack areas will result in an

// aligned stack.

- MinReservedArea =

- EnsureStackAlignment(Subtarget.getFrameLowering(), MinReservedArea);

+ CallerReservedArea =

+ EnsureStackAlignment(Subtarget.getFrameLowering(), CallerReservedArea);

PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();

- FuncInfo->setMinReservedArea(MinReservedArea);

+ FuncInfo->setMinReservedArea(CallerReservedArea);

+ if (isVarArg) {

+ FuncInfo->setVarArgsFrameIndex(

+ MFI.CreateFixedObject(PtrByteSize, CCInfo.getNextStackOffset(), true));

+ SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);

+ static const MCPhysReg GPR_32[] = {PPC::R3, PPC::R4, PPC::R5, PPC::R6,

+ PPC::R7, PPC::R8, PPC::R9, PPC::R10};

+ static const MCPhysReg GPR_64[] = {PPC::X3, PPC::X4, PPC::X5, PPC::X6,

+ PPC::X7, PPC::X8, PPC::X9, PPC::X10};

+ const unsigned NumGPArgRegs = array_lengthof(IsPPC64 ? GPR_64 : GPR_32);

+ // The fixed integer arguments of a variadic function are stored to the

+ // VarArgsFrameIndex on the stack so that they may be loaded by

+ // dereferencing the result of va_next.

+ for (unsigned GPRIndex =

+ (CCInfo.getNextStackOffset() - LinkageSize) / PtrByteSize;

+ GPRIndex < NumGPArgRegs; ++GPRIndex) {

+ const unsigned VReg =

+ IsPPC64 ? MF.addLiveIn(GPR_64[GPRIndex], &PPC::G8RCRegClass)

+ : MF.addLiveIn(GPR_32[GPRIndex], &PPC::GPRCRegClass);

+ SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);

+ SDValue Store =

+ DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo());

+ MemOps.push_back(Store);

+ // Increment the address for the next argument to store.

+ SDValue PtrOff = DAG.getConstant(PtrByteSize, dl, PtrVT);

+ FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff);

+ }

+ if (!MemOps.empty())

+ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);

return Chain;

}

SDValue PPCTargetLowering::LowerCall_AIX(

- SDValue Chain, SDValue Callee, CallingConv::ID CallConv, bool isVarArg,

- bool isTailCall, bool isPatchPoint,

+ SDValue Chain, SDValue Callee, CallFlags CFlags,

const SmallVectorImpl<ISD::OutputArg> &Outs,

const SmallVectorImpl<SDValue> &OutVals,

const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,

SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,

- ImmutableCallSite CS) const {

+ const CallBase *CB) const {

+ // See PPCTargetLowering::LowerFormalArguments_AIX() for a description of the

+ // AIX ABI stack frame layout.

- assert((CallConv == CallingConv::C ||

- CallConv == CallingConv::Cold ||

- CallConv == CallingConv::Fast) && "Unexpected calling convention!");

+ assert((CFlags.CallConv == CallingConv::C ||

+ CFlags.CallConv == CallingConv::Cold ||

+ CFlags.CallConv == CallingConv::Fast) &&

+ "Unexpected calling convention!");

- if (isPatchPoint)

+ if (CFlags.IsPatchPoint)

report_fatal_error("This call type is unimplemented on AIX.");

const PPCSubtarget& Subtarget =

@@ -7101,7 +7510,8 @@ SDValue PPCTargetLowering::LowerCall_AIX(

MachineFunction &MF = DAG.getMachineFunction();

SmallVector<CCValAssign, 16> ArgLocs;

- CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());

+ CCState CCInfo(CFlags.CallConv, CFlags.IsVarArg, MF, ArgLocs,

+ *DAG.getContext());

// Reserve space for the linkage save area (LSA) on the stack.

// In both PPC32 and PPC64 there are 6 reserved slots in the LSA:

@@ -7109,8 +7519,9 @@ SDValue PPCTargetLowering::LowerCall_AIX(

// The LSA is 24 bytes (6x4) in PPC32 and 48 bytes (6x8) in PPC64.

const unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();

const bool IsPPC64 = Subtarget.isPPC64();

+ const EVT PtrVT = getPointerTy(DAG.getDataLayout());

const unsigned PtrByteSize = IsPPC64 ? 8 : 4;

- CCInfo.AllocateStack(LinkageSize, PtrByteSize);

+ CCInfo.AllocateStack(LinkageSize, Align(PtrByteSize));

CCInfo.AnalyzeCallOperands(Outs, CC_AIX);

// The prolog code of the callee may store up to 8 GPR argument registers to

@@ -7120,7 +7531,8 @@ SDValue PPCTargetLowering::LowerCall_AIX(

// conservatively assume that it is needed. As such, make sure we have at

// least enough stack space for the caller to store the 8 GPRs.

const unsigned MinParameterSaveAreaSize = 8 * PtrByteSize;

- const unsigned NumBytes = LinkageSize + MinParameterSaveAreaSize;

+ const unsigned NumBytes = std::max(LinkageSize + MinParameterSaveAreaSize,

+ CCInfo.getNextStackOffset());

// Adjust the stack pointer for the new arguments...

// These operations are automatically eliminated by the prolog/epilog pass.

@@ -7128,77 +7540,192 @@ SDValue PPCTargetLowering::LowerCall_AIX(

SDValue CallSeqStart = Chain;

SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;

+ SmallVector<SDValue, 8> MemOpChains;

+ // Set up a copy of the stack pointer for loading and storing any

+ // arguments that may not fit in the registers available for argument

+ // passing.

+ const SDValue StackPtr = IsPPC64 ? DAG.getRegister(PPC::X1, MVT::i64)

+ : DAG.getRegister(PPC::R1, MVT::i32);

for (unsigned I = 0, E = ArgLocs.size(); I != E;) {

- CCValAssign &VA = ArgLocs[I++];

+ const unsigned ValNo = ArgLocs[I].getValNo();

+ SDValue Arg = OutVals[ValNo];

+ ISD::ArgFlagsTy Flags = Outs[ValNo].Flags;

- if (VA.isMemLoc())

- report_fatal_error("Handling of placing parameters on the stack is "

- "unimplemented!");

- if (!VA.isRegLoc())

- report_fatal_error(

- "Unexpected non-register location for function call argument.");

+ if (Flags.isByVal()) {

+ const unsigned ByValSize = Flags.getByValSize();

- SDValue Arg = OutVals[VA.getValNo()];

+ // Nothing to do for zero-sized ByVals on the caller side.

+ if (!ByValSize) {

+ ++I;

+ continue;

+ }

- if (!VA.needsCustom()) {

- switch (VA.getLocInfo()) {

- default:

- report_fatal_error("Unexpected argument extension type.");

- case CCValAssign::Full:

- break;

- case CCValAssign::ZExt:

- Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);

- break;

- case CCValAssign::SExt:

- Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);

- break;

+ auto GetLoad = [&](EVT VT, unsigned LoadOffset) {

+ return DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain,

+ (LoadOffset != 0)

+ ? DAG.getObjectPtrOffset(dl, Arg, LoadOffset)

+ : Arg,

+ MachinePointerInfo(), VT);

+ };

+ unsigned LoadOffset = 0;

+ // Initialize registers, which are fully occupied by the by-val argument.

+ while (LoadOffset + PtrByteSize <= ByValSize && ArgLocs[I].isRegLoc()) {

+ SDValue Load = GetLoad(PtrVT, LoadOffset);

+ MemOpChains.push_back(Load.getValue(1));

+ LoadOffset += PtrByteSize;

+ const CCValAssign &ByValVA = ArgLocs[I++];

+ assert(ByValVA.getValNo() == ValNo &&

+ "Unexpected location for pass-by-value argument.");

+ RegsToPass.push_back(std::make_pair(ByValVA.getLocReg(), Load));

}

+ if (LoadOffset == ByValSize)

+ continue;

+ // There must be one more loc to handle the remainder.

+ assert(ArgLocs[I].getValNo() == ValNo &&

+ "Expected additional location for by-value argument.");

+ if (ArgLocs[I].isMemLoc()) {

+ assert(LoadOffset < ByValSize && "Unexpected memloc for by-val arg.");

+ const CCValAssign &ByValVA = ArgLocs[I++];

+ ISD::ArgFlagsTy MemcpyFlags = Flags;

+ // Only memcpy the bytes that don't pass in register.

+ MemcpyFlags.setByValSize(ByValSize - LoadOffset);

+ Chain = CallSeqStart = createMemcpyOutsideCallSeq(

+ (LoadOffset != 0) ? DAG.getObjectPtrOffset(dl, Arg, LoadOffset)

+ : Arg,

+ DAG.getObjectPtrOffset(dl, StackPtr, ByValVA.getLocMemOffset()),

+ CallSeqStart, MemcpyFlags, DAG, dl);

+ continue;

+ }

+ // Initialize the final register residue.

+ // Any residue that occupies the final by-val arg register must be

+ // left-justified on AIX. Loads must be a power-of-2 size and cannot be

+ // larger than the ByValSize. For example: a 7 byte by-val arg requires 4,

+ // 2 and 1 byte loads.

+ const unsigned ResidueBytes = ByValSize % PtrByteSize;

+ assert(ResidueBytes != 0 && LoadOffset + PtrByteSize > ByValSize &&

+ "Unexpected register residue for by-value argument.");

+ SDValue ResidueVal;

+ for (unsigned Bytes = 0; Bytes != ResidueBytes;) {

+ const unsigned N = PowerOf2Floor(ResidueBytes - Bytes);

+ const MVT VT =

+ N == 1 ? MVT::i8

+ : ((N == 2) ? MVT::i16 : (N == 4 ? MVT::i32 : MVT::i64));

+ SDValue Load = GetLoad(VT, LoadOffset);

+ MemOpChains.push_back(Load.getValue(1));

+ LoadOffset += N;

+ Bytes += N;

+ // By-val arguments are passed left-justfied in register.

+ // Every load here needs to be shifted, otherwise a full register load

+ // should have been used.

+ assert(PtrVT.getSimpleVT().getSizeInBits() > (Bytes * 8) &&

+ "Unexpected load emitted during handling of pass-by-value "

+ "argument.");

+ unsigned NumSHLBits = PtrVT.getSimpleVT().getSizeInBits() - (Bytes * 8);

+ EVT ShiftAmountTy =

+ getShiftAmountTy(Load->getValueType(0), DAG.getDataLayout());

+ SDValue SHLAmt = DAG.getConstant(NumSHLBits, dl, ShiftAmountTy);

+ SDValue ShiftedLoad =

+ DAG.getNode(ISD::SHL, dl, Load.getValueType(), Load, SHLAmt);

+ ResidueVal = ResidueVal ? DAG.getNode(ISD::OR, dl, PtrVT, ResidueVal,

+ ShiftedLoad)

+ : ShiftedLoad;

+ }

+ const CCValAssign &ByValVA = ArgLocs[I++];

+ RegsToPass.push_back(std::make_pair(ByValVA.getLocReg(), ResidueVal));

+ continue;

+ }

+ CCValAssign &VA = ArgLocs[I++];

+ const MVT LocVT = VA.getLocVT();

+ const MVT ValVT = VA.getValVT();

+ switch (VA.getLocInfo()) {

+ default:

+ report_fatal_error("Unexpected argument extension type.");

+ case CCValAssign::Full:

+ break;

+ case CCValAssign::ZExt:

+ Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);

+ break;

+ case CCValAssign::SExt:

+ Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);

+ break;

+ }

+ if (VA.isRegLoc() && !VA.needsCustom()) {

RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));

+ continue;

+ }

+ if (VA.isMemLoc()) {

+ SDValue PtrOff =

+ DAG.getConstant(VA.getLocMemOffset(), dl, StackPtr.getValueType());

+ PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);

+ MemOpChains.push_back(

+ DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo()));

continue;

}

// Custom handling is used for GPR initializations for vararg float

// arguments.

- assert(isVarArg && VA.getValVT().isFloatingPoint() &&

- VA.getLocVT().isInteger() &&

- "Unexpected custom register handling for calling convention.");

+ assert(VA.isRegLoc() && VA.needsCustom() && CFlags.IsVarArg &&

+ ValVT.isFloatingPoint() && LocVT.isInteger() &&

+ "Unexpected register handling for calling convention.");

SDValue ArgAsInt =

- DAG.getBitcast(MVT::getIntegerVT(VA.getValVT().getSizeInBits()), Arg);

+ DAG.getBitcast(MVT::getIntegerVT(ValVT.getSizeInBits()), Arg);

- if (Arg.getValueType().getStoreSize() == VA.getLocVT().getStoreSize())

+ if (Arg.getValueType().getStoreSize() == LocVT.getStoreSize())

// f32 in 32-bit GPR

// f64 in 64-bit GPR

RegsToPass.push_back(std::make_pair(VA.getLocReg(), ArgAsInt));

- else if (Arg.getValueType().getSizeInBits() < VA.getLocVT().getSizeInBits())

+ else if (Arg.getValueType().getSizeInBits() < LocVT.getSizeInBits())

// f32 in 64-bit GPR.

RegsToPass.push_back(std::make_pair(

- VA.getLocReg(), DAG.getZExtOrTrunc(ArgAsInt, dl, VA.getLocVT())));

+ VA.getLocReg(), DAG.getZExtOrTrunc(ArgAsInt, dl, LocVT)));

else {

// f64 in two 32-bit GPRs

// The 2 GPRs are marked custom and expected to be adjacent in ArgLocs.

- assert(Arg.getValueType() == MVT::f64 && isVarArg && !IsPPC64 &&

+ assert(Arg.getValueType() == MVT::f64 && CFlags.IsVarArg && !IsPPC64 &&

"Unexpected custom register for argument!");

CCValAssign &GPR1 = VA;

SDValue MSWAsI64 = DAG.getNode(ISD::SRL, dl, MVT::i64, ArgAsInt,

DAG.getConstant(32, dl, MVT::i8));

RegsToPass.push_back(std::make_pair(

GPR1.getLocReg(), DAG.getZExtOrTrunc(MSWAsI64, dl, MVT::i32)));

- assert(I != E && "A second custom GPR is expected!");

- CCValAssign &GPR2 = ArgLocs[I++];

- assert(GPR2.isRegLoc() && GPR2.getValNo() == GPR1.getValNo() &&

- GPR2.needsCustom() && "A second custom GPR is expected!");

- RegsToPass.push_back(std::make_pair(

- GPR2.getLocReg(), DAG.getZExtOrTrunc(ArgAsInt, dl, MVT::i32)));

+ if (I != E) {

+ // If only 1 GPR was available, there will only be one custom GPR and

+ // the argument will also pass in memory.

+ CCValAssign &PeekArg = ArgLocs[I];

+ if (PeekArg.isRegLoc() && PeekArg.getValNo() == PeekArg.getValNo()) {

+ assert(PeekArg.needsCustom() && "A second custom GPR is expected.");

+ CCValAssign &GPR2 = ArgLocs[I++];

+ RegsToPass.push_back(std::make_pair(

+ GPR2.getLocReg(), DAG.getZExtOrTrunc(ArgAsInt, dl, MVT::i32)));

+ }

}

+ if (!MemOpChains.empty())

+ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);

// For indirect calls, we need to save the TOC base to the stack for

// restoration after the call.

- if (!isTailCall && !isPatchPoint &&

- !isFunctionGlobalAddress(Callee) && !isa<ExternalSymbolSDNode>(Callee)) {

+ if (CFlags.IsIndirect) {

+ assert(!CFlags.IsTailCall && "Indirect tail-calls not supported.");

const MCRegister TOCBaseReg = Subtarget.getTOCPointerRegister();

const MCRegister StackPtrReg = Subtarget.getStackPointerRegister();

const MVT PtrVT = Subtarget.isPPC64() ? MVT::i64 : MVT::i32;

@@ -7224,10 +7751,8 @@ SDValue PPCTargetLowering::LowerCall_AIX(

}

const int SPDiff = 0;

- return FinishCall(CallConv, dl, isTailCall, isVarArg, isPatchPoint,

- /* unused except on PPC64 ELFv1 */ false, DAG, RegsToPass,

- InFlag, Chain, CallSeqStart, Callee, SPDiff, NumBytes, Ins,

- InVals, CS);

+ return FinishCall(CFlags, dl, DAG, RegsToPass, InFlag, Chain, CallSeqStart,

+ Callee, SPDiff, NumBytes, Ins, InVals, CB);

}

bool

@@ -7299,25 +7824,6 @@ PPCTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,

RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));

}

- const PPCRegisterInfo *TRI = Subtarget.getRegisterInfo();

- const MCPhysReg *I =

- TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());

- if (I) {

- for (; *I; ++I) {

- if (PPC::G8RCRegClass.contains(*I))

- RetOps.push_back(DAG.getRegister(*I, MVT::i64));

- else if (PPC::F8RCRegClass.contains(*I))

- RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64)));

- else if (PPC::CRRCRegClass.contains(*I))

- RetOps.push_back(DAG.getRegister(*I, MVT::i1));

- else if (PPC::VRRCRegClass.contains(*I))

- RetOps.push_back(DAG.getRegister(*I, MVT::Other));

- else

- llvm_unreachable("Unexpected register class in CSRsViaCopy!");

- }

RetOps[0] = Chain; // Update chain.

// Add the flag if we have it.

@@ -7419,6 +7925,7 @@ PPCTargetLowering::getFramePointerFrameIndex(SelectionDAG & DAG) const {

SDValue PPCTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,

SelectionDAG &DAG) const {

+ MachineFunction &MF = DAG.getMachineFunction();

// Get the inputs.

SDValue Chain = Op.getOperand(0);

SDValue Size = Op.getOperand(1);

@@ -7431,9 +7938,10 @@ SDValue PPCTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,

DAG.getConstant(0, dl, PtrVT), Size);

// Construct a node for the frame pointer save index.

SDValue FPSIdx = getFramePointerFrameIndex(DAG);

- // Build a DYNALLOC node.

SDValue Ops[3] = { Chain, NegSize, FPSIdx };

SDVTList VTs = DAG.getVTList(PtrVT, MVT::Other);

+ if (hasInlineStackProbe(MF))

+ return DAG.getNode(PPCISD::PROBED_ALLOCA, dl, VTs, Ops);

return DAG.getNode(PPCISD::DYNALLOC, dl, VTs, Ops);

}

@@ -7582,15 +8090,6 @@ SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {

!Op.getOperand(2).getValueType().isFloatingPoint())

return Op;

- bool HasNoInfs = DAG.getTarget().Options.NoInfsFPMath;

- bool HasNoNaNs = DAG.getTarget().Options.NoNaNsFPMath;

- // We might be able to do better than this under some circumstances, but in

- // general, fsel-based lowering of select is a finite-math-only optimization.

- // For more information, see section F.3 of the 2.06 ISA specification.

- // With ISA 3.0, we have xsmaxcdp/xsmincdp which are OK to emit even in the

- // presence of infinities.

- if (!Subtarget.hasP9Vector() && (!HasNoInfs || !HasNoNaNs))

- return Op;

ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();

EVT ResVT = Op.getValueType();

@@ -7598,14 +8097,14 @@ SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {

SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1);

SDValue TV = Op.getOperand(2), FV = Op.getOperand(3);

SDLoc dl(Op);

+ SDNodeFlags Flags = Op.getNode()->getFlags();

+ // We have xsmaxcdp/xsmincdp which are OK to emit even in the

+ // presence of infinities.

if (Subtarget.hasP9Vector() && LHS == TV && RHS == FV) {

switch (CC) {

default:

- // Not a min/max but with finite math, we may still be able to use fsel.

- if (HasNoInfs && HasNoNaNs)

- break;

- return Op;

+ break;

case ISD::SETOGT:

case ISD::SETGT:

return DAG.getNode(PPCISD::XSMAXCDP, dl, Op.getValueType(), LHS, RHS);

@@ -7615,10 +8114,13 @@ SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {

}

- // TODO: Propagate flags from the select rather than global settings.

- SDNodeFlags Flags;

- Flags.setNoInfs(true);

- Flags.setNoNaNs(true);

+ // We might be able to do better than this under some circumstances, but in

+ // general, fsel-based lowering of select is a finite-math-only optimization.

+ // For more information, see section F.3 of the 2.06 ISA specification.

+ // With ISA 3.0

+ if ((!DAG.getTarget().Options.NoInfsFPMath && !Flags.hasNoInfs()) ||

+ (!DAG.getTarget().Options.NoNaNsFPMath && !Flags.hasNoNaNs()))

+ return Op;

// If the RHS of the comparison is a 0.0, we don't need to do the

// subtraction at all.

@@ -7738,12 +8240,12 @@ void PPCTargetLowering::LowerFP_TO_INTForReuse(SDValue Op, ReuseLoadInfo &RLI,

// Emit a store to the stack slot.

SDValue Chain;

- unsigned Alignment = DAG.getEVTAlignment(Tmp.getValueType());

+ Align Alignment(DAG.getEVTAlign(Tmp.getValueType()));

if (i32Stack) {

MachineFunction &MF = DAG.getMachineFunction();

- Alignment = 4;

+ Alignment = Align(4);

MachineMemOperand *MMO =

- MF.getMachineMemOperand(MPI, MachineMemOperand::MOStore, 4, Alignment);

+ MF.getMachineMemOperand(MPI, MachineMemOperand::MOStore, 4, Alignment);

SDValue Ops[] = { DAG.getEntryNode(), Tmp, FIPtr };

Chain = DAG.getMemIntrinsicNode(PPCISD::STFIWX, dl,

DAG.getVTList(MVT::Other), Ops, MVT::i32, MMO);

@@ -7803,7 +8305,7 @@ SDValue PPCTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG,

const SDLoc &dl) const {

// FP to INT conversions are legal for f128.

- if (EnableQuadPrecision && (Op->getOperand(0).getValueType() == MVT::f128))

+ if (Op->getOperand(0).getValueType() == MVT::f128)

return Op;

// Expand ppcf128 to i32 by hand for the benefit of llvm-gcc bootstrap on

@@ -7899,7 +8401,7 @@ bool PPCTargetLowering::canReuseLoadAddress(SDValue Op, EVT MemVT,

RLI.MPI = LD->getPointerInfo();

RLI.IsDereferenceable = LD->isDereferenceable();

RLI.IsInvariant = LD->isInvariant();

- RLI.Alignment = LD->getAlignment();

+ RLI.Alignment = LD->getAlign();

RLI.AAInfo = LD->getAAInfo();

RLI.Ranges = LD->getRanges();

@@ -8043,16 +8545,19 @@ SDValue PPCTargetLowering::LowerINT_TO_FPVector(SDValue Op, SelectionDAG &DAG,

SDValue ShuffleSrc2 =

SignedConv ? DAG.getUNDEF(WideVT) : DAG.getConstant(0, dl, WideVT);

SDValue Arrange = DAG.getVectorShuffle(WideVT, dl, Wide, ShuffleSrc2, ShuffV);

- unsigned ExtendOp =

- SignedConv ? (unsigned)PPCISD::SExtVElems : (unsigned)ISD::BITCAST;

SDValue Extend;

- if (!Subtarget.hasP9Altivec() && SignedConv) {

+ if (SignedConv) {

Arrange = DAG.getBitcast(IntermediateVT, Arrange);

+ EVT ExtVT = Op.getOperand(0).getValueType();

+ if (Subtarget.hasP9Altivec())

+ ExtVT = EVT::getVectorVT(*DAG.getContext(), WideVT.getVectorElementType(),

+ IntermediateVT.getVectorNumElements());

Extend = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, IntermediateVT, Arrange,

- DAG.getValueType(Op.getOperand(0).getValueType()));

+ DAG.getValueType(ExtVT));

} else

- Extend = DAG.getNode(ExtendOp, dl, IntermediateVT, Arrange);

+ Extend = DAG.getNode(ISD::BITCAST, dl, IntermediateVT, Arrange);

return DAG.getNode(Opc, dl, Op.getValueType(), Extend);

}

@@ -8068,7 +8573,7 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,

return LowerINT_TO_FPVector(Op, DAG, dl);

// Conversions to f128 are legal.

- if (EnableQuadPrecision && (Op.getValueType() == MVT::f128))

+ if (Op.getValueType() == MVT::f128)

return Op;

if (Subtarget.hasQPX() && Op.getOperand(0).getValueType() == MVT::v4i1) {

@@ -8163,8 +8668,10 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,

SINT, DAG.getConstant(53, dl, MVT::i32));

Cond = DAG.getNode(ISD::ADD, dl, MVT::i64,

Cond, DAG.getConstant(1, dl, MVT::i64));

- Cond = DAG.getSetCC(dl, MVT::i32,

- Cond, DAG.getConstant(1, dl, MVT::i64), ISD::SETUGT);

+ Cond = DAG.getSetCC(

+ dl,

+ getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),

+ Cond, DAG.getConstant(1, dl, MVT::i64), ISD::SETUGT);

SINT = DAG.getNode(ISD::SELECT, dl, MVT::i64, Cond, Round, SINT);

}

@@ -8205,7 +8712,7 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,

MachineFrameInfo &MFI = MF.getFrameInfo();

EVT PtrVT = getPointerTy(DAG.getDataLayout());

- int FrameIdx = MFI.CreateStackObject(4, 4, false);

+ int FrameIdx = MFI.CreateStackObject(4, Align(4), false);

SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);

SDValue Store =

@@ -8220,7 +8727,7 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,

RLI.Chain = Store;

RLI.MPI =

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx);

- RLI.Alignment = 4;

+ RLI.Alignment = Align(4);

MachineMemOperand *MMO =

MF.getMachineMemOperand(RLI.MPI, MachineMemOperand::MOLoad, 4,

@@ -8257,7 +8764,7 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,

bool ReusingLoad;

if (!(ReusingLoad = canReuseLoadAddress(Op.getOperand(0), MVT::i32, RLI,

DAG))) {

- int FrameIdx = MFI.CreateStackObject(4, 4, false);

+ int FrameIdx = MFI.CreateStackObject(4, Align(4), false);

SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);

SDValue Store =

@@ -8272,7 +8779,7 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,

RLI.Chain = Store;

RLI.MPI =

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx);

- RLI.Alignment = 4;

+ RLI.Alignment = Align(4);

}

MachineMemOperand *MMO =

@@ -8289,7 +8796,7 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,

assert(Subtarget.isPPC64() &&

"i32->FP without LFIWAX supported only on PPC64");

- int FrameIdx = MFI.CreateStackObject(8, 8, false);

+ int FrameIdx = MFI.CreateStackObject(8, Align(8), false);

SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);

SDValue Ext64 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i64,

@@ -8341,22 +8848,20 @@ SDValue PPCTargetLowering::LowerFLT_ROUNDS_(SDValue Op,

EVT PtrVT = getPointerTy(MF.getDataLayout());

// Save FP Control Word to register

- EVT NodeTys[] = {

- MVT::f64, // return register

- MVT::Glue // unused in this context

- };

- SDValue Chain = DAG.getNode(PPCISD::MFFS, dl, NodeTys, None);

+ SDValue Chain = Op.getOperand(0);

+ SDValue MFFS = DAG.getNode(PPCISD::MFFS, dl, {MVT::f64, MVT::Other}, Chain);

+ Chain = MFFS.getValue(1);

// Save FP register to stack slot

- int SSFI = MF.getFrameInfo().CreateStackObject(8, 8, false);

+ int SSFI = MF.getFrameInfo().CreateStackObject(8, Align(8), false);

SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);

- SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Chain, StackSlot,

- MachinePointerInfo());

+ Chain = DAG.getStore(Chain, dl, MFFS, StackSlot, MachinePointerInfo());

// Load FP Control Word from low 32 bits of stack slot.

SDValue Four = DAG.getConstant(4, dl, PtrVT);

SDValue Addr = DAG.getNode(ISD::ADD, dl, PtrVT, StackSlot, Four);

- SDValue CWD = DAG.getLoad(MVT::i32, dl, Store, Addr, MachinePointerInfo());

+ SDValue CWD = DAG.getLoad(MVT::i32, dl, Chain, Addr, MachinePointerInfo());

+ Chain = CWD.getValue(1);

// Transform as necessary

SDValue CWD1 =

@@ -8373,8 +8878,11 @@ SDValue PPCTargetLowering::LowerFLT_ROUNDS_(SDValue Op,

SDValue RetVal =

DAG.getNode(ISD::XOR, dl, MVT::i32, CWD1, CWD2);

- return DAG.getNode((VT.getSizeInBits() < 16 ?

- ISD::TRUNCATE : ISD::ZERO_EXTEND), dl, VT, RetVal);

+ RetVal =

+ DAG.getNode((VT.getSizeInBits() < 16 ? ISD::TRUNCATE : ISD::ZERO_EXTEND),

+ dl, VT, RetVal);

+ return DAG.getMergeValues({RetVal, Chain}, dl);

}

SDValue PPCTargetLowering::LowerSHL_PARTS(SDValue Op, SelectionDAG &DAG) const {

@@ -8468,19 +8976,21 @@ SDValue PPCTargetLowering::LowerSRA_PARTS(SDValue Op, SelectionDAG &DAG) const {

// Vector related lowering.

-/// BuildSplatI - Build a canonical splati of Val with an element size of

-/// SplatSize. Cast the result to VT.

-static SDValue BuildSplatI(int Val, unsigned SplatSize, EVT VT,

- SelectionDAG &DAG, const SDLoc &dl) {

+/// getCanonicalConstSplat - Build a canonical splat immediate of Val with an

+/// element size of SplatSize. Cast the result to VT.

+static SDValue getCanonicalConstSplat(uint64_t Val, unsigned SplatSize, EVT VT,

+ SelectionDAG &DAG, const SDLoc &dl) {

static const MVT VTys[] = { // canonical VT to use for each size.

MVT::v16i8, MVT::v8i16, MVT::Other, MVT::v4i32

};

EVT ReqVT = VT != MVT::Other ? VT : VTys[SplatSize-1];

- // Force vspltis[hw] -1 to vspltisb -1 to canonicalize.

- if (Val == -1)

+ // For a splat with all ones, turn it to vspltisb 0xFF to canonicalize.

+ if (Val == ((1LU << (SplatSize * 8)) - 1)) {

SplatSize = 1;

+ Val = 0xFF;

+ }

EVT CanonicalVT = VTys[SplatSize-1];

@@ -8591,10 +9101,9 @@ SDValue PPCTargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const {

SDLoc dl(Op);

SDValue Op0 = Op->getOperand(0);

- if (!EnableQuadPrecision ||

- (Op.getValueType() != MVT::f128 ) ||

+ if ((Op.getValueType() != MVT::f128) ||

(Op0.getOpcode() != ISD::BUILD_PAIR) ||

- (Op0.getOperand(0).getValueType() != MVT::i64) ||

+ (Op0.getOperand(0).getValueType() != MVT::i64) ||

(Op0.getOperand(1).getValueType() != MVT::i64))

return SDValue();

@@ -8606,7 +9115,8 @@ static const SDValue *getNormalLoadInput(const SDValue &Op) {

const SDValue *InputLoad = &Op;

if (InputLoad->getOpcode() == ISD::BITCAST)

InputLoad = &InputLoad->getOperand(0);

- if (InputLoad->getOpcode() == ISD::SCALAR_TO_VECTOR)

+ if (InputLoad->getOpcode() == ISD::SCALAR_TO_VECTOR ||

+ InputLoad->getOpcode() == PPCISD::SCALAR_TO_VECTOR_PERMUTED)

InputLoad = &InputLoad->getOperand(0);

if (InputLoad->getOpcode() != ISD::LOAD)

return nullptr;

@@ -8614,6 +9124,34 @@ static const SDValue *getNormalLoadInput(const SDValue &Op) {

return ISD::isNormalLoad(LD) ? InputLoad : nullptr;

}

+// Convert the argument APFloat to a single precision APFloat if there is no

+// loss in information during the conversion to single precision APFloat and the

+// resulting number is not a denormal number. Return true if successful.

+bool llvm::convertToNonDenormSingle(APFloat &ArgAPFloat) {

+ APFloat APFloatToConvert = ArgAPFloat;

+ bool LosesInfo = true;

+ APFloatToConvert.convert(APFloat::IEEEsingle(), APFloat::rmNearestTiesToEven,

+ &LosesInfo);

+ bool Success = (!LosesInfo && !APFloatToConvert.isDenormal());

+ if (Success)

+ ArgAPFloat = APFloatToConvert;

+ return Success;

+// Bitcast the argument APInt to a double and convert it to a single precision

+// APFloat, bitcast the APFloat to an APInt and assign it to the original

+// argument if there is no loss in information during the conversion from

+// double to single precision APFloat and the resulting number is not a denormal

+// number. Return true if successful.

+bool llvm::convertToNonDenormSingle(APInt &ArgAPInt) {

+ double DpValue = ArgAPInt.bitsToDouble();

+ APFloat APFloatDp(DpValue);

+ bool Success = convertToNonDenormSingle(APFloatDp);

+ if (Success)

+ ArgAPInt = APFloatDp.bitcastToAPInt();

+ return Success;

// If this is a case we can't handle, return null and let the default

// expansion code take care of it. If we CAN select this case, and if it

// selects to a single instruction, return Op. Otherwise, if we can codegen

@@ -8630,7 +9168,7 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,

// then convert it to a floating-point vector and compare it

// to a zero vector to get the boolean result.

MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();

- int FrameIdx = MFI.CreateStackObject(16, 16, false);

+ int FrameIdx = MFI.CreateStackObject(16, Align(16), false);

MachinePointerInfo PtrInfo =

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx);

EVT PtrVT = getPointerTy(DAG.getDataLayout());

@@ -8665,8 +9203,8 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,

}

Constant *CP = ConstantVector::get(CV);

- SDValue CPIdx = DAG.getConstantPool(CP, getPointerTy(DAG.getDataLayout()),

- 16 /* alignment */);

+ SDValue CPIdx =

+ DAG.getConstantPool(CP, getPointerTy(DAG.getDataLayout()), Align(16));

SDValue Ops[] = {DAG.getEntryNode(), CPIdx};

SDVTList VTs = DAG.getVTList({MVT::v4i1, /*chain*/ MVT::Other});

@@ -8733,9 +9271,23 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,

APInt APSplatBits, APSplatUndef;

unsigned SplatBitSize;

bool HasAnyUndefs;

- if (! BVN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize,

- HasAnyUndefs, 0, !Subtarget.isLittleEndian()) ||

- SplatBitSize > 32) {

+ bool BVNIsConstantSplat =

+ BVN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize,

+ HasAnyUndefs, 0, !Subtarget.isLittleEndian());

+ // If it is a splat of a double, check if we can shrink it to a 32 bit

+ // non-denormal float which when converted back to double gives us the same

+ // double. This is to exploit the XXSPLTIDP instruction.

+ if (BVNIsConstantSplat && Subtarget.hasPrefixInstrs() &&

+ (SplatBitSize == 64) && (Op->getValueType(0) == MVT::v2f64) &&

+ convertToNonDenormSingle(APSplatBits)) {

+ SDValue SplatNode = DAG.getNode(

+ PPCISD::XXSPLTI_SP_TO_DP, dl, MVT::v2f64,

+ DAG.getTargetConstant(APSplatBits.getZExtValue(), dl, MVT::i32));

+ return DAG.getBitcast(Op.getValueType(), SplatNode);

+ }

+ if (!BVNIsConstantSplat || SplatBitSize > 32) {

const SDValue *InputLoad = getNormalLoadInput(Op.getOperand(0));

// Handle load-and-splat patterns as we have instructions that will do this

@@ -8774,8 +9326,8 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,

return SDValue();

}

- unsigned SplatBits = APSplatBits.getZExtValue();

- unsigned SplatUndef = APSplatUndef.getZExtValue();

+ uint64_t SplatBits = APSplatBits.getZExtValue();

+ uint64_t SplatUndef = APSplatUndef.getZExtValue();

unsigned SplatSize = SplatBitSize / 8;

// First, handle single instruction cases.

@@ -8790,17 +9342,30 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,

return Op;

}

- // We have XXSPLTIB for constant splats one byte wide

- // FIXME: SplatBits is an unsigned int being cast to an int while passing it

- // as an argument to BuildSplatiI. Given SplatSize == 1 it is okay here.

+ // We have XXSPLTIW for constant splats four bytes wide.

+ // Given vector length is a multiple of 4, 2-byte splats can be replaced

+ // with 4-byte splats. We replicate the SplatBits in case of 2-byte splat to

+ // make a 4-byte splat element. For example: 2-byte splat of 0xABAB can be

+ // turned into a 4-byte splat of 0xABABABAB.

+ if (Subtarget.hasPrefixInstrs() && SplatSize == 2)

+ return getCanonicalConstSplat((SplatBits |= SplatBits << 16), SplatSize * 2,

+ Op.getValueType(), DAG, dl);

+ if (Subtarget.hasPrefixInstrs() && SplatSize == 4)

+ return getCanonicalConstSplat(SplatBits, SplatSize, Op.getValueType(), DAG,

+ dl);

+ // We have XXSPLTIB for constant splats one byte wide.

if (Subtarget.hasP9Vector() && SplatSize == 1)

- return BuildSplatI(SplatBits, SplatSize, Op.getValueType(), DAG, dl);

+ return getCanonicalConstSplat(SplatBits, SplatSize, Op.getValueType(), DAG,

+ dl);

// If the sign extended value is in the range [-16,15], use VSPLTI[bhw].

int32_t SextVal= (int32_t(SplatBits << (32-SplatBitSize)) >>

(32-SplatBitSize));

if (SextVal >= -16 && SextVal <= 15)

- return BuildSplatI(SextVal, SplatSize, Op.getValueType(), DAG, dl);

+ return getCanonicalConstSplat(SextVal, SplatSize, Op.getValueType(), DAG,

+ dl);

// Two instruction sequences.

@@ -8831,7 +9396,7 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,

// for fneg/fabs.

if (SplatSize == 4 && SplatBits == (0x7FFFFFFF&~SplatUndef)) {

// Make -1 and vspltisw -1:

- SDValue OnesV = BuildSplatI(-1, 4, MVT::v4i32, DAG, dl);

+ SDValue OnesV = getCanonicalConstSplat(-1, 4, MVT::v4i32, DAG, dl);

// Make the VSLW intrinsic, computing 0x8000_0000.

SDValue Res = BuildIntrinsicOp(Intrinsic::ppc_altivec_vslw, OnesV,

@@ -8859,7 +9424,7 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,

// vsplti + shl self.

if (SextVal == (int)((unsigned)i << TypeShiftAmt)) {

- SDValue Res = BuildSplatI(i, SplatSize, MVT::Other, DAG, dl);

+ SDValue Res = getCanonicalConstSplat(i, SplatSize, MVT::Other, DAG, dl);

static const unsigned IIDs[] = { // Intrinsic to use for each size.

Intrinsic::ppc_altivec_vslb, Intrinsic::ppc_altivec_vslh, 0,

Intrinsic::ppc_altivec_vslw

@@ -8870,7 +9435,7 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,

// vsplti + srl self.

if (SextVal == (int)((unsigned)i >> TypeShiftAmt)) {

- SDValue Res = BuildSplatI(i, SplatSize, MVT::Other, DAG, dl);

+ SDValue Res = getCanonicalConstSplat(i, SplatSize, MVT::Other, DAG, dl);

static const unsigned IIDs[] = { // Intrinsic to use for each size.

Intrinsic::ppc_altivec_vsrb, Intrinsic::ppc_altivec_vsrh, 0,

Intrinsic::ppc_altivec_vsrw

@@ -8881,7 +9446,7 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,

// vsplti + sra self.

if (SextVal == (int)((unsigned)i >> TypeShiftAmt)) {

- SDValue Res = BuildSplatI(i, SplatSize, MVT::Other, DAG, dl);

+ SDValue Res = getCanonicalConstSplat(i, SplatSize, MVT::Other, DAG, dl);

static const unsigned IIDs[] = { // Intrinsic to use for each size.

Intrinsic::ppc_altivec_vsrab, Intrinsic::ppc_altivec_vsrah, 0,

Intrinsic::ppc_altivec_vsraw

@@ -8893,7 +9458,7 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,

// vsplti + rol self.

if (SextVal == (int)(((unsigned)i << TypeShiftAmt) |

((unsigned)i >> (SplatBitSize-TypeShiftAmt)))) {

- SDValue Res = BuildSplatI(i, SplatSize, MVT::Other, DAG, dl);

+ SDValue Res = getCanonicalConstSplat(i, SplatSize, MVT::Other, DAG, dl);

static const unsigned IIDs[] = { // Intrinsic to use for each size.

Intrinsic::ppc_altivec_vrlb, Intrinsic::ppc_altivec_vrlh, 0,

Intrinsic::ppc_altivec_vrlw

@@ -8904,19 +9469,19 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,

// t = vsplti c, result = vsldoi t, t, 1

if (SextVal == (int)(((unsigned)i << 8) | (i < 0 ? 0xFF : 0))) {

- SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl);

+ SDValue T = getCanonicalConstSplat(i, SplatSize, MVT::v16i8, DAG, dl);

unsigned Amt = Subtarget.isLittleEndian() ? 15 : 1;

return BuildVSLDOI(T, T, Amt, Op.getValueType(), DAG, dl);

}

// t = vsplti c, result = vsldoi t, t, 2

if (SextVal == (int)(((unsigned)i << 16) | (i < 0 ? 0xFFFF : 0))) {

- SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl);

+ SDValue T = getCanonicalConstSplat(i, SplatSize, MVT::v16i8, DAG, dl);

unsigned Amt = Subtarget.isLittleEndian() ? 14 : 2;

return BuildVSLDOI(T, T, Amt, Op.getValueType(), DAG, dl);

}

// t = vsplti c, result = vsldoi t, t, 3

if (SextVal == (int)(((unsigned)i << 24) | (i < 0 ? 0xFFFFFF : 0))) {

- SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl);

+ SDValue T = getCanonicalConstSplat(i, SplatSize, MVT::v16i8, DAG, dl);

unsigned Amt = Subtarget.isLittleEndian() ? 13 : 3;

return BuildVSLDOI(T, T, Amt, Op.getValueType(), DAG, dl);

}

@@ -9215,6 +9780,107 @@ SDValue PPCTargetLowering::lowerToVINSERTH(ShuffleVectorSDNode *N,

return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins);

}

+/// lowerToXXSPLTI32DX - Return the SDValue if this VECTOR_SHUFFLE can be

+/// handled by the XXSPLTI32DX instruction introduced in ISA 3.1, otherwise

+/// return the default SDValue.

+SDValue PPCTargetLowering::lowerToXXSPLTI32DX(ShuffleVectorSDNode *SVN,

+ SelectionDAG &DAG) const {

+ // The LHS and RHS may be bitcasts to v16i8 as we canonicalize shuffles

+ // to v16i8. Peek through the bitcasts to get the actual operands.

+ SDValue LHS = peekThroughBitcasts(SVN->getOperand(0));

+ SDValue RHS = peekThroughBitcasts(SVN->getOperand(1));

+ auto ShuffleMask = SVN->getMask();

+ SDValue VecShuffle(SVN, 0);

+ SDLoc DL(SVN);

+ // Check that we have a four byte shuffle.

+ if (!isNByteElemShuffleMask(SVN, 4, 1))

+ return SDValue();

+ // Canonicalize the RHS being a BUILD_VECTOR when lowering to xxsplti32dx.

+ if (RHS->getOpcode() != ISD::BUILD_VECTOR) {

+ std::swap(LHS, RHS);

+ VecShuffle = DAG.getCommutedVectorShuffle(*SVN);

+ ShuffleMask = cast<ShuffleVectorSDNode>(VecShuffle)->getMask();

+ }

+ // Ensure that the RHS is a vector of constants.

+ BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode());

+ if (!BVN)

+ return SDValue();

+ // Check if RHS is a splat of 4-bytes (or smaller).

+ APInt APSplatValue, APSplatUndef;

+ unsigned SplatBitSize;

+ bool HasAnyUndefs;

+ if (!BVN->isConstantSplat(APSplatValue, APSplatUndef, SplatBitSize,

+ HasAnyUndefs, 0, !Subtarget.isLittleEndian()) ||

+ SplatBitSize > 32)

+ return SDValue();

+ // Check that the shuffle mask matches the semantics of XXSPLTI32DX.

+ // The instruction splats a constant C into two words of the source vector

+ // producing { C, Unchanged, C, Unchanged } or { Unchanged, C, Unchanged, C }.

+ // Thus we check that the shuffle mask is the equivalent of

+ // <0, [4-7], 2, [4-7]> or <[4-7], 1, [4-7], 3> respectively.

+ // Note: the check above of isNByteElemShuffleMask() ensures that the bytes

+ // within each word are consecutive, so we only need to check the first byte.

+ SDValue Index;

+ bool IsLE = Subtarget.isLittleEndian();

+ if ((ShuffleMask[0] == 0 && ShuffleMask[8] == 8) &&

+ (ShuffleMask[4] % 4 == 0 && ShuffleMask[12] % 4 == 0 &&

+ ShuffleMask[4] > 15 && ShuffleMask[12] > 15))

+ Index = DAG.getTargetConstant(IsLE ? 0 : 1, DL, MVT::i32);

+ else if ((ShuffleMask[4] == 4 && ShuffleMask[12] == 12) &&

+ (ShuffleMask[0] % 4 == 0 && ShuffleMask[8] % 4 == 0 &&

+ ShuffleMask[0] > 15 && ShuffleMask[8] > 15))

+ Index = DAG.getTargetConstant(IsLE ? 1 : 0, DL, MVT::i32);

+ else

+ return SDValue();

+ // If the splat is narrower than 32-bits, we need to get the 32-bit value

+ // for XXSPLTI32DX.

+ unsigned SplatVal = APSplatValue.getZExtValue();

+ for (; SplatBitSize < 32; SplatBitSize <<= 1)

+ SplatVal |= (SplatVal << SplatBitSize);

+ SDValue SplatNode = DAG.getNode(

+ PPCISD::XXSPLTI32DX, DL, MVT::v2i64, DAG.getBitcast(MVT::v2i64, LHS),

+ Index, DAG.getTargetConstant(SplatVal, DL, MVT::i32));

+ return DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, SplatNode);

+/// LowerROTL - Custom lowering for ROTL(v1i128) to vector_shuffle(v16i8).

+/// We lower ROTL(v1i128) to vector_shuffle(v16i8) only if shift amount is

+/// a multiple of 8. Otherwise convert it to a scalar rotation(i128)

+/// i.e (or (shl x, C1), (srl x, 128-C1)).

+SDValue PPCTargetLowering::LowerROTL(SDValue Op, SelectionDAG &DAG) const {

+ assert(Op.getOpcode() == ISD::ROTL && "Should only be called for ISD::ROTL");

+ assert(Op.getValueType() == MVT::v1i128 &&

+ "Only set v1i128 as custom, other type shouldn't reach here!");

+ SDLoc dl(Op);

+ SDValue N0 = peekThroughBitcasts(Op.getOperand(0));

+ SDValue N1 = peekThroughBitcasts(Op.getOperand(1));

+ unsigned SHLAmt = N1.getConstantOperandVal(0);

+ if (SHLAmt % 8 == 0) {

+ SmallVector<int, 16> Mask(16, 0);

+ std::iota(Mask.begin(), Mask.end(), 0);

+ std::rotate(Mask.begin(), Mask.begin() + SHLAmt / 8, Mask.end());

+ if (SDValue Shuffle =

+ DAG.getVectorShuffle(MVT::v16i8, dl, DAG.getBitcast(MVT::v16i8, N0),

+ DAG.getUNDEF(MVT::v16i8), Mask))

+ return DAG.getNode(ISD::BITCAST, dl, MVT::v1i128, Shuffle);

+ }

+ SDValue ArgVal = DAG.getBitcast(MVT::i128, N0);

+ SDValue SHLOp = DAG.getNode(ISD::SHL, dl, MVT::i128, ArgVal,

+ DAG.getConstant(SHLAmt, dl, MVT::i32));

+ SDValue SRLOp = DAG.getNode(ISD::SRL, dl, MVT::i128, ArgVal,

+ DAG.getConstant(128 - SHLAmt, dl, MVT::i32));

+ SDValue OROp = DAG.getNode(ISD::OR, dl, MVT::i128, SHLOp, SRLOp);

+ return DAG.getNode(ISD::BITCAST, dl, MVT::v1i128, OROp);

/// LowerVECTOR_SHUFFLE - Return the code we lower for VECTOR_SHUFFLE. If this

/// is a shuffle we can handle in a single instruction, return it. Otherwise,

/// return the code it can be lowered into. Worst case, it can always be

@@ -9225,6 +9891,18 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,

SDValue V1 = Op.getOperand(0);

SDValue V2 = Op.getOperand(1);

ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);

+ // Any nodes that were combined in the target-independent combiner prior

+ // to vector legalization will not be sent to the target combine. Try to

+ // combine it here.

+ if (SDValue NewShuffle = combineVectorShuffle(SVOp, DAG)) {

+ if (!isa<ShuffleVectorSDNode>(NewShuffle))

+ return NewShuffle;

+ Op = NewShuffle;

+ SVOp = cast<ShuffleVectorSDNode>(Op);

+ V1 = Op.getOperand(0);

+ V2 = Op.getOperand(1);

+ }

EVT VT = Op.getValueType();

bool isLittleEndian = Subtarget.isLittleEndian();

@@ -9250,6 +9928,11 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,

Offset = isLittleEndian ? (3 - SplatIdx) * 4 : SplatIdx * 4;

else

Offset = isLittleEndian ? (1 - SplatIdx) * 8 : SplatIdx * 8;

+ // If we are loading a partial vector, it does not make sense to adjust

+ // the base pointer. This happens with (splat (s_to_v_permuted (ld))).

+ if (LD->getMemoryVT().getSizeInBits() == (IsFourByte ? 32 : 64))

+ Offset = 0;

SDValue BasePtr = LD->getBasePtr();

if (Offset != 0)

BasePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),

@@ -9288,6 +9971,12 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,

return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins);

}

+ if (Subtarget.hasPrefixInstrs()) {

+ SDValue SplatInsertNode;

+ if ((SplatInsertNode = lowerToXXSPLTI32DX(SVOp, DAG)))

+ return SplatInsertNode;

+ }

if (Subtarget.hasP9Altivec()) {

SDValue NewISDNode;

if ((NewISDNode = lowerToVINSERTH(SVOp, DAG)))

@@ -9523,7 +10212,13 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,

MVT::i32));

}

+ ShufflesHandledWithVPERM++;

SDValue VPermMask = DAG.getBuildVector(MVT::v16i8, dl, ResultMask);

+ LLVM_DEBUG(dbgs() << "Emitting a VPERM for the following shuffle:\n");

+ LLVM_DEBUG(SVOp->dump());

+ LLVM_DEBUG(dbgs() << "With the following permute control vector:\n");

+ LLVM_DEBUG(VPermMask.dump());

if (isLittleEndian)

return DAG.getNode(PPCISD::VPERM, dl, V1.getValueType(),

V2, V1, VPermMask);

@@ -9880,18 +10575,6 @@ SDValue PPCTargetLowering::LowerINTRINSIC_VOID(SDValue Op,

return SDValue();

}

-SDValue PPCTargetLowering::LowerREM(SDValue Op, SelectionDAG &DAG) const {

- // Check for a DIV with the same operands as this REM.

- for (auto UI : Op.getOperand(1)->uses()) {

- if ((Op.getOpcode() == ISD::SREM && UI->getOpcode() == ISD::SDIV) ||

- (Op.getOpcode() == ISD::UREM && UI->getOpcode() == ISD::UDIV))

- if (UI->getOperand(0) == Op.getOperand(0) &&

- UI->getOperand(1) == Op.getOperand(1))

- return SDValue();

- }

- return Op;

// Lower scalar BSWAP64 to xxbrd.

SDValue PPCTargetLowering::LowerBSWAP(SDValue Op, SelectionDAG &DAG) const {

SDLoc dl(Op);

@@ -9950,7 +10633,7 @@ SDValue PPCTargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op,

SDLoc dl(Op);

// Create a stack slot that is 16-byte aligned.

MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();

- int FrameIdx = MFI.CreateStackObject(16, 16, false);

+ int FrameIdx = MFI.CreateStackObject(16, Align(16), false);

EVT PtrVT = getPointerTy(DAG.getDataLayout());

SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);

@@ -10020,7 +10703,7 @@ SDValue PPCTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,

Value);

MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();

- int FrameIdx = MFI.CreateStackObject(16, 16, false);

+ int FrameIdx = MFI.CreateStackObject(16, Align(16), false);

MachinePointerInfo PtrInfo =

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx);

EVT PtrVT = getPointerTy(DAG.getDataLayout());

@@ -10161,9 +10844,8 @@ SDValue PPCTargetLowering::LowerVectorStore(SDValue Op,

SDValue Stores[4];

for (unsigned Idx = 0; Idx < 4; ++Idx) {

- SDValue Ex = DAG.getNode(

- ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, Value,

- DAG.getConstant(Idx, dl, getVectorIdxTy(DAG.getDataLayout())));

+ SDValue Ex = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, Value,

+ DAG.getVectorIdxConstant(Idx, dl));

SDValue Store;

if (ScalarVT != ScalarMemVT)

Store =

@@ -10220,7 +10902,7 @@ SDValue PPCTargetLowering::LowerVectorStore(SDValue Op,

Value);

MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();

- int FrameIdx = MFI.CreateStackObject(16, 16, false);

+ int FrameIdx = MFI.CreateStackObject(16, Align(16), false);

MachinePointerInfo PtrInfo =

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx);

EVT PtrVT = getPointerTy(DAG.getDataLayout());

@@ -10269,9 +10951,9 @@ SDValue PPCTargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {

if (Op.getValueType() == MVT::v4i32) {

SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1);

- SDValue Zero = BuildSplatI( 0, 1, MVT::v4i32, DAG, dl);

- SDValue Neg16 = BuildSplatI(-16, 4, MVT::v4i32, DAG, dl);//+16 as shift amt.

+ SDValue Zero = getCanonicalConstSplat(0, 1, MVT::v4i32, DAG, dl);

+ // +16 as shift amt.

+ SDValue Neg16 = getCanonicalConstSplat(-16, 4, MVT::v4i32, DAG, dl);

SDValue RHSSwap = // = vrlw RHS, 16

BuildIntrinsicOp(Intrinsic::ppc_altivec_vrlw, RHS, Neg16, DAG, dl);

@@ -10291,13 +10973,6 @@ SDValue PPCTargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {

HiProd = BuildIntrinsicOp(Intrinsic::ppc_altivec_vslw, HiProd,

Neg16, DAG, dl);

return DAG.getNode(ISD::ADD, dl, MVT::v4i32, LoProd, HiProd);

- } else if (Op.getValueType() == MVT::v8i16) {

- SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1);

- SDValue Zero = BuildSplatI(0, 1, MVT::v8i16, DAG, dl);

- return BuildIntrinsicOp(Intrinsic::ppc_altivec_vmladduhm,

- LHS, RHS, Zero, DAG, dl);

} else if (Op.getValueType() == MVT::v16i8) {

SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1);

bool isLittleEndian = Subtarget.isLittleEndian();

@@ -10504,6 +11179,7 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {

case ISD::MUL: return LowerMUL(Op, DAG);

case ISD::ABS: return LowerABS(Op, DAG);

case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);

+ case ISD::ROTL: return LowerROTL(Op, DAG);

// For counter-based loop handling.

case ISD::INTRINSIC_W_CHAIN: return SDValue();

@@ -10516,9 +11192,6 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {

case ISD::INTRINSIC_VOID:

return LowerINTRINSIC_VOID(Op, DAG);

- case ISD::SREM:

- case ISD::UREM:

- return LowerREM(Op, DAG);

case ISD::BSWAP:

return LowerBSWAP(Op, DAG);

case ISD::ATOMIC_CMP_SWAP:

@@ -10537,8 +11210,8 @@ void PPCTargetLowering::ReplaceNodeResults(SDNode *N,

SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other);

SDValue RTB = DAG.getNode(PPCISD::READ_TIME_BASE, dl, VTs, N->getOperand(0));

- Results.push_back(RTB);

- Results.push_back(RTB.getValue(1));

+ Results.push_back(

+ DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, RTB, RTB.getValue(1)));

Results.push_back(RTB.getValue(2));

break;

}

@@ -11198,13 +11871,192 @@ PPCTargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,

return MBB;

}

+bool PPCTargetLowering::hasInlineStackProbe(MachineFunction &MF) const {

+ // If the function specifically requests inline stack probes, emit them.

+ if (MF.getFunction().hasFnAttribute("probe-stack"))

+ return MF.getFunction().getFnAttribute("probe-stack").getValueAsString() ==

+ "inline-asm";

+ return false;

+unsigned PPCTargetLowering::getStackProbeSize(MachineFunction &MF) const {

+ const TargetFrameLowering *TFI = Subtarget.getFrameLowering();

+ unsigned StackAlign = TFI->getStackAlignment();

+ assert(StackAlign >= 1 && isPowerOf2_32(StackAlign) &&

+ "Unexpected stack alignment");

+ // The default stack probe size is 4096 if the function has no

+ // stack-probe-size attribute.

+ unsigned StackProbeSize = 4096;

+ const Function &Fn = MF.getFunction();

+ if (Fn.hasFnAttribute("stack-probe-size"))

+ Fn.getFnAttribute("stack-probe-size")

+ .getValueAsString()

+ .getAsInteger(0, StackProbeSize);

+ // Round down to the stack alignment.

+ StackProbeSize &= ~(StackAlign - 1);

+ return StackProbeSize ? StackProbeSize : StackAlign;

+// Lower dynamic stack allocation with probing. `emitProbedAlloca` is splitted

+// into three phases. In the first phase, it uses pseudo instruction

+// PREPARE_PROBED_ALLOCA to get the future result of actual FramePointer and

+// FinalStackPtr. In the second phase, it generates a loop for probing blocks.

+// At last, it uses pseudo instruction DYNAREAOFFSET to get the future result of

+// MaxCallFrameSize so that it can calculate correct data area pointer.

+MachineBasicBlock *

+PPCTargetLowering::emitProbedAlloca(MachineInstr &MI,

+ MachineBasicBlock *MBB) const {

+ const bool isPPC64 = Subtarget.isPPC64();

+ MachineFunction *MF = MBB->getParent();

+ const TargetInstrInfo *TII = Subtarget.getInstrInfo();

+ DebugLoc DL = MI.getDebugLoc();

+ const unsigned ProbeSize = getStackProbeSize(*MF);

+ const BasicBlock *ProbedBB = MBB->getBasicBlock();

+ MachineRegisterInfo &MRI = MF->getRegInfo();

+ // The CFG of probing stack looks as

+ // +-----+

+ // | MBB |

+ // +--+--+

+ // |

+ // +----v----+

+ // +--->+ TestMBB +---+

+ // | +----+----+ |

+ // | | |

+ // | +-----v----+ |

+ // +---+ BlockMBB | |

+ // +----------+ |

+ // |

+ // +---------+ |

+ // | TailMBB +<--+

+ // +---------+

+ // In MBB, calculate previous frame pointer and final stack pointer.

+ // In TestMBB, test if sp is equal to final stack pointer, if so, jump to

+ // TailMBB. In BlockMBB, update the sp atomically and jump back to TestMBB.

+ // TailMBB is spliced via \p MI.

+ MachineBasicBlock *TestMBB = MF->CreateMachineBasicBlock(ProbedBB);

+ MachineBasicBlock *TailMBB = MF->CreateMachineBasicBlock(ProbedBB);

+ MachineBasicBlock *BlockMBB = MF->CreateMachineBasicBlock(ProbedBB);

+ MachineFunction::iterator MBBIter = ++MBB->getIterator();

+ MF->insert(MBBIter, TestMBB);

+ MF->insert(MBBIter, BlockMBB);

+ MF->insert(MBBIter, TailMBB);

+ const TargetRegisterClass *G8RC = &PPC::G8RCRegClass;

+ const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;

+ Register DstReg = MI.getOperand(0).getReg();

+ Register NegSizeReg = MI.getOperand(1).getReg();

+ Register SPReg = isPPC64 ? PPC::X1 : PPC::R1;

+ Register FinalStackPtr = MRI.createVirtualRegister(isPPC64 ? G8RC : GPRC);

+ Register FramePointer = MRI.createVirtualRegister(isPPC64 ? G8RC : GPRC);

+ // Get the canonical FinalStackPtr like what

+ // PPCRegisterInfo::lowerDynamicAlloc does.

+ BuildMI(*MBB, {MI}, DL,

+ TII->get(isPPC64 ? PPC::PREPARE_PROBED_ALLOCA_64

+ : PPC::PREPARE_PROBED_ALLOCA_32),

+ FramePointer)

+ .addDef(FinalStackPtr)

+ .addReg(NegSizeReg)

+ .add(MI.getOperand(2))

+ .add(MI.getOperand(3));

+ // Materialize a scratch register for update.

+ int64_t NegProbeSize = -(int64_t)ProbeSize;

+ assert(isInt<32>(NegProbeSize) && "Unhandled probe size!");

+ Register ScratchReg = MRI.createVirtualRegister(isPPC64 ? G8RC : GPRC);

+ if (!isInt<16>(NegProbeSize)) {

+ Register TempReg = MRI.createVirtualRegister(isPPC64 ? G8RC : GPRC);

+ BuildMI(*MBB, {MI}, DL, TII->get(isPPC64 ? PPC::LIS8 : PPC::LIS), TempReg)

+ .addImm(NegProbeSize >> 16);

+ BuildMI(*MBB, {MI}, DL, TII->get(isPPC64 ? PPC::ORI8 : PPC::ORI),

+ ScratchReg)

+ .addReg(TempReg)

+ .addImm(NegProbeSize & 0xFFFF);

+ } else

+ BuildMI(*MBB, {MI}, DL, TII->get(isPPC64 ? PPC::LI8 : PPC::LI), ScratchReg)

+ .addImm(NegProbeSize);

+ {

+ // Probing leading residual part.

+ Register Div = MRI.createVirtualRegister(isPPC64 ? G8RC : GPRC);

+ BuildMI(*MBB, {MI}, DL, TII->get(isPPC64 ? PPC::DIVD : PPC::DIVW), Div)

+ .addReg(NegSizeReg)

+ .addReg(ScratchReg);

+ Register Mul = MRI.createVirtualRegister(isPPC64 ? G8RC : GPRC);

+ BuildMI(*MBB, {MI}, DL, TII->get(isPPC64 ? PPC::MULLD : PPC::MULLW), Mul)

+ .addReg(Div)

+ .addReg(ScratchReg);

+ Register NegMod = MRI.createVirtualRegister(isPPC64 ? G8RC : GPRC);

+ BuildMI(*MBB, {MI}, DL, TII->get(isPPC64 ? PPC::SUBF8 : PPC::SUBF), NegMod)

+ .addReg(Mul)

+ .addReg(NegSizeReg);

+ BuildMI(*MBB, {MI}, DL, TII->get(isPPC64 ? PPC::STDUX : PPC::STWUX), SPReg)

+ .addReg(FramePointer)

+ .addReg(SPReg)

+ .addReg(NegMod);

+ }

+ {

+ // Remaining part should be multiple of ProbeSize.

+ Register CmpResult = MRI.createVirtualRegister(&PPC::CRRCRegClass);

+ BuildMI(TestMBB, DL, TII->get(isPPC64 ? PPC::CMPD : PPC::CMPW), CmpResult)

+ .addReg(SPReg)

+ .addReg(FinalStackPtr);

+ BuildMI(TestMBB, DL, TII->get(PPC::BCC))

+ .addImm(PPC::PRED_EQ)

+ .addReg(CmpResult)

+ .addMBB(TailMBB);

+ TestMBB->addSuccessor(BlockMBB);

+ TestMBB->addSuccessor(TailMBB);

+ }

+ {

+ // Touch the block.

+ // |P...|P...|P...

+ BuildMI(BlockMBB, DL, TII->get(isPPC64 ? PPC::STDUX : PPC::STWUX), SPReg)

+ .addReg(FramePointer)

+ .addReg(SPReg)

+ .addReg(ScratchReg);

+ BuildMI(BlockMBB, DL, TII->get(PPC::B)).addMBB(TestMBB);

+ BlockMBB->addSuccessor(TestMBB);

+ }

+ // Calculation of MaxCallFrameSize is deferred to prologepilog, use

+ // DYNAREAOFFSET pseudo instruction to get the future result.

+ Register MaxCallFrameSizeReg =

+ MRI.createVirtualRegister(isPPC64 ? G8RC : GPRC);

+ BuildMI(TailMBB, DL,

+ TII->get(isPPC64 ? PPC::DYNAREAOFFSET8 : PPC::DYNAREAOFFSET),

+ MaxCallFrameSizeReg)

+ .add(MI.getOperand(2))

+ .add(MI.getOperand(3));

+ BuildMI(TailMBB, DL, TII->get(isPPC64 ? PPC::ADD8 : PPC::ADD4), DstReg)

+ .addReg(SPReg)

+ .addReg(MaxCallFrameSizeReg);

+ // Splice instructions after MI to TailMBB.

+ TailMBB->splice(TailMBB->end(), MBB,

+ std::next(MachineBasicBlock::iterator(MI)), MBB->end());

+ TailMBB->transferSuccessorsAndUpdatePHIs(MBB);

+ MBB->addSuccessor(TestMBB);

+ // Delete the pseudo instruction.

+ MI.eraseFromParent();

+ ++NumDynamicAllocaProbed;

+ return TailMBB;

MachineBasicBlock *

PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,

MachineBasicBlock *BB) const {

if (MI.getOpcode() == TargetOpcode::STACKMAP ||

MI.getOpcode() == TargetOpcode::PATCHPOINT) {

if (Subtarget.is64BitELFABI() &&

- MI.getOpcode() == TargetOpcode::PATCHPOINT) {

+ MI.getOpcode() == TargetOpcode::PATCHPOINT &&

+ !Subtarget.isUsingPCRelativeCalls()) {

// Call lowering should have added an r2 operand to indicate a dependence

// on the TOC base pointer value. It can't however, because there is no

// way to mark the dependence as implicit there, and so the stackmap code

@@ -11886,12 +12738,12 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,

}

MachineFrameInfo &MFI = F->getFrameInfo();

- int FrameIdx = MFI.CreateStackObject(8, 8, false);

+ int FrameIdx = MFI.CreateStackObject(8, Align(8), false);

MachineMemOperand *MMOStore = F->getMachineMemOperand(

- MachinePointerInfo::getFixedStack(*F, FrameIdx, 0),

- MachineMemOperand::MOStore, MFI.getObjectSize(FrameIdx),

- MFI.getObjectAlignment(FrameIdx));

+ MachinePointerInfo::getFixedStack(*F, FrameIdx, 0),

+ MachineMemOperand::MOStore, MFI.getObjectSize(FrameIdx),

+ MFI.getObjectAlign(FrameIdx));

// Store the SrcReg into the stack.

BuildMI(*BB, MI, dl, TII->get(StoreOp))

@@ -11901,9 +12753,9 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,

.addMemOperand(MMOStore);

MachineMemOperand *MMOLoad = F->getMachineMemOperand(

- MachinePointerInfo::getFixedStack(*F, FrameIdx, 0),

- MachineMemOperand::MOLoad, MFI.getObjectSize(FrameIdx),

- MFI.getObjectAlignment(FrameIdx));

+ MachinePointerInfo::getFixedStack(*F, FrameIdx, 0),

+ MachineMemOperand::MOLoad, MFI.getObjectSize(FrameIdx),

+ MFI.getObjectAlign(FrameIdx));

// Load from the stack where SrcReg is stored, and save to DestReg,

// so we have done the RegClass conversion from RegClass::SrcReg to

@@ -11963,6 +12815,9 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,

.addReg(NewFPSCRReg)

.addImm(0)

.addImm(0);

+ } else if (MI.getOpcode() == PPC::PROBED_ALLOCA_32 ||

+ MI.getOpcode() == PPC::PROBED_ALLOCA_64) {

+ return emitProbedAlloca(MI, BB);

} else {

llvm_unreachable("Unexpected instr type to insert");

}

@@ -13167,15 +14022,20 @@ static SDValue addShuffleForVecExtend(SDNode *N, SelectionDAG &DAG,

DAG.getVectorShuffle(Input.getValueType(), dl, Input,

DAG.getUNDEF(Input.getValueType()), ShuffleMask);

- EVT Ty = N->getValueType(0);

- SDValue BV = DAG.getNode(PPCISD::SExtVElems, dl, Ty, Shuffle);

- return BV;

+ EVT VT = N->getValueType(0);

+ SDValue Conv = DAG.getBitcast(VT, Shuffle);

+ EVT ExtVT = EVT::getVectorVT(*DAG.getContext(),

+ Input.getValueType().getVectorElementType(),

+ VT.getVectorNumElements());

+ return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, VT, Conv,

+ DAG.getValueType(ExtVT));

}

// Look for build vector patterns where input operands come from sign

// extended vector_extract elements of specific indices. If the correct indices

-// aren't used, add a vector shuffle to fix up the indices and create a new

-// PPCISD:SExtVElems node which selects the vector sign extend instructions

+// aren't used, add a vector shuffle to fix up the indices and create

+// SIGN_EXTEND_INREG node which selects the vector sign extend instructions

// during instruction selection.

static SDValue combineBVOfVecSExt(SDNode *N, SelectionDAG &DAG) {

// This array encodes the indices that the vector sign extend instructions

@@ -13498,8 +14358,8 @@ SDValue PPCTargetLowering::expandVSXLoadForLE(SDNode *N,

// Do not expand to PPCISD::LXVD2X + PPCISD::XXSWAPD when the load is

// aligned and the type is a vector with elements up to 4 bytes

- if (Subtarget.needsSwapsForVSXMemOps() && !(MMO->getAlignment()%16)

- && VecTy.getScalarSizeInBits() <= 32 ) {

+ if (Subtarget.needsSwapsForVSXMemOps() && MMO->getAlign() >= Align(16) &&

+ VecTy.getScalarSizeInBits() <= 32) {

return SDValue();

}

@@ -13569,8 +14429,8 @@ SDValue PPCTargetLowering::expandVSXStoreForLE(SDNode *N,

// Do not expand to PPCISD::XXSWAPD and PPCISD::STXVD2X when the load is

// aligned and the type is a vector with elements up to 4 bytes

- if (Subtarget.needsSwapsForVSXMemOps() && !(MMO->getAlignment()%16)

- && VecTy.getScalarSizeInBits() <= 32 ) {

+ if (Subtarget.needsSwapsForVSXMemOps() && MMO->getAlign() >= Align(16) &&

+ VecTy.getScalarSizeInBits() <= 32) {

return SDValue();

}

@@ -13650,6 +14510,210 @@ SDValue PPCTargetLowering::combineStoreFPToInt(SDNode *N,

return Val;

}

+static bool isAlternatingShuffMask(const ArrayRef<int> &Mask, int NumElts) {

+ // Check that the source of the element keeps flipping

+ // (i.e. Mask[i] < NumElts -> Mask[i+i] >= NumElts).

+ bool PrevElemFromFirstVec = Mask[0] < NumElts;

+ for (int i = 1, e = Mask.size(); i < e; i++) {

+ if (PrevElemFromFirstVec && Mask[i] < NumElts)

+ return false;

+ if (!PrevElemFromFirstVec && Mask[i] >= NumElts)

+ return false;

+ PrevElemFromFirstVec = !PrevElemFromFirstVec;

+ }

+ return true;

+static bool isSplatBV(SDValue Op) {

+ if (Op.getOpcode() != ISD::BUILD_VECTOR)

+ return false;

+ SDValue FirstOp;

+ // Find first non-undef input.

+ for (int i = 0, e = Op.getNumOperands(); i < e; i++) {

+ FirstOp = Op.getOperand(i);

+ if (!FirstOp.isUndef())

+ break;

+ }

+ // All inputs are undef or the same as the first non-undef input.

+ for (int i = 1, e = Op.getNumOperands(); i < e; i++)

+ if (Op.getOperand(i) != FirstOp && !Op.getOperand(i).isUndef())

+ return false;

+ return true;

+static SDValue isScalarToVec(SDValue Op) {

+ if (Op.getOpcode() == ISD::SCALAR_TO_VECTOR)

+ return Op;

+ if (Op.getOpcode() != ISD::BITCAST)

+ return SDValue();

+ Op = Op.getOperand(0);

+ if (Op.getOpcode() == ISD::SCALAR_TO_VECTOR)

+ return Op;

+ return SDValue();

+static void fixupShuffleMaskForPermutedSToV(SmallVectorImpl<int> &ShuffV,

+ int LHSMaxIdx, int RHSMinIdx,

+ int RHSMaxIdx, int HalfVec) {

+ for (int i = 0, e = ShuffV.size(); i < e; i++) {

+ int Idx = ShuffV[i];

+ if ((Idx >= 0 && Idx < LHSMaxIdx) || (Idx >= RHSMinIdx && Idx < RHSMaxIdx))

+ ShuffV[i] += HalfVec;

+ }

+ return;

+// Replace a SCALAR_TO_VECTOR with a SCALAR_TO_VECTOR_PERMUTED except if

+// the original is:

+// (<n x Ty> (scalar_to_vector (Ty (extract_elt <n x Ty> %a, C))))

+// In such a case, just change the shuffle mask to extract the element

+// from the permuted index.

+static SDValue getSToVPermuted(SDValue OrigSToV, SelectionDAG &DAG) {

+ SDLoc dl(OrigSToV);

+ EVT VT = OrigSToV.getValueType();

+ assert(OrigSToV.getOpcode() == ISD::SCALAR_TO_VECTOR &&

+ "Expecting a SCALAR_TO_VECTOR here");

+ SDValue Input = OrigSToV.getOperand(0);

+ if (Input.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {

+ ConstantSDNode *Idx = dyn_cast<ConstantSDNode>(Input.getOperand(1));

+ SDValue OrigVector = Input.getOperand(0);

+ // Can't handle non-const element indices or different vector types

+ // for the input to the extract and the output of the scalar_to_vector.

+ if (Idx && VT == OrigVector.getValueType()) {

+ SmallVector<int, 16> NewMask(VT.getVectorNumElements(), -1);

+ NewMask[VT.getVectorNumElements() / 2] = Idx->getZExtValue();

+ return DAG.getVectorShuffle(VT, dl, OrigVector, OrigVector, NewMask);

+ }

+ return DAG.getNode(PPCISD::SCALAR_TO_VECTOR_PERMUTED, dl, VT,

+ OrigSToV.getOperand(0));

+// On little endian subtargets, combine shuffles such as:

+// vector_shuffle<16,1,17,3,18,5,19,7,20,9,21,11,22,13,23,15>, <zero>, %b

+// into:

+// vector_shuffle<16,0,17,1,18,2,19,3,20,4,21,5,22,6,23,7>, <zero>, %b

+// because the latter can be matched to a single instruction merge.

+// Furthermore, SCALAR_TO_VECTOR on little endian always involves a permute

+// to put the value into element zero. Adjust the shuffle mask so that the

+// vector can remain in permuted form (to prevent a swap prior to a shuffle).

+SDValue PPCTargetLowering::combineVectorShuffle(ShuffleVectorSDNode *SVN,

+ SelectionDAG &DAG) const {

+ SDValue LHS = SVN->getOperand(0);

+ SDValue RHS = SVN->getOperand(1);

+ auto Mask = SVN->getMask();

+ int NumElts = LHS.getValueType().getVectorNumElements();

+ SDValue Res(SVN, 0);

+ SDLoc dl(SVN);

+ // None of these combines are useful on big endian systems since the ISA

+ // already has a big endian bias.

+ if (!Subtarget.isLittleEndian() || !Subtarget.hasVSX())

+ return Res;

+ // If this is not a shuffle of a shuffle and the first element comes from

+ // the second vector, canonicalize to the commuted form. This will make it

+ // more likely to match one of the single instruction patterns.

+ if (Mask[0] >= NumElts && LHS.getOpcode() != ISD::VECTOR_SHUFFLE &&

+ RHS.getOpcode() != ISD::VECTOR_SHUFFLE) {

+ std::swap(LHS, RHS);

+ Res = DAG.getCommutedVectorShuffle(*SVN);

+ Mask = cast<ShuffleVectorSDNode>(Res)->getMask();

+ }

+ // Adjust the shuffle mask if either input vector comes from a

+ // SCALAR_TO_VECTOR and keep the respective input vector in permuted

+ // form (to prevent the need for a swap).

+ SmallVector<int, 16> ShuffV(Mask.begin(), Mask.end());

+ SDValue SToVLHS = isScalarToVec(LHS);

+ SDValue SToVRHS = isScalarToVec(RHS);

+ if (SToVLHS || SToVRHS) {

+ int NumEltsIn = SToVLHS ? SToVLHS.getValueType().getVectorNumElements()

+ : SToVRHS.getValueType().getVectorNumElements();

+ int NumEltsOut = ShuffV.size();

+ // Initially assume that neither input is permuted. These will be adjusted

+ // accordingly if either input is.

+ int LHSMaxIdx = -1;

+ int RHSMinIdx = -1;

+ int RHSMaxIdx = -1;

+ int HalfVec = LHS.getValueType().getVectorNumElements() / 2;

+ // Get the permuted scalar to vector nodes for the source(s) that come from

+ // ISD::SCALAR_TO_VECTOR.

+ if (SToVLHS) {

+ // Set up the values for the shuffle vector fixup.

+ LHSMaxIdx = NumEltsOut / NumEltsIn;

+ SToVLHS = getSToVPermuted(SToVLHS, DAG);

+ if (SToVLHS.getValueType() != LHS.getValueType())

+ SToVLHS = DAG.getBitcast(LHS.getValueType(), SToVLHS);

+ LHS = SToVLHS;

+ }

+ if (SToVRHS) {

+ RHSMinIdx = NumEltsOut;

+ RHSMaxIdx = NumEltsOut / NumEltsIn + RHSMinIdx;

+ SToVRHS = getSToVPermuted(SToVRHS, DAG);

+ if (SToVRHS.getValueType() != RHS.getValueType())

+ SToVRHS = DAG.getBitcast(RHS.getValueType(), SToVRHS);

+ RHS = SToVRHS;

+ }

+ // Fix up the shuffle mask to reflect where the desired element actually is.

+ // The minimum and maximum indices that correspond to element zero for both

+ // the LHS and RHS are computed and will control which shuffle mask entries

+ // are to be changed. For example, if the RHS is permuted, any shuffle mask

+ // entries in the range [RHSMinIdx,RHSMaxIdx) will be incremented by

+ // HalfVec to refer to the corresponding element in the permuted vector.

+ fixupShuffleMaskForPermutedSToV(ShuffV, LHSMaxIdx, RHSMinIdx, RHSMaxIdx,

+ HalfVec);

+ Res = DAG.getVectorShuffle(SVN->getValueType(0), dl, LHS, RHS, ShuffV);

+ // We may have simplified away the shuffle. We won't be able to do anything

+ // further with it here.

+ if (!isa<ShuffleVectorSDNode>(Res))

+ return Res;

+ Mask = cast<ShuffleVectorSDNode>(Res)->getMask();

+ }

+ // The common case after we commuted the shuffle is that the RHS is a splat

+ // and we have elements coming in from the splat at indices that are not

+ // conducive to using a merge.

+ // Example:

+ // vector_shuffle<0,17,1,19,2,21,3,23,4,25,5,27,6,29,7,31> t1, <zero>

+ if (!isSplatBV(RHS))

+ return Res;

+ // We are looking for a mask such that all even elements are from

+ // one vector and all odd elements from the other.

+ if (!isAlternatingShuffMask(Mask, NumElts))

+ return Res;

+ // Adjust the mask so we are pulling in the same index from the splat

+ // as the index from the interesting vector in consecutive elements.

+ // Example (even elements from first vector):

+ // vector_shuffle<0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23> t1, <zero>

+ if (Mask[0] < NumElts)

+ for (int i = 1, e = Mask.size(); i < e; i += 2)

+ ShuffV[i] = (ShuffV[i - 1] + NumElts);

+ // Example (odd elements from first vector):

+ // vector_shuffle<16,0,17,1,18,2,19,3,20,4,21,5,22,6,23,7> t1, <zero>

+ else

+ for (int i = 0, e = Mask.size(); i < e; i += 2)

+ ShuffV[i] = (ShuffV[i + 1] + NumElts);

+ // If the RHS has undefs, we need to remove them since we may have created

+ // a shuffle that adds those instead of the splat value.

+ SDValue SplatVal = cast<BuildVectorSDNode>(RHS.getNode())->getSplatValue();

+ RHS = DAG.getSplatBuildVector(RHS.getValueType(), dl, SplatVal);

+ Res = DAG.getVectorShuffle(SVN->getValueType(0), dl, LHS, RHS, ShuffV);

+ return Res;

SDValue PPCTargetLowering::combineVReverseMemOP(ShuffleVectorSDNode *SVN,

LSBaseSDNode *LSBase,

DAGCombinerInfo &DCI) const {

@@ -13721,6 +14785,9 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,

return combineSRL(N, DCI);

case ISD::MUL:

return combineMUL(N, DCI);

+ case ISD::FMA:

+ case PPCISD::FNMSUB:

+ return combineFMALike(N, DCI);

case PPCISD::SHL:

if (isNullConstant(N->getOperand(0))) // 0 << V -> 0.

return N->getOperand(0);

@@ -13756,7 +14823,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,

LSBaseSDNode* LSBase = cast<LSBaseSDNode>(N->getOperand(0));

return combineVReverseMemOP(cast<ShuffleVectorSDNode>(N), LSBase, DCI);

}

- break;

+ return combineVectorShuffle(cast<ShuffleVectorSDNode>(N), DCI.DAG);

case ISD::STORE: {

EVT Op1VT = N->getOperand(1).getValueType();

@@ -13963,17 +15030,18 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,

EVT MemVT = LD->getMemoryVT();

Type *Ty = MemVT.getTypeForEVT(*DAG.getContext());

- unsigned ABIAlignment = DAG.getDataLayout().getABITypeAlignment(Ty);

+ Align ABIAlignment = DAG.getDataLayout().getABITypeAlign(Ty);

Type *STy = MemVT.getScalarType().getTypeForEVT(*DAG.getContext());

- unsigned ScalarABIAlignment = DAG.getDataLayout().getABITypeAlignment(STy);

+ Align ScalarABIAlignment = DAG.getDataLayout().getABITypeAlign(STy);

if (LD->isUnindexed() && VT.isVector() &&

((Subtarget.hasAltivec() && ISD::isNON_EXTLoad(N) &&

// P8 and later hardware should just use LOAD.

- !Subtarget.hasP8Vector() && (VT == MVT::v16i8 || VT == MVT::v8i16 ||

- VT == MVT::v4i32 || VT == MVT::v4f32)) ||

+ !Subtarget.hasP8Vector() &&

+ (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||

+ VT == MVT::v4f32)) ||

(Subtarget.hasQPX() && (VT == MVT::v4f64 || VT == MVT::v4f32) &&

- LD->getAlignment() >= ScalarABIAlignment)) &&

- LD->getAlignment() < ABIAlignment) {

+ LD->getAlign() >= ScalarABIAlignment)) &&

+ LD->getAlign() < ABIAlignment) {

// This is a type-legal unaligned Altivec or QPX load.

SDValue Chain = LD->getChain();

SDValue Ptr = LD->getBasePtr();

@@ -14520,6 +15588,7 @@ Align PPCTargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {

case PPC::DIR_PWR7:

case PPC::DIR_PWR8:

case PPC::DIR_PWR9:

+ case PPC::DIR_PWR10:

case PPC::DIR_PWR_FUTURE: {

if (!ML)

break;

@@ -14926,18 +15995,16 @@ SDValue PPCTargetLowering::LowerFRAMEADDR(SDValue Op,

const MachineFunction &MF) const {

bool isPPC64 = Subtarget.isPPC64();

- bool IsDarwinABI = Subtarget.isDarwinABI();

bool is64Bit = isPPC64 && VT == LLT::scalar(64);

if (!is64Bit && VT != LLT::scalar(32))

report_fatal_error("Invalid register global variable type");

- .Case("r1", is64Bit ? PPC::X1 : PPC::R1)

- .Case("r2", (IsDarwinABI || isPPC64) ? Register() : PPC::R2)

- .Case("r13", (!isPPC64 && IsDarwinABI) ? Register() :

- (is64Bit ? PPC::X13 : PPC::R13))

- .Default(Register());

+ .Case("r1", is64Bit ? PPC::X1 : PPC::R1)

+ .Case("r2", isPPC64 ? Register() : PPC::R2)

+ .Case("r13", (is64Bit ? PPC::X13 : PPC::R13))

+ .Default(Register());

if (Reg)

return Reg;

@@ -15030,7 +16097,7 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,

Info.ptrVal = I.getArgOperand(0);

Info.offset = -VT.getStoreSize()+1;

Info.size = 2*VT.getStoreSize()-1;

- Info.align = Align::None();

+ Info.align = Align(1);

Info.flags = MachineMemOperand::MOLoad;

return true;

}

@@ -15064,7 +16131,7 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,

Info.ptrVal = I.getArgOperand(0);

Info.offset = 0;

Info.size = VT.getStoreSize();

- Info.align = Align::None();

+ Info.align = Align(1);

Info.flags = MachineMemOperand::MOLoad;

return true;

}

@@ -15116,7 +16183,7 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,

Info.ptrVal = I.getArgOperand(1);

Info.offset = -VT.getStoreSize()+1;

Info.size = 2*VT.getStoreSize()-1;

- Info.align = Align::None();

+ Info.align = Align(1);

Info.flags = MachineMemOperand::MOStore;

return true;

}

@@ -15149,7 +16216,7 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,

Info.ptrVal = I.getArgOperand(1);

Info.offset = 0;

Info.size = VT.getStoreSize();

- Info.align = Align::None();

+ Info.align = Align(1);

Info.flags = MachineMemOperand::MOStore;

return true;

}

@@ -15160,35 +16227,24 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,

return false;

}

-/// getOptimalMemOpType - Returns the target specific optimal type for load

-/// and store operations as a result of memset, memcpy, and memmove

-/// lowering. If DstAlign is zero that means it's safe to destination

-/// alignment can satisfy any constraint. Similarly if SrcAlign is zero it

-/// means there isn't a need to check it against alignment requirement,

-/// probably because the source does not need to be loaded. If 'IsMemset' is

-/// true, that means it's expanding a memset. If 'ZeroMemset' is true, that

-/// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy

-/// source is constant so it does not need to be loaded.

/// It returns EVT::Other if the type should be determined using generic

/// target-independent logic.

EVT PPCTargetLowering::getOptimalMemOpType(

- uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset,

- bool ZeroMemset, bool MemcpyStrSrc,

- const AttributeList &FuncAttributes) const {

+ const MemOp &Op, const AttributeList &FuncAttributes) const {

if (getTargetMachine().getOptLevel() != CodeGenOpt::None) {

// When expanding a memset, require at least two QPX instructions to cover

// the cost of loading the value to be stored from the constant pool.

- if (Subtarget.hasQPX() && Size >= 32 && (!IsMemset || Size >= 64) &&

- (!SrcAlign || SrcAlign >= 32) && (!DstAlign || DstAlign >= 32) &&

+ if (Subtarget.hasQPX() && Op.size() >= 32 &&

+ (Op.isMemcpy() || Op.size() >= 64) && Op.isAligned(Align(32)) &&

!FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat)) {

return MVT::v4f64;

}

// We should use Altivec/VSX loads and stores when available. For unaligned

// addresses, unaligned VSX loads are only fast starting with the P8.

- if (Subtarget.hasAltivec() && Size >= 16 &&

- (((!SrcAlign || SrcAlign >= 16) && (!DstAlign || DstAlign >= 16)) ||

- ((IsMemset && Subtarget.hasVSX()) || Subtarget.hasP8Vector())))

+ if (Subtarget.hasAltivec() && Op.size() >= 16 &&

+ (Op.isAligned(Align(16)) ||

+ ((Op.isMemset() && Subtarget.hasVSX()) || Subtarget.hasP8Vector())))

return MVT::v4i32;

}

@@ -15304,22 +16360,48 @@ bool PPCTargetLowering::allowsMisalignedMemoryAccesses(EVT VT,

bool PPCTargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,

EVT VT) const {

- VT = VT.getScalarType();

- if (!VT.isSimple())

- return false;

+ return isFMAFasterThanFMulAndFAdd(

+ MF.getFunction(), VT.getTypeForEVT(MF.getFunction().getContext()));

- switch (VT.getSimpleVT().SimpleTy) {

- case MVT::f32:

- case MVT::f64:

+bool PPCTargetLowering::isFMAFasterThanFMulAndFAdd(const Function &F,

+ Type *Ty) const {

+ switch (Ty->getScalarType()->getTypeID()) {

+ case Type::FloatTyID:

+ case Type::DoubleTyID:

return true;

- case MVT::f128:

- return (EnableQuadPrecision && Subtarget.hasP9Vector());

+ case Type::FP128TyID:

+ return Subtarget.hasP9Vector();

default:

- break;

+ return false;

}

- return false;

+// Currently this is a copy from AArch64TargetLowering::isProfitableToHoist.

+// FIXME: add more patterns which are profitable to hoist.

+bool PPCTargetLowering::isProfitableToHoist(Instruction *I) const {

+ if (I->getOpcode() != Instruction::FMul)

+ return true;

+ if (!I->hasOneUse())

+ return true;

+ Instruction *User = I->user_back();

+ assert(User && "A single use instruction with no uses.");

+ if (User->getOpcode() != Instruction::FSub &&

+ User->getOpcode() != Instruction::FAdd)

+ return true;

+ const TargetOptions &Options = getTargetMachine().Options;

+ const Function *F = I->getFunction();

+ const DataLayout &DL = F->getParent()->getDataLayout();

+ Type *Ty = User->getOperand(0)->getType();

+ return !(

+ isFMAFasterThanFMulAndFAdd(*F, Ty) &&

+ isOperationLegalOrCustom(ISD::FMA, getValueType(DL, Ty)) &&

+ (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath));

}

const MCPhysReg *

@@ -15335,12 +16417,12 @@ PPCTargetLowering::getScratchRegisters(CallingConv::ID) const {

return ScratchRegs;

}

-unsigned PPCTargetLowering::getExceptionPointerRegister(

+Register PPCTargetLowering::getExceptionPointerRegister(

const Constant *PersonalityFn) const {

return Subtarget.isPPC64() ? PPC::X3 : PPC::R3;

}

-unsigned PPCTargetLowering::getExceptionSelectorRegister(

+Register PPCTargetLowering::getExceptionSelectorRegister(

const Constant *PersonalityFn) const {

return Subtarget.isPPC64() ? PPC::X4 : PPC::R4;

}

@@ -15371,58 +16453,83 @@ PPCTargetLowering::createFastISel(FunctionLoweringInfo &FuncInfo,

return PPC::createFastISel(FuncInfo, LibInfo);

}

-void PPCTargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {

- if (Subtarget.isDarwinABI()) return;

- if (!Subtarget.isPPC64()) return;

- // Update IsSplitCSR in PPCFunctionInfo

- PPCFunctionInfo *PFI = Entry->getParent()->getInfo<PPCFunctionInfo>();

- PFI->setIsSplitCSR(true);

+// 'Inverted' means the FMA opcode after negating one multiplicand.

+// For example, (fma -a b c) = (fnmsub a b c)

+static unsigned invertFMAOpcode(unsigned Opc) {

+ switch (Opc) {

+ default:

+ llvm_unreachable("Invalid FMA opcode for PowerPC!");

+ case ISD::FMA:

+ return PPCISD::FNMSUB;

+ case PPCISD::FNMSUB:

+ return ISD::FMA;

+ }

}

-void PPCTargetLowering::insertCopiesSplitCSR(

- MachineBasicBlock *Entry,

- const SmallVectorImpl<MachineBasicBlock *> &Exits) const {

- const PPCRegisterInfo *TRI = Subtarget.getRegisterInfo();

- const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());

- if (!IStart)

- return;

+SDValue PPCTargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,

+ bool LegalOps, bool OptForSize,

+ NegatibleCost &Cost,

+ unsigned Depth) const {

+ if (Depth > SelectionDAG::MaxRecursionDepth)

+ return SDValue();

- const TargetInstrInfo *TII = Subtarget.getInstrInfo();

- MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();

- MachineBasicBlock::iterator MBBI = Entry->begin();

- for (const MCPhysReg *I = IStart; *I; ++I) {

- const TargetRegisterClass *RC = nullptr;

- if (PPC::G8RCRegClass.contains(*I))

- RC = &PPC::G8RCRegClass;

- else if (PPC::F8RCRegClass.contains(*I))

- RC = &PPC::F8RCRegClass;

- else if (PPC::CRRCRegClass.contains(*I))

- RC = &PPC::CRRCRegClass;

- else if (PPC::VRRCRegClass.contains(*I))

- RC = &PPC::VRRCRegClass;

- else

- llvm_unreachable("Unexpected register class in CSRsViaCopy!");

+ unsigned Opc = Op.getOpcode();

+ EVT VT = Op.getValueType();

+ SDNodeFlags Flags = Op.getNode()->getFlags();

+ switch (Opc) {

+ case PPCISD::FNMSUB:

+ // TODO: QPX subtarget is deprecated. No transformation here.

+ if (!Op.hasOneUse() || !isTypeLegal(VT) || Subtarget.hasQPX())

+ break;

- Register NewVR = MRI->createVirtualRegister(RC);

- // Create copy from CSR to a virtual register.

- // FIXME: this currently does not emit CFI pseudo-instructions, it works

- // fine for CXX_FAST_TLS since the C++-style TLS access functions should be

- // nounwind. If we want to generalize this later, we may need to emit

- // CFI pseudo-instructions.

- assert(Entry->getParent()->getFunction().hasFnAttribute(

- Attribute::NoUnwind) &&

- "Function should be nounwind in insertCopiesSplitCSR!");

- Entry->addLiveIn(*I);

- BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)

- .addReg(*I);

+ const TargetOptions &Options = getTargetMachine().Options;

+ SDValue N0 = Op.getOperand(0);

+ SDValue N1 = Op.getOperand(1);

+ SDValue N2 = Op.getOperand(2);

+ SDLoc Loc(Op);

- // Insert the copy-back instructions right before the terminator.

- for (auto *Exit : Exits)

- BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),

- TII->get(TargetOpcode::COPY), *I)

- .addReg(NewVR);

+ NegatibleCost N2Cost = NegatibleCost::Expensive;

+ SDValue NegN2 =

+ getNegatedExpression(N2, DAG, LegalOps, OptForSize, N2Cost, Depth + 1);

+ if (!NegN2)

+ return SDValue();

+ // (fneg (fnmsub a b c)) => (fnmsub (fneg a) b (fneg c))

+ // (fneg (fnmsub a b c)) => (fnmsub a (fneg b) (fneg c))

+ // These transformations may change sign of zeroes. For example,

+ // -(-ab-(-c))=-0 while -(-(ab-c))=+0 when a=b=c=1.

+ if (Flags.hasNoSignedZeros() || Options.NoSignedZerosFPMath) {

+ // Try and choose the cheaper one to negate.

+ NegatibleCost N0Cost = NegatibleCost::Expensive;

+ SDValue NegN0 = getNegatedExpression(N0, DAG, LegalOps, OptForSize,

+ N0Cost, Depth + 1);

+ NegatibleCost N1Cost = NegatibleCost::Expensive;

+ SDValue NegN1 = getNegatedExpression(N1, DAG, LegalOps, OptForSize,

+ N1Cost, Depth + 1);

+ if (NegN0 && N0Cost <= N1Cost) {

+ Cost = std::min(N0Cost, N2Cost);

+ return DAG.getNode(Opc, Loc, VT, NegN0, N1, NegN2, Flags);

+ } else if (NegN1) {

+ Cost = std::min(N1Cost, N2Cost);

+ return DAG.getNode(Opc, Loc, VT, N0, NegN1, NegN2, Flags);

+ }

+ // (fneg (fnmsub a b c)) => (fma a b (fneg c))

+ if (isOperationLegal(ISD::FMA, VT)) {

+ Cost = N2Cost;

+ return DAG.getNode(ISD::FMA, Loc, VT, N0, N1, NegN2, Flags);

+ }

+ break;

}

+ return TargetLowering::getNegatedExpression(Op, DAG, LegalOps, OptForSize,

+ Cost, Depth);

}

// Override to enable LOAD_STACK_GUARD lowering on Linux.

@@ -15450,6 +16557,13 @@ bool PPCTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,

return false;

case MVT::f32:

case MVT::f64:

+ if (Subtarget.hasPrefixInstrs()) {

+ // With prefixed instructions, we can materialize anything that can be

+ // represented with a 32-bit immediate, not just positive zero.

+ APFloat APFloatOfImm = Imm;

+ return convertToNonDenormSingle(APFloatOfImm);

+ }

+ LLVM_FALLTHROUGH;

case MVT::ppcf128:

return Imm.isPosZero();

}

@@ -15620,10 +16734,59 @@ static SDValue combineADDToADDZE(SDNode *N, SelectionDAG &DAG,

return SDValue();

}

+// Transform

+// (add C1, (MAT_PCREL_ADDR GlobalAddr+C2)) to

+// (MAT_PCREL_ADDR GlobalAddr+(C1+C2))

+// In this case both C1 and C2 must be known constants.

+// C1+C2 must fit into a 34 bit signed integer.

+static SDValue combineADDToMAT_PCREL_ADDR(SDNode *N, SelectionDAG &DAG,

+ const PPCSubtarget &Subtarget) {

+ if (!Subtarget.isUsingPCRelativeCalls())

+ return SDValue();

+ // Check both Operand 0 and Operand 1 of the ADD node for the PCRel node.

+ // If we find that node try to cast the Global Address and the Constant.

+ SDValue LHS = N->getOperand(0);

+ SDValue RHS = N->getOperand(1);

+ if (LHS.getOpcode() != PPCISD::MAT_PCREL_ADDR)

+ std::swap(LHS, RHS);

+ if (LHS.getOpcode() != PPCISD::MAT_PCREL_ADDR)

+ return SDValue();

+ // Operand zero of PPCISD::MAT_PCREL_ADDR is the GA node.

+ GlobalAddressSDNode *GSDN = dyn_cast<GlobalAddressSDNode>(LHS.getOperand(0));

+ ConstantSDNode* ConstNode = dyn_cast<ConstantSDNode>(RHS);

+ // Check that both casts succeeded.

+ if (!GSDN || !ConstNode)

+ return SDValue();

+ int64_t NewOffset = GSDN->getOffset() + ConstNode->getSExtValue();

+ SDLoc DL(GSDN);

+ // The signed int offset needs to fit in 34 bits.

+ if (!isInt<34>(NewOffset))

+ return SDValue();

+ // The new global address is a copy of the old global address except

+ // that it has the updated Offset.

+ SDValue GA =

+ DAG.getTargetGlobalAddress(GSDN->getGlobal(), DL, GSDN->getValueType(0),

+ NewOffset, GSDN->getTargetFlags());

+ SDValue MatPCRel =

+ DAG.getNode(PPCISD::MAT_PCREL_ADDR, DL, GSDN->getValueType(0), GA);

+ return MatPCRel;

SDValue PPCTargetLowering::combineADD(SDNode *N, DAGCombinerInfo &DCI) const {

if (auto Value = combineADDToADDZE(N, DCI.DAG, Subtarget))

return Value;

+ if (auto Value = combineADDToMAT_PCREL_ADDR(N, DCI.DAG, Subtarget))

+ return Value;

return SDValue();

}

@@ -15648,6 +16811,24 @@ SDValue PPCTargetLowering::combineTRUNCATE(SDNode *N,

SDLoc dl(N);

SDValue Op0 = N->getOperand(0);

+ // fold (truncate (abs (sub (zext a), (zext b)))) -> (vabsd a, b)

+ if (Subtarget.hasP9Altivec() && Op0.getOpcode() == ISD::ABS) {

+ EVT VT = N->getValueType(0);

+ if (VT != MVT::v4i32 && VT != MVT::v8i16 && VT != MVT::v16i8)

+ return SDValue();

+ SDValue Sub = Op0.getOperand(0);

+ if (Sub.getOpcode() == ISD::SUB) {

+ SDValue SubOp0 = Sub.getOperand(0);

+ SDValue SubOp1 = Sub.getOperand(1);

+ if ((SubOp0.getOpcode() == ISD::ZERO_EXTEND) &&

+ (SubOp1.getOpcode() == ISD::ZERO_EXTEND)) {

+ return DCI.DAG.getNode(PPCISD::VABSD, dl, VT, SubOp0.getOperand(0),

+ SubOp1.getOperand(0),

+ DCI.DAG.getTargetConstant(0, dl, MVT::i32));

+ }

// Looking for a truncate of i128 to i64.

if (Op0.getValueType() != MVT::i128 || N->getValueType(0) != MVT::i64)

return SDValue();

@@ -15702,6 +16883,7 @@ SDValue PPCTargetLowering::combineMUL(SDNode *N, DAGCombinerInfo &DCI) const {

// vector 7 2 2

return true;

case PPC::DIR_PWR9:

+ case PPC::DIR_PWR10:

case PPC::DIR_PWR_FUTURE:

// type mul add shl

// scalar 5 2 2

@@ -15763,6 +16945,44 @@ SDValue PPCTargetLowering::combineMUL(SDNode *N, DAGCombinerInfo &DCI) const {

}

+// Combine fma-like op (like fnmsub) with fnegs to appropriate op. Do this

+// in combiner since we need to check SD flags and other subtarget features.

+SDValue PPCTargetLowering::combineFMALike(SDNode *N,

+ DAGCombinerInfo &DCI) const {

+ SDValue N0 = N->getOperand(0);

+ SDValue N1 = N->getOperand(1);

+ SDValue N2 = N->getOperand(2);

+ SDNodeFlags Flags = N->getFlags();

+ EVT VT = N->getValueType(0);

+ SelectionDAG &DAG = DCI.DAG;

+ const TargetOptions &Options = getTargetMachine().Options;

+ unsigned Opc = N->getOpcode();

+ bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();

+ bool LegalOps = !DCI.isBeforeLegalizeOps();

+ SDLoc Loc(N);

+ // TODO: QPX subtarget is deprecated. No transformation here.

+ if (Subtarget.hasQPX() || !isOperationLegal(ISD::FMA, VT))

+ return SDValue();

+ // Allowing transformation to FNMSUB may change sign of zeroes when ab-c=0

+ // since (fnmsub a b c)=-0 while c-ab=+0.

+ if (!Flags.hasNoSignedZeros() && !Options.NoSignedZerosFPMath)

+ return SDValue();

+ // (fma (fneg a) b c) => (fnmsub a b c)

+ // (fnmsub (fneg a) b c) => (fma a b c)

+ if (SDValue NegN0 = getCheaperNegatedExpression(N0, DAG, LegalOps, CodeSize))

+ return DAG.getNode(invertFMAOpcode(Opc), Loc, VT, NegN0, N1, N2, Flags);

+ // (fma a (fneg b) c) => (fnmsub a b c)

+ // (fnmsub a (fneg b) c) => (fma a b c)

+ if (SDValue NegN1 = getCheaperNegatedExpression(N1, DAG, LegalOps, CodeSize))

+ return DAG.getNode(invertFMAOpcode(Opc), Loc, VT, N0, NegN1, N2, Flags);

+ return SDValue();

bool PPCTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {

// Only duplicate to increase tail-calls for the 64bit SysV ABIs.

if (!Subtarget.is64BitELFABI())