aboutsummaryrefslogtreecommitdiff
path: root/lib/Target
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2015-09-06 18:34:38 +0000
committerDimitry Andric <dim@FreeBSD.org>2015-09-06 18:34:38 +0000
commit69156b4c20249e7800cc09e0eef0beb3d15ac1ad (patch)
tree461d3cf041290f4a99740d540bf0973d6084f98e /lib/Target
parentee8648bdac07986a0f1ec897b02ec82a2f144d46 (diff)
downloadsrc-69156b4c20249e7800cc09e0eef0beb3d15ac1ad.tar.gz
src-69156b4c20249e7800cc09e0eef0beb3d15ac1ad.zip
Import llvm 3.7.0 release (r246257).vendor/llvm/llvm-release_370-r246257
Notes
Notes: svn path=/vendor/llvm/dist/; revision=287510 svn path=/vendor/llvm/llvm-release_370-r246257/; revision=287511; tag=vendor/llvm/llvm-release_370-r246257
Diffstat (limited to 'lib/Target')
-rw-r--r--lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp1
-rw-r--r--lib/Target/AArch64/AArch64FrameLowering.cpp1
-rw-r--r--lib/Target/AMDGPU/AMDGPU.td5
-rw-r--r--lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp19
-rw-r--r--lib/Target/AMDGPU/AMDGPUSubtarget.cpp2
-rw-r--r--lib/Target/AMDGPU/AMDGPUSubtarget.h5
-rw-r--r--lib/Target/AMDGPU/AMDILCFGStructurizer.cpp16
-rw-r--r--lib/Target/AMDGPU/Processors.td4
-rw-r--r--lib/Target/AMDGPU/SIISelLowering.cpp59
-rw-r--r--lib/Target/AMDGPU/SIISelLowering.h1
-rw-r--r--lib/Target/AMDGPU/SIInstrInfo.td2
-rw-r--r--lib/Target/AMDGPU/SIInstructions.td7
-rw-r--r--lib/Target/AMDGPU/SIMachineFunctionInfo.cpp1
-rw-r--r--lib/Target/AMDGPU/SIPrepareScratchRegs.cpp1
-rw-r--r--lib/Target/AMDGPU/SIRegisterInfo.cpp5
-rw-r--r--lib/Target/AMDGPU/VIInstructions.td42
-rw-r--r--lib/Target/ARM/ARMISelLowering.cpp6
-rw-r--r--lib/Target/ARM/ARMLoadStoreOptimizer.cpp366
-rw-r--r--lib/Target/ARM/README.txt2
-rw-r--r--lib/Target/ARM/Thumb1InstrInfo.cpp2
-rw-r--r--lib/Target/Hexagon/HexagonFrameLowering.cpp4
-rw-r--r--lib/Target/Mips/Mips64InstrInfo.td8
-rw-r--r--lib/Target/Mips/MipsFastISel.cpp37
-rw-r--r--lib/Target/Mips/MipsISelLowering.cpp8
-rw-r--r--lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp25
-rw-r--r--lib/Target/PowerPC/PPCAsmPrinter.cpp128
-rw-r--r--lib/Target/PowerPC/PPCFrameLowering.cpp3
-rw-r--r--lib/Target/PowerPC/PPCISelDAGToDAG.cpp7
-rw-r--r--lib/Target/PowerPC/PPCISelLowering.cpp20
-rw-r--r--lib/Target/README.txt2
-rw-r--r--lib/Target/Sparc/SparcFrameLowering.cpp19
-rw-r--r--lib/Target/SystemZ/SystemZCallingConv.td4
-rw-r--r--lib/Target/SystemZ/SystemZISelLowering.cpp14
-rw-r--r--lib/Target/SystemZ/SystemZISelLowering.h4
-rw-r--r--lib/Target/X86/AsmParser/X86AsmParser.cpp56
-rw-r--r--lib/Target/X86/X86FloatingPoint.cpp3
-rw-r--r--lib/Target/X86/X86FrameLowering.cpp2
-rw-r--r--lib/Target/X86/X86ISelLowering.cpp82
-rw-r--r--lib/Target/X86/X86InstrInfo.cpp15
-rw-r--r--lib/Target/X86/X86InstrSSE.td8
40 files changed, 582 insertions, 414 deletions
diff --git a/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp b/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp
index 79a84ad8c6c5..9d6dbd641a16 100644
--- a/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp
+++ b/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp
@@ -593,6 +593,7 @@ bool AArch64A57FPLoadBalancing::colorChain(Chain *G, Color C,
if (Change) {
Substs[MO.getReg()] = Reg;
MO.setReg(Reg);
+ MRI->setPhysRegUsed(Reg);
Changed = true;
}
diff --git a/lib/Target/AArch64/AArch64FrameLowering.cpp b/lib/Target/AArch64/AArch64FrameLowering.cpp
index a7817f4f67dd..a76473f7e539 100644
--- a/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -354,6 +354,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
if (NumBytes && NeedsRealignment) {
// Use the first callee-saved register as a scratch register.
scratchSPReg = AArch64::X9;
+ MF.getRegInfo().setPhysRegUsed(scratchSPReg);
}
// If we're a leaf function, try using the red zone.
diff --git a/lib/Target/AMDGPU/AMDGPU.td b/lib/Target/AMDGPU/AMDGPU.td
index ef8ef6268548..68b50504ee44 100644
--- a/lib/Target/AMDGPU/AMDGPU.td
+++ b/lib/Target/AMDGPU/AMDGPU.td
@@ -123,6 +123,11 @@ def FeatureSGPRInitBug : SubtargetFeature<"sgpr-init-bug",
"true",
"VI SGPR initilization bug requiring a fixed SGPR allocation size">;
+def FeatureEnableHugeScratchBuffer : SubtargetFeature<"huge-scratch-buffer",
+ "EnableHugeScratchBuffer",
+ "true",
+ "Enable scratch buffer sizes greater than 128 GB">;
+
class SubtargetFeatureFetchLimit <string Value> :
SubtargetFeature <"fetch"#Value,
"TexVTXClauseSize",
diff --git a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 37b77d778d9f..64c54ccb31ff 100644
--- a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -1029,6 +1029,10 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
SDValue &SLC, SDValue &TFE) const {
SDValue Ptr, Offen, Idxen, Addr64;
+ // addr64 bit was removed for volcanic islands.
+ if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
+ return false;
+
SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64,
GLC, SLC, TFE);
@@ -1095,13 +1099,16 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc,
// (add n0, c1)
if (CurDAG->isBaseWithConstantOffset(Addr)) {
+ SDValue N0 = Addr.getOperand(0);
SDValue N1 = Addr.getOperand(1);
- ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
-
- if (isLegalMUBUFImmOffset(C1)) {
- VAddr = Addr.getOperand(0);
- ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
- return true;
+ // Offsets in vaddr must be positive.
+ if (CurDAG->SignBitIsZero(N0)) {
+ ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
+ if (isLegalMUBUFImmOffset(C1)) {
+ VAddr = N0;
+ ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
+ return true;
+ }
}
}
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index bd5abc4f546e..5f32a65c9338 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -73,7 +73,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
WavefrontSize(0), CFALUBug(false), LocalMemorySize(0),
EnableVGPRSpilling(false), SGPRInitBug(false), IsGCN(false),
GCN1Encoding(false), GCN3Encoding(false), CIInsts(false), LDSBankCount(0),
- IsaVersion(ISAVersion0_0_0),
+ IsaVersion(ISAVersion0_0_0), EnableHugeScratchBuffer(false),
FrameLowering(TargetFrameLowering::StackGrowsUp,
64 * 16, // Maximum stack alignment (long16)
0),
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 90831bfb4458..735f01dfa7c5 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -89,6 +89,7 @@ private:
bool FeatureDisable;
int LDSBankCount;
unsigned IsaVersion;
+ bool EnableHugeScratchBuffer;
AMDGPUFrameLowering FrameLowering;
std::unique_ptr<AMDGPUTargetLowering> TLInfo;
@@ -271,6 +272,10 @@ public:
return DevName;
}
+ bool enableHugeScratchBuffer() const {
+ return EnableHugeScratchBuffer;
+ }
+
bool dumpCode() const {
return DumpCode;
}
diff --git a/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp b/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
index c9b25a1a0b84..d918ac3a5b3b 100644
--- a/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
+++ b/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
@@ -1719,7 +1719,6 @@ MachineBasicBlock *
AMDGPUCFGStructurizer::normalizeInfiniteLoopExit(MachineLoop* LoopRep) {
MachineBasicBlock *LoopHeader = LoopRep->getHeader();
MachineBasicBlock *LoopLatch = LoopRep->getLoopLatch();
- const TargetRegisterClass * I32RC = TRI->getCFGStructurizerRegClass(MVT::i32);
if (!LoopHeader || !LoopLatch)
return nullptr;
@@ -1732,18 +1731,9 @@ AMDGPUCFGStructurizer::normalizeInfiniteLoopExit(MachineLoop* LoopRep) {
FuncRep->push_back(DummyExitBlk); //insert to function
SHOWNEWBLK(DummyExitBlk, "DummyExitBlock to normalize infiniteLoop: ");
DEBUG(dbgs() << "Old branch instr: " << *BranchMI << "\n";);
- MachineBasicBlock::iterator I = BranchMI;
- unsigned ImmReg = FuncRep->getRegInfo().createVirtualRegister(I32RC);
- llvm_unreachable("Extra register needed to handle CFG");
- MachineInstr *NewMI = insertInstrBefore(I, AMDGPU::BRANCH_COND_i32);
- MachineInstrBuilder MIB(*FuncRep, NewMI);
- MIB.addMBB(LoopHeader);
- MIB.addReg(ImmReg, false);
- SHOWNEWINSTR(NewMI);
- BranchMI->eraseFromParent();
- LoopLatch->addSuccessor(DummyExitBlk);
-
- return DummyExitBlk;
+ LLVMContext &Ctx = LoopHeader->getParent()->getFunction()->getContext();
+ Ctx.emitError("Extra register needed to handle CFG");
+ return nullptr;
}
void AMDGPUCFGStructurizer::removeUnconditionalBranch(MachineBasicBlock *MBB) {
diff --git a/lib/Target/AMDGPU/Processors.td b/lib/Target/AMDGPU/Processors.td
index 69efb8b8bc43..d9a0723bedc9 100644
--- a/lib/Target/AMDGPU/Processors.td
+++ b/lib/Target/AMDGPU/Processors.td
@@ -138,3 +138,7 @@ def : ProcessorModel<"iceland", SIQuarterSpeedModel,
def : ProcessorModel<"carrizo", SIQuarterSpeedModel,
[FeatureVolcanicIslands, FeatureISAVersion8_0_1]
>;
+
+def : ProcessorModel<"fiji", SIQuarterSpeedModel,
+ [FeatureVolcanicIslands, FeatureISAVersion8_0_1]
+>;
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp
index dd818a9ba746..099b0b15942b 100644
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -254,6 +254,12 @@ bool SITargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &,
return false;
}
+bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const {
+ // Flat instructions do not have offsets, and only have the register
+ // address.
+ return AM.BaseOffs == 0 && (AM.Scale == 0 || AM.Scale == 1);
+}
+
bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
const AddrMode &AM, Type *Ty,
unsigned AS) const {
@@ -263,8 +269,21 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
switch (AS) {
case AMDGPUAS::GLOBAL_ADDRESS:
- case AMDGPUAS::CONSTANT_ADDRESS: // XXX - Should we assume SMRD instructions?
+ if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+ // Assume the we will use FLAT for all global memory accesses
+ // on VI.
+ // FIXME: This assumption is currently wrong. On VI we still use
+ // MUBUF instructions for the r + i addressing mode. As currently
+ // implemented, the MUBUF instructions only work on buffer < 4GB.
+ // It may be possible to support > 4GB buffers with MUBUF instructions,
+ // by setting the stride value in the resource descriptor which would
+ // increase the size limit to (stride * 4GB). However, this is risky,
+ // because it has never been validated.
+ return isLegalFlatAddressingMode(AM);
+ }
+ // fall-through
case AMDGPUAS::PRIVATE_ADDRESS:
+ case AMDGPUAS::CONSTANT_ADDRESS: // XXX - Should we assume SMRD instructions?
case AMDGPUAS::UNKNOWN_ADDRESS_SPACE: {
// MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
// additionally can do r + r + i with addr64. 32-bit has more addressing
@@ -324,11 +343,9 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
return false;
}
- case AMDGPUAS::FLAT_ADDRESS: {
- // Flat instructions do not have offsets, and only have the register
- // address.
- return AM.BaseOffs == 0 && (AM.Scale == 0 || AM.Scale == 1);
- }
+ case AMDGPUAS::FLAT_ADDRESS:
+ return isLegalFlatAddressingMode(AM);
+
default:
llvm_unreachable("unhandled address space");
}
@@ -812,10 +829,29 @@ static SDNode *findUser(SDValue Value, unsigned Opcode) {
SDValue SITargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const {
+ SDLoc SL(Op);
FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Op);
unsigned FrameIndex = FINode->getIndex();
- return DAG.getTargetFrameIndex(FrameIndex, MVT::i32);
+ // A FrameIndex node represents a 32-bit offset into scratch memory. If
+ // the high bit of a frame index offset were to be set, this would mean
+ // that it represented an offset of ~2GB * 64 = ~128GB from the start of the
+ // scratch buffer, with 64 being the number of threads per wave.
+ //
+ // If we know the machine uses less than 128GB of scratch, then we can
+ // amrk the high bit of the FrameIndex node as known zero,
+ // which is important, because it means in most situations we can
+ // prove that values derived from FrameIndex nodes are non-negative.
+ // This enables us to take advantage of more addressing modes when
+ // accessing scratch buffers, since for scratch reads/writes, the register
+ // offset must always be positive.
+
+ SDValue TFI = DAG.getTargetFrameIndex(FrameIndex, MVT::i32);
+ if (Subtarget->enableHugeScratchBuffer())
+ return TFI;
+
+ return DAG.getNode(ISD::AssertZext, SL, MVT::i32, TFI,
+ DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), 31)));
}
/// This transforms the control flow intrinsics to get the branch destination as
@@ -2034,6 +2070,13 @@ void SITargetLowering::adjustWritemask(MachineSDNode *&Node,
}
}
+static bool isFrameIndexOp(SDValue Op) {
+ if (Op.getOpcode() == ISD::AssertZext)
+ Op = Op.getOperand(0);
+
+ return isa<FrameIndexSDNode>(Op);
+}
+
/// \brief Legalize target independent instructions (e.g. INSERT_SUBREG)
/// with frame index operands.
/// LLVM assumes that inputs are to these instructions are registers.
@@ -2042,7 +2085,7 @@ void SITargetLowering::legalizeTargetIndependentNode(SDNode *Node,
SmallVector<SDValue, 8> Ops;
for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
- if (!isa<FrameIndexSDNode>(Node->getOperand(i))) {
+ if (!isFrameIndexOp(Node->getOperand(i))) {
Ops.push_back(Node->getOperand(i));
continue;
}
diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h
index 635b4edc89de..d84c32ec0092 100644
--- a/lib/Target/AMDGPU/SIISelLowering.h
+++ b/lib/Target/AMDGPU/SIISelLowering.h
@@ -56,6 +56,7 @@ class SITargetLowering : public AMDGPUTargetLowering {
SDValue performMin3Max3Combine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performSetCCCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+ bool isLegalFlatAddressingMode(const AddrMode &AM) const;
public:
SITargetLowering(TargetMachine &tm, const AMDGPUSubtarget &STI);
diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td
index b39a78714640..8d8110bca4c5 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/lib/Target/AMDGPU/SIInstrInfo.td
@@ -1600,12 +1600,14 @@ multiclass VOPC_m <vopc op, dag outs, dag ins, string asm, list<dag> pattern,
SIMCInstr <opName#"_e32", SISubtarget.SI> {
let Defs = !if(DefExec, [EXEC], []);
let hasSideEffects = DefExec;
+ let AssemblerPredicates = [isSICI];
}
def _vi : VOPC<op.VI, ins, asm, []>,
SIMCInstr <opName#"_e32", SISubtarget.VI> {
let Defs = !if(DefExec, [EXEC], []);
let hasSideEffects = DefExec;
+ let AssemblerPredicates = [isVI];
}
}
diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td
index 1ee63c675822..f78ffd72314c 100644
--- a/lib/Target/AMDGPU/SIInstructions.td
+++ b/lib/Target/AMDGPU/SIInstructions.td
@@ -2910,9 +2910,6 @@ defm : MUBUFLoad_Pattern <BUFFER_LOAD_SBYTE_ADDR64, i32, sextloadi8_constant>;
defm : MUBUFLoad_Pattern <BUFFER_LOAD_UBYTE_ADDR64, i32, az_extloadi8_constant>;
defm : MUBUFLoad_Pattern <BUFFER_LOAD_SSHORT_ADDR64, i32, sextloadi16_constant>;
defm : MUBUFLoad_Pattern <BUFFER_LOAD_USHORT_ADDR64, i32, az_extloadi16_constant>;
-defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORD_ADDR64, i32, constant_load>;
-defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORDX2_ADDR64, v2i32, constant_load>;
-defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORDX4_ADDR64, v4i32, constant_load>;
} // End Predicates = [isSICI]
class MUBUFScratchLoadPat <MUBUF Instr, ValueType vt, PatFrag ld> : Pat <
@@ -3273,13 +3270,13 @@ def : Pat <
(f64 (fadd (f64 (VOP3Mods f64:$x, i32:$mods)),
(f64 (fneg (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))))))),
(V_CNDMASK_B64_PSEUDO
- $x,
(V_MIN_F64
SRCMODS.NONE,
(V_FRACT_F64_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE),
SRCMODS.NONE,
(V_MOV_B64_PSEUDO 0x3fefffffffffffff),
DSTCLAMP.NONE, DSTOMOD.NONE),
+ $x,
(V_CMP_CLASS_F64_e64 SRCMODS.NONE, $x, 3/*NaN*/))
>;
@@ -3291,13 +3288,13 @@ def : Pat <
$x,
SRCMODS.NEG,
(V_CNDMASK_B64_PSEUDO
- $x,
(V_MIN_F64
SRCMODS.NONE,
(V_FRACT_F64_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE),
SRCMODS.NONE,
(V_MOV_B64_PSEUDO 0x3fefffffffffffff),
DSTCLAMP.NONE, DSTOMOD.NONE),
+ $x,
(V_CMP_CLASS_F64_e64 SRCMODS.NONE, $x, 3/*NaN*/)),
DSTCLAMP.NONE, DSTOMOD.NONE)
>;
diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index d23b92edef33..587ea63d6796 100644
--- a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -53,6 +53,7 @@ SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg(
if (!LaneVGPRs.count(LaneVGPRIdx)) {
unsigned LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass);
LaneVGPRs[LaneVGPRIdx] = LaneVGPR;
+ MRI.setPhysRegUsed(LaneVGPR);
// Add this register as live-in to all blocks to avoid machine verifer
// complaining about use of an undefined physical register.
diff --git a/lib/Target/AMDGPU/SIPrepareScratchRegs.cpp b/lib/Target/AMDGPU/SIPrepareScratchRegs.cpp
index b086d2ed6652..0a7f684552f0 100644
--- a/lib/Target/AMDGPU/SIPrepareScratchRegs.cpp
+++ b/lib/Target/AMDGPU/SIPrepareScratchRegs.cpp
@@ -91,6 +91,7 @@ bool SIPrepareScratchRegs::runOnMachineFunction(MachineFunction &MF) {
if (ScratchOffsetReg != AMDGPU::NoRegister) {
// Found an SGPR to use
+ MRI.setPhysRegUsed(ScratchOffsetReg);
BuildMI(*Entry, I, DL, TII->get(AMDGPU::S_MOV_B32), ScratchOffsetReg)
.addReg(ScratchOffsetPreloadReg);
} else {
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.cpp b/lib/Target/AMDGPU/SIRegisterInfo.cpp
index ce4acafac9fa..54c4d549fac7 100644
--- a/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -348,7 +348,8 @@ const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const {
&AMDGPU::SReg_128RegClass,
&AMDGPU::VReg_256RegClass,
&AMDGPU::SReg_256RegClass,
- &AMDGPU::VReg_512RegClass
+ &AMDGPU::VReg_512RegClass,
+ &AMDGPU::SReg_512RegClass
};
for (const TargetRegisterClass *BaseClass : BaseClasses) {
@@ -499,7 +500,7 @@ unsigned SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI,
for (TargetRegisterClass::iterator I = RC->begin(), E = RC->end();
I != E; ++I) {
- if (MRI.reg_nodbg_empty(*I))
+ if (!MRI.isPhysRegUsed(*I))
return *I;
}
return AMDGPU::NoRegister;
diff --git a/lib/Target/AMDGPU/VIInstructions.td b/lib/Target/AMDGPU/VIInstructions.td
index 5bf86e649ce0..aca46732adb9 100644
--- a/lib/Target/AMDGPU/VIInstructions.td
+++ b/lib/Target/AMDGPU/VIInstructions.td
@@ -103,4 +103,46 @@ def : Pat <
(S_BUFFER_LOAD_DWORD_IMM $sbase, (as_i32imm $offset))
>;
+// Patterns for global loads with no offset
+class FlatLoadPat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat <
+ (vt (node i64:$addr)),
+ (inst $addr, 0, 0, 0)
+>;
+
+def : FlatLoadPat <FLAT_LOAD_UBYTE, az_extloadi8_global, i32>;
+def : FlatLoadPat <FLAT_LOAD_SBYTE, sextloadi8_global, i32>;
+def : FlatLoadPat <FLAT_LOAD_USHORT, az_extloadi16_global, i32>;
+def : FlatLoadPat <FLAT_LOAD_SSHORT, sextloadi16_global, i32>;
+def : FlatLoadPat <FLAT_LOAD_DWORD, global_load, i32>;
+def : FlatLoadPat <FLAT_LOAD_DWORDX2, global_load, v2i32>;
+def : FlatLoadPat <FLAT_LOAD_DWORDX4, global_load, v4i32>;
+
+class FlatStorePat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat <
+ (node vt:$data, i64:$addr),
+ (inst $data, $addr, 0, 0, 0)
+>;
+
+def : FlatStorePat <FLAT_STORE_BYTE, truncstorei8_global, i32>;
+def : FlatStorePat <FLAT_STORE_SHORT, truncstorei16_global, i32>;
+def : FlatStorePat <FLAT_STORE_DWORD, global_store, i32>;
+def : FlatStorePat <FLAT_STORE_DWORDX2, global_store, v2i32>;
+def : FlatStorePat <FLAT_STORE_DWORDX4, global_store, v4i32>;
+
+class FlatAtomicPat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat <
+ (vt (node i64:$addr, vt:$data)),
+ (inst $addr, $data, 0, 0)
+>;
+
+def : FlatAtomicPat <FLAT_ATOMIC_ADD_RTN, atomic_add_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_AND_RTN, atomic_and_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_SUB_RTN, atomic_sub_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_SMAX_RTN, atomic_max_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_UMAX_RTN, atomic_umax_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_SMIN_RTN, atomic_min_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_UMIN_RTN, atomic_umin_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_OR_RTN, atomic_or_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_SWAP_RTN, atomic_swap_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_XOR_RTN, atomic_xor_global, i32>;
+
+
} // End Predicates = [isVI]
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index e335784f6d87..8cc06df71633 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -4583,6 +4583,12 @@ static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) {
ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
SDLoc dl(Op);
+ if (CmpVT.getVectorElementType() == MVT::i64)
+ // 64-bit comparisons are not legal. We've marked SETCC as non-Custom,
+ // but it's possible that our operands are 64-bit but our result is 32-bit.
+ // Bail in this case.
+ return SDValue();
+
if (Op1.getValueType().isFloatingPoint()) {
switch (SetCCOpcode) {
default: llvm_unreachable("Illegal FP comparison");
diff --git a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
index 37352810c99f..265b86f75f1d 100644
--- a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
+++ b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
@@ -118,7 +118,6 @@ namespace {
};
SpecificBumpPtrAllocator<MergeCandidate> Allocator;
SmallVector<const MergeCandidate*,4> Candidates;
- SmallVector<MachineInstr*,4> MergeBaseCandidates;
void moveLiveRegsBefore(const MachineBasicBlock &MBB,
MachineBasicBlock::const_iterator Before);
@@ -141,7 +140,6 @@ namespace {
MachineBasicBlock::iterator &MBBI);
bool MergeBaseUpdateLoadStore(MachineInstr *MI);
bool MergeBaseUpdateLSMultiple(MachineInstr *MI);
- bool MergeBaseUpdateLSDouble(MachineInstr &MI) const;
bool LoadStoreMultipleOpti(MachineBasicBlock &MBB);
bool MergeReturnIntoLDM(MachineBasicBlock &MBB);
};
@@ -933,6 +931,11 @@ void ARMLoadStoreOpt::FormCandidates(const MemOpQueue &MemOps) {
if (STI->isSwift() && !isNotVFP && (PRegNum % 2) == 1)
CanMergeToLSMulti = false;
+ // LDRD/STRD do not allow SP/PC. LDM/STM do not support it or have it
+ // deprecated; LDM to PC is fine but cannot happen here.
+ if (PReg == ARM::SP || PReg == ARM::PC)
+ CanMergeToLSMulti = CanMergeToLSDouble = false;
+
// Merge following instructions where possible.
for (unsigned I = SIndex+1; I < EIndex; ++I, ++Count) {
int NewOffset = MemOps[I].Offset;
@@ -940,16 +943,15 @@ void ARMLoadStoreOpt::FormCandidates(const MemOpQueue &MemOps) {
break;
const MachineOperand &MO = getLoadStoreRegOp(*MemOps[I].MI);
unsigned Reg = MO.getReg();
- unsigned RegNum = MO.isUndef() ? UINT_MAX : TRI->getEncodingValue(Reg);
+ if (Reg == ARM::SP || Reg == ARM::PC)
+ break;
// See if the current load/store may be part of a multi load/store.
+ unsigned RegNum = MO.isUndef() ? UINT_MAX : TRI->getEncodingValue(Reg);
bool PartOfLSMulti = CanMergeToLSMulti;
if (PartOfLSMulti) {
- // Cannot load from SP
- if (Reg == ARM::SP)
- PartOfLSMulti = false;
// Register numbers must be in ascending order.
- else if (RegNum <= PRegNum)
+ if (RegNum <= PRegNum)
PartOfLSMulti = false;
// For VFP / NEON load/store multiples, the registers must be
// consecutive and within the limit on the number of registers per
@@ -993,6 +995,76 @@ void ARMLoadStoreOpt::FormCandidates(const MemOpQueue &MemOps) {
} while (SIndex < EIndex);
}
+static bool isMatchingDecrement(MachineInstr *MI, unsigned Base,
+ unsigned Bytes, unsigned Limit,
+ ARMCC::CondCodes Pred, unsigned PredReg) {
+ unsigned MyPredReg = 0;
+ if (!MI)
+ return false;
+
+ bool CheckCPSRDef = false;
+ switch (MI->getOpcode()) {
+ default: return false;
+ case ARM::tSUBi8:
+ case ARM::t2SUBri:
+ case ARM::SUBri:
+ CheckCPSRDef = true;
+ break;
+ case ARM::tSUBspi:
+ break;
+ }
+
+ // Make sure the offset fits in 8 bits.
+ if (Bytes == 0 || (Limit && Bytes >= Limit))
+ return false;
+
+ unsigned Scale = (MI->getOpcode() == ARM::tSUBspi ||
+ MI->getOpcode() == ARM::tSUBi8) ? 4 : 1; // FIXME
+ if (!(MI->getOperand(0).getReg() == Base &&
+ MI->getOperand(1).getReg() == Base &&
+ (MI->getOperand(2).getImm() * Scale) == Bytes &&
+ getInstrPredicate(MI, MyPredReg) == Pred &&
+ MyPredReg == PredReg))
+ return false;
+
+ return CheckCPSRDef ? !definesCPSR(MI) : true;
+}
+
+static bool isMatchingIncrement(MachineInstr *MI, unsigned Base,
+ unsigned Bytes, unsigned Limit,
+ ARMCC::CondCodes Pred, unsigned PredReg) {
+ unsigned MyPredReg = 0;
+ if (!MI)
+ return false;
+
+ bool CheckCPSRDef = false;
+ switch (MI->getOpcode()) {
+ default: return false;
+ case ARM::tADDi8:
+ case ARM::t2ADDri:
+ case ARM::ADDri:
+ CheckCPSRDef = true;
+ break;
+ case ARM::tADDspi:
+ break;
+ }
+
+ if (Bytes == 0 || (Limit && Bytes >= Limit))
+ // Make sure the offset fits in 8 bits.
+ return false;
+
+ unsigned Scale = (MI->getOpcode() == ARM::tADDspi ||
+ MI->getOpcode() == ARM::tADDi8) ? 4 : 1; // FIXME
+ if (!(MI->getOperand(0).getReg() == Base &&
+ MI->getOperand(1).getReg() == Base &&
+ (MI->getOperand(2).getImm() * Scale) == Bytes &&
+ getInstrPredicate(MI, MyPredReg) == Pred &&
+ MyPredReg == PredReg))
+ return false;
+
+ return CheckCPSRDef ? !definesCPSR(MI) : true;
+}
+
static unsigned getUpdatingLSMultipleOpcode(unsigned Opc,
ARM_AM::AMSubMode Mode) {
switch (Opc) {
@@ -1060,75 +1132,6 @@ static unsigned getUpdatingLSMultipleOpcode(unsigned Opc,
}
}
-/// Check if the given instruction increments or decrements a register and
-/// return the amount it is incremented/decremented. Returns 0 if the CPSR flags
-/// generated by the instruction are possibly read as well.
-static int isIncrementOrDecrement(const MachineInstr &MI, unsigned Reg,
- ARMCC::CondCodes Pred, unsigned PredReg) {
- bool CheckCPSRDef;
- int Scale;
- switch (MI.getOpcode()) {
- case ARM::tADDi8: Scale = 4; CheckCPSRDef = true; break;
- case ARM::tSUBi8: Scale = -4; CheckCPSRDef = true; break;
- case ARM::t2SUBri:
- case ARM::SUBri: Scale = -1; CheckCPSRDef = true; break;
- case ARM::t2ADDri:
- case ARM::ADDri: Scale = 1; CheckCPSRDef = true; break;
- case ARM::tADDspi: Scale = 4; CheckCPSRDef = false; break;
- case ARM::tSUBspi: Scale = -4; CheckCPSRDef = false; break;
- default: return 0;
- }
-
- unsigned MIPredReg;
- if (MI.getOperand(0).getReg() != Reg ||
- MI.getOperand(1).getReg() != Reg ||
- getInstrPredicate(&MI, MIPredReg) != Pred ||
- MIPredReg != PredReg)
- return 0;
-
- if (CheckCPSRDef && definesCPSR(&MI))
- return 0;
- return MI.getOperand(2).getImm() * Scale;
-}
-
-/// Searches for an increment or decrement of \p Reg before \p MBBI.
-static MachineBasicBlock::iterator
-findIncDecBefore(MachineBasicBlock::iterator MBBI, unsigned Reg,
- ARMCC::CondCodes Pred, unsigned PredReg, int &Offset) {
- Offset = 0;
- MachineBasicBlock &MBB = *MBBI->getParent();
- MachineBasicBlock::iterator BeginMBBI = MBB.begin();
- MachineBasicBlock::iterator EndMBBI = MBB.end();
- if (MBBI == BeginMBBI)
- return EndMBBI;
-
- // Skip debug values.
- MachineBasicBlock::iterator PrevMBBI = std::prev(MBBI);
- while (PrevMBBI->isDebugValue() && PrevMBBI != BeginMBBI)
- --PrevMBBI;
-
- Offset = isIncrementOrDecrement(*PrevMBBI, Reg, Pred, PredReg);
- return Offset == 0 ? EndMBBI : PrevMBBI;
-}
-
-/// Searches for a increment or decrement of \p Reg after \p MBBI.
-static MachineBasicBlock::iterator
-findIncDecAfter(MachineBasicBlock::iterator MBBI, unsigned Reg,
- ARMCC::CondCodes Pred, unsigned PredReg, int &Offset) {
- Offset = 0;
- MachineBasicBlock &MBB = *MBBI->getParent();
- MachineBasicBlock::iterator EndMBBI = MBB.end();
- MachineBasicBlock::iterator NextMBBI = std::next(MBBI);
- // Skip debug values.
- while (NextMBBI != EndMBBI && NextMBBI->isDebugValue())
- ++NextMBBI;
- if (NextMBBI == EndMBBI)
- return EndMBBI;
-
- Offset = isIncrementOrDecrement(*NextMBBI, Reg, Pred, PredReg);
- return Offset == 0 ? EndMBBI : NextMBBI;
-}
-
/// Fold proceeding/trailing inc/dec of base register into the
/// LDM/STM/VLDM{D|S}/VSTM{D|S} op when possible:
///
@@ -1148,6 +1151,7 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineInstr *MI) {
const MachineOperand &BaseOP = MI->getOperand(0);
unsigned Base = BaseOP.getReg();
bool BaseKill = BaseOP.isKill();
+ unsigned Bytes = getLSMultipleTransferSize(MI);
unsigned PredReg = 0;
ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg);
unsigned Opcode = MI->getOpcode();
@@ -1159,24 +1163,49 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineInstr *MI) {
if (MI->getOperand(i).getReg() == Base)
return false;
- int Bytes = getLSMultipleTransferSize(MI);
+ bool DoMerge = false;
+ ARM_AM::AMSubMode Mode = getLoadStoreMultipleSubMode(Opcode);
+
+ // Try merging with the previous instruction.
MachineBasicBlock &MBB = *MI->getParent();
+ MachineBasicBlock::iterator BeginMBBI = MBB.begin();
MachineBasicBlock::iterator MBBI(MI);
- int Offset;
- MachineBasicBlock::iterator MergeInstr
- = findIncDecBefore(MBBI, Base, Pred, PredReg, Offset);
- ARM_AM::AMSubMode Mode = getLoadStoreMultipleSubMode(Opcode);
- if (Mode == ARM_AM::ia && Offset == -Bytes) {
- Mode = ARM_AM::db;
- } else if (Mode == ARM_AM::ib && Offset == -Bytes) {
- Mode = ARM_AM::da;
- } else {
- MergeInstr = findIncDecAfter(MBBI, Base, Pred, PredReg, Offset);
- if (((Mode != ARM_AM::ia && Mode != ARM_AM::ib) || Offset != Bytes) &&
- ((Mode != ARM_AM::da && Mode != ARM_AM::db) || Offset != -Bytes))
- return false;
+ if (MBBI != BeginMBBI) {
+ MachineBasicBlock::iterator PrevMBBI = std::prev(MBBI);
+ while (PrevMBBI != BeginMBBI && PrevMBBI->isDebugValue())
+ --PrevMBBI;
+ if (Mode == ARM_AM::ia &&
+ isMatchingDecrement(PrevMBBI, Base, Bytes, 0, Pred, PredReg)) {
+ Mode = ARM_AM::db;
+ DoMerge = true;
+ } else if (Mode == ARM_AM::ib &&
+ isMatchingDecrement(PrevMBBI, Base, Bytes, 0, Pred, PredReg)) {
+ Mode = ARM_AM::da;
+ DoMerge = true;
+ }
+ if (DoMerge)
+ MBB.erase(PrevMBBI);
}
- MBB.erase(MergeInstr);
+
+ // Try merging with the next instruction.
+ MachineBasicBlock::iterator EndMBBI = MBB.end();
+ if (!DoMerge && MBBI != EndMBBI) {
+ MachineBasicBlock::iterator NextMBBI = std::next(MBBI);
+ while (NextMBBI != EndMBBI && NextMBBI->isDebugValue())
+ ++NextMBBI;
+ if ((Mode == ARM_AM::ia || Mode == ARM_AM::ib) &&
+ isMatchingIncrement(NextMBBI, Base, Bytes, 0, Pred, PredReg)) {
+ DoMerge = true;
+ } else if ((Mode == ARM_AM::da || Mode == ARM_AM::db) &&
+ isMatchingDecrement(NextMBBI, Base, Bytes, 0, Pred, PredReg)) {
+ DoMerge = true;
+ }
+ if (DoMerge)
+ MBB.erase(NextMBBI);
+ }
+
+ if (!DoMerge)
+ return false;
unsigned NewOpc = getUpdatingLSMultipleOpcode(Opcode, Mode);
MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc))
@@ -1254,6 +1283,7 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineInstr *MI) {
unsigned Base = getLoadStoreBaseOp(*MI).getReg();
bool BaseKill = getLoadStoreBaseOp(*MI).isKill();
+ unsigned Bytes = getLSMultipleTransferSize(MI);
unsigned Opcode = MI->getOpcode();
DebugLoc DL = MI->getDebugLoc();
bool isAM5 = (Opcode == ARM::VLDRD || Opcode == ARM::VLDRS ||
@@ -1265,6 +1295,7 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineInstr *MI) {
if (isAM5 && ARM_AM::getAM5Offset(MI->getOperand(2).getImm()) != 0)
return false;
+ bool isLd = isLoadSingle(Opcode);
// Can't do the merge if the destination register is the same as the would-be
// writeback register.
if (MI->getOperand(0).getReg() == Base)
@@ -1272,31 +1303,55 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineInstr *MI) {
unsigned PredReg = 0;
ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg);
- int Bytes = getLSMultipleTransferSize(MI);
+ bool DoMerge = false;
+ ARM_AM::AddrOpc AddSub = ARM_AM::add;
+ unsigned NewOpc = 0;
+ // AM2 - 12 bits, thumb2 - 8 bits.
+ unsigned Limit = isAM5 ? 0 : (isAM2 ? 0x1000 : 0x100);
+
+ // Try merging with the previous instruction.
MachineBasicBlock &MBB = *MI->getParent();
+ MachineBasicBlock::iterator BeginMBBI = MBB.begin();
MachineBasicBlock::iterator MBBI(MI);
- int Offset;
- MachineBasicBlock::iterator MergeInstr
- = findIncDecBefore(MBBI, Base, Pred, PredReg, Offset);
- unsigned NewOpc;
- if (!isAM5 && Offset == Bytes) {
- NewOpc = getPreIndexedLoadStoreOpcode(Opcode, ARM_AM::add);
- } else if (Offset == -Bytes) {
- NewOpc = getPreIndexedLoadStoreOpcode(Opcode, ARM_AM::sub);
- } else {
- MergeInstr = findIncDecAfter(MBBI, Base, Pred, PredReg, Offset);
- if (Offset == Bytes) {
- NewOpc = getPostIndexedLoadStoreOpcode(Opcode, ARM_AM::add);
- } else if (!isAM5 && Offset == -Bytes) {
- NewOpc = getPostIndexedLoadStoreOpcode(Opcode, ARM_AM::sub);
- } else
- return false;
+ if (MBBI != BeginMBBI) {
+ MachineBasicBlock::iterator PrevMBBI = std::prev(MBBI);
+ while (PrevMBBI != BeginMBBI && PrevMBBI->isDebugValue())
+ --PrevMBBI;
+ if (isMatchingDecrement(PrevMBBI, Base, Bytes, Limit, Pred, PredReg)) {
+ DoMerge = true;
+ AddSub = ARM_AM::sub;
+ } else if (!isAM5 &&
+ isMatchingIncrement(PrevMBBI, Base, Bytes, Limit,Pred,PredReg)) {
+ DoMerge = true;
+ }
+ if (DoMerge) {
+ NewOpc = getPreIndexedLoadStoreOpcode(Opcode, AddSub);
+ MBB.erase(PrevMBBI);
+ }
}
- MBB.erase(MergeInstr);
- ARM_AM::AddrOpc AddSub = Offset < 0 ? ARM_AM::sub : ARM_AM::add;
+ // Try merging with the next instruction.
+ MachineBasicBlock::iterator EndMBBI = MBB.end();
+ if (!DoMerge && MBBI != EndMBBI) {
+ MachineBasicBlock::iterator NextMBBI = std::next(MBBI);
+ while (NextMBBI != EndMBBI && NextMBBI->isDebugValue())
+ ++NextMBBI;
+ if (!isAM5 &&
+ isMatchingDecrement(NextMBBI, Base, Bytes, Limit, Pred, PredReg)) {
+ DoMerge = true;
+ AddSub = ARM_AM::sub;
+ } else if (isMatchingIncrement(NextMBBI, Base, Bytes, Limit,Pred,PredReg)) {
+ DoMerge = true;
+ }
+ if (DoMerge) {
+ NewOpc = getPostIndexedLoadStoreOpcode(Opcode, AddSub);
+ MBB.erase(NextMBBI);
+ }
+ }
+
+ if (!DoMerge)
+ return false;
- bool isLd = isLoadSingle(Opcode);
if (isAM5) {
// VLDM[SD]_UPD, VSTM[SD]_UPD
// (There are no base-updating versions of VLDR/VSTR instructions, but the
@@ -1313,16 +1368,18 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineInstr *MI) {
if (isAM2) {
// LDR_PRE, LDR_POST
if (NewOpc == ARM::LDR_PRE_IMM || NewOpc == ARM::LDRB_PRE_IMM) {
+ int Offset = AddSub == ARM_AM::sub ? -Bytes : Bytes;
BuildMI(MBB, MBBI, DL, TII->get(NewOpc), MI->getOperand(0).getReg())
.addReg(Base, RegState::Define)
.addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg);
} else {
- int Imm = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift);
+ int Offset = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift);
BuildMI(MBB, MBBI, DL, TII->get(NewOpc), MI->getOperand(0).getReg())
.addReg(Base, RegState::Define)
- .addReg(Base).addReg(0).addImm(Imm).addImm(Pred).addReg(PredReg);
+ .addReg(Base).addReg(0).addImm(Offset).addImm(Pred).addReg(PredReg);
}
} else {
+ int Offset = AddSub == ARM_AM::sub ? -Bytes : Bytes;
// t2LDR_PRE, t2LDR_POST
BuildMI(MBB, MBBI, DL, TII->get(NewOpc), MI->getOperand(0).getReg())
.addReg(Base, RegState::Define)
@@ -1334,12 +1391,13 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineInstr *MI) {
// the vestigal zero-reg offset register. When that's fixed, this clause
// can be removed entirely.
if (isAM2 && NewOpc == ARM::STR_POST_IMM) {
- int Imm = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift);
+ int Offset = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift);
// STR_PRE, STR_POST
BuildMI(MBB, MBBI, DL, TII->get(NewOpc), Base)
.addReg(MO.getReg(), getKillRegState(MO.isKill()))
- .addReg(Base).addReg(0).addImm(Imm).addImm(Pred).addReg(PredReg);
+ .addReg(Base).addReg(0).addImm(Offset).addImm(Pred).addReg(PredReg);
} else {
+ int Offset = AddSub == ARM_AM::sub ? -Bytes : Bytes;
// t2STR_PRE, t2STR_POST
BuildMI(MBB, MBBI, DL, TII->get(NewOpc), Base)
.addReg(MO.getReg(), getKillRegState(MO.isKill()))
@@ -1351,66 +1409,6 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineInstr *MI) {
return true;
}
-bool ARMLoadStoreOpt::MergeBaseUpdateLSDouble(MachineInstr &MI) const {
- unsigned Opcode = MI.getOpcode();
- assert((Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8) &&
- "Must have t2STRDi8 or t2LDRDi8");
- if (MI.getOperand(3).getImm() != 0)
- return false;
-
- // Behaviour for writeback is undefined if base register is the same as one
- // of the others.
- const MachineOperand &BaseOp = MI.getOperand(2);
- unsigned Base = BaseOp.getReg();
- const MachineOperand &Reg0Op = MI.getOperand(0);
- const MachineOperand &Reg1Op = MI.getOperand(1);
- if (Reg0Op.getReg() == Base || Reg1Op.getReg() == Base)
- return false;
-
- unsigned PredReg;
- ARMCC::CondCodes Pred = getInstrPredicate(&MI, PredReg);
- MachineBasicBlock::iterator MBBI(MI);
- MachineBasicBlock &MBB = *MI.getParent();
- int Offset;
- MachineBasicBlock::iterator MergeInstr = findIncDecBefore(MBBI, Base, Pred,
- PredReg, Offset);
- unsigned NewOpc;
- if (Offset == 8 || Offset == -8) {
- NewOpc = Opcode == ARM::t2LDRDi8 ? ARM::t2LDRD_PRE : ARM::t2STRD_PRE;
- } else {
- MergeInstr = findIncDecAfter(MBBI, Base, Pred, PredReg, Offset);
- if (Offset == 8 || Offset == -8) {
- NewOpc = Opcode == ARM::t2LDRDi8 ? ARM::t2LDRD_POST : ARM::t2STRD_POST;
- } else
- return false;
- }
- MBB.erase(MergeInstr);
-
- DebugLoc DL = MI.getDebugLoc();
- MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc));
- if (NewOpc == ARM::t2LDRD_PRE || NewOpc == ARM::t2LDRD_POST) {
- MIB.addOperand(Reg0Op).addOperand(Reg1Op)
- .addReg(BaseOp.getReg(), RegState::Define);
- } else {
- assert(NewOpc == ARM::t2STRD_PRE || NewOpc == ARM::t2STRD_POST);
- MIB.addReg(BaseOp.getReg(), RegState::Define)
- .addOperand(Reg0Op).addOperand(Reg1Op);
- }
- MIB.addReg(BaseOp.getReg(), RegState::Kill)
- .addImm(Offset).addImm(Pred).addReg(PredReg);
- assert(TII->get(Opcode).getNumOperands() == 6 &&
- TII->get(NewOpc).getNumOperands() == 7 &&
- "Unexpected number of operands in Opcode specification.");
-
- // Transfer implicit operands.
- for (const MachineOperand &MO : MI.implicit_operands())
- MIB.addOperand(MO);
- MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
-
- MBB.erase(MBBI);
- return true;
-}
-
/// Returns true if instruction is a memory operation that this pass is capable
/// of operating on.
static bool isMemoryOp(const MachineInstr *MI) {
@@ -1618,7 +1616,6 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
ARMCC::CondCodes CurrPred = ARMCC::AL;
unsigned Position = 0;
assert(Candidates.size() == 0);
- assert(MergeBaseCandidates.size() == 0);
LiveRegsValid = false;
for (MachineBasicBlock::iterator I = MBB.end(), MBBI; I != MBB.begin();
@@ -1697,15 +1694,8 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
MBBI = I;
--Position;
// Fallthrough to look into existing chain.
- } else if (MBBI->isDebugValue()) {
+ } else if (MBBI->isDebugValue())
continue;
- } else if (MBBI->getOpcode() == ARM::t2LDRDi8 ||
- MBBI->getOpcode() == ARM::t2STRDi8) {
- // ARMPreAllocLoadStoreOpt has already formed some LDRD/STRD instructions
- // remember them because we may still be able to merge add/sub into them.
- MergeBaseCandidates.push_back(MBBI);
- }
-
// If we are here then the chain is broken; Extract candidates for a merge.
if (MemOps.size() > 0) {
@@ -1736,9 +1726,7 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
if (Merged) {
Changed = true;
unsigned Opcode = Merged->getOpcode();
- if (Opcode == ARM::t2STRDi8 || Opcode == ARM::t2LDRDi8)
- MergeBaseUpdateLSDouble(*Merged);
- else
+ if (Opcode != ARM::t2STRDi8 && Opcode != ARM::t2LDRDi8)
MergeBaseUpdateLSMultiple(Merged);
} else {
for (MachineInstr *MI : Candidate->Instrs) {
@@ -1753,10 +1741,6 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
}
}
Candidates.clear();
- // Try to fold add/sub into the LDRD/STRD formed by ARMPreAllocLoadStoreOpt.
- for (MachineInstr *MI : MergeBaseCandidates)
- MergeBaseUpdateLSDouble(*MI);
- MergeBaseCandidates.clear();
return Changed;
}
diff --git a/lib/Target/ARM/README.txt b/lib/Target/ARM/README.txt
index 57dc6cb88bed..090a003424a4 100644
--- a/lib/Target/ARM/README.txt
+++ b/lib/Target/ARM/README.txt
@@ -566,7 +566,7 @@ Robert Muth started working on an alternate jump table implementation that
does not put the tables in-line in the text. This is more like the llvm
default jump table implementation. This might be useful sometime. Several
revisions of patches are on the mailing list, beginning at:
-http://lists.cs.uiuc.edu/pipermail/llvmdev/2009-June/022763.html
+http://lists.llvm.org/pipermail/llvm-dev/2009-June/022763.html
//===---------------------------------------------------------------------===//
diff --git a/lib/Target/ARM/Thumb1InstrInfo.cpp b/lib/Target/ARM/Thumb1InstrInfo.cpp
index 028119c264b3..216e776932be 100644
--- a/lib/Target/ARM/Thumb1InstrInfo.cpp
+++ b/lib/Target/ARM/Thumb1InstrInfo.cpp
@@ -57,7 +57,7 @@ void Thumb1InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
// Some things to try that should be better:
// * 'mov hi, $src; mov $dst, hi', with hi as either r10 or r11
// * 'movs $dst, $src' if cpsr isn't live
- // See: http://lists.cs.uiuc.edu/pipermail/llvmdev/2014-August/075998.html
+ // See: http://lists.llvm.org/pipermail/llvm-dev/2014-August/075998.html
// 'MOV lo, lo' is unpredictable on < v6, so use the stack to do it
AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::tPUSH)))
diff --git a/lib/Target/Hexagon/HexagonFrameLowering.cpp b/lib/Target/Hexagon/HexagonFrameLowering.cpp
index 29283c81877e..21a8996b1159 100644
--- a/lib/Target/Hexagon/HexagonFrameLowering.cpp
+++ b/lib/Target/Hexagon/HexagonFrameLowering.cpp
@@ -864,13 +864,13 @@ static bool needToReserveScavengingSpillSlots(MachineFunction &MF,
// Check for an unused caller-saved register.
for ( ; *CallerSavedRegs; ++CallerSavedRegs) {
MCPhysReg FreeReg = *CallerSavedRegs;
- if (!MRI.reg_nodbg_empty(FreeReg))
+ if (MRI.isPhysRegUsed(FreeReg))
continue;
// Check aliased register usage.
bool IsCurrentRegUsed = false;
for (MCRegAliasIterator AI(FreeReg, &HRI, false); AI.isValid(); ++AI)
- if (!MRI.reg_nodbg_empty(*AI)) {
+ if (MRI.isPhysRegUsed(*AI)) {
IsCurrentRegUsed = true;
break;
}
diff --git a/lib/Target/Mips/Mips64InstrInfo.td b/lib/Target/Mips/Mips64InstrInfo.td
index c37cf95cadc3..f917ecad4a53 100644
--- a/lib/Target/Mips/Mips64InstrInfo.td
+++ b/lib/Target/Mips/Mips64InstrInfo.td
@@ -500,14 +500,6 @@ def : MipsPat<(trunc (assertzext GPR64:$src)),
def : MipsPat<(i32 (trunc GPR64:$src)),
(SLL (EXTRACT_SUBREG GPR64:$src, sub_32), 0)>;
-// Bypass trunc nodes for bitwise ops.
-def : MipsPat<(i32 (trunc (and GPR64:$lhs, GPR64:$rhs))),
- (EXTRACT_SUBREG (AND64 GPR64:$lhs, GPR64:$rhs), sub_32)>;
-def : MipsPat<(i32 (trunc (or GPR64:$lhs, GPR64:$rhs))),
- (EXTRACT_SUBREG (OR64 GPR64:$lhs, GPR64:$rhs), sub_32)>;
-def : MipsPat<(i32 (trunc (xor GPR64:$lhs, GPR64:$rhs))),
- (EXTRACT_SUBREG (XOR64 GPR64:$lhs, GPR64:$rhs), sub_32)>;
-
// variable shift instructions patterns
def : MipsPat<(shl GPR64:$rt, (i32 (trunc GPR64:$rs))),
(DSLLV GPR64:$rt, (EXTRACT_SUBREG GPR64:$rs, sub_32))>;
diff --git a/lib/Target/Mips/MipsFastISel.cpp b/lib/Target/Mips/MipsFastISel.cpp
index e2f6fcc17726..5152a072b3a2 100644
--- a/lib/Target/Mips/MipsFastISel.cpp
+++ b/lib/Target/Mips/MipsFastISel.cpp
@@ -267,6 +267,9 @@ unsigned MipsFastISel::emitLogicalOp(unsigned ISDOpc, MVT RetVT,
}
unsigned MipsFastISel::fastMaterializeAlloca(const AllocaInst *AI) {
+ if (!TargetSupported)
+ return 0;
+
assert(TLI.getValueType(DL, AI->getType(), true) == MVT::i32 &&
"Alloca should always return a pointer.");
@@ -290,12 +293,7 @@ unsigned MipsFastISel::materializeInt(const Constant *C, MVT VT) {
return 0;
const TargetRegisterClass *RC = &Mips::GPR32RegClass;
const ConstantInt *CI = cast<ConstantInt>(C);
- int64_t Imm;
- if ((VT != MVT::i1) && CI->isNegative())
- Imm = CI->getSExtValue();
- else
- Imm = CI->getZExtValue();
- return materialize32BitInt(Imm, RC);
+ return materialize32BitInt(CI->getZExtValue(), RC);
}
unsigned MipsFastISel::materialize32BitInt(int64_t Imm,
@@ -382,6 +380,9 @@ unsigned MipsFastISel::materializeExternalCallSym(MCSymbol *Sym) {
// Materialize a constant into a register, and return the register
// number (or zero if we failed to handle it).
unsigned MipsFastISel::fastMaterializeConstant(const Constant *C) {
+ if (!TargetSupported)
+ return 0;
+
EVT CEVT = TLI.getValueType(DL, C->getType(), true);
// Only handle simple types.
@@ -981,6 +982,13 @@ bool MipsFastISel::selectSelect(const Instruction *I) {
if (!Src1Reg || !Src2Reg || !CondReg)
return false;
+ unsigned ZExtCondReg = createResultReg(&Mips::GPR32RegClass);
+ if (!ZExtCondReg)
+ return false;
+
+ if (!emitIntExt(MVT::i1, CondReg, MVT::i32, ZExtCondReg, true))
+ return false;
+
unsigned ResultReg = createResultReg(RC);
unsigned TempReg = createResultReg(RC);
@@ -989,7 +997,7 @@ bool MipsFastISel::selectSelect(const Instruction *I) {
emitInst(TargetOpcode::COPY, TempReg).addReg(Src2Reg);
emitInst(CondMovOpc, ResultReg)
- .addReg(Src1Reg).addReg(CondReg).addReg(TempReg);
+ .addReg(Src1Reg).addReg(ZExtCondReg).addReg(TempReg);
updateValueMap(I, ResultReg);
return true;
}
@@ -1232,12 +1240,19 @@ bool MipsFastISel::finishCall(CallLoweringInfo &CLI, MVT RetVT,
}
bool MipsFastISel::fastLowerCall(CallLoweringInfo &CLI) {
+ if (!TargetSupported)
+ return false;
+
CallingConv::ID CC = CLI.CallConv;
bool IsTailCall = CLI.IsTailCall;
bool IsVarArg = CLI.IsVarArg;
const Value *Callee = CLI.Callee;
MCSymbol *Symbol = CLI.Symbol;
+ // Do not handle FastCC.
+ if (CC == CallingConv::Fast)
+ return false;
+
// Allow SelectionDAG isel to handle tail calls.
if (IsTailCall)
return false;
@@ -1312,6 +1327,9 @@ bool MipsFastISel::fastLowerCall(CallLoweringInfo &CLI) {
}
bool MipsFastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
+ if (!TargetSupported)
+ return false;
+
switch (II->getIntrinsicID()) {
default:
return false;
@@ -1415,6 +1433,11 @@ bool MipsFastISel::selectRet(const Instruction *I) {
if (Ret->getNumOperands() > 0) {
CallingConv::ID CC = F.getCallingConv();
+
+ // Do not handle FastCC.
+ if (CC == CallingConv::Fast)
+ return false;
+
SmallVector<ISD::OutputArg, 4> Outs;
GetReturnInfo(F.getReturnType(), F.getAttributes(), Outs, TLI, DL);
diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp
index fbebb9abb4cc..fab2fdfef8cf 100644
--- a/lib/Target/Mips/MipsISelLowering.cpp
+++ b/lib/Target/Mips/MipsISelLowering.cpp
@@ -27,6 +27,7 @@
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineJumpTableInfo.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/FunctionLoweringInfo.h"
#include "llvm/CodeGen/SelectionDAGISel.h"
#include "llvm/CodeGen/ValueTypes.h"
#include "llvm/IR/CallingConv.h"
@@ -53,11 +54,6 @@ NoZeroDivCheck("mno-check-zero-division", cl::Hidden,
cl::desc("MIPS: Don't trap on integer division by zero."),
cl::init(false));
-cl::opt<bool>
-EnableMipsFastISel("mips-fast-isel", cl::Hidden,
- cl::desc("Allow mips-fast-isel to be used"),
- cl::init(false));
-
static const MCPhysReg Mips64DPRegs[8] = {
Mips::D12_64, Mips::D13_64, Mips::D14_64, Mips::D15_64,
Mips::D16_64, Mips::D17_64, Mips::D18_64, Mips::D19_64
@@ -461,7 +457,7 @@ const MipsTargetLowering *MipsTargetLowering::create(const MipsTargetMachine &TM
FastISel *
MipsTargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
const TargetLibraryInfo *libInfo) const {
- if (!EnableMipsFastISel)
+ if (!funcInfo.MF->getTarget().Options.EnableFastISel)
return TargetLowering::createFastISel(funcInfo, libInfo);
return Mips::createFastISel(funcInfo, libInfo);
}
diff --git a/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp b/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
index 4799ea27c4b4..93a503c3758d 100644
--- a/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
+++ b/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
@@ -12,6 +12,7 @@
#include "llvm/MC/MCFixedLenDisassembler.h"
#include "llvm/MC/MCInst.h"
#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/Endian.h"
#include "llvm/Support/TargetRegistry.h"
using namespace llvm;
@@ -22,10 +23,12 @@ typedef MCDisassembler::DecodeStatus DecodeStatus;
namespace {
class PPCDisassembler : public MCDisassembler {
+ bool IsLittleEndian;
+
public:
- PPCDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx)
- : MCDisassembler(STI, Ctx) {}
- ~PPCDisassembler() override {}
+ PPCDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx,
+ bool IsLittleEndian)
+ : MCDisassembler(STI, Ctx), IsLittleEndian(IsLittleEndian) {}
DecodeStatus getInstruction(MCInst &Instr, uint64_t &Size,
ArrayRef<uint8_t> Bytes, uint64_t Address,
@@ -37,7 +40,13 @@ public:
static MCDisassembler *createPPCDisassembler(const Target &T,
const MCSubtargetInfo &STI,
MCContext &Ctx) {
- return new PPCDisassembler(STI, Ctx);
+ return new PPCDisassembler(STI, Ctx, /*IsLittleEndian=*/false);
+}
+
+static MCDisassembler *createPPCLEDisassembler(const Target &T,
+ const MCSubtargetInfo &STI,
+ MCContext &Ctx) {
+ return new PPCDisassembler(STI, Ctx, /*IsLittleEndian=*/true);
}
extern "C" void LLVMInitializePowerPCDisassembler() {
@@ -47,7 +56,7 @@ extern "C" void LLVMInitializePowerPCDisassembler() {
TargetRegistry::RegisterMCDisassembler(ThePPC64Target,
createPPCDisassembler);
TargetRegistry::RegisterMCDisassembler(ThePPC64LETarget,
- createPPCDisassembler);
+ createPPCLEDisassembler);
}
// FIXME: These can be generated by TableGen from the existing register
@@ -383,9 +392,9 @@ DecodeStatus PPCDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
return MCDisassembler::Fail;
}
- // The instruction is big-endian encoded.
- uint32_t Inst =
- (Bytes[0] << 24) | (Bytes[1] << 16) | (Bytes[2] << 8) | (Bytes[3] << 0);
+ // Read the instruction in the proper endianness.
+ uint32_t Inst = IsLittleEndian ? support::endian::read32le(Bytes.data())
+ : support::endian::read32be(Bytes.data());
if (STI.getFeatureBits()[PPC::FeatureQPX]) {
DecodeStatus result =
diff --git a/lib/Target/PowerPC/PPCAsmPrinter.cpp b/lib/Target/PowerPC/PPCAsmPrinter.cpp
index 199a0debf88b..444446692c58 100644
--- a/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -363,71 +363,85 @@ void PPCAsmPrinter::LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM,
SM.recordPatchPoint(MI);
PatchPointOpers Opers(&MI);
- int64_t CallTarget = Opers.getMetaOper(PatchPointOpers::TargetPos).getImm();
unsigned EncodedBytes = 0;
- if (CallTarget) {
- assert((CallTarget & 0xFFFFFFFFFFFF) == CallTarget &&
- "High 16 bits of call target should be zero.");
- unsigned ScratchReg = MI.getOperand(Opers.getNextScratchIdx()).getReg();
- EncodedBytes = 0;
- // Materialize the jump address:
- EmitToStreamer(OutStreamer, MCInstBuilder(PPC::LI8)
- .addReg(ScratchReg)
- .addImm((CallTarget >> 32) & 0xFFFF));
- ++EncodedBytes;
- EmitToStreamer(OutStreamer, MCInstBuilder(PPC::RLDIC)
- .addReg(ScratchReg)
- .addReg(ScratchReg)
- .addImm(32).addImm(16));
- ++EncodedBytes;
- EmitToStreamer(OutStreamer, MCInstBuilder(PPC::ORIS8)
- .addReg(ScratchReg)
- .addReg(ScratchReg)
- .addImm((CallTarget >> 16) & 0xFFFF));
- ++EncodedBytes;
- EmitToStreamer(OutStreamer, MCInstBuilder(PPC::ORI8)
- .addReg(ScratchReg)
- .addReg(ScratchReg)
- .addImm(CallTarget & 0xFFFF));
-
- // Save the current TOC pointer before the remote call.
- int TOCSaveOffset = Subtarget->isELFv2ABI() ? 24 : 40;
- EmitToStreamer(OutStreamer, MCInstBuilder(PPC::STD)
- .addReg(PPC::X2)
- .addImm(TOCSaveOffset)
- .addReg(PPC::X1));
- ++EncodedBytes;
-
-
- // If we're on ELFv1, then we need to load the actual function pointer from
- // the function descriptor.
- if (!Subtarget->isELFv2ABI()) {
- // Load the new TOC pointer and the function address, but not r11
- // (needing this is rare, and loading it here would prevent passing it
- // via a 'nest' parameter.
- EmitToStreamer(OutStreamer, MCInstBuilder(PPC::LD)
+ const MachineOperand &CalleeMO =
+ Opers.getMetaOper(PatchPointOpers::TargetPos);
+
+ if (CalleeMO.isImm()) {
+ int64_t CallTarget = Opers.getMetaOper(PatchPointOpers::TargetPos).getImm();
+ if (CallTarget) {
+ assert((CallTarget & 0xFFFFFFFFFFFF) == CallTarget &&
+ "High 16 bits of call target should be zero.");
+ unsigned ScratchReg = MI.getOperand(Opers.getNextScratchIdx()).getReg();
+ EncodedBytes = 0;
+ // Materialize the jump address:
+ EmitToStreamer(OutStreamer, MCInstBuilder(PPC::LI8)
+ .addReg(ScratchReg)
+ .addImm((CallTarget >> 32) & 0xFFFF));
+ ++EncodedBytes;
+ EmitToStreamer(OutStreamer, MCInstBuilder(PPC::RLDIC)
+ .addReg(ScratchReg)
+ .addReg(ScratchReg)
+ .addImm(32).addImm(16));
+ ++EncodedBytes;
+ EmitToStreamer(OutStreamer, MCInstBuilder(PPC::ORIS8)
+ .addReg(ScratchReg)
+ .addReg(ScratchReg)
+ .addImm((CallTarget >> 16) & 0xFFFF));
+ ++EncodedBytes;
+ EmitToStreamer(OutStreamer, MCInstBuilder(PPC::ORI8)
+ .addReg(ScratchReg)
+ .addReg(ScratchReg)
+ .addImm(CallTarget & 0xFFFF));
+
+ // Save the current TOC pointer before the remote call.
+ int TOCSaveOffset = Subtarget->isELFv2ABI() ? 24 : 40;
+ EmitToStreamer(OutStreamer, MCInstBuilder(PPC::STD)
.addReg(PPC::X2)
- .addImm(8)
+ .addImm(TOCSaveOffset)
+ .addReg(PPC::X1));
+ ++EncodedBytes;
+
+
+ // If we're on ELFv1, then we need to load the actual function pointer
+ // from the function descriptor.
+ if (!Subtarget->isELFv2ABI()) {
+ // Load the new TOC pointer and the function address, but not r11
+ // (needing this is rare, and loading it here would prevent passing it
+ // via a 'nest' parameter.
+ EmitToStreamer(OutStreamer, MCInstBuilder(PPC::LD)
+ .addReg(PPC::X2)
+ .addImm(8)
+ .addReg(ScratchReg));
+ ++EncodedBytes;
+ EmitToStreamer(OutStreamer, MCInstBuilder(PPC::LD)
+ .addReg(ScratchReg)
+ .addImm(0)
+ .addReg(ScratchReg));
+ ++EncodedBytes;
+ }
+
+ EmitToStreamer(OutStreamer, MCInstBuilder(PPC::MTCTR8)
.addReg(ScratchReg));
++EncodedBytes;
+ EmitToStreamer(OutStreamer, MCInstBuilder(PPC::BCTRL8));
+ ++EncodedBytes;
+
+ // Restore the TOC pointer after the call.
EmitToStreamer(OutStreamer, MCInstBuilder(PPC::LD)
- .addReg(ScratchReg)
- .addImm(0)
- .addReg(ScratchReg));
+ .addReg(PPC::X2)
+ .addImm(TOCSaveOffset)
+ .addReg(PPC::X1));
++EncodedBytes;
}
+ } else if (CalleeMO.isGlobal()) {
+ const GlobalValue *GValue = CalleeMO.getGlobal();
+ MCSymbol *MOSymbol = getSymbol(GValue);
+ const MCExpr *SymVar = MCSymbolRefExpr::create(MOSymbol, OutContext);
- EmitToStreamer(OutStreamer, MCInstBuilder(PPC::MTCTR8).addReg(ScratchReg));
- ++EncodedBytes;
- EmitToStreamer(OutStreamer, MCInstBuilder(PPC::BCTRL8));
- ++EncodedBytes;
-
- // Restore the TOC pointer after the call.
- EmitToStreamer(OutStreamer, MCInstBuilder(PPC::LD)
- .addReg(PPC::X2)
- .addImm(TOCSaveOffset)
- .addReg(PPC::X1));
- ++EncodedBytes;
+ EmitToStreamer(OutStreamer, MCInstBuilder(PPC::BL8_NOP)
+ .addExpr(SymVar));
+ EncodedBytes += 2;
}
// Each instruction is 4 bytes.
diff --git a/lib/Target/PowerPC/PPCFrameLowering.cpp b/lib/Target/PowerPC/PPCFrameLowering.cpp
index 87229d80d9c1..08ae7174244a 100644
--- a/lib/Target/PowerPC/PPCFrameLowering.cpp
+++ b/lib/Target/PowerPC/PPCFrameLowering.cpp
@@ -306,10 +306,9 @@ static void HandleVRSaveUpdate(MachineInstr *MI, const TargetInstrInfo &TII) {
const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
DebugLoc dl = MI->getDebugLoc();
- const MachineRegisterInfo &MRI = MF->getRegInfo();
unsigned UsedRegMask = 0;
for (unsigned i = 0; i != 32; ++i)
- if (MRI.isPhysRegModified(VRRegNo[i]))
+ if (MF->getRegInfo().isPhysRegUsed(VRRegNo[i]))
UsedRegMask |= 1 << (31-i);
// Live in and live out values already must be in the mask, so don't bother
diff --git a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index 01a3acb742e6..b6025bf66ef7 100644
--- a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -2305,14 +2305,15 @@ SDNode *PPCDAGToDAGISel::SelectSETCC(SDNode *N) {
if (Swap)
std::swap(LHS, RHS);
+ EVT ResVT = VecVT.changeVectorElementTypeToInteger();
if (Negate) {
- SDValue VCmp(CurDAG->getMachineNode(VCmpInst, dl, VecVT, LHS, RHS), 0);
+ SDValue VCmp(CurDAG->getMachineNode(VCmpInst, dl, ResVT, LHS, RHS), 0);
return CurDAG->SelectNodeTo(N, PPCSubTarget->hasVSX() ? PPC::XXLNOR :
PPC::VNOR,
- VecVT, VCmp, VCmp);
+ ResVT, VCmp, VCmp);
}
- return CurDAG->SelectNodeTo(N, VCmpInst, VecVT, LHS, RHS);
+ return CurDAG->SelectNodeTo(N, VCmpInst, ResVT, LHS, RHS);
}
if (PPCSubTarget->useCRBits())
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
index 0ed9b051ffed..1e28913d1fca 100644
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -580,6 +580,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
addRegisterClass(MVT::f64, &PPC::VSFRCRegClass);
+ addRegisterClass(MVT::v4i32, &PPC::VSRCRegClass);
addRegisterClass(MVT::v4f32, &PPC::VSRCRegClass);
addRegisterClass(MVT::v2f64, &PPC::VSRCRegClass);
@@ -1416,7 +1417,7 @@ int PPC::isVSLDOIShuffleMask(SDNode *N, unsigned ShuffleKind,
} else
return -1;
- if (ShuffleKind == 2 && isLE)
+ if (isLE)
ShiftAmt = 16 - ShiftAmt;
return ShiftAmt;
@@ -1429,6 +1430,11 @@ bool PPC::isSplatShuffleMask(ShuffleVectorSDNode *N, unsigned EltSize) {
assert(N->getValueType(0) == MVT::v16i8 &&
(EltSize == 1 || EltSize == 2 || EltSize == 4));
+ // The consecutive indices need to specify an element, not part of two
+ // different elements. So abandon ship early if this isn't the case.
+ if (N->getMaskElt(0) % EltSize != 0)
+ return false;
+
// This is a splat operation if each element of the permute is the same, and
// if the value doesn't reference the second vector.
unsigned ElementBase = N->getMaskElt(0);
@@ -7011,17 +7017,20 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
// t = vsplti c, result = vsldoi t, t, 1
if (SextVal == (int)(((unsigned)i << 8) | (i < 0 ? 0xFF : 0))) {
SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl);
- return BuildVSLDOI(T, T, 1, Op.getValueType(), DAG, dl);
+ unsigned Amt = Subtarget.isLittleEndian() ? 15 : 1;
+ return BuildVSLDOI(T, T, Amt, Op.getValueType(), DAG, dl);
}
// t = vsplti c, result = vsldoi t, t, 2
if (SextVal == (int)(((unsigned)i << 16) | (i < 0 ? 0xFFFF : 0))) {
SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl);
- return BuildVSLDOI(T, T, 2, Op.getValueType(), DAG, dl);
+ unsigned Amt = Subtarget.isLittleEndian() ? 14 : 2;
+ return BuildVSLDOI(T, T, Amt, Op.getValueType(), DAG, dl);
}
// t = vsplti c, result = vsldoi t, t, 3
if (SextVal == (int)(((unsigned)i << 24) | (i < 0 ? 0xFFFFFF : 0))) {
SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl);
- return BuildVSLDOI(T, T, 3, Op.getValueType(), DAG, dl);
+ unsigned Amt = Subtarget.isLittleEndian() ? 13 : 3;
+ return BuildVSLDOI(T, T, Amt, Op.getValueType(), DAG, dl);
}
}
@@ -9957,6 +9966,9 @@ SDValue PPCTargetLowering::combineFPToIntToFP(SDNode *N,
if (Src.getValueType() == MVT::f32) {
Src = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Src);
DCI.AddToWorklist(Src.getNode());
+ } else if (Src.getValueType() != MVT::f64) {
+ // Make sure that we don't pick up a ppc_fp128 source value.
+ return SDValue();
}
unsigned FCTOp =
diff --git a/lib/Target/README.txt b/lib/Target/README.txt
index 282d9234c1a5..7e9888cc13e8 100644
--- a/lib/Target/README.txt
+++ b/lib/Target/README.txt
@@ -106,7 +106,7 @@ for 1,2,4,8 bytes.
//===---------------------------------------------------------------------===//
It would be nice to revert this patch:
-http://lists.cs.uiuc.edu/pipermail/llvm-commits/Week-of-Mon-20060213/031986.html
+http://lists.llvm.org/pipermail/llvm-commits/Week-of-Mon-20060213/031986.html
And teach the dag combiner enough to simplify the code expanded before
legalize. It seems plausible that this knowledge would let it simplify other
diff --git a/lib/Target/Sparc/SparcFrameLowering.cpp b/lib/Target/Sparc/SparcFrameLowering.cpp
index 8fa10dcae114..c0279daa63d9 100644
--- a/lib/Target/Sparc/SparcFrameLowering.cpp
+++ b/lib/Target/Sparc/SparcFrameLowering.cpp
@@ -190,11 +190,11 @@ static bool LLVM_ATTRIBUTE_UNUSED verifyLeafProcRegUse(MachineRegisterInfo *MRI)
{
for (unsigned reg = SP::I0; reg <= SP::I7; ++reg)
- if (!MRI->reg_nodbg_empty(reg))
+ if (MRI->isPhysRegUsed(reg))
return false;
for (unsigned reg = SP::L0; reg <= SP::L7; ++reg)
- if (!MRI->reg_nodbg_empty(reg))
+ if (MRI->isPhysRegUsed(reg))
return false;
return true;
@@ -206,10 +206,10 @@ bool SparcFrameLowering::isLeafProc(MachineFunction &MF) const
MachineRegisterInfo &MRI = MF.getRegInfo();
MachineFrameInfo *MFI = MF.getFrameInfo();
- return !(MFI->hasCalls() // has calls
- || !MRI.reg_nodbg_empty(SP::L0) // Too many registers needed
- || !MRI.reg_nodbg_empty(SP::O6) // %SP is used
- || hasFP(MF)); // need %FP
+ return !(MFI->hasCalls() // has calls
+ || MRI.isPhysRegUsed(SP::L0) // Too many registers needed
+ || MRI.isPhysRegUsed(SP::O6) // %SP is used
+ || hasFP(MF)); // need %FP
}
void SparcFrameLowering::remapRegsForLeafProc(MachineFunction &MF) const {
@@ -218,13 +218,16 @@ void SparcFrameLowering::remapRegsForLeafProc(MachineFunction &MF) const {
// Remap %i[0-7] to %o[0-7].
for (unsigned reg = SP::I0; reg <= SP::I7; ++reg) {
- if (MRI.reg_nodbg_empty(reg))
+ if (!MRI.isPhysRegUsed(reg))
continue;
unsigned mapped_reg = (reg - SP::I0 + SP::O0);
- assert(MRI.reg_nodbg_empty(mapped_reg));
+ assert(!MRI.isPhysRegUsed(mapped_reg));
// Replace I register with O register.
MRI.replaceRegWith(reg, mapped_reg);
+
+ // Mark the reg unused.
+ MRI.setPhysRegUnused(reg);
}
// Rewrite MBB's Live-ins.
diff --git a/lib/Target/SystemZ/SystemZCallingConv.td b/lib/Target/SystemZ/SystemZCallingConv.td
index be8f00b57adb..bdd1b1598adb 100644
--- a/lib/Target/SystemZ/SystemZCallingConv.td
+++ b/lib/Target/SystemZ/SystemZCallingConv.td
@@ -53,10 +53,6 @@ def RetCC_SystemZ : CallingConv<[
CCIfSubtarget<"hasVector()",
CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
CCAssignToReg<[V24, V26, V28, V30, V25, V27, V29, V31]>>>
-
- // ABI-compliant code returns long double by reference, but that conversion
- // is left to higher-level code. Perhaps we could add an f128 definition
- // here for code that doesn't care about the ABI?
]>;
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/SystemZ/SystemZISelLowering.cpp b/lib/Target/SystemZ/SystemZISelLowering.cpp
index 056ee02dcc21..9a753c897519 100644
--- a/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -1175,6 +1175,20 @@ SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI,
return Chain;
}
+bool SystemZTargetLowering::
+CanLowerReturn(CallingConv::ID CallConv,
+ MachineFunction &MF, bool isVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ LLVMContext &Context) const {
+ // Detect unsupported vector return types.
+ if (Subtarget.hasVector())
+ VerifyVectorTypes(Outs);
+
+ SmallVector<CCValAssign, 16> RetLocs;
+ CCState RetCCInfo(CallConv, isVarArg, MF, RetLocs, Context);
+ return RetCCInfo.CheckReturn(Outs, RetCC_SystemZ);
+}
+
SDValue
SystemZTargetLowering::LowerReturn(SDValue Chain,
CallingConv::ID CallConv, bool IsVarArg,
diff --git a/lib/Target/SystemZ/SystemZISelLowering.h b/lib/Target/SystemZ/SystemZISelLowering.h
index 949b67f114ea..07ff25144581 100644
--- a/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/lib/Target/SystemZ/SystemZISelLowering.h
@@ -423,6 +423,10 @@ public:
SDValue LowerCall(CallLoweringInfo &CLI,
SmallVectorImpl<SDValue> &InVals) const override;
+ bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
+ bool isVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ LLVMContext &Context) const override;
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
const SmallVectorImpl<ISD::OutputArg> &Outs,
const SmallVectorImpl<SDValue> &OutVals,
diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp
index 418f0431e1d8..bca059d8c383 100644
--- a/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -681,6 +681,9 @@ private:
std::unique_ptr<X86Operand> DefaultMemSIOperand(SMLoc Loc);
std::unique_ptr<X86Operand> DefaultMemDIOperand(SMLoc Loc);
+ void AddDefaultSrcDestOperands(
+ OperandVector& Operands, std::unique_ptr<llvm::MCParsedAsmOperand> &&Src,
+ std::unique_ptr<llvm::MCParsedAsmOperand> &&Dst);
std::unique_ptr<X86Operand> ParseOperand();
std::unique_ptr<X86Operand> ParseATTOperand();
std::unique_ptr<X86Operand> ParseIntelOperand();
@@ -1014,6 +1017,19 @@ std::unique_ptr<X86Operand> X86AsmParser::DefaultMemDIOperand(SMLoc Loc) {
Loc, Loc, 0);
}
+void X86AsmParser::AddDefaultSrcDestOperands(
+ OperandVector& Operands, std::unique_ptr<llvm::MCParsedAsmOperand> &&Src,
+ std::unique_ptr<llvm::MCParsedAsmOperand> &&Dst) {
+ if (isParsingIntelSyntax()) {
+ Operands.push_back(std::move(Dst));
+ Operands.push_back(std::move(Src));
+ }
+ else {
+ Operands.push_back(std::move(Src));
+ Operands.push_back(std::move(Dst));
+ }
+}
+
std::unique_ptr<X86Operand> X86AsmParser::ParseOperand() {
if (isParsingIntelSyntax())
return ParseIntelOperand();
@@ -2228,26 +2244,18 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
if (Name.startswith("ins") && Operands.size() == 1 &&
(Name == "insb" || Name == "insw" || Name == "insl" ||
Name == "insd" )) {
- if (isParsingIntelSyntax()) {
- Operands.push_back(X86Operand::CreateReg(X86::DX, NameLoc, NameLoc));
- Operands.push_back(DefaultMemDIOperand(NameLoc));
- } else {
- Operands.push_back(X86Operand::CreateReg(X86::DX, NameLoc, NameLoc));
- Operands.push_back(DefaultMemDIOperand(NameLoc));
- }
+ AddDefaultSrcDestOperands(Operands,
+ X86Operand::CreateReg(X86::DX, NameLoc, NameLoc),
+ DefaultMemDIOperand(NameLoc));
}
// Append default arguments to "outs[bwld]"
if (Name.startswith("outs") && Operands.size() == 1 &&
(Name == "outsb" || Name == "outsw" || Name == "outsl" ||
Name == "outsd" )) {
- if (isParsingIntelSyntax()) {
- Operands.push_back(DefaultMemSIOperand(NameLoc));
- Operands.push_back(X86Operand::CreateReg(X86::DX, NameLoc, NameLoc));
- } else {
- Operands.push_back(DefaultMemSIOperand(NameLoc));
- Operands.push_back(X86Operand::CreateReg(X86::DX, NameLoc, NameLoc));
- }
+ AddDefaultSrcDestOperands(Operands,
+ DefaultMemSIOperand(NameLoc),
+ X86Operand::CreateReg(X86::DX, NameLoc, NameLoc));
}
// Transform "lods[bwlq]" into "lods[bwlq] ($SIREG)" for appropriate
@@ -2279,13 +2287,9 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
(Name == "cmps" || Name == "cmpsb" || Name == "cmpsw" ||
Name == "cmpsl" || Name == "cmpsd" || Name == "cmpsq")) {
if (Operands.size() == 1) {
- if (isParsingIntelSyntax()) {
- Operands.push_back(DefaultMemSIOperand(NameLoc));
- Operands.push_back(DefaultMemDIOperand(NameLoc));
- } else {
- Operands.push_back(DefaultMemDIOperand(NameLoc));
- Operands.push_back(DefaultMemSIOperand(NameLoc));
- }
+ AddDefaultSrcDestOperands(Operands,
+ DefaultMemDIOperand(NameLoc),
+ DefaultMemSIOperand(NameLoc));
} else if (Operands.size() == 3) {
X86Operand &Op = (X86Operand &)*Operands[1];
X86Operand &Op2 = (X86Operand &)*Operands[2];
@@ -2305,13 +2309,9 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
if (Operands.size() == 1) {
if (Name == "movsd")
Operands.back() = X86Operand::CreateToken("movsl", NameLoc);
- if (isParsingIntelSyntax()) {
- Operands.push_back(DefaultMemDIOperand(NameLoc));
- Operands.push_back(DefaultMemSIOperand(NameLoc));
- } else {
- Operands.push_back(DefaultMemSIOperand(NameLoc));
- Operands.push_back(DefaultMemDIOperand(NameLoc));
- }
+ AddDefaultSrcDestOperands(Operands,
+ DefaultMemSIOperand(NameLoc),
+ DefaultMemDIOperand(NameLoc));
} else if (Operands.size() == 3) {
X86Operand &Op = (X86Operand &)*Operands[1];
X86Operand &Op2 = (X86Operand &)*Operands[2];
diff --git a/lib/Target/X86/X86FloatingPoint.cpp b/lib/Target/X86/X86FloatingPoint.cpp
index 36a8cdbab55b..40b9c8a863a3 100644
--- a/lib/Target/X86/X86FloatingPoint.cpp
+++ b/lib/Target/X86/X86FloatingPoint.cpp
@@ -301,9 +301,8 @@ bool FPS::runOnMachineFunction(MachineFunction &MF) {
bool FPIsUsed = false;
static_assert(X86::FP6 == X86::FP0+6, "Register enums aren't sorted right!");
- const MachineRegisterInfo &MRI = MF.getRegInfo();
for (unsigned i = 0; i <= 6; ++i)
- if (!MRI.reg_nodbg_empty(X86::FP0 + i)) {
+ if (MF.getRegInfo().isPhysRegUsed(X86::FP0+i)) {
FPIsUsed = true;
break;
}
diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp
index 2a35c4cf31f3..3a21b57f0157 100644
--- a/lib/Target/X86/X86FrameLowering.cpp
+++ b/lib/Target/X86/X86FrameLowering.cpp
@@ -1682,6 +1682,8 @@ void X86FrameLowering::adjustForSegmentedStacks(
.addImm(StackSize);
BuildMI(allocMBB, DL, TII.get(MOVri), Reg11)
.addImm(X86FI->getArgumentStackSize());
+ MF.getRegInfo().setPhysRegUsed(Reg10);
+ MF.getRegInfo().setPhysRegUsed(Reg11);
} else {
BuildMI(allocMBB, DL, TII.get(X86::PUSHi32))
.addImm(X86FI->getArgumentStackSize());
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 6e22ab30057c..71ccb1ab1e55 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -12640,24 +12640,29 @@ static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
if (User->getOpcode() == ISD::FNEG)
return Op;
- SDValue Op0 = Op.getOperand(0);
- bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
-
SDLoc dl(Op);
MVT VT = Op.getSimpleValueType();
- // Assume scalar op for initialization; update for vector if needed.
- // Note that there are no scalar bitwise logical SSE/AVX instructions, so we
- // generate a 16-byte vector constant and logic op even for the scalar case.
- // Using a 16-byte mask allows folding the load of the mask with
- // the logic op, so it can save (~4 bytes) on code size.
- MVT EltVT = VT;
- unsigned NumElts = VT == MVT::f64 ? 2 : 4;
+
// FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
// decide if we should generate a 16-byte constant mask when we only need 4 or
// 8 bytes for the scalar case.
+
+ MVT LogicVT;
+ MVT EltVT;
+ unsigned NumElts;
+
if (VT.isVector()) {
+ LogicVT = VT;
EltVT = VT.getVectorElementType();
NumElts = VT.getVectorNumElements();
+ } else {
+ // There are no scalar bitwise logical SSE/AVX instructions, so we
+ // generate a 16-byte vector constant and logic op even for the scalar case.
+ // Using a 16-byte mask allows folding the load of the mask with
+ // the logic op, so it can save (~4 bytes) on code size.
+ LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
+ EltVT = VT;
+ NumElts = (VT == MVT::f64) ? 2 : 4;
}
unsigned EltBits = EltVT.getSizeInBits();
@@ -12670,26 +12675,25 @@ static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
- SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
+ SDValue Mask = DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx,
MachinePointerInfo::getConstantPool(),
false, false, false, Alignment);
- if (VT.isVector()) {
- // For a vector, cast operands to a vector type, perform the logic op,
- // and cast the result back to the original value type.
- MVT VecVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
- SDValue MaskCasted = DAG.getBitcast(VecVT, Mask);
- SDValue Operand = IsFNABS ? DAG.getBitcast(VecVT, Op0.getOperand(0))
- : DAG.getBitcast(VecVT, Op0);
- unsigned BitOp = IsFABS ? ISD::AND : IsFNABS ? ISD::OR : ISD::XOR;
- return DAG.getBitcast(VT,
- DAG.getNode(BitOp, dl, VecVT, Operand, MaskCasted));
- }
-
- // If not vector, then scalar.
- unsigned BitOp = IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR;
+ SDValue Op0 = Op.getOperand(0);
+ bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
+ unsigned LogicOp =
+ IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR;
SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
- return DAG.getNode(BitOp, dl, VT, Operand, Mask);
+
+ if (VT.isVector())
+ return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
+
+ // For the scalar case extend to a 128-bit vector, perform the logic op,
+ // and extract the scalar result back out.
+ Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
+ SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
+ DAG.getIntPtrConstant(0, dl));
}
static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
@@ -12729,10 +12733,16 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
Constant *C = ConstantVector::get(CV);
auto PtrVT = TLI.getPointerTy(DAG.getDataLayout());
SDValue CPIdx = DAG.getConstantPool(C, PtrVT, 16);
- SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx,
+
+ // Perform all logic operations as 16-byte vectors because there are no
+ // scalar FP logic instructions in SSE. This allows load folding of the
+ // constants into the logic instructions.
+ MVT LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
+ SDValue Mask1 = DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx,
MachinePointerInfo::getConstantPool(),
false, false, false, 16);
- SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1);
+ Op1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Op1);
+ SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Op1, Mask1);
// Next, clear the sign bit from the first operand (magnitude).
// If it's a constant, we can clear it here.
@@ -12740,7 +12750,8 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
APFloat APF = Op0CN->getValueAPF();
// If the magnitude is a positive zero, the sign bit alone is enough.
if (APF.isPosZero())
- return SignBit;
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcVT, SignBit,
+ DAG.getIntPtrConstant(0, dl));
APF.clearSign();
CV[0] = ConstantFP::get(*Context, APF);
} else {
@@ -12750,15 +12761,18 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
}
C = ConstantVector::get(CV);
CPIdx = DAG.getConstantPool(C, PtrVT, 16);
- SDValue Val = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
+ SDValue Val = DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx,
MachinePointerInfo::getConstantPool(),
false, false, false, 16);
// If the magnitude operand wasn't a constant, we need to AND out the sign.
- if (!isa<ConstantFPSDNode>(Op0))
- Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Val);
-
+ if (!isa<ConstantFPSDNode>(Op0)) {
+ Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Op0);
+ Val = DAG.getNode(X86ISD::FAND, dl, LogicVT, Op0, Val);
+ }
// OR the magnitude value with the sign bit.
- return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit);
+ Val = DAG.getNode(X86ISD::FOR, dl, LogicVT, Val, SignBit);
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcVT, Val,
+ DAG.getIntPtrConstant(0, dl));
}
static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index 786150760b93..cf68ef053361 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -956,18 +956,9 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::DPPDrri, X86::DPPDrmi, TB_ALIGN_16 },
{ X86::DPPSrri, X86::DPPSrmi, TB_ALIGN_16 },
- // FIXME: We should not be folding Fs* scalar loads into vector
- // instructions because the vector instructions require vector-sized
- // loads. Lowering should create vector-sized instructions (the Fv*
- // variants below) to allow load folding.
- { X86::FsANDNPDrr, X86::FsANDNPDrm, TB_ALIGN_16 },
- { X86::FsANDNPSrr, X86::FsANDNPSrm, TB_ALIGN_16 },
- { X86::FsANDPDrr, X86::FsANDPDrm, TB_ALIGN_16 },
- { X86::FsANDPSrr, X86::FsANDPSrm, TB_ALIGN_16 },
- { X86::FsORPDrr, X86::FsORPDrm, TB_ALIGN_16 },
- { X86::FsORPSrr, X86::FsORPSrm, TB_ALIGN_16 },
- { X86::FsXORPDrr, X86::FsXORPDrm, TB_ALIGN_16 },
- { X86::FsXORPSrr, X86::FsXORPSrm, TB_ALIGN_16 },
+ // Do not fold Fs* scalar logical op loads because there are no scalar
+ // load variants for these instructions. When folded, the load is required
+ // to be 128-bits, so the load size would not match.
{ X86::FvANDNPDrr, X86::FvANDNPDrm, TB_ALIGN_16 },
{ X86::FvANDNPSrr, X86::FvANDNPSrm, TB_ALIGN_16 },
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index a5ff9edf05a3..99386b0658ad 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -2919,6 +2919,14 @@ multiclass sse12_fp_packed_vector_logical_alias<
defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
VR128, v2f64, f128mem, loadv2f64, SSEPackedDouble, itins, 0>,
PD, VEX_4V;
+
+ defm V#NAME#PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
+ VR256, v8f32, f256mem, loadv8f32, SSEPackedSingle, itins, 0>,
+ PS, VEX_4V, VEX_L;
+
+ defm V#NAME#PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
+ VR256, v4f64, f256mem, loadv4f64, SSEPackedDouble, itins, 0>,
+ PD, VEX_4V, VEX_L;
}
let Constraints = "$src1 = $dst" in {