diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2017-06-03 15:20:36 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2017-06-03 15:20:36 +0000 |
commit | d288ef4c1788d3a951a7558c68312c2d320612b1 (patch) | |
tree | ece909a5200f95f85f0813599a9500620f4d9217 /lib/Target | |
parent | f382538d471e38a9b98f016c4caebd24c8d60b62 (diff) | |
download | src-d288ef4c1788d3a951a7558c68312c2d320612b1.tar.gz src-d288ef4c1788d3a951a7558c68312c2d320612b1.zip |
Vendor import of llvm trunk r304659:vendor/llvm/llvm-trunk-r304659
Notes
Notes:
svn path=/vendor/llvm/dist/; revision=319523
svn path=/vendor/llvm/llvm-trunk-r304659/; revision=319524; tag=vendor/llvm/llvm-trunk-r304659
Diffstat (limited to 'lib/Target')
27 files changed, 2146 insertions, 114 deletions
diff --git a/lib/Target/AArch64/AArch64PBQPRegAlloc.h b/lib/Target/AArch64/AArch64PBQPRegAlloc.h index 4f656f94ea12..b99c1d1d6b3e 100644 --- a/lib/Target/AArch64/AArch64PBQPRegAlloc.h +++ b/lib/Target/AArch64/AArch64PBQPRegAlloc.h @@ -1,4 +1,4 @@ -//===-- AArch64PBQPRegAlloc.h - AArch64 specific PBQP constraints -------===// +//==- AArch64PBQPRegAlloc.h - AArch64 specific PBQP constraints --*- C++ -*-==// // // The LLVM Compiler Infrastructure // @@ -15,6 +15,8 @@ namespace llvm { +class TargetRegisterInfo; + /// Add the accumulator chaining constraint to a PBQP graph class A57ChainingConstraint : public PBQPRAConstraint { public: @@ -33,6 +35,7 @@ private: // Add constraints between existing chains void addInterChainConstraint(PBQPRAGraph &G, unsigned Rd, unsigned Ra); }; -} + +} // end namespace llvm #endif // LLVM_LIB_TARGET_AARCH64_AARCH64PBQPREGALOC_H diff --git a/lib/Target/AArch64/AArch64SchedFalkorDetails.td b/lib/Target/AArch64/AArch64SchedFalkorDetails.td index d098cf7a5a37..7402bcf1346c 100644 --- a/lib/Target/AArch64/AArch64SchedFalkorDetails.td +++ b/lib/Target/AArch64/AArch64SchedFalkorDetails.td @@ -56,12 +56,14 @@ def FalkorWr_1Z_0cyc : SchedWriteRes<[FalkorUnitZ]> { let Latency = 0; } def FalkorWr_1ZB_0cyc : SchedWriteRes<[FalkorUnitZB]> { let Latency = 0; } def FalkorWr_1LD_3cyc : SchedWriteRes<[FalkorUnitLD]> { let Latency = 3; } def FalkorWr_1LD_4cyc : SchedWriteRes<[FalkorUnitLD]> { let Latency = 4; } +def FalkorWr_1XYZ_0cyc : SchedWriteRes<[FalkorUnitXYZ]> { let Latency = 0; } def FalkorWr_1XYZ_1cyc : SchedWriteRes<[FalkorUnitXYZ]> { let Latency = 1; } def FalkorWr_1XYZ_2cyc : SchedWriteRes<[FalkorUnitXYZ]> { let Latency = 2; } def FalkorWr_1XYZB_0cyc : SchedWriteRes<[FalkorUnitXYZB]>{ let Latency = 0; } def FalkorWr_1XYZB_1cyc : SchedWriteRes<[FalkorUnitXYZB]>{ let Latency = 1; } def FalkorWr_1none_0cyc : SchedWriteRes<[]> { let Latency = 0; } +def FalkorWr_1VXVY_0cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 0; } def FalkorWr_1VXVY_1cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 1; } def FalkorWr_1VXVY_2cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 2; } def FalkorWr_1VXVY_3cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 3; } @@ -76,6 +78,7 @@ def FalkorWr_1LD_0cyc : SchedWriteRes<[FalkorUnitLD]> { let Latency = 0; } def FalkorWr_1ST_0cyc : SchedWriteRes<[FalkorUnitST]> { let Latency = 0; } def FalkorWr_1ST_3cyc : SchedWriteRes<[FalkorUnitST]> { let Latency = 3; } +def FalkorWr_1GTOV_0cyc : SchedWriteRes<[FalkorUnitGTOV]>{ let Latency = 0; } def FalkorWr_1GTOV_1cyc : SchedWriteRes<[FalkorUnitGTOV]>{ let Latency = 1; } def FalkorWr_1GTOV_4cyc : SchedWriteRes<[FalkorUnitGTOV]>{ let Latency = 4; } def FalkorWr_1VTOG_1cyc : SchedWriteRes<[FalkorUnitVTOG]>{ let Latency = 1; } @@ -83,6 +86,10 @@ def FalkorWr_1VTOG_1cyc : SchedWriteRes<[FalkorUnitVTOG]>{ let Latency = 1; } //===----------------------------------------------------------------------===// // Define 2 micro-op types +def FalkorWr_2VXVY_0cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> { + let Latency = 0; + let NumMicroOps = 2; +} def FalkorWr_2VXVY_1cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> { let Latency = 1; let NumMicroOps = 2; @@ -476,17 +483,19 @@ def FalkorReadFMA64 : SchedReadAdvance<2, [FalkorWr_FMUL64_1VXVY_6cyc, FalkorWr // SchedPredicates and WriteVariants for Immediate Zero and LSLFast/ASRFast // ----------------------------------------------------------------------------- def FalkorImmZPred : SchedPredicate<[{MI->getOperand(1).getImm() == 0}]>; -def FalkorFMOVZrReg : SchedPredicate<[{MI->getOperand(1).getReg() == AArch64::WZR || +def FalkorOp1ZrReg : SchedPredicate<[{MI->getOperand(1).getReg() == AArch64::WZR || + MI->getOperand(1).getReg() == AArch64::XZR}]>; def FalkorShiftExtFastPred : SchedPredicate<[{TII->isFalkorShiftExtFast(*MI)}]>; def FalkorWr_FMOV : SchedWriteVariant<[ - SchedVar<FalkorFMOVZrReg, [FalkorWr_1none_0cyc]>, + SchedVar<FalkorOp1ZrReg, [FalkorWr_1none_0cyc]>, SchedVar<NoSchedPred, [FalkorWr_1GTOV_1cyc]>]>; def FalkorWr_MOVZ : SchedWriteVariant<[ SchedVar<FalkorImmZPred, [FalkorWr_1none_0cyc]>, - SchedVar<NoSchedPred, [FalkorWr_1XYZB_1cyc]>]>; + SchedVar<NoSchedPred, [FalkorWr_1XYZB_0cyc]>]>; // imm fwd + def FalkorWr_ADDSUBsx : SchedWriteVariant<[ SchedVar<FalkorShiftExtFastPred, [FalkorWr_1XYZ_1cyc]>, @@ -500,6 +509,10 @@ def FalkorWr_LDRSro : SchedWriteVariant<[ SchedVar<FalkorShiftExtFastPred, [FalkorWr_1LD_4cyc]>, SchedVar<NoSchedPred, [FalkorWr_1XYZ_1LD_5cyc]>]>; +def FalkorWr_ORRi : SchedWriteVariant<[ + SchedVar<FalkorOp1ZrReg, [FalkorWr_1XYZ_0cyc]>, // imm fwd + SchedVar<NoSchedPred, [FalkorWr_1XYZ_1cyc]>]>; + def FalkorWr_PRFMro : SchedWriteVariant<[ SchedVar<FalkorShiftExtFastPred, [FalkorWr_1ST_3cyc]>, SchedVar<NoSchedPred, [FalkorWr_1XYZ_1ST_4cyc]>]>; @@ -810,7 +823,8 @@ def : InstRW<[FalkorWr_1XYZ_1cyc], (instregex "^BIC(S)?(W|X)r(r|s)$")>; def : InstRW<[FalkorWr_1XYZ_1cyc], (instregex "^EON(W|X)r(r|s)$")>; def : InstRW<[FalkorWr_1XYZ_1cyc], (instregex "^EOR(W|X)r(i|r|s)$")>; def : InstRW<[FalkorWr_1XYZ_1cyc], (instregex "^ORN(W|X)r(r|s)$")>; -def : InstRW<[FalkorWr_1XYZ_1cyc], (instregex "^ORR(W|X)r(i|r|s)$")>; +def : InstRW<[FalkorWr_ORRi], (instregex "^ORR(W|X)ri$")>; +def : InstRW<[FalkorWr_1XYZ_1cyc], (instregex "^ORR(W|X)r(r|s)$")>; def : InstRW<[FalkorWr_1XYZ_1cyc], (instregex "^SBC(S)?(W|X)r$")>; def : InstRW<[FalkorWr_1XYZ_1cyc], (instregex "^SUB(S)?(W|X)r(r|i)$")>; def : InstRW<[FalkorWr_ADDSUBsx], (instregex "^ADD(S)?(W|X)r(s|x|x64)$")>; @@ -825,7 +839,7 @@ def : InstRW<[FalkorWr_1GTOV_1cyc], (instregex "^INSv(i8|i16)(gpr|lane)$")>; def : InstRW<[FalkorWr_1VTOG_1cyc], (instregex "^(S|U)MOVv.*$")>; def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^(BIF|BIT|BSL)v8i8$")>; def : InstRW<[FalkorWr_1VXVY_1cyc], (instrs EXTv8i8)>; -def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "(MOVI|MVNI)(D|v8b_ns|v2i32|v4i16|v2s_msl)$")>; +def : InstRW<[FalkorWr_1VXVY_0cyc], (instregex "(MOVI|MVNI)(D|v8b_ns|v2i32|v4i16|v2s_msl)$")>; // imm fwd def : InstRW<[FalkorWr_1VXVY_1cyc], (instrs TBLv8i8One)>; def : InstRW<[FalkorWr_1VXVY_1cyc], (instrs NOTv8i8)>; def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^REV(16|32|64)v.*$")>; @@ -849,7 +863,7 @@ def : InstRW<[FalkorWr_1GTOV_1VXVY_2cyc], def : InstRW<[FalkorWr_2GTOV_1cyc], (instregex "^DUP(v4i32|v2i64)(gpr|lane)$")>; def : InstRW<[FalkorWr_2VXVY_1cyc], (instregex "^(BIF|BIT|BSL)v16i8$")>; def : InstRW<[FalkorWr_2VXVY_1cyc], (instrs EXTv16i8)>; -def : InstRW<[FalkorWr_2VXVY_1cyc], (instregex "(MOVI|MVNI)(v2d_ns|v16b_ns|v4i32|v8i16|v4s_msl)$")>; +def : InstRW<[FalkorWr_2VXVY_0cyc], (instregex "(MOVI|MVNI)(v2d_ns|v16b_ns|v4i32|v8i16|v4s_msl)$")>; // imm fwd def : InstRW<[FalkorWr_2VXVY_1cyc], (instrs NOTv16i8)>; def : InstRW<[FalkorWr_2VXVY_1cyc], (instrs TBLv16i8One)>; @@ -1036,13 +1050,13 @@ def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc, ReadDefault, ReadDefault, FalkorReadFM // FP Miscellaneous Instructions // ----------------------------------------------------------------------------- def : InstRW<[FalkorWr_FMOV], (instregex "^FMOV(WS|XD|XDHigh)r$")>; -def : InstRW<[FalkorWr_1GTOV_1cyc], (instregex "^FMOV(S|D)i$")>; +def : InstRW<[FalkorWr_1GTOV_0cyc], (instregex "^FMOV(S|D)i$")>; // imm fwd def : InstRW<[FalkorWr_1VTOG_1cyc], (instregex "^FCVTZ(S|U)S(W|X)(D|S)ri$")>; def : InstRW<[FalkorWr_1VTOG_1cyc], (instregex "^FCVTZ(S|U)(d|s)$")>; def : InstRW<[FalkorWr_1VTOG_1cyc], (instregex "^FMOV(SW|DX|DXHigh)r$")>; -def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^FMOV(Sr|Dr|v.*_ns)$")>; +def : InstRW<[FalkorWr_1VXVY_0cyc], (instregex "^FMOV(Sr|Dr|v.*_ns)$")>; // imm fwd // FIXME: We are currently generating movi v0.2d, #0 for these, which is worse than fmov wzr/xzr -def : InstRW<[FalkorWr_2VXVY_1cyc], (instrs FMOVD0, FMOVS0)>; +def : InstRW<[FalkorWr_2VXVY_0cyc], (instrs FMOVD0, FMOVS0)>; // imm fwd def : InstRW<[FalkorWr_1GTOV_4cyc], (instregex "^(S|U)CVTF(S|U)(W|X)(D|S)ri$")>; def : InstRW<[FalkorWr_1VXVY_4cyc], (instregex "^(S|U)CVTF(v1i32|v2i32|v1i64|v2f32|d|s)(_shift)?")>; @@ -1107,11 +1121,12 @@ def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc, FalkorReadVMA], // Move and Shift Instructions // ----------------------------------------------------------------------------- -def : InstRW<[FalkorWr_1XYZ_1cyc], (instregex "^(LSLV|LSRV|ASRV|RORV|MOVK)(W|X).*")>; -def : InstRW<[FalkorWr_1XYZB_1cyc], (instregex "^ADRP?$")>; -def : InstRW<[FalkorWr_1XYZB_1cyc], (instregex "^MOVN(W|X)i$")>; +def : InstRW<[FalkorWr_1XYZ_1cyc], (instregex "^(LSLV|LSRV|ASRV|RORV)(W|X)r$")>; +def : InstRW<[FalkorWr_1XYZ_0cyc], (instregex "^MOVK(W|X)i$")>; // imm fwd +def : InstRW<[FalkorWr_1XYZB_0cyc], (instregex "^ADRP?$")>; // imm fwd +def : InstRW<[FalkorWr_1XYZB_0cyc], (instregex "^MOVN(W|X)i$")>; // imm fwd def : InstRW<[FalkorWr_MOVZ], (instregex "^MOVZ(W|X)i$")>; -def : InstRW<[FalkorWr_1XYZ_1cyc], (instrs MOVi32imm, MOVi64imm)>; +def : InstRW<[FalkorWr_1XYZ_0cyc], (instrs MOVi32imm, MOVi64imm)>; // imm fwd (approximation) def : InstRW<[WriteSequence<[FalkorWr_1XYZ_1cyc, FalkorWr_1XYZ_1cyc]>], (instrs MOVaddr, MOVaddrBA, MOVaddrCP, MOVaddrEXT, MOVaddrJT, MOVaddrTLS)>; def : InstRW<[WriteSequence<[FalkorWr_1LD_3cyc, FalkorWr_1XYZ_1cyc]>], diff --git a/lib/Target/AMDGPU/AMDGPU.h b/lib/Target/AMDGPU/AMDGPU.h index 78ff3bbe3d1a..55d18c3f3646 100644 --- a/lib/Target/AMDGPU/AMDGPU.h +++ b/lib/Target/AMDGPU/AMDGPU.h @@ -55,6 +55,8 @@ FunctionPass *createAMDGPUMachineCFGStructurizerPass(); void initializeAMDGPUMachineCFGStructurizerPass(PassRegistry&); extern char &AMDGPUMachineCFGStructurizerID; +void initializeAMDGPUAlwaysInlinePass(PassRegistry&); + ModulePass *createAMDGPUAnnotateKernelFeaturesPass(); void initializeAMDGPUAnnotateKernelFeaturesPass(PassRegistry &); extern char &AMDGPUAnnotateKernelFeaturesID; diff --git a/lib/Target/AMDGPU/AMDGPU.td b/lib/Target/AMDGPU/AMDGPU.td index e7ebb37a9d62..b50e8d1d659e 100644 --- a/lib/Target/AMDGPU/AMDGPU.td +++ b/lib/Target/AMDGPU/AMDGPU.td @@ -365,6 +365,13 @@ def FeatureFlatForGlobal : SubtargetFeature<"flat-for-global", "Force to generate flat instruction for global" >; +def FeatureAutoWaitcntBeforeBarrier : SubtargetFeature < + "auto-waitcnt-before-barrier", + "AutoWaitcntBeforeBarrier", + "true", + "Hardware automatically inserts waitcnt before barrier" +>; + // Dummy feature used to disable assembler instructions. def FeatureDisable : SubtargetFeature<"", "FeatureDisable","true", diff --git a/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp b/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp index 1d03714874e2..8084d368c80f 100644 --- a/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp +++ b/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp @@ -22,18 +22,22 @@ using namespace llvm; namespace { class AMDGPUAlwaysInline : public ModulePass { - static char ID; - bool GlobalOpt; public: - AMDGPUAlwaysInline(bool GlobalOpt) : ModulePass(ID), GlobalOpt(GlobalOpt) { } + static char ID; + + AMDGPUAlwaysInline(bool GlobalOpt = false) : + ModulePass(ID), GlobalOpt(GlobalOpt) { } bool runOnModule(Module &M) override; StringRef getPassName() const override { return "AMDGPU Always Inline Pass"; } }; } // End anonymous namespace +INITIALIZE_PASS(AMDGPUAlwaysInline, "amdgpu-always-inline", + "AMDGPU Inline All Functions", false, false) + char AMDGPUAlwaysInline::ID = 0; bool AMDGPUAlwaysInline::runOnModule(Module &M) { diff --git a/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 57905be18813..267f4807a788 100644 --- a/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -28,11 +28,16 @@ using namespace llvm; AMDGPULegalizerInfo::AMDGPULegalizerInfo() { using namespace TargetOpcode; + const LLT S1= LLT::scalar(1); const LLT S32 = LLT::scalar(32); const LLT S64 = LLT::scalar(64); const LLT P1 = LLT::pointer(1, 64); const LLT P2 = LLT::pointer(2, 64); + // FIXME: i1 operands to intrinsics should always be legal, but other i1 + // values may not be legal. We need to figure out how to distinguish + // between these two scenarios. + setAction({G_CONSTANT, S1}, Legal); setAction({G_CONSTANT, S32}, Legal); setAction({G_CONSTANT, S64}, Legal); diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index 6e301b4ad527..8d157e2f98f2 100644 --- a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -91,6 +91,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS, FPExceptions(false), DX10Clamp(false), FlatForGlobal(false), + AutoWaitcntBeforeBarrier(false), UnalignedScratchAccess(false), UnalignedBufferAccess(false), diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h index 0582ce95693a..ed9cbb994fad 100644 --- a/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -110,6 +110,7 @@ protected: bool FPExceptions; bool DX10Clamp; bool FlatForGlobal; + bool AutoWaitcntBeforeBarrier; bool UnalignedScratchAccess; bool UnalignedBufferAccess; bool HasApertureRegs; @@ -195,7 +196,8 @@ public: } bool isOpenCLEnv() const { - return TargetTriple.getEnvironment() == Triple::OpenCL; + return TargetTriple.getEnvironment() == Triple::OpenCL || + TargetTriple.getEnvironmentName() == "amdgizcl"; } Generation getGeneration() const { @@ -363,6 +365,10 @@ public: return FlatForGlobal; } + bool hasAutoWaitcntBeforeBarrier() const { + return AutoWaitcntBeforeBarrier; + } + bool hasUnalignedBufferAccess() const { return UnalignedBufferAccess; } @@ -727,12 +733,6 @@ public: /// Return the maximum number of waves per SIMD for kernels using \p VGPRs VGPRs unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const; - /// \returns True if waitcnt instruction is needed before barrier instruction, - /// false otherwise. - bool needWaitcntBeforeBarrier() const { - return true; - } - /// \returns true if the flat_scratch register should be initialized with the /// pointer to the wave's scratch memory rather than a size and offset. bool flatScratchIsPointer() const { diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 596f02ae4a64..404598ff4738 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -116,7 +116,7 @@ static cl::opt<bool> EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden, static cl::opt<bool> EnableSIInsertWaitcntsPass( "enable-si-insert-waitcnts", cl::desc("Use new waitcnt insertion pass"), - cl::init(false)); + cl::init(true)); // Option to run late CFG structurizer static cl::opt<bool> LateCFGStructurize( @@ -139,6 +139,7 @@ extern "C" void LLVMInitializeAMDGPUTarget() { initializeSIShrinkInstructionsPass(*PR); initializeSIFixControlFlowLiveIntervalsPass(*PR); initializeSILoadStoreOptimizerPass(*PR); + initializeAMDGPUAlwaysInlinePass(*PR); initializeAMDGPUAnnotateKernelFeaturesPass(*PR); initializeAMDGPUAnnotateUniformValuesPass(*PR); initializeAMDGPULowerIntrinsicsPass(*PR); diff --git a/lib/Target/AMDGPU/SIFoldOperands.cpp b/lib/Target/AMDGPU/SIFoldOperands.cpp index f13629a3185f..dfac068d1f69 100644 --- a/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -35,9 +35,12 @@ struct FoldCandidate { }; unsigned char UseOpNo; MachineOperand::MachineOperandType Kind; + bool Commuted; - FoldCandidate(MachineInstr *MI, unsigned OpNo, MachineOperand *FoldOp) : - UseMI(MI), OpToFold(nullptr), UseOpNo(OpNo), Kind(FoldOp->getType()) { + FoldCandidate(MachineInstr *MI, unsigned OpNo, MachineOperand *FoldOp, + bool Commuted_ = false) : + UseMI(MI), OpToFold(nullptr), UseOpNo(OpNo), Kind(FoldOp->getType()), + Commuted(Commuted_) { if (FoldOp->isImm()) { ImmToFold = FoldOp->getImm(); } else if (FoldOp->isFI()) { @@ -59,6 +62,10 @@ struct FoldCandidate { bool isReg() const { return Kind == MachineOperand::MO_Register; } + + bool isCommuted() const { + return Commuted; + } }; class SIFoldOperands : public MachineFunctionPass { @@ -237,8 +244,13 @@ static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList, !TII->commuteInstruction(*MI, false, CommuteIdx0, CommuteIdx1)) return false; - if (!TII->isOperandLegal(*MI, OpNo, OpToFold)) + if (!TII->isOperandLegal(*MI, OpNo, OpToFold)) { + TII->commuteInstruction(*MI, false, CommuteIdx0, CommuteIdx1); return false; + } + + FoldList.push_back(FoldCandidate(MI, OpNo, OpToFold, true)); + return true; } FoldList.push_back(FoldCandidate(MI, OpNo, OpToFold)); @@ -699,6 +711,9 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI, DEBUG(dbgs() << "Folded source from " << MI << " into OpNo " << static_cast<int>(Fold.UseOpNo) << " of " << *Fold.UseMI << '\n'); tryFoldInst(TII, Fold.UseMI); + } else if (Fold.isCommuted()) { + // Restoring instruction's original operand order if fold has failed. + TII->commuteInstruction(*Fold.UseMI, false); } } } diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp index 76c2644867aa..b48b23911105 100644 --- a/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/lib/Target/AMDGPU/SIISelLowering.cpp @@ -3571,7 +3571,7 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { } if (AS == AMDGPUASI.CONSTANT_ADDRESS || AS == AMDGPUASI.GLOBAL_ADDRESS) { if (Subtarget->getScalarizeGlobalBehavior() && isMemOpUniform(Load) && - isMemOpHasNoClobberedMemOperand(Load)) + !Load->isVolatile() && isMemOpHasNoClobberedMemOperand(Load)) return SDValue(); // Non-uniform loads will be selected to MUBUF instructions, so they // have the same legalization requirements as global and private diff --git a/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index e22166d03e9a..c10badba88f3 100644 --- a/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -1009,7 +1009,8 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore( // occurs before the instruction. Doing it here prevents any additional // S_WAITCNTs from being emitted if the instruction was marked as // requiring a WAITCNT beforehand. - if (MI.getOpcode() == AMDGPU::S_BARRIER && ST->needWaitcntBeforeBarrier()) { + if (MI.getOpcode() == AMDGPU::S_BARRIER && + !ST->hasAutoWaitcntBeforeBarrier()) { EmitSwaitcnt |= ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT)); EmitSwaitcnt |= ScoreBrackets->updateByWait( diff --git a/lib/Target/AMDGPU/SIInsertWaits.cpp b/lib/Target/AMDGPU/SIInsertWaits.cpp index 9f32ecfa52ff..bc86515d8b1f 100644 --- a/lib/Target/AMDGPU/SIInsertWaits.cpp +++ b/lib/Target/AMDGPU/SIInsertWaits.cpp @@ -630,7 +630,7 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) { // but we also want to wait for any other outstanding transfers before // signalling other hardware blocks if ((I->getOpcode() == AMDGPU::S_BARRIER && - ST->needWaitcntBeforeBarrier()) || + !ST->hasAutoWaitcntBeforeBarrier()) || I->getOpcode() == AMDGPU::S_SENDMSG || I->getOpcode() == AMDGPU::S_SENDMSGHALT) Required = LastIssued; diff --git a/lib/Target/AMDGPU/SMInstructions.td b/lib/Target/AMDGPU/SMInstructions.td index 5b840a14dbc3..73dd8b7daa4e 100644 --- a/lib/Target/AMDGPU/SMInstructions.td +++ b/lib/Target/AMDGPU/SMInstructions.td @@ -229,6 +229,7 @@ def smrd_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{ ((Ld->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS && static_cast<const SITargetLowering *>(getTargetLowering())->isMemOpUniform(N)) || (Subtarget->getScalarizeGlobalBehavior() && Ld->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS && + !Ld->isVolatile() && static_cast<const SITargetLowering *>(getTargetLowering())->isMemOpUniform(N) && static_cast<const SITargetLowering *>(getTargetLowering())->isMemOpHasNoClobberedMemOperand(N))); }]>; diff --git a/lib/Target/AMDGPU/VOP3Instructions.td b/lib/Target/AMDGPU/VOP3Instructions.td index 001fc960b228..77fc9551cff9 100644 --- a/lib/Target/AMDGPU/VOP3Instructions.td +++ b/lib/Target/AMDGPU/VOP3Instructions.td @@ -245,9 +245,10 @@ def V_MAD_I64_I32 : VOP3Inst <"v_mad_i64_i32", VOP3b_I64_I1_I32_I32_I64>; let SubtargetPredicate = Has16BitInsts in { +def V_DIV_FIXUP_F16 : VOP3Inst <"v_div_fixup_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, AMDGPUdiv_fixup>; + let isCommutable = 1 in { -def V_DIV_FIXUP_F16 : VOP3Inst <"v_div_fixup_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, AMDGPUdiv_fixup>; def V_FMA_F16 : VOP3Inst <"v_fma_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, fma>; def V_INTERP_P1LL_F16 : VOP3Inst <"v_interp_p1ll_f16", VOP3_Profile<VOP_F32_F32_F16>>; def V_INTERP_P1LV_F16 : VOP3Inst <"v_interp_p1lv_f16", VOP3_Profile<VOP_F32_F32_F16_F16>>; diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td index 46fd1f70ee99..ca68f5d42c32 100644 --- a/lib/Target/ARM/ARM.td +++ b/lib/Target/ARM/ARM.td @@ -205,6 +205,13 @@ def FeatureAvoidPartialCPSR : SubtargetFeature<"avoid-partial-cpsr", "AvoidCPSRPartialUpdate", "true", "Avoid CPSR partial update for OOO execution">; +/// Disable +1 predication cost for instructions updating CPSR. +/// Enabled for Cortex-A57. +def FeatureCheapPredicableCPSR : SubtargetFeature<"cheap-predicable-cpsr", + "CheapPredicableCPSRDef", + "true", + "Disable +1 predication cost for instructions updating CPSR">; + def FeatureAvoidMOVsShOp : SubtargetFeature<"avoid-movs-shop", "AvoidMOVsShifterOperand", "true", "Avoid movs instructions with shifter operand">; @@ -788,12 +795,14 @@ def : ProcNoItin<"cortex-a53", [ARMv8a, ProcA53, FeatureCRC, FeatureFPAO]>; -def : ProcNoItin<"cortex-a57", [ARMv8a, ProcA57, - FeatureHWDivThumb, - FeatureHWDivARM, - FeatureCrypto, - FeatureCRC, - FeatureFPAO]>; +def : ProcessorModel<"cortex-a57", CortexA57Model, [ARMv8a, ProcA57, + FeatureHWDivThumb, + FeatureHWDivARM, + FeatureCrypto, + FeatureCRC, + FeatureFPAO, + FeatureAvoidPartialCPSR, + FeatureCheapPredicableCPSR]>; def : ProcNoItin<"cortex-a72", [ARMv8a, ProcA72, FeatureHWDivThumb, diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp index 5c9d589e2625..f8b65573f9cd 100644 --- a/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -558,13 +558,68 @@ bool ARMBaseInstrInfo::DefinesPredicate( return Found; } -static bool isCPSRDefined(const MachineInstr *MI) { - for (const auto &MO : MI->operands()) +bool ARMBaseInstrInfo::isCPSRDefined(const MachineInstr &MI) { + for (const auto &MO : MI.operands()) if (MO.isReg() && MO.getReg() == ARM::CPSR && MO.isDef() && !MO.isDead()) return true; return false; } +bool ARMBaseInstrInfo::isAddrMode3OpImm(const MachineInstr &MI, + unsigned Op) const { + const MachineOperand &Offset = MI.getOperand(Op + 1); + return Offset.getReg() != 0; +} + +// Load with negative register offset requires additional 1cyc and +I unit +// for Cortex A57 +bool ARMBaseInstrInfo::isAddrMode3OpMinusReg(const MachineInstr &MI, + unsigned Op) const { + const MachineOperand &Offset = MI.getOperand(Op + 1); + const MachineOperand &Opc = MI.getOperand(Op + 2); + assert(Opc.isImm()); + assert(Offset.isReg()); + int64_t OpcImm = Opc.getImm(); + + bool isSub = ARM_AM::getAM3Op(OpcImm) == ARM_AM::sub; + return (isSub && Offset.getReg() != 0); +} + +bool ARMBaseInstrInfo::isLdstScaledReg(const MachineInstr &MI, + unsigned Op) const { + const MachineOperand &Opc = MI.getOperand(Op + 2); + unsigned OffImm = Opc.getImm(); + return ARM_AM::getAM2ShiftOpc(OffImm) != ARM_AM::no_shift; +} + +// Load, scaled register offset, not plus LSL2 +bool ARMBaseInstrInfo::isLdstScaledRegNotPlusLsl2(const MachineInstr &MI, + unsigned Op) const { + const MachineOperand &Opc = MI.getOperand(Op + 2); + unsigned OffImm = Opc.getImm(); + + bool isAdd = ARM_AM::getAM2Op(OffImm) == ARM_AM::add; + unsigned Amt = ARM_AM::getAM2Offset(OffImm); + ARM_AM::ShiftOpc ShiftOpc = ARM_AM::getAM2ShiftOpc(OffImm); + if (ShiftOpc == ARM_AM::no_shift) return false; // not scaled + bool SimpleScaled = (isAdd && ShiftOpc == ARM_AM::lsl && Amt == 2); + return !SimpleScaled; +} + +// Minus reg for ldstso addr mode +bool ARMBaseInstrInfo::isLdstSoMinusReg(const MachineInstr &MI, + unsigned Op) const { + unsigned OffImm = MI.getOperand(Op + 2).getImm(); + return ARM_AM::getAM2Op(OffImm) == ARM_AM::sub; +} + +// Load, scaled register offset +bool ARMBaseInstrInfo::isAm2ScaledReg(const MachineInstr &MI, + unsigned Op) const { + unsigned OffImm = MI.getOperand(Op + 2).getImm(); + return ARM_AM::getAM2ShiftOpc(OffImm) != ARM_AM::no_shift; +} + static bool isEligibleForITBlock(const MachineInstr *MI) { switch (MI->getOpcode()) { default: return true; @@ -590,7 +645,7 @@ static bool isEligibleForITBlock(const MachineInstr *MI) { case ARM::tSUBi3: // SUB (immediate) T1 case ARM::tSUBi8: // SUB (immediate) T2 case ARM::tSUBrr: // SUB (register) T1 - return !isCPSRDefined(MI); + return !ARMBaseInstrInfo::isCPSRDefined(*MI); } } @@ -3349,6 +3404,22 @@ ARMBaseInstrInfo::getVLDMDefCycle(const InstrItineraryData *ItinData, return DefCycle; } +bool ARMBaseInstrInfo::isLDMBaseRegInList(const MachineInstr &MI) const { + unsigned BaseReg = MI.getOperand(0).getReg(); + for (unsigned i = 1, sz = MI.getNumOperands(); i < sz; ++i) { + const auto &Op = MI.getOperand(i); + if (Op.isReg() && Op.getReg() == BaseReg) + return true; + } + return false; +} +unsigned +ARMBaseInstrInfo::getLDMVariableDefsSize(const MachineInstr &MI) const { + // ins GPR:$Rn, pred:$p (2xOp), reglist:$regs, variable_ops + // (outs GPR:$wb), (ins GPR:$Rn, pred:$p (2xOp), reglist:$regs, variable_ops) + return MI.getNumOperands() + 1 - MI.getDesc().getNumOperands(); +} + int ARMBaseInstrInfo::getLDMDefCycle(const InstrItineraryData *ItinData, const MCInstrDesc &DefMCID, @@ -4119,7 +4190,8 @@ unsigned ARMBaseInstrInfo::getPredicationCost(const MachineInstr &MI) const { const MCInstrDesc &MCID = MI.getDesc(); - if (MCID.isCall() || MCID.hasImplicitDefOfPhysReg(ARM::CPSR)) { + if (MCID.isCall() || (MCID.hasImplicitDefOfPhysReg(ARM::CPSR) && + !Subtarget.cheapPredicableCPSRDef())) { // When predicated, CPSR is an additional source operand for CPSR updating // instructions, this apparently increases their latencies. return 1; @@ -4148,7 +4220,8 @@ unsigned ARMBaseInstrInfo::getInstrLatency(const InstrItineraryData *ItinData, } const MCInstrDesc &MCID = MI.getDesc(); - if (PredCost && (MCID.isCall() || MCID.hasImplicitDefOfPhysReg(ARM::CPSR))) { + if (PredCost && (MCID.isCall() || (MCID.hasImplicitDefOfPhysReg(ARM::CPSR) && + !Subtarget.cheapPredicableCPSRDef()))) { // When predicated, CPSR is an additional source operand for CPSR updating // instructions, this apparently increases their latencies. *PredCost = 1; diff --git a/lib/Target/ARM/ARMBaseInstrInfo.h b/lib/Target/ARM/ARMBaseInstrInfo.h index dd7fe871345a..c52e572786d4 100644 --- a/lib/Target/ARM/ARMBaseInstrInfo.h +++ b/lib/Target/ARM/ARMBaseInstrInfo.h @@ -159,6 +159,24 @@ public: bool isPredicable(const MachineInstr &MI) const override; + // CPSR defined in instruction + static bool isCPSRDefined(const MachineInstr &MI); + bool isAddrMode3OpImm(const MachineInstr &MI, unsigned Op) const; + bool isAddrMode3OpMinusReg(const MachineInstr &MI, unsigned Op) const; + + // Load, scaled register offset + bool isLdstScaledReg(const MachineInstr &MI, unsigned Op) const; + // Load, scaled register offset, not plus LSL2 + bool isLdstScaledRegNotPlusLsl2(const MachineInstr &MI, unsigned Op) const; + // Minus reg for ldstso addr mode + bool isLdstSoMinusReg(const MachineInstr &MI, unsigned Op) const; + // Scaled register offset in address mode 2 + bool isAm2ScaledReg(const MachineInstr &MI, unsigned Op) const; + // Load multiple, base reg in list + bool isLDMBaseRegInList(const MachineInstr &MI) const; + // get LDM variable defs size + unsigned getLDMVariableDefsSize(const MachineInstr &MI) const; + /// GetInstSize - Returns the size of the specified MachineInstr. /// unsigned getInstSizeInBytes(const MachineInstr &MI) const override; diff --git a/lib/Target/ARM/ARMCallLowering.cpp b/lib/Target/ARM/ARMCallLowering.cpp index 31a2f499a9a7..a33d025d114e 100644 --- a/lib/Target/ARM/ARMCallLowering.cpp +++ b/lib/Target/ARM/ARMCallLowering.cpp @@ -34,7 +34,7 @@ ARMCallLowering::ARMCallLowering(const ARMTargetLowering &TLI) static bool isSupportedType(const DataLayout &DL, const ARMTargetLowering &TLI, Type *T) { - if (T->isArrayTy()) + if (T->isArrayTy() || T->isStructTy()) return true; EVT VT = TLI.getValueType(DL, T, true); @@ -167,8 +167,11 @@ void ARMCallLowering::splitToValueTypes( if (SplitVTs.size() == 1) { // Even if there is no splitting to do, we still want to replace the // original type (e.g. pointer type -> integer). - SplitArgs.emplace_back(OrigArg.Reg, SplitVTs[0].getTypeForEVT(Ctx), - OrigArg.Flags, OrigArg.IsFixed); + auto Flags = OrigArg.Flags; + unsigned OriginalAlignment = DL.getABITypeAlignment(OrigArg.Ty); + Flags.setOrigAlign(OriginalAlignment); + SplitArgs.emplace_back(OrigArg.Reg, SplitVTs[0].getTypeForEVT(Ctx), Flags, + OrigArg.IsFixed); return; } @@ -177,6 +180,10 @@ void ARMCallLowering::splitToValueTypes( EVT SplitVT = SplitVTs[i]; Type *SplitTy = SplitVT.getTypeForEVT(Ctx); auto Flags = OrigArg.Flags; + + unsigned OriginalAlignment = DL.getABITypeAlignment(SplitTy); + Flags.setOrigAlign(OriginalAlignment); + bool NeedsConsecutiveRegisters = TLI.functionArgumentNeedsConsecutiveRegisters( SplitTy, F->getCallingConv(), F->isVarArg()); @@ -185,6 +192,7 @@ void ARMCallLowering::splitToValueTypes( if (i == e - 1) Flags.setInConsecutiveRegsLast(); } + SplitArgs.push_back( ArgInfo{MRI.createGenericVirtualRegister(getLLTForType(*SplitTy, DL)), SplitTy, Flags, OrigArg.IsFixed}); diff --git a/lib/Target/ARM/ARMSchedule.td b/lib/Target/ARM/ARMSchedule.td index ec5b97cba8cd..1c7902520f2d 100644 --- a/lib/Target/ARM/ARMSchedule.td +++ b/lib/Target/ARM/ARMSchedule.td @@ -147,6 +147,9 @@ def : PredicateProlog<[{ const ARMBaseInstrInfo *TII = static_cast<const ARMBaseInstrInfo*>(SchedModel->getInstrInfo()); (void)TII; + const ARMSubtarget *STI = + static_cast<const ARMSubtarget*>(SchedModel->getSubtargetInfo()); + (void)STI; }]>; def IsPredicatedPred : SchedPredicate<[{TII->isPredicated(*MI)}]>; @@ -420,3 +423,4 @@ include "ARMScheduleA8.td" include "ARMScheduleA9.td" include "ARMScheduleSwift.td" include "ARMScheduleR52.td" +include "ARMScheduleA57.td" diff --git a/lib/Target/ARM/ARMScheduleA57.td b/lib/Target/ARM/ARMScheduleA57.td new file mode 100644 index 000000000000..525079d12d51 --- /dev/null +++ b/lib/Target/ARM/ARMScheduleA57.td @@ -0,0 +1,1471 @@ +//=- ARMScheduleA57.td - ARM Cortex-A57 Scheduling Defs -----*- tablegen -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the machine model for ARM Cortex-A57 to support +// instruction scheduling and other instruction cost heuristics. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// *** Common description and scheduling model parameters taken from AArch64 *** +// The Cortex-A57 is a traditional superscalar microprocessor with a +// conservative 3-wide in-order stage for decode and dispatch. Combined with the +// much wider out-of-order issue stage, this produced a need to carefully +// schedule micro-ops so that all three decoded each cycle are successfully +// issued as the reservation station(s) simply don't stay occupied for long. +// Therefore, IssueWidth is set to the narrower of the two at three, while still +// modeling the machine as out-of-order. + +def IsCPSRDefinedPred : SchedPredicate<[{TII->isCPSRDefined(*MI)}]>; +def IsCPSRDefinedAndPredicatedPred : + SchedPredicate<[{TII->isCPSRDefined(*MI) && TII->isPredicated(*MI)}]>; + +// Cortex A57 rev. r1p0 or later (false = r0px) +def IsR1P0AndLaterPred : SchedPredicate<[{false}]>; + +// If Addrmode3 contains register offset (not immediate) +def IsLdrAm3RegOffPred : + SchedPredicate<[{!TII->isAddrMode3OpImm(*MI, 1)}]>; +// The same predicate with operand offset 2 and 3: +def IsLdrAm3RegOffPredX2 : + SchedPredicate<[{!TII->isAddrMode3OpImm(*MI, 2)}]>; +def IsLdrAm3RegOffPredX3 : + SchedPredicate<[{!TII->isAddrMode3OpImm(*MI, 3)}]>; + +// If Addrmode3 contains "minus register" +def IsLdrAm3NegRegOffPred : + SchedPredicate<[{TII->isAddrMode3OpMinusReg(*MI, 1)}]>; +// The same predicate with operand offset 2 and 3: +def IsLdrAm3NegRegOffPredX2 : + SchedPredicate<[{TII->isAddrMode3OpMinusReg(*MI, 2)}]>; +def IsLdrAm3NegRegOffPredX3 : + SchedPredicate<[{TII->isAddrMode3OpMinusReg(*MI, 3)}]>; + +// Load, scaled register offset, not plus LSL2 +def IsLdstsoScaledNotOptimalPredX0 : + SchedPredicate<[{TII->isLdstScaledRegNotPlusLsl2(*MI, 0)}]>; +def IsLdstsoScaledNotOptimalPred : + SchedPredicate<[{TII->isLdstScaledRegNotPlusLsl2(*MI, 1)}]>; +def IsLdstsoScaledNotOptimalPredX2 : + SchedPredicate<[{TII->isLdstScaledRegNotPlusLsl2(*MI, 2)}]>; + +// Load, scaled register offset +def IsLdstsoScaledPred : + SchedPredicate<[{TII->isLdstScaledReg(*MI, 1)}]>; +def IsLdstsoScaledPredX2 : + SchedPredicate<[{TII->isLdstScaledReg(*MI, 2)}]>; + +def IsLdstsoMinusRegPredX0 : + SchedPredicate<[{TII->isLdstSoMinusReg(*MI, 0)}]>; +def IsLdstsoMinusRegPred : + SchedPredicate<[{TII->isLdstSoMinusReg(*MI, 1)}]>; +def IsLdstsoMinusRegPredX2 : + SchedPredicate<[{TII->isLdstSoMinusReg(*MI, 2)}]>; + +// Load, scaled register offset +def IsLdrAm2ScaledPred : + SchedPredicate<[{TII->isAm2ScaledReg(*MI, 1)}]>; + +// LDM, base reg in list +def IsLdmBaseRegInList : + SchedPredicate<[{TII->isLDMBaseRegInList(*MI)}]>; + +class A57WriteLMOpsListType<list<SchedWriteRes> writes> { + list <SchedWriteRes> Writes = writes; + SchedMachineModel SchedModel = ?; +} + +// *** Common description and scheduling model parameters taken from AArch64 *** +// (AArch64SchedA57.td) +def CortexA57Model : SchedMachineModel { + let IssueWidth = 3; // 3-way decode and dispatch + let MicroOpBufferSize = 128; // 128 micro-op re-order buffer + let LoadLatency = 4; // Optimistic load latency + let MispredictPenalty = 16; // Fetch + Decode/Rename/Dispatch + Branch + + // Enable partial & runtime unrolling. + let LoopMicroOpBufferSize = 16; + let CompleteModel = 1; +} + +//===----------------------------------------------------------------------===// +// Define each kind of processor resource and number available on Cortex-A57. +// Cortex A-57 has 8 pipelines that each has its own 8-entry queue where +// micro-ops wait for their operands and then issue out-of-order. + +def A57UnitB : ProcResource<1>; // Type B micro-ops +def A57UnitI : ProcResource<2>; // Type I micro-ops +def A57UnitM : ProcResource<1>; // Type M micro-ops +def A57UnitL : ProcResource<1>; // Type L micro-ops +def A57UnitS : ProcResource<1>; // Type S micro-ops + +def A57UnitX : ProcResource<1>; // Type X micro-ops (F1) +def A57UnitW : ProcResource<1>; // Type W micro-ops (F0) + +let SchedModel = CortexA57Model in { + def A57UnitV : ProcResGroup<[A57UnitX, A57UnitW]>; // Type V micro-ops +} + +let SchedModel = CortexA57Model in { + +//===----------------------------------------------------------------------===// +// Define customized scheduler read/write types specific to the Cortex-A57. + +include "ARMScheduleA57WriteRes.td" + +// To have "CompleteModel = 1", support of pseudos and special instructions +def : InstRW<[WriteNoop], (instregex "(t)?BKPT$", "(t2)?CDP(2)?$", + "(t2)?CLREX$", "CONSTPOOL_ENTRY$", "COPY_STRUCT_BYVAL_I32$", + "(t2)?CPS[123]p$", "(t2)?DBG$", "(t2)?DMB$", "(t2)?DSB$", "ERET$", + "(t2|t)?HINT$", "(t)?HLT$", "(t2)?HVC$", "(t2)?ISB$", "ITasm$", + "(t2)?RFE(DA|DB|IA|IB)", "(t)?SETEND", "(t2)?SETPAN", "(t2)?SMC", "SPACE", + "(t2)?SRS(DA|DB|IA|IB)", "SWP(B)?", "t?TRAP", "UDF$", "t2DCPS", "t2SG", + "t2TT", "tCPS", "CMP_SWAP", "t?SVC", "t2IT", "CompilerBarrier")>; + +def : InstRW<[WriteNoop], (instregex "VMRS", "VMSR", "FMSTAT")>; + +// Specific memory instrs +def : InstRW<[WriteNoop, WriteNoop], (instregex "(t2)?LDA", "(t2)?LDC", "(t2)?STC", + "(t2)?STL", "(t2)?LDREX", "(t2)?STREX", "MEMCPY")>; + +// coprocessor moves +def : InstRW<[WriteNoop, WriteNoop], (instregex + "(t2)?MCR(2|R|R2)?$", "(t2)?MRC(2)?$", + "(t2)?MRRC(2)?$", "(t2)?MRS(banked|sys|_AR|_M|sys_AR)?$", + "(t2)?MSR(banked|i|_AR|_M)?$")>; + +// Deprecated instructions +def : InstRW<[WriteNoop], (instregex "FLDM", "FSTM")>; + +// Pseudos +def : InstRW<[WriteNoop], (instregex "(t2)?ABS$", + "(t)?ADJCALLSTACKDOWN$", "(t)?ADJCALLSTACKUP$", "(t2|t)?Int_eh_sjlj", + "tLDRpci_pic", "t2SUBS_PC_LR", + "JUMPTABLE", "tInt_WIN_eh_sjlj_longjmp", + "VLD(1|2)LN(d|q)(WB_fixed_|WB_register_)?Asm", + "VLD(3|4)(DUP|LN)?(d|q)(WB_fixed_|WB_register_)?Asm", + "VST(1|2)LN(d|q)(WB_fixed_|WB_register_)?Asm", + "VST(3|4)(DUP|LN)?(d|q)(WB_fixed_|WB_register_)?Asm", + "WIN__CHKSTK", "WIN__DBZCHK")>; + +// Miscellaneous +// ----------------------------------------------------------------------------- + +def : InstRW<[A57Write_1cyc_1I], (instrs COPY)>; + +// --- 3.2 Branch Instructions --- +// B, BX, BL, BLX (imm, reg != LR, reg == LR), CBZ, CBNZ + +def : InstRW<[A57Write_1cyc_1B], (instregex "(t2|t)?B$", "t?BX", "(t2|t)?Bcc$", + "t?TAILJMP(d|r)", "TCRETURN(d|r)i", "tBfar", "tCBN?Z")>; +def : InstRW<[A57Write_1cyc_1B_1I], + (instregex "t?BL$", "BL_pred$", "t?BLXi", "t?TPsoft")>; +def : InstRW<[A57Write_2cyc_1B_1I], (instregex "BLX", "tBLX(NS)?r")>; +// Pseudos +def : InstRW<[A57Write_2cyc_1B_1I], (instregex "BCCi64", "BCCZi64")>; +def : InstRW<[A57Write_3cyc_1B_1I], (instregex "BR_JTadd", "t?BR_JTr", + "t2BR_JT", "t2BXJ", "(t2)?TB(B|H)(_JT)?$", "tBRIND")>; +def : InstRW<[A57Write_6cyc_1B_1L], (instregex "BR_JTm")>; + +// --- 3.3 Arithmetic and Logical Instructions --- +// ADD{S}, ADC{S}, ADR, AND{S}, BIC{S}, CMN, CMP, EOR{S}, ORN{S}, ORR{S}, +// RSB{S}, RSC{S}, SUB{S}, SBC{S}, TEQ, TST + +def : InstRW<[A57Write_1cyc_1I], (instregex "tADDframe")>; + +// shift by register, conditional or unconditional +// TODO: according to the doc, conditional uses I0/I1, unconditional uses M +// Why more complex instruction uses more simple pipeline? +// May be an error in doc. +def A57WriteALUsi : SchedWriteVariant<[ + // lsl #2, lsl #1, or lsr #1. + SchedVar<IsPredicatedPred, [A57Write_2cyc_1M]>, + SchedVar<NoSchedPred, [A57Write_2cyc_1M]> +]>; +def A57WriteALUsr : SchedWriteVariant<[ + SchedVar<IsPredicatedPred, [A57Write_2cyc_1I]>, + SchedVar<NoSchedPred, [A57Write_2cyc_1M]> +]>; +def A57WriteALUSsr : SchedWriteVariant<[ + SchedVar<IsPredicatedPred, [A57Write_2cyc_1I]>, + SchedVar<NoSchedPred, [A57Write_2cyc_1M]> +]>; +def A57ReadALUsr : SchedReadVariant<[ + SchedVar<IsPredicatedPred, [ReadDefault]>, + SchedVar<NoSchedPred, [ReadDefault]> +]>; +def : SchedAlias<WriteALUsi, A57WriteALUsi>; +def : SchedAlias<WriteALUsr, A57WriteALUsr>; +def : SchedAlias<WriteALUSsr, A57WriteALUSsr>; +def : SchedAlias<ReadALUsr, A57ReadALUsr>; + +def A57WriteCMPsr : SchedWriteVariant<[ + SchedVar<IsPredicatedPred, [A57Write_2cyc_1I]>, + SchedVar<NoSchedPred, [A57Write_2cyc_1M]> +]>; +def : SchedAlias<WriteCMP, A57Write_1cyc_1I>; +def : SchedAlias<WriteCMPsi, A57Write_2cyc_1M>; +def : SchedAlias<WriteCMPsr, A57WriteCMPsr>; + +// --- 3.4 Move and Shift Instructions --- +// Move, basic +// MOV{S}, MOVW, MVN{S} +def : InstRW<[A57Write_1cyc_1I], (instregex "MOV(r|i|i16|r_TC)", + "(t2)?MVN(CC)?(r|i)", "BMOVPCB_CALL", "BMOVPCRX_CALL", + "MOVCC(r|i|i16|i32imm)", "tMOV", "tMVN")>; + +// Move, shift by immed, setflags/no setflags +// (ASR, LSL, LSR, ROR, RRX)=MOVsi, MVN +// setflags = isCPSRDefined +def A57WriteMOVsi : SchedWriteVariant<[ + SchedVar<IsCPSRDefinedPred, [A57Write_2cyc_1M]>, + SchedVar<NoSchedPred, [A57Write_1cyc_1I]> +]>; +def : InstRW<[A57WriteMOVsi], (instregex "MOV(CC)?si", "MVNsi", + "ASRi", "(t2|t)ASRri", "LSRi", "(t2|t)LSRri", "LSLi", "(t2|t)LSLri", "RORi", + "(t2|t)RORri", "(t2)?RRX", "t2MOV", "tROR")>; + +// shift by register, conditional or unconditional, setflags/no setflags +def A57WriteMOVsr : SchedWriteVariant<[ + SchedVar<IsCPSRDefinedAndPredicatedPred, [A57Write_2cyc_1I]>, + SchedVar<IsCPSRDefinedPred, [A57Write_2cyc_1M]>, + SchedVar<IsPredicatedPred, [A57Write_2cyc_1I]>, + SchedVar<NoSchedPred, [A57Write_1cyc_1I]> +]>; +def : InstRW<[A57WriteMOVsr], (instregex "MOV(CC)?sr", "MVNsr", "t2MVNs", + "ASRr", "(t2|t)ASRrr", "LSRr", "(t2|t)LSRrr", "LSLr", "(t2|t)?LSLrr", "RORr", + "(t2|t)RORrr")>; + +// Move, top +// MOVT - A57Write_2cyc_1M for r0px, A57Write_1cyc_1I for r1p0 and later +def A57WriteMOVT : SchedWriteVariant<[ + SchedVar<IsR1P0AndLaterPred, [A57Write_1cyc_1I]>, + SchedVar<NoSchedPred, [A57Write_2cyc_1M]> +]>; +def : InstRW<[A57WriteMOVT], (instregex "MOVTi16")>; + +def A57WriteI2pc : + WriteSequence<[A57Write_1cyc_1I, A57Write_1cyc_1I, A57Write_1cyc_1I]>; +def A57WriteI2ld : + WriteSequence<[A57Write_1cyc_1I, A57Write_1cyc_1I, A57Write_4cyc_1L]>; +def : InstRW< [A57WriteI2pc], (instregex "MOV_ga_pcrel")>; +def : InstRW< [A57WriteI2ld], (instregex "MOV_ga_pcrel_ldr")>; + +// +2cyc for branch forms +def : InstRW<[A57Write_3cyc_1I], (instregex "MOVPC(LR|RX)")>; + +// --- 3.5 Divide and Multiply Instructions --- +// Divide: SDIV, UDIV +// latency from documentration: 4 ‐ 20, maximum taken +def : SchedAlias<WriteDIV, A57Write_20cyc_1M>; +// Multiply: tMul not bound to common WriteRes types +def : InstRW<[A57Write_3cyc_1M], (instregex "tMUL")>; +def : SchedAlias<WriteMUL16, A57Write_3cyc_1M>; +def : SchedAlias<WriteMUL32, A57Write_3cyc_1M>; +def : ReadAdvance<ReadMUL, 0>; + +// Multiply accumulate: MLA, MLS, SMLABB, SMLABT, SMLATB, SMLATT, SMLAWB, +// SMLAWT, SMLAD{X}, SMLSD{X}, SMMLA{R}, SMMLS{R} +// Multiply-accumulate pipelines support late-forwarding of accumulate operands +// from similar μops, allowing a typical sequence of multiply-accumulate μops +// to issue one every 1 cycle (sched advance = 2). +def A57WriteMLA : SchedWriteRes<[A57UnitM]> { let Latency = 3; } +def A57WriteMLAL : SchedWriteRes<[A57UnitM]> { let Latency = 4; } +def A57ReadMLA : SchedReadAdvance<2, [A57WriteMLA, A57WriteMLAL]>; + +def : SchedAlias<WriteMAC16, A57WriteMLA>; +def : SchedAlias<WriteMAC32, A57WriteMLA>; +def : SchedAlias<ReadMAC, A57ReadMLA>; + +def : SchedAlias<WriteMAC64Lo, A57WriteMLAL>; +def : SchedAlias<WriteMAC64Hi, A57WriteMLAL>; + +// Multiply long: SMULL, UMULL +def : SchedAlias<WriteMUL64Lo, A57Write_4cyc_1M>; +def : SchedAlias<WriteMUL64Hi, A57Write_4cyc_1M>; + +// --- 3.6 Saturating and Parallel Arithmetic Instructions --- +// Parallel arith +// SADD16, SADD8, SSUB16, SSUB8, UADD16, UADD8, USUB16, USUB8 +// Conditional GE-setting instructions require three extra μops +// and two additional cycles to conditionally update the GE field. +def A57WriteParArith : SchedWriteVariant<[ + SchedVar<IsPredicatedPred, [A57Write_4cyc_1I_1M]>, + SchedVar<NoSchedPred, [A57Write_2cyc_1I_1M]> +]>; +def : InstRW< [A57WriteParArith], (instregex + "(t2)?SADD(16|8)", "(t2)?SSUB(16|8)", + "(t2)?UADD(16|8)", "(t2)?USUB(16|8)")>; + +// Parallel arith with exchange: SASX, SSAX, UASX, USAX +def A57WriteParArithExch : SchedWriteVariant<[ + SchedVar<IsPredicatedPred, [A57Write_5cyc_1I_1M]>, + SchedVar<NoSchedPred, [A57Write_3cyc_1I_1M]> +]>; +def : InstRW<[A57WriteParArithExch], + (instregex "(t2)?SASX", "(t2)?SSAX", "(t2)?UASX", "(t2)?USAX")>; + +// Parallel halving arith +// SHADD16, SHADD8, SHSUB16, SHSUB8, UHADD16, UHADD8, UHSUB16, UHSUB8 +def : InstRW<[A57Write_2cyc_1M], (instregex + "(t2)?SHADD(16|8)", "(t2)?SHSUB(16|8)", + "(t2)?UHADD(16|8)", "(t2)?UHSUB(16|8)")>; + +// Parallel halving arith with exchange +// SHASX, SHSAX, UHASX, UHSAX +def : InstRW<[A57Write_3cyc_1I_1M], (instregex "(t2)?SHASX", "(t2)?SHSAX", + "(t2)?UHASX", "(t2)?UHSAX")>; + +// Parallel saturating arith +// QADD16, QADD8, QSUB16, QSUB8, UQADD16, UQADD8, UQSUB16, UQSUB8 +def : InstRW<[A57Write_2cyc_1M], (instregex "QADD(16|8)", "QSUB(16|8)", + "UQADD(16|8)", "UQSUB(16|8)", "t2(U?)QADD", "t2(U?)QSUB")>; + +// Parallel saturating arith with exchange +// QASX, QSAX, UQASX, UQSAX +def : InstRW<[A57Write_3cyc_1I_1M], (instregex "(t2)?QASX", "(t2)?QSAX", + "(t2)?UQASX", "(t2)?UQSAX")>; + +// Saturate: SSAT, SSAT16, USAT, USAT16 +def : InstRW<[A57Write_2cyc_1M], + (instregex "(t2)?SSAT(16)?", "(t2)?USAT(16)?")>; + +// Saturating arith: QADD, QSUB +def : InstRW<[A57Write_2cyc_1M], (instregex "QADD$", "QSUB$")>; + +// Saturating doubling arith: QDADD, QDSUB +def : InstRW<[A57Write_3cyc_1I_1M], (instregex "(t2)?QDADD", "(t2)?QDSUB")>; + +// --- 3.7 Miscellaneous Data-Processing Instructions --- +// Bit field extract: SBFX, UBFX +def : InstRW<[A57Write_1cyc_1I], (instregex "(t2)?SBFX", "(t2)?UBFX")>; + +// Bit field insert/clear: BFI, BFC +def : InstRW<[A57Write_2cyc_1M], (instregex "(t2)?BFI", "(t2)?BFC")>; + +// Select bytes, conditional/unconditional +def A57WriteSEL : SchedWriteVariant<[ + SchedVar<IsPredicatedPred, [A57Write_2cyc_1I]>, + SchedVar<NoSchedPred, [A57Write_1cyc_1I]> +]>; +def : InstRW<[A57WriteSEL], (instregex "(t2)?SEL")>; + +// Sign/zero extend, normal: SXTB, SXTH, UXTB, UXTH +def : InstRW<[A57Write_1cyc_1I], + (instregex "(t2|t)?SXT(B|H)$", "(t2|t)?UXT(B|H)$")>; + +// Sign/zero extend and add, normal: SXTAB, SXTAH, UXTAB, UXTAH +def : InstRW<[A57Write_2cyc_1M], + (instregex "(t2)?SXTA(B|H)$", "(t2)?UXTA(B|H)$")>; + +// Sign/zero extend and add, parallel: SXTAB16, UXTAB16 +def : InstRW<[A57Write_4cyc_1M], (instregex "(t2)?SXTAB16", "(t2)?UXTAB16")>; + +// Sum of absolute differences: USAD8, USADA8 +def : InstRW<[A57Write_3cyc_1M], (instregex "(t2)?USAD8", "(t2)?USADA8")>; + +// --- 3.8 Load Instructions --- + +// Load, immed offset +// LDR and LDRB have LDRi12 and LDRBi12 forms for immediate +def : InstRW<[A57Write_4cyc_1L], (instregex "LDRi12", "LDRBi12", + "LDRcp", "(t2|t)?LDRConstPool", "LDRLIT_ga_(pcrel|abs)", + "PICLDR", "tLDR")>; + +def : InstRW<[A57Write_4cyc_1L], + (instregex "t2LDRS?(B|H)?(pcrel|T|i8|i12|pci|pci_pic|s)?$")>; + +// For "Load, register offset, minus" we need +1cyc, +1I +def A57WriteLdrAm3 : SchedWriteVariant<[ + SchedVar<IsLdrAm3NegRegOffPred, [A57Write_5cyc_1I_1L]>, + SchedVar<NoSchedPred, [A57Write_4cyc_1L]> +]>; +def : InstRW<[A57WriteLdrAm3], (instregex "LDR(H|SH|SB)$")>; +def A57WriteLdrAm3X2 : SchedWriteVariant<[ + SchedVar<IsLdrAm3NegRegOffPredX2, [A57Write_5cyc_1I_1L]>, + SchedVar<NoSchedPred, [A57Write_4cyc_1L]> +]>; +def : InstRW<[A57WriteLdrAm3X2, A57WriteLdrAm3X2], (instregex "LDRD$")>; +def : InstRW<[A57Write_4cyc_1L, A57Write_4cyc_1L], (instregex "t2LDRDi8")>; + +def A57WriteLdrAmLDSTSO : SchedWriteVariant<[ + SchedVar<IsLdstsoScaledNotOptimalPred, [A57Write_5cyc_1I_1L]>, + SchedVar<IsLdstsoMinusRegPred, [A57Write_5cyc_1I_1L]>, + SchedVar<NoSchedPred, [A57Write_4cyc_1L]> +]>; +def : InstRW<[A57WriteLdrAmLDSTSO], (instregex "LDRrs", "LDRBrs")>; + +def A57WrBackOne : SchedWriteRes<[]> { + let Latency = 1; + let NumMicroOps = 0; +} +def A57WrBackTwo : SchedWriteRes<[]> { + let Latency = 2; + let NumMicroOps = 0; +} +def A57WrBackThree : SchedWriteRes<[]> { + let Latency = 3; + let NumMicroOps = 0; +} + +// --- LDR pre-indexed --- +// Load, immed pre-indexed (4 cyc for load result, 1 cyc for Base update) +def : InstRW<[A57Write_4cyc_1L_1I, A57WrBackOne], (instregex "LDR_PRE_IMM", + "LDRB_PRE_IMM", "t2LDRB_PRE")>; + +// Load, register pre-indexed (4 cyc for load result, 2 cyc for Base update) +// (5 cyc load result for not-lsl2 scaled) +def A57WriteLdrAmLDSTSOPre : SchedWriteVariant<[ + SchedVar<IsLdstsoScaledNotOptimalPredX2, [A57Write_5cyc_1I_1L]>, + SchedVar<NoSchedPred, [A57Write_4cyc_1L_1I]> +]>; +def : InstRW<[A57WriteLdrAmLDSTSOPre, A57WrBackTwo], + (instregex "LDR_PRE_REG", "LDRB_PRE_REG")>; + +def A57WriteLdrAm3PreWrBack : SchedWriteVariant<[ + SchedVar<IsLdrAm3RegOffPredX2, [A57WrBackTwo]>, + SchedVar<NoSchedPred, [A57WrBackOne]> +]>; +def : InstRW<[A57Write_4cyc_1L, A57WriteLdrAm3PreWrBack], + (instregex "LDR(H|SH|SB)_PRE")>; +def : InstRW<[A57Write_4cyc_1L, A57WrBackOne], + (instregex "t2LDR(H|SH|SB)?_PRE")>; + +// LDRD pre-indexed: 5(2) cyc for reg, 4(1) cyc for imm. +def A57WriteLdrDAm3Pre : SchedWriteVariant<[ + SchedVar<IsLdrAm3RegOffPredX3, [A57Write_5cyc_1I_1L]>, + SchedVar<NoSchedPred, [A57Write_4cyc_1L_1I]> +]>; +def A57WriteLdrDAm3PreWrBack : SchedWriteVariant<[ + SchedVar<IsLdrAm3RegOffPredX3, [A57WrBackTwo]>, + SchedVar<NoSchedPred, [A57WrBackOne]> +]>; +def : InstRW<[A57WriteLdrDAm3Pre, A57WriteLdrDAm3Pre, A57WriteLdrDAm3PreWrBack], + (instregex "LDRD_PRE")>; +def : InstRW<[A57Write_4cyc_1L_1I, A57Write_4cyc_1L_1I, A57WrBackOne], + (instregex "t2LDRD_PRE")>; + +// --- LDR post-indexed --- +def : InstRW<[A57Write_4cyc_1L_1I, A57WrBackOne], (instregex "LDR(T?)_POST_IMM", + "LDRB(T?)_POST_IMM", "LDR(SB|H|SH)Ti", "t2LDRB_POST")>; + +def A57WriteLdrAm3PostWrBack : SchedWriteVariant<[ + SchedVar<IsLdrAm3RegOffPred, [A57WrBackTwo]>, + SchedVar<NoSchedPred, [A57WrBackOne]> +]>; +def : InstRW<[A57Write_4cyc_1L_1I, A57WriteLdrAm3PostWrBack], + (instregex "LDR(H|SH|SB)_POST")>; +def : InstRW<[A57Write_4cyc_1L, A57WrBackOne], + (instregex "t2LDR(H|SH|SB)?_POST")>; + +def : InstRW<[A57Write_4cyc_1L_1I, A57WrBackTwo], (instregex "LDR_POST_REG", + "LDRB_POST_REG", "LDR(B?)T_POST$")>; + +def A57WriteLdrTRegPost : SchedWriteVariant<[ + SchedVar<IsLdrAm2ScaledPred, [A57Write_4cyc_1I_1L_1M]>, + SchedVar<NoSchedPred, [A57Write_4cyc_1L_1I]> +]>; +def A57WriteLdrTRegPostWrBack : SchedWriteVariant<[ + SchedVar<IsLdrAm2ScaledPred, [A57WrBackThree]>, + SchedVar<NoSchedPred, [A57WrBackTwo]> +]>; +// 4(3) "I0/I1,L,M" for scaled register, otherwise 4(2) "I0/I1,L" +def : InstRW<[A57WriteLdrTRegPost, A57WriteLdrTRegPostWrBack], + (instregex "LDRT_POST_REG", "LDRBT_POST_REG")>; + +def : InstRW<[A57Write_4cyc_1L_1I, A57WrBackTwo], (instregex "LDR(SB|H|SH)Tr")>; + +def A57WriteLdrAm3PostWrBackX3 : SchedWriteVariant<[ + SchedVar<IsLdrAm3RegOffPredX3, [A57WrBackTwo]>, + SchedVar<NoSchedPred, [A57WrBackOne]> +]>; +// LDRD post-indexed: 4(2) cyc for reg, 4(1) cyc for imm. +def : InstRW<[A57Write_4cyc_1L_1I, A57Write_4cyc_1L_1I, + A57WriteLdrAm3PostWrBackX3], (instregex "LDRD_POST")>; +def : InstRW<[A57Write_4cyc_1L_1I, A57Write_4cyc_1L_1I, A57WrBackOne], + (instregex "t2LDRD_POST")>; + +// --- Preload instructions --- +// Preload, immed offset +def : InstRW<[A57Write_4cyc_1L], (instregex "(t2)?PLDi12", "(t2)?PLDWi12", + "t2PLDW?(i8|pci|s)", "(t2)?PLI")>; + +// Preload, register offset, +// 5cyc "I0/I1,L" for minus reg or scaled not plus lsl2 +// otherwise 4cyc "L" +def A57WritePLD : SchedWriteVariant<[ + SchedVar<IsLdstsoScaledNotOptimalPredX0, [A57Write_5cyc_1I_1L]>, + SchedVar<IsLdstsoMinusRegPredX0, [A57Write_5cyc_1I_1L]>, + SchedVar<NoSchedPred, [A57Write_4cyc_1L]> +]>; +def : InstRW<[A57WritePLD], (instregex "PLDrs", "PLDWrs")>; + +// --- Load multiple instructions --- +foreach NumAddr = 1-8 in { + def A57LMAddrPred#NumAddr : + SchedPredicate<"(TII->getLDMVariableDefsSize(*MI)+1)/2 == "#NumAddr>; +} + +def A57LDMOpsListNoregin : A57WriteLMOpsListType< + [A57Write_3cyc_1L, A57Write_3cyc_1L, + A57Write_4cyc_1L, A57Write_4cyc_1L, + A57Write_5cyc_1L, A57Write_5cyc_1L, + A57Write_6cyc_1L, A57Write_6cyc_1L, + A57Write_7cyc_1L, A57Write_7cyc_1L, + A57Write_8cyc_1L, A57Write_8cyc_1L, + A57Write_9cyc_1L, A57Write_9cyc_1L, + A57Write_10cyc_1L, A57Write_10cyc_1L]>; +def A57WriteLDMnoreginlist : SchedWriteVariant<[ + SchedVar<A57LMAddrPred1, A57LDMOpsListNoregin.Writes[0-1]>, + SchedVar<A57LMAddrPred2, A57LDMOpsListNoregin.Writes[0-3]>, + SchedVar<A57LMAddrPred3, A57LDMOpsListNoregin.Writes[0-5]>, + SchedVar<A57LMAddrPred4, A57LDMOpsListNoregin.Writes[0-7]>, + SchedVar<A57LMAddrPred5, A57LDMOpsListNoregin.Writes[0-9]>, + SchedVar<A57LMAddrPred6, A57LDMOpsListNoregin.Writes[0-11]>, + SchedVar<A57LMAddrPred7, A57LDMOpsListNoregin.Writes[0-13]>, + SchedVar<A57LMAddrPred8, A57LDMOpsListNoregin.Writes[0-15]>, + SchedVar<NoSchedPred, A57LDMOpsListNoregin.Writes[0-15]> +]> { let Variadic=1; } + +def A57LDMOpsListRegin : A57WriteLMOpsListType< + [A57Write_4cyc_1L_1I, A57Write_4cyc_1L_1I, + A57Write_5cyc_1L_1I, A57Write_5cyc_1L_1I, + A57Write_6cyc_1L_1I, A57Write_6cyc_1L_1I, + A57Write_7cyc_1L_1I, A57Write_7cyc_1L_1I, + A57Write_8cyc_1L_1I, A57Write_8cyc_1L_1I, + A57Write_9cyc_1L_1I, A57Write_9cyc_1L_1I, + A57Write_10cyc_1L_1I, A57Write_10cyc_1L_1I, + A57Write_11cyc_1L_1I, A57Write_11cyc_1L_1I]>; +def A57WriteLDMreginlist : SchedWriteVariant<[ + SchedVar<A57LMAddrPred1, A57LDMOpsListRegin.Writes[0-1]>, + SchedVar<A57LMAddrPred2, A57LDMOpsListRegin.Writes[0-3]>, + SchedVar<A57LMAddrPred3, A57LDMOpsListRegin.Writes[0-5]>, + SchedVar<A57LMAddrPred4, A57LDMOpsListRegin.Writes[0-7]>, + SchedVar<A57LMAddrPred5, A57LDMOpsListRegin.Writes[0-9]>, + SchedVar<A57LMAddrPred6, A57LDMOpsListRegin.Writes[0-11]>, + SchedVar<A57LMAddrPred7, A57LDMOpsListRegin.Writes[0-13]>, + SchedVar<A57LMAddrPred8, A57LDMOpsListRegin.Writes[0-15]>, + SchedVar<NoSchedPred, A57LDMOpsListRegin.Writes[0-15]> +]> { let Variadic=1; } + +def A57LDMOpsList_Upd : A57WriteLMOpsListType< + [A57WrBackOne, + A57Write_3cyc_1L_1I, A57Write_3cyc_1L_1I, + A57Write_4cyc_1L_1I, A57Write_4cyc_1L_1I, + A57Write_5cyc_1L_1I, A57Write_5cyc_1L_1I, + A57Write_6cyc_1L_1I, A57Write_6cyc_1L_1I, + A57Write_7cyc_1L_1I, A57Write_7cyc_1L_1I, + A57Write_8cyc_1L_1I, A57Write_8cyc_1L_1I, + A57Write_9cyc_1L_1I, A57Write_9cyc_1L_1I, + A57Write_10cyc_1L_1I, A57Write_10cyc_1L_1I]>; +def A57WriteLDM_Upd : SchedWriteVariant<[ + SchedVar<A57LMAddrPred1, A57LDMOpsList_Upd.Writes[0-2]>, + SchedVar<A57LMAddrPred2, A57LDMOpsList_Upd.Writes[0-4]>, + SchedVar<A57LMAddrPred3, A57LDMOpsList_Upd.Writes[0-6]>, + SchedVar<A57LMAddrPred4, A57LDMOpsList_Upd.Writes[0-8]>, + SchedVar<A57LMAddrPred5, A57LDMOpsList_Upd.Writes[0-10]>, + SchedVar<A57LMAddrPred6, A57LDMOpsList_Upd.Writes[0-12]>, + SchedVar<A57LMAddrPred7, A57LDMOpsList_Upd.Writes[0-14]>, + SchedVar<A57LMAddrPred8, A57LDMOpsList_Upd.Writes[0-16]>, + SchedVar<NoSchedPred, A57LDMOpsList_Upd.Writes[0-16]> +]> { let Variadic=1; } + +def A57WriteLDM : SchedWriteVariant<[ + SchedVar<IsLdmBaseRegInList, [A57WriteLDMreginlist]>, + SchedVar<NoSchedPred, [A57WriteLDMnoreginlist]> +]> { let Variadic=1; } + +def : InstRW<[A57WriteLDM], (instregex "(t|t2|sys)?LDM(IA|DA|DB|IB)$")>; + +// TODO: no writeback latency defined in documentation (implemented as 1 cyc) +def : InstRW<[A57WriteLDM_Upd], + (instregex "(t|t2|sys)?LDM(IA_UPD|DA_UPD|DB_UPD|IB_UPD|IA_RET)", "tPOP")>; + +// --- 3.9 Store Instructions --- + +// Store, immed offset +def : InstRW<[A57Write_1cyc_1S], (instregex "STRi12", "STRBi12", "PICSTR", + "t2STR(B?)(T|i12|i8|s)", "t2STRDi8", "t2STRH(i12|i8|s)", "tSTR")>; + +// Store, register offset +// For minus or for not plus lsl2 scaled we need 3cyc "I0/I1, S", +// otherwise 1cyc S. +def A57WriteStrAmLDSTSO : SchedWriteVariant<[ + SchedVar<IsLdstsoScaledNotOptimalPred, [A57Write_3cyc_1I_1S]>, + SchedVar<IsLdstsoMinusRegPred, [A57Write_3cyc_1I_1S]>, + SchedVar<NoSchedPred, [A57Write_1cyc_1S]> +]>; +def : InstRW<[A57WriteStrAmLDSTSO], (instregex "STRrs", "STRBrs")>; + +// STRH,STRD: 3cyc "I0/I1, S" for minus reg, 1cyc S for imm or for plus reg. +def A57WriteStrAm3 : SchedWriteVariant<[ + SchedVar<IsLdrAm3NegRegOffPred, [A57Write_3cyc_1I_1S]>, + SchedVar<NoSchedPred, [A57Write_1cyc_1S]> +]>; +def : InstRW<[A57WriteStrAm3], (instregex "STRH$")>; +def A57WriteStrAm3X2 : SchedWriteVariant<[ + SchedVar<IsLdrAm3NegRegOffPredX2, [A57Write_3cyc_1I_1S]>, + SchedVar<NoSchedPred, [A57Write_1cyc_1S]> +]>; +def : InstRW<[A57WriteStrAm3X2], (instregex "STRD$")>; + +// Store, immed pre-indexed (1cyc "S, I0/I1", 1cyc writeback) +def : InstRW<[A57WrBackOne, A57Write_1cyc_1S_1I], (instregex "STR_PRE_IMM", + "STRB_PRE_IMM", "STR(B)?(r|i)_preidx", "(t2)?STRH_(preidx|PRE)", + "t2STR(B?)_(PRE|preidx)", "t2STRD_PRE")>; + +// Store, register pre-indexed: +// 1(1) "S, I0/I1" for plus reg +// 3(2) "I0/I1, S" for minus reg +// 1(2) "S, M" for scaled plus lsl2 +// 3(2) "I0/I1, S" for other scaled +def A57WriteStrAmLDSTSOPre : SchedWriteVariant<[ + SchedVar<IsLdstsoScaledNotOptimalPredX2, [A57Write_3cyc_1I_1S]>, + SchedVar<IsLdstsoMinusRegPredX2, [A57Write_3cyc_1I_1S]>, + SchedVar<IsLdstsoScaledPredX2, [A57Write_1cyc_1S_1M]>, + SchedVar<NoSchedPred, [A57Write_1cyc_1S_1I]> +]>; +def A57WriteStrAmLDSTSOPreWrBack : SchedWriteVariant<[ + SchedVar<IsLdstsoScaledPredX2, [A57WrBackTwo]>, + SchedVar<IsLdstsoMinusRegPredX2, [A57WrBackTwo]>, + SchedVar<NoSchedPred, [A57WrBackOne]> +]>; +def : InstRW<[A57WriteStrAmLDSTSOPreWrBack, A57WriteStrAmLDSTSOPre], + (instregex "STR_PRE_REG", "STRB_PRE_REG")>; + +// pre-indexed STRH/STRD (STRH_PRE, STRD_PRE) +// 1(1) "S, I0/I1" for imm or reg plus +// 3(2) "I0/I1, S" for reg minus +def A57WriteStrAm3PreX2 : SchedWriteVariant<[ + SchedVar<IsLdrAm3NegRegOffPredX2, [A57Write_3cyc_1I_1S]>, + SchedVar<NoSchedPred, [A57Write_1cyc_1S_1I]> +]>; +def A57WriteStrAm3PreWrBackX2 : SchedWriteVariant<[ + SchedVar<IsLdrAm3NegRegOffPredX2, [A57WrBackTwo]>, + SchedVar<NoSchedPred, [A57WrBackOne]> +]>; +def : InstRW<[A57WriteStrAm3PreWrBackX2, A57WriteStrAm3PreX2], + (instregex "STRH_PRE")>; + +def A57WriteStrAm3PreX3 : SchedWriteVariant<[ + SchedVar<IsLdrAm3NegRegOffPredX3, [A57Write_3cyc_1I_1S]>, + SchedVar<NoSchedPred, [A57Write_1cyc_1S_1I]> +]>; +def A57WriteStrAm3PreWrBackX3 : SchedWriteVariant<[ + SchedVar<IsLdrAm3NegRegOffPredX3, [A57WrBackTwo]>, + SchedVar<NoSchedPred, [A57WrBackOne]> +]>; +def : InstRW<[A57WriteStrAm3PreWrBackX3, A57WriteStrAm3PreX3], + (instregex "STRD_PRE")>; + +def : InstRW<[A57WrBackOne, A57Write_1cyc_1S_1I], (instregex "STR(T?)_POST_IMM", + "STRB(T?)_POST_IMM", "t2STR(B?)_POST")>; + +// 1(2) "S, M" for STR/STRB register post-indexed (both scaled or not) +def : InstRW<[A57WrBackTwo, A57Write_1cyc_1S_1M], (instregex "STR(T?)_POST_REG", + "STRB(T?)_POST_REG", "STR(B?)T_POST$")>; + +// post-indexed STRH/STRD(STRH_POST, STRD_POST), STRHTi, STRHTr +// 1(1) "S, I0/I1" both for reg or imm +def : InstRW<[A57WrBackOne, A57Write_1cyc_1S_1I], + (instregex "(t2)?STR(H|D)_POST", "STRHT(i|r)", "t2STRHT")>; + +// --- Store multiple instructions --- +// TODO: no writeback latency defined in documentation +def A57WriteSTM : SchedWriteVariant<[ + SchedVar<A57LMAddrPred1, [A57Write_1cyc_1S]>, + SchedVar<A57LMAddrPred2, [A57Write_2cyc_1S]>, + SchedVar<A57LMAddrPred3, [A57Write_3cyc_1S]>, + SchedVar<A57LMAddrPred4, [A57Write_4cyc_1S]>, + SchedVar<A57LMAddrPred5, [A57Write_5cyc_1S]>, + SchedVar<A57LMAddrPred6, [A57Write_6cyc_1S]>, + SchedVar<A57LMAddrPred7, [A57Write_7cyc_1S]>, + SchedVar<A57LMAddrPred8, [A57Write_8cyc_1S]>, + SchedVar<NoSchedPred, [A57Write_2cyc_1S]> +]>; +def A57WriteSTM_Upd : SchedWriteVariant<[ + SchedVar<A57LMAddrPred1, [A57Write_1cyc_1S_1I]>, + SchedVar<A57LMAddrPred2, [A57Write_2cyc_1S_1I]>, + SchedVar<A57LMAddrPred3, [A57Write_3cyc_1S_1I]>, + SchedVar<A57LMAddrPred4, [A57Write_4cyc_1S_1I]>, + SchedVar<A57LMAddrPred5, [A57Write_5cyc_1S_1I]>, + SchedVar<A57LMAddrPred6, [A57Write_6cyc_1S_1I]>, + SchedVar<A57LMAddrPred7, [A57Write_7cyc_1S_1I]>, + SchedVar<A57LMAddrPred8, [A57Write_8cyc_1S_1I]>, + SchedVar<NoSchedPred, [A57Write_2cyc_1S_1I]> +]>; + +def : InstRW<[A57WriteSTM], (instregex "(t2|sys|t)?STM(IA|DA|DB|IB)$")>; +def : InstRW<[A57WrBackOne, A57WriteSTM_Upd], + (instregex "(t2|sys|t)?STM(IA_UPD|DA_UPD|DB_UPD|IB_UPD)", "tPUSH")>; + +// --- 3.10 FP Data Processing Instructions --- +def : SchedAlias<WriteFPALU32, A57Write_5cyc_1V>; +def : SchedAlias<WriteFPALU64, A57Write_5cyc_1V>; + +def : InstRW<[A57Write_3cyc_1V], (instregex "VABS(S|D|H)")>; + +// fp compare - 3cyc F1 for unconditional, 6cyc "F0/F1, F1" for conditional +def A57WriteVcmp : SchedWriteVariant<[ + SchedVar<IsPredicatedPred, [A57Write_6cyc_1V_1X]>, + SchedVar<NoSchedPred, [A57Write_3cyc_1X]> +]>; +def : InstRW<[A57WriteVcmp], + (instregex "VCMP(D|S|H|ZD|ZS|ZH)$", "VCMPE(D|S|H|ZD|ZS|ZH)")>; + +// fp convert +def : InstRW<[A57Write_5cyc_1V], (instregex + "VCVT(A|N|P|M)(SH|UH|SS|US|SD|UD)", "VCVT(BDH|THD|TDH)")>; + +def : SchedAlias<WriteFPCVT, A57Write_5cyc_1V>; + +// FP round to integral +def : InstRW<[A57Write_5cyc_1V], (instregex "VRINT(A|N|P|M|Z|R|X)(H|S|D)$")>; + +// FP divide, FP square root +def : SchedAlias<WriteFPDIV32, A57Write_17cyc_1W>; +def : SchedAlias<WriteFPDIV64, A57Write_32cyc_1W>; +def : SchedAlias<WriteFPSQRT32, A57Write_17cyc_1W>; +def : SchedAlias<WriteFPSQRT64, A57Write_32cyc_1W>; + +// FP max/min +def : InstRW<[A57Write_5cyc_1V], (instregex "VMAX", "VMIN")>; + +// FP multiply-accumulate pipelines support late forwarding of the result +// from FP multiply μops to the accumulate operands of an +// FP multiply-accumulate μop. The latter can potentially be issued 1 cycle +// after the FP multiply μop has been issued +// FP multiply, FZ +def A57WriteVMUL : SchedWriteRes<[A57UnitV]> { let Latency = 5; } + +def : SchedAlias<WriteFPMUL32, A57WriteVMUL>; +def : SchedAlias<WriteFPMUL64, A57WriteVMUL>; +def : ReadAdvance<ReadFPMUL, 0>; + +// FP multiply accumulate, FZ: 9cyc "F0/F1" or 4 cyc for sequenced accumulate +// VFMA, VFMS, VFNMA, VFNMS, VMLA, VMLS, VNMLA, VNMLS +def A57WriteVFMA : SchedWriteRes<[A57UnitV]> { let Latency = 9; } + +// VFMA takes 9 cyc for common case and 4 cyc for VFMA->VFMA chain (5 read adv.) +// VMUL takes 5 cyc for common case and 1 cyc for VMUL->VFMA chain (4 read adv.) +// Currently, there is no way to define different read advances for VFMA operand +// from VFMA or from VMUL, so there will be 5 read advance. +// Zero latency (instead of one) for VMUL->VFMA shouldn't break something. +// The same situation with ASIMD VMUL/VFMA instructions +// def A57ReadVFMA : SchedRead; +// def : ReadAdvance<A57ReadVFMA, 5, [A57WriteVFMA]>; +// def : ReadAdvance<A57ReadVFMA, 4, [A57WriteVMUL]>; +def A57ReadVFMA5 : SchedReadAdvance<5, [A57WriteVFMA, A57WriteVMUL]>; + +def : SchedAlias<WriteFPMAC32, A57WriteVFMA>; +def : SchedAlias<WriteFPMAC64, A57WriteVFMA>; +def : SchedAlias<ReadFPMAC, A57ReadVFMA5>; + +def : InstRW<[A57Write_3cyc_1V], (instregex "VNEG")>; +def : InstRW<[A57Write_3cyc_1V], (instregex "VSEL")>; + +// --- 3.11 FP Miscellaneous Instructions --- +// VMOV: 3cyc "F0/F1" for imm/reg +def : InstRW<[A57Write_3cyc_1V], (instregex "FCONST(D|S|H)")>; +def : InstRW<[A57Write_3cyc_1V], (instregex "VMOV(D|S|H)(cc)?$")>; + +// 5cyc L for FP transfer, vfp to core reg, +// 5cyc L for FP transfer, core reg to vfp +def : SchedAlias<WriteFPMOV, A57Write_5cyc_1L>; +// VMOVRRS/VMOVRRD in common code declared with one WriteFPMOV (instead of 2). +def : InstRW<[A57Write_5cyc_1L, A57Write_5cyc_1L], (instregex "VMOV(RRS|RRD)")>; + +// 8cyc "L,F0/F1" for FP transfer, core reg to upper or lower half of vfp D-reg +def : InstRW<[A57Write_8cyc_1L_1I], (instregex "VMOVDRR")>; + +// --- 3.12 FP Load Instructions --- +def : InstRW<[A57Write_5cyc_1L], (instregex "VLDR(D|S|H)")>; + +def : InstRW<[A57Write_5cyc_1L], (instregex "VLDMQIA$")>; + +// FP load multiple (VLDM) + +def A57VLDMOpsListUncond : A57WriteLMOpsListType< + [A57Write_5cyc_1L, A57Write_5cyc_1L, + A57Write_6cyc_1L, A57Write_6cyc_1L, + A57Write_7cyc_1L, A57Write_7cyc_1L, + A57Write_8cyc_1L, A57Write_8cyc_1L, + A57Write_9cyc_1L, A57Write_9cyc_1L, + A57Write_10cyc_1L, A57Write_10cyc_1L, + A57Write_11cyc_1L, A57Write_11cyc_1L, + A57Write_12cyc_1L, A57Write_12cyc_1L]>; +def A57WriteVLDMuncond : SchedWriteVariant<[ + SchedVar<A57LMAddrPred1, A57VLDMOpsListUncond.Writes[0-1]>, + SchedVar<A57LMAddrPred2, A57VLDMOpsListUncond.Writes[0-3]>, + SchedVar<A57LMAddrPred3, A57VLDMOpsListUncond.Writes[0-5]>, + SchedVar<A57LMAddrPred4, A57VLDMOpsListUncond.Writes[0-7]>, + SchedVar<A57LMAddrPred5, A57VLDMOpsListUncond.Writes[0-9]>, + SchedVar<A57LMAddrPred6, A57VLDMOpsListUncond.Writes[0-11]>, + SchedVar<A57LMAddrPred7, A57VLDMOpsListUncond.Writes[0-13]>, + SchedVar<A57LMAddrPred8, A57VLDMOpsListUncond.Writes[0-15]>, + SchedVar<NoSchedPred, A57VLDMOpsListUncond.Writes[0-15]> +]> { let Variadic=1; } + +def A57VLDMOpsListCond : A57WriteLMOpsListType< + [A57Write_5cyc_1L, A57Write_6cyc_1L, + A57Write_7cyc_1L, A57Write_8cyc_1L, + A57Write_9cyc_1L, A57Write_10cyc_1L, + A57Write_11cyc_1L, A57Write_12cyc_1L, + A57Write_13cyc_1L, A57Write_14cyc_1L, + A57Write_15cyc_1L, A57Write_16cyc_1L, + A57Write_17cyc_1L, A57Write_18cyc_1L, + A57Write_19cyc_1L, A57Write_20cyc_1L]>; +def A57WriteVLDMcond : SchedWriteVariant<[ + SchedVar<A57LMAddrPred1, A57VLDMOpsListCond.Writes[0-1]>, + SchedVar<A57LMAddrPred2, A57VLDMOpsListCond.Writes[0-3]>, + SchedVar<A57LMAddrPred3, A57VLDMOpsListCond.Writes[0-5]>, + SchedVar<A57LMAddrPred4, A57VLDMOpsListCond.Writes[0-7]>, + SchedVar<A57LMAddrPred5, A57VLDMOpsListCond.Writes[0-9]>, + SchedVar<A57LMAddrPred6, A57VLDMOpsListCond.Writes[0-11]>, + SchedVar<A57LMAddrPred7, A57VLDMOpsListCond.Writes[0-13]>, + SchedVar<A57LMAddrPred8, A57VLDMOpsListCond.Writes[0-15]>, + SchedVar<NoSchedPred, A57VLDMOpsListCond.Writes[0-15]> +]> { let Variadic=1; } + +def A57WriteVLDM : SchedWriteVariant<[ + SchedVar<IsPredicatedPred, [A57WriteVLDMcond]>, + SchedVar<NoSchedPred, [A57WriteVLDMuncond]> +]> { let Variadic=1; } + +def : InstRW<[A57WriteVLDM], (instregex "VLDM(DIA|SIA)$")>; + +def A57VLDMOpsListUncond_Upd : A57WriteLMOpsListType< + [A57Write_5cyc_1L_1I, A57Write_5cyc_1L_1I, + A57Write_6cyc_1L_1I, A57Write_6cyc_1L_1I, + A57Write_7cyc_1L_1I, A57Write_7cyc_1L_1I, + A57Write_8cyc_1L_1I, A57Write_8cyc_1L_1I, + A57Write_9cyc_1L_1I, A57Write_9cyc_1L_1I, + A57Write_10cyc_1L_1I, A57Write_10cyc_1L_1I, + A57Write_11cyc_1L_1I, A57Write_11cyc_1L_1I, + A57Write_12cyc_1L_1I, A57Write_12cyc_1L_1I]>; +def A57WriteVLDMuncond_UPD : SchedWriteVariant<[ + SchedVar<A57LMAddrPred1, A57VLDMOpsListUncond_Upd.Writes[0-1]>, + SchedVar<A57LMAddrPred2, A57VLDMOpsListUncond_Upd.Writes[0-3]>, + SchedVar<A57LMAddrPred3, A57VLDMOpsListUncond_Upd.Writes[0-5]>, + SchedVar<A57LMAddrPred4, A57VLDMOpsListUncond_Upd.Writes[0-7]>, + SchedVar<A57LMAddrPred5, A57VLDMOpsListUncond_Upd.Writes[0-9]>, + SchedVar<A57LMAddrPred6, A57VLDMOpsListUncond_Upd.Writes[0-11]>, + SchedVar<A57LMAddrPred7, A57VLDMOpsListUncond_Upd.Writes[0-13]>, + SchedVar<A57LMAddrPred8, A57VLDMOpsListUncond_Upd.Writes[0-15]>, + SchedVar<NoSchedPred, A57VLDMOpsListUncond_Upd.Writes[0-15]> +]> { let Variadic=1; } + +def A57VLDMOpsListCond_Upd : A57WriteLMOpsListType< + [A57Write_5cyc_1L_1I, A57Write_6cyc_1L_1I, + A57Write_7cyc_1L_1I, A57Write_8cyc_1L_1I, + A57Write_9cyc_1L_1I, A57Write_10cyc_1L_1I, + A57Write_11cyc_1L_1I, A57Write_12cyc_1L_1I, + A57Write_13cyc_1L_1I, A57Write_14cyc_1L_1I, + A57Write_15cyc_1L_1I, A57Write_16cyc_1L_1I, + A57Write_17cyc_1L_1I, A57Write_18cyc_1L_1I, + A57Write_19cyc_1L_1I, A57Write_20cyc_1L_1I]>; +def A57WriteVLDMcond_UPD : SchedWriteVariant<[ + SchedVar<A57LMAddrPred1, A57VLDMOpsListCond_Upd.Writes[0-1]>, + SchedVar<A57LMAddrPred2, A57VLDMOpsListCond_Upd.Writes[0-3]>, + SchedVar<A57LMAddrPred3, A57VLDMOpsListCond_Upd.Writes[0-5]>, + SchedVar<A57LMAddrPred4, A57VLDMOpsListCond_Upd.Writes[0-7]>, + SchedVar<A57LMAddrPred5, A57VLDMOpsListCond_Upd.Writes[0-9]>, + SchedVar<A57LMAddrPred6, A57VLDMOpsListCond_Upd.Writes[0-11]>, + SchedVar<A57LMAddrPred7, A57VLDMOpsListCond_Upd.Writes[0-13]>, + SchedVar<A57LMAddrPred8, A57VLDMOpsListCond_Upd.Writes[0-15]>, + SchedVar<NoSchedPred, A57VLDMOpsListCond_Upd.Writes[0-15]> +]> { let Variadic=1; } + +def A57WriteVLDM_UPD : SchedWriteVariant<[ + SchedVar<IsPredicatedPred, [A57WriteVLDMcond_UPD]>, + SchedVar<NoSchedPred, [A57WriteVLDMuncond_UPD]> +]> { let Variadic=1; } + +def : InstRW<[A57WrBackOne, A57WriteVLDM_UPD], + (instregex "VLDM(DIA_UPD|DDB_UPD|SIA_UPD|SDB_UPD)")>; + +// --- 3.13 FP Store Instructions --- +def : InstRW<[A57Write_1cyc_1S], (instregex "VSTR(D|S|H)")>; + +def : InstRW<[A57Write_2cyc_1S], (instregex "VSTMQIA$")>; + +def A57WriteVSTMs : SchedWriteVariant<[ + SchedVar<A57LMAddrPred1, [A57Write_1cyc_1S]>, + SchedVar<A57LMAddrPred2, [A57Write_2cyc_1S]>, + SchedVar<A57LMAddrPred3, [A57Write_3cyc_1S]>, + SchedVar<A57LMAddrPred4, [A57Write_4cyc_1S]>, + SchedVar<A57LMAddrPred5, [A57Write_5cyc_1S]>, + SchedVar<A57LMAddrPred6, [A57Write_6cyc_1S]>, + SchedVar<A57LMAddrPred7, [A57Write_7cyc_1S]>, + SchedVar<A57LMAddrPred8, [A57Write_8cyc_1S]>, + SchedVar<NoSchedPred, [A57Write_2cyc_1S]> +]>; +def A57WriteVSTMd : SchedWriteVariant<[ + SchedVar<A57LMAddrPred1, [A57Write_2cyc_1S]>, + SchedVar<A57LMAddrPred2, [A57Write_4cyc_1S]>, + SchedVar<A57LMAddrPred3, [A57Write_6cyc_1S]>, + SchedVar<A57LMAddrPred4, [A57Write_8cyc_1S]>, + SchedVar<A57LMAddrPred5, [A57Write_10cyc_1S]>, + SchedVar<A57LMAddrPred6, [A57Write_12cyc_1S]>, + SchedVar<A57LMAddrPred7, [A57Write_14cyc_1S]>, + SchedVar<A57LMAddrPred8, [A57Write_16cyc_1S]>, + SchedVar<NoSchedPred, [A57Write_4cyc_1S]> +]>; +def A57WriteVSTMs_Upd : SchedWriteVariant<[ + SchedVar<A57LMAddrPred1, [A57Write_1cyc_1S_1I]>, + SchedVar<A57LMAddrPred2, [A57Write_2cyc_1S_1I]>, + SchedVar<A57LMAddrPred3, [A57Write_3cyc_1S_1I]>, + SchedVar<A57LMAddrPred4, [A57Write_4cyc_1S_1I]>, + SchedVar<A57LMAddrPred5, [A57Write_5cyc_1S_1I]>, + SchedVar<A57LMAddrPred6, [A57Write_6cyc_1S_1I]>, + SchedVar<A57LMAddrPred7, [A57Write_7cyc_1S_1I]>, + SchedVar<A57LMAddrPred8, [A57Write_8cyc_1S_1I]>, + SchedVar<NoSchedPred, [A57Write_2cyc_1S_1I]> +]>; +def A57WriteVSTMd_Upd : SchedWriteVariant<[ + SchedVar<A57LMAddrPred1, [A57Write_2cyc_1S_1I]>, + SchedVar<A57LMAddrPred2, [A57Write_4cyc_1S_1I]>, + SchedVar<A57LMAddrPred3, [A57Write_6cyc_1S_1I]>, + SchedVar<A57LMAddrPred4, [A57Write_8cyc_1S_1I]>, + SchedVar<A57LMAddrPred5, [A57Write_10cyc_1S_1I]>, + SchedVar<A57LMAddrPred6, [A57Write_12cyc_1S_1I]>, + SchedVar<A57LMAddrPred7, [A57Write_14cyc_1S_1I]>, + SchedVar<A57LMAddrPred8, [A57Write_16cyc_1S_1I]>, + SchedVar<NoSchedPred, [A57Write_2cyc_1S_1I]> +]>; + +def : InstRW<[A57WriteVSTMs], (instregex "VSTMSIA$")>; +def : InstRW<[A57WriteVSTMd], (instregex "VSTMDIA$")>; +def : InstRW<[A57WrBackOne, A57WriteVSTMs_Upd], + (instregex "VSTM(SIA_UPD|SDB_UPD)")>; +def : InstRW<[A57WrBackOne, A57WriteVSTMd_Upd], + (instregex "VSTM(DIA_UPD|DDB_UPD)")>; + +// --- 3.14 ASIMD Integer Instructions --- + +// ASIMD absolute diff, 3cyc F0/F1 for integer VABD +def : InstRW<[A57Write_3cyc_1V], (instregex "VABD(s|u)")>; + +// ASIMD absolute diff accum: 4(1) F1 for D-form, 5(2) F1 for Q-form +def A57WriteVABAD : SchedWriteRes<[A57UnitX]> { let Latency = 4; } +def A57ReadVABAD : SchedReadAdvance<3, [A57WriteVABAD]>; +def : InstRW<[A57WriteVABAD, A57ReadVABAD], + (instregex "VABA(s|u)(v8i8|v4i16|v2i32)")>; +def A57WriteVABAQ : SchedWriteRes<[A57UnitX]> { let Latency = 5; } +def A57ReadVABAQ : SchedReadAdvance<3, [A57WriteVABAQ]>; +def : InstRW<[A57WriteVABAQ, A57ReadVABAQ], + (instregex "VABA(s|u)(v16i8|v8i16|v4i32)")>; + +// ASIMD absolute diff accum long: 4(1) F1 for VABAL +def A57WriteVABAL : SchedWriteRes<[A57UnitX]> { let Latency = 4; } +def A57ReadVABAL : SchedReadAdvance<3, [A57WriteVABAL]>; +def : InstRW<[A57WriteVABAL, A57ReadVABAL], (instregex "VABAL(s|u)")>; + +// ASIMD absolute diff long: 3cyc F0/F1 for VABDL +def : InstRW<[A57Write_3cyc_1V], (instregex "VABDL(s|u)")>; + +// ASIMD arith, basic +def : InstRW<[A57Write_3cyc_1V], (instregex "VADD", "VADDL", "VADDW", + "VNEG(s8d|s16d|s32d|s8q|s16q|s32q|d|q)", + "VPADDi", "VPADDL", "VSUB", "VSUBL", "VSUBW")>; + +// ASIMD arith, complex +def : InstRW<[A57Write_3cyc_1V], (instregex "VABS", "VADDHN", "VHADD", "VHSUB", + "VQABS", "VQADD", "VQNEG", "VQSUB", + "VRADDHN", "VRHADD", "VRSUBHN", "VSUBHN")>; + +// ASIMD compare +def : InstRW<[A57Write_3cyc_1V], + (instregex "VCEQ", "VCGE", "VCGT", "VCLE", "VTST", "VCLT")>; + +// ASIMD logical +def : InstRW<[A57Write_3cyc_1V], + (instregex "VAND", "VBIC", "VMVN", "VORR", "VORN", "VEOR")>; + +// ASIMD max/min +def : InstRW<[A57Write_3cyc_1V], + (instregex "(VMAX|VMIN)(s|u)", "(VPMAX|VPMIN)(s8|s16|s32|u8|u16|u32)")>; + +// ASIMD multiply, D-form: 5cyc F0 for r0px, 4cyc F0 for r1p0 and later +// Cortex-A57 r1p0 and later reduce the latency of ASIMD multiply +// and multiply-with-accumulate instructions relative to r0pX. +def A57WriteVMULD_VecInt : SchedWriteVariant<[ + SchedVar<IsR1P0AndLaterPred, [A57Write_4cyc_1W]>, + SchedVar<NoSchedPred, [A57Write_5cyc_1W]>]>; +def : InstRW<[A57WriteVMULD_VecInt], (instregex + "VMUL(v8i8|v4i16|v2i32|pd)", "VMULsl(v4i16|v2i32)", + "VQDMULH(sl)?(v4i16|v2i32)", "VQRDMULH(sl)?(v4i16|v2i32)")>; + +// ASIMD multiply, Q-form: 6cyc F0 for r0px, 5cyc F0 for r1p0 and later +def A57WriteVMULQ_VecInt : SchedWriteVariant<[ + SchedVar<IsR1P0AndLaterPred, [A57Write_5cyc_1W]>, + SchedVar<NoSchedPred, [A57Write_6cyc_1W]>]>; +def : InstRW<[A57WriteVMULQ_VecInt], (instregex + "VMUL(v16i8|v8i16|v4i32|pq)", "VMULsl(v8i16|v4i32)", + "VQDMULH(sl)?(v8i16|v4i32)", "VQRDMULH(sl)?(v8i16|v4i32)")>; + +// ASIMD multiply accumulate, D-form +// 5cyc F0 for r0px, 4cyc F0 for r1p0 and later, 1cyc for accumulate sequence +// (4 or 3 ReadAdvance) +def A57WriteVMLAD_VecInt : SchedWriteVariant<[ + SchedVar<IsR1P0AndLaterPred, [A57Write_4cyc_1W]>, + SchedVar<NoSchedPred, [A57Write_5cyc_1W]>]>; +def A57ReadVMLAD_VecInt : SchedReadVariant<[ + SchedVar<IsR1P0AndLaterPred, [SchedReadAdvance<3, [A57WriteVMLAD_VecInt]>]>, + SchedVar<NoSchedPred, [SchedReadAdvance<4, [A57WriteVMLAD_VecInt]>]> +]>; +def : InstRW<[A57WriteVMLAD_VecInt, A57ReadVMLAD_VecInt], + (instregex "VMLA(sl)?(v8i8|v4i16|v2i32)", "VMLS(sl)?(v8i8|v4i16|v2i32)")>; + +// ASIMD multiply accumulate, Q-form +// 6cyc F0 for r0px, 5cyc F0 for r1p0 and later, 2cyc for accumulate sequence +// (4 or 3 ReadAdvance) +def A57WriteVMLAQ_VecInt : SchedWriteVariant<[ + SchedVar<IsR1P0AndLaterPred, [A57Write_5cyc_1W]>, + SchedVar<NoSchedPred, [A57Write_6cyc_1W]>]>; +def A57ReadVMLAQ_VecInt : SchedReadVariant<[ + SchedVar<IsR1P0AndLaterPred, [SchedReadAdvance<3, [A57WriteVMLAQ_VecInt]>]>, + SchedVar<NoSchedPred, [SchedReadAdvance<4, [A57WriteVMLAQ_VecInt]>]> +]>; +def : InstRW<[A57WriteVMLAQ_VecInt, A57ReadVMLAQ_VecInt], + (instregex "VMLA(sl)?(v16i8|v8i16|v4i32)", "VMLS(sl)?(v16i8|v8i16|v4i32)")>; + +// ASIMD multiply accumulate long +// 5cyc F0 for r0px, 4cyc F0 for r1p0 and later, 1cyc for accumulate sequence +// (4 or 3 ReadAdvance) +def A57WriteVMLAL_VecInt : SchedWriteVariant<[ + SchedVar<IsR1P0AndLaterPred, [A57Write_4cyc_1W]>, + SchedVar<NoSchedPred, [A57Write_5cyc_1W]>]>; +def A57ReadVMLAL_VecInt : SchedReadVariant<[ + SchedVar<IsR1P0AndLaterPred, [SchedReadAdvance<3, [A57WriteVMLAL_VecInt]>]>, + SchedVar<NoSchedPred, [SchedReadAdvance<4, [A57WriteVMLAL_VecInt]>]> +]>; +def : InstRW<[A57WriteVMLAL_VecInt, A57ReadVMLAL_VecInt], + (instregex "VMLAL(s|u)", "VMLSL(s|u)")>; + +// ASIMD multiply accumulate saturating long +// 5cyc F0 for r0px, 4cyc F0 for r1p0 and later, 2cyc for accumulate sequence +// (3 or 2 ReadAdvance) +def A57WriteVQDMLAL_VecInt : SchedWriteVariant<[ + SchedVar<IsR1P0AndLaterPred, [A57Write_4cyc_1W]>, + SchedVar<NoSchedPred, [A57Write_5cyc_1W]>]>; +def A57ReadVQDMLAL_VecInt : SchedReadVariant<[ + SchedVar<IsR1P0AndLaterPred, [SchedReadAdvance<2, [A57WriteVQDMLAL_VecInt]>]>, + SchedVar<NoSchedPred, [SchedReadAdvance<3, [A57WriteVQDMLAL_VecInt]>]> +]>; +def : InstRW<[A57WriteVQDMLAL_VecInt, A57ReadVQDMLAL_VecInt], + (instregex "VQDMLAL", "VQDMLSL")>; + +// ASIMD multiply long +// 5cyc F0 for r0px, 4cyc F0 for r1p0 and later +def A57WriteVMULL_VecInt : SchedWriteVariant<[ + SchedVar<IsR1P0AndLaterPred, [A57Write_4cyc_1W]>, + SchedVar<NoSchedPred, [A57Write_5cyc_1W]>]>; +def : InstRW<[A57WriteVMULL_VecInt], + (instregex "VMULL(s|u|p8|sls|slu)", "VQDMULL")>; + +// ASIMD pairwise add and accumulate +// 4cyc F1, 1cyc for accumulate sequence (3cyc ReadAdvance) +def A57WriteVPADAL : SchedWriteRes<[A57UnitX]> { let Latency = 4; } +def A57ReadVPADAL : SchedReadAdvance<3, [A57WriteVPADAL]>; +def : InstRW<[A57WriteVPADAL, A57ReadVPADAL], (instregex "VPADAL(s|u)")>; + +// ASIMD shift accumulate +// 4cyc F1, 1cyc for accumulate sequence (3cyc ReadAdvance) +def A57WriteVSRA : SchedWriteRes<[A57UnitX]> { let Latency = 4; } +def A57ReadVSRA : SchedReadAdvance<3, [A57WriteVSRA]>; +def : InstRW<[A57WriteVSRA, A57ReadVSRA], (instregex "VSRA", "VRSRA")>; + +// ASIMD shift by immed, basic +def : InstRW<[A57Write_3cyc_1X], + (instregex "VMOVL", "VSHLi", "VSHLL", "VSHR(s|u)", "VSHRN")>; + +// ASIMD shift by immed, complex +def : InstRW<[A57Write_4cyc_1X], (instregex + "VQRSHRN", "VQRSHRUN", "VQSHL(si|ui|su)", "VQSHRN", "VQSHRUN", "VRSHR(s|u)", + "VRSHRN")>; + +// ASIMD shift by immed and insert, basic, D-form +def : InstRW<[A57Write_4cyc_1X], (instregex + "VSLI(v8i8|v4i16|v2i32|v1i64)", "VSRI(v8i8|v4i16|v2i32|v1i64)")>; + +// ASIMD shift by immed and insert, basic, Q-form +def : InstRW<[A57Write_5cyc_1X], (instregex + "VSLI(v16i8|v8i16|v4i32|v2i64)", "VSRI(v16i8|v8i16|v4i32|v2i64)")>; + +// ASIMD shift by register, basic, D-form +def : InstRW<[A57Write_3cyc_1X], (instregex + "VSHL(s|u)(v8i8|v4i16|v2i32|v1i64)")>; + +// ASIMD shift by register, basic, Q-form +def : InstRW<[A57Write_4cyc_1X], (instregex + "VSHL(s|u)(v16i8|v8i16|v4i32|v2i64)")>; + +// ASIMD shift by register, complex, D-form +// VQRSHL, VQSHL, VRSHL +def : InstRW<[A57Write_4cyc_1X], (instregex + "VQRSHL(s|u)(v8i8|v4i16|v2i32|v1i64)", "VQSHL(s|u)(v8i8|v4i16|v2i32|v1i64)", + "VRSHL(s|u)(v8i8|v4i16|v2i32|v1i64)")>; + +// ASIMD shift by register, complex, Q-form +def : InstRW<[A57Write_5cyc_1X], (instregex + "VQRSHL(s|u)(v16i8|v8i16|v4i32|v2i64)", "VQSHL(s|u)(v16i8|v8i16|v4i32|v2i64)", + "VRSHL(s|u)(v16i8|v8i16|v4i32|v2i64)")>; + +// --- 3.15 ASIMD Floating-Point Instructions --- +// ASIMD FP absolute value +def : InstRW<[A57Write_3cyc_1V], (instregex "VABS(fd|fq|hd|hq)")>; + +// ASIMD FP arith +def : InstRW<[A57Write_5cyc_1V], (instregex "VABD(fd|fq|hd|hq)", + "VADD(fd|fq|hd|hq)", "VPADD(f|h)", "VSUB(fd|fq|hd|hq)")>; + +// ASIMD FP compare +def : InstRW<[A57Write_5cyc_1V], (instregex "VAC(GE|GT|LE|LT)", + "VC(EQ|GE|GT|LE)(fd|fq|hd|hq)")>; + +// ASIMD FP convert, integer +def : InstRW<[A57Write_5cyc_1V], (instregex + "VCVT(f2sd|f2ud|s2fd|u2fd|f2sq|f2uq|s2fq|u2fq|f2xsd|f2xud|xs2fd|xu2fd)", + "VCVT(f2xsq|f2xuq|xs2fq|xu2fq)", + "VCVT(AN|MN|NN|PN)(SDf|SQf|UDf|UQf|SDh|SQh|UDh|UQh)")>; + +// ASIMD FP convert, half-precision: 8cyc F0/F1 +def : InstRW<[A57Write_8cyc_1V], (instregex + "VCVT(h2sd|h2ud|s2hd|u2hd|h2sq|h2uq|s2hq|u2hq|h2xsd|h2xud|xs2hd|xu2hd)", + "VCVT(h2xsq|h2xuq|xs2hq|xu2hq)", + "VCVT(f2h|h2f)")>; + +// ASIMD FP max/min +def : InstRW<[A57Write_5cyc_1V], (instregex + "(VMAX|VMIN)(fd|fq|hd|hq)", "(VPMAX|VPMIN)(f|h)", "VMAXNM", "VMINNM")>; + +// ASIMD FP multiply +def A57WriteVMUL_VecFP : SchedWriteRes<[A57UnitV]> { let Latency = 5; } +def : InstRW<[A57WriteVMUL_VecFP], (instregex "VMUL(sl)?(fd|fq|hd|hq)")>; + +// ASIMD FP multiply accumulate: 9cyc F0/F1, 4cyc for accumulate sequence +def A57WriteVMLA_VecFP : SchedWriteRes<[A57UnitV]> { let Latency = 9; } +def A57ReadVMLA_VecFP : + SchedReadAdvance<5, [A57WriteVMLA_VecFP, A57WriteVMUL_VecFP]>; +def : InstRW<[A57WriteVMLA_VecFP, A57ReadVMLA_VecFP], + (instregex "(VMLA|VMLS)(sl)?(fd|fq|hd|hq)", "(VFMA|VFMS)(fd|fq|hd|hq)")>; + +// ASIMD FP negate +def : InstRW<[A57Write_3cyc_1V], (instregex "VNEG(fd|f32q|hd|hq)")>; + +// ASIMD FP round to integral +def : InstRW<[A57Write_5cyc_1V], (instregex + "VRINT(AN|MN|NN|PN|XN|ZN)(Df|Qf|Dh|Qh)")>; + +// --- 3.16 ASIMD Miscellaneous Instructions --- + +// ASIMD bitwise insert +def : InstRW<[A57Write_3cyc_1V], (instregex "VBIF", "VBIT", "VBSL")>; + +// ASIMD count +def : InstRW<[A57Write_3cyc_1V], (instregex "VCLS", "VCLZ", "VCNT")>; + +// ASIMD duplicate, core reg: 8cyc "L, F0/F1" +def : InstRW<[A57Write_8cyc_1L_1V], (instregex "VDUP(8|16|32)(d|q)")>; + +// ASIMD duplicate, scalar: 3cyc "F0/F1" +def : InstRW<[A57Write_3cyc_1V], (instregex "VDUPLN(8|16|32)(d|q)")>; + +// ASIMD extract +def : InstRW<[A57Write_3cyc_1V], (instregex "VEXT(d|q)(8|16|32|64)")>; + +// ASIMD move, immed +def : InstRW<[A57Write_3cyc_1V], (instregex + "VMOV(v8i8|v16i8|v4i16|v8i16|v2i32|v4i32|v1i64|v2i64|v2f32|v4f32)", + "VMOVQ0")>; + +// ASIMD move, narrowing +def : InstRW<[A57Write_3cyc_1V], (instregex "VMOVN")>; + +// ASIMD move, saturating +def : InstRW<[A57Write_4cyc_1X], (instregex "VQMOVN")>; + +// ASIMD reciprocal estimate +def : InstRW<[A57Write_5cyc_1V], (instregex "VRECPE", "VRSQRTE")>; + +// ASIMD reciprocal step, FZ +def : InstRW<[A57Write_9cyc_1V], (instregex "VRECPS", "VRSQRTS")>; + +// ASIMD reverse, swap, table lookup (1-2 reg) +def : InstRW<[A57Write_3cyc_1V], (instregex "VREV", "VSWP", "VTB(L|X)(1|2)")>; + +// ASIMD table lookup (3-4 reg) +def : InstRW<[A57Write_6cyc_1V], (instregex "VTBL(3|4)", "VTBX(3|4)")>; + +// ASIMD transfer, scalar to core reg: 6cyc "L, I0/I1" +def : InstRW<[A57Write_6cyc_1L_1I], (instregex "VGETLN")>; + +// ASIMD transfer, core reg to scalar: 8cyc "L, F0/F1" +def : InstRW<[A57Write_8cyc_1L_1V], (instregex "VSETLN")>; + +// ASIMD transpose +def : InstRW<[A57Write_3cyc_1V, A57Write_3cyc_1V], (instregex "VTRN")>; + +// ASIMD unzip/zip, D-form +def : InstRW<[A57Write_3cyc_1V, A57Write_3cyc_1V], + (instregex "VUZPd", "VZIPd")>; + +// ASIMD unzip/zip, Q-form +def : InstRW<[A57Write_6cyc_1V, A57Write_6cyc_1V], + (instregex "VUZPq", "VZIPq")>; + +// --- 3.17 ASIMD Load Instructions --- + +// Overriden via InstRW for this processor. +def : WriteRes<WriteVLD1, []>; +def : WriteRes<WriteVLD2, []>; +def : WriteRes<WriteVLD3, []>; +def : WriteRes<WriteVLD4, []>; +def : WriteRes<WriteVST1, []>; +def : WriteRes<WriteVST2, []>; +def : WriteRes<WriteVST3, []>; +def : WriteRes<WriteVST4, []>; + +// 1-2 reg: 5cyc L, +I for writeback, 1 cyc wb latency +def : InstRW<[A57Write_5cyc_1L], (instregex "VLD1(d|q)(8|16|32|64)$")>; +def : InstRW<[A57Write_5cyc_1L_1I, A57WrBackOne], + (instregex "VLD1(d|q)(8|16|32|64)wb")>; + +// 3-4 reg: 6cyc L, +I for writeback, 1 cyc wb latency +def : InstRW<[A57Write_6cyc_1L], + (instregex "VLD1(d|q)(8|16|32|64)(T|Q)$", "VLD1d64(T|Q)Pseudo")>; + +def : InstRW<[A57Write_6cyc_1L_1I, A57WrBackOne], + (instregex "VLD1(d|q)(8|16|32|64)(T|Q)wb")>; + +// ASIMD load, 1 element, one lane and all lanes: 8cyc "L, F0/F1" +def : InstRW<[A57Write_8cyc_1L_1V], (instregex + "VLD1(LN|DUP)(d|q)(8|16|32)$", "VLD1(LN|DUP)(d|q)(8|16|32)Pseudo$")>; +def : InstRW<[A57Write_8cyc_1L_1V_1I, A57WrBackOne], (instregex + "VLD1(LN|DUP)(d|q)(8|16|32)(wb|_UPD)", "VLD1LNq(8|16|32)Pseudo_UPD")>; + +// ASIMD load, 2 element, multiple, 2 reg: 8cyc "L, F0/F1" +def : InstRW<[A57Write_8cyc_1L_1V], + (instregex "VLD2(d|q)(8|16|32)$", "VLD2q(8|16|32)Pseudo$")>; +def : InstRW<[A57Write_8cyc_1L_1V_1I, A57WrBackOne], + (instregex "VLD2(d|q)(8|16|32)wb", "VLD2q(8|16|32)PseudoWB")>; + +// ASIMD load, 2 element, multiple, 4 reg: 9cyc "L, F0/F1" +def : InstRW<[A57Write_9cyc_1L_1V], (instregex "VLD2b(8|16|32)$")>; +def : InstRW<[A57Write_9cyc_1L_1V_1I, A57WrBackOne], + (instregex "VLD2b(8|16|32)wb")>; + +// ASIMD load, 2 element, one lane and all lanes: 8cyc "L, F0/F1" +def : InstRW<[A57Write_8cyc_1L_1V, A57Write_8cyc_1L_1V], + (instregex "VLD2(DUP|LN)(d|q)(8|16|32|8x2|16x2|32x2)$", + "VLD2LN(d|q)(8|16|32)Pseudo$")>; +// 2 results + wb result +def : InstRW<[A57Write_8cyc_1L_1V_1I, A57Write_8cyc_1L_1V, A57WrBackOne], + (instregex "VLD2LN(d|q)(8|16|32)_UPD$")>; +// 1 result + wb result +def : InstRW<[A57Write_8cyc_1L_1V_1I, A57WrBackOne], + (instregex "VLD2DUPd(8|16|32|8x2|16x2|32x2)wb", + "VLD2LN(d|q)(8|16|32)Pseudo_UPD")>; + +// ASIMD load, 3 element, multiple, 3 reg: 9cyc "L, F0/F1" +// 3 results +def : InstRW<[A57Write_9cyc_1L_1V, A57Write_9cyc_1L_1V, A57Write_9cyc_1L_1V], + (instregex "VLD3(d|q)(8|16|32)$")>; +// 1 result +def : InstRW<[A57Write_9cyc_1L_1V], + (instregex "VLD3(d|q)(8|16|32)(oddP|P)seudo$")>; +// 3 results + wb +def : InstRW<[A57Write_9cyc_1L_1V_1I, A57Write_9cyc_1L_1V_1I, + A57Write_9cyc_1L_1V_1I, A57WrBackOne], + (instregex "VLD3(d|q)(8|16|32)_UPD$")>; +// 1 result + wb +def : InstRW<[A57Write_9cyc_1L_1V_1I, A57WrBackOne], + (instregex "VLD3(d|q)(8|16|32)(oddP|P)seudo_UPD")>; + +// ASIMD load, 3 element, one lane, size 32: 8cyc "L, F0/F1" +def : InstRW<[A57Write_8cyc_1L_1V, A57Write_8cyc_1L_1V, A57Write_8cyc_1L_1V], + (instregex "VLD3LN(d|q)32$", + "VLD3LN(d|q)32Pseudo$")>; +def : InstRW<[A57Write_8cyc_1L_1V_1I, A57Write_8cyc_1L_1V_1I, + A57Write_8cyc_1L_1V_1I, A57WrBackOne], + (instregex "VLD3LN(d|q)32_UPD")>; +def : InstRW<[A57Write_8cyc_1L_1V_1I, A57WrBackOne], + (instregex "VLD3LN(d|q)32Pseudo_UPD")>; + +// ASIMD load, 3 element, one lane, size 8/16: 9cyc "L, F0/F1" +def : InstRW<[A57Write_9cyc_1L_1V, A57Write_9cyc_1L_1V, A57Write_9cyc_1L_1V], + (instregex "VLD3LN(d|q)(8|16)$", + "VLD3LN(d|q)(8|16)Pseudo$")>; +def : InstRW<[A57Write_9cyc_1L_1V_1I, A57Write_9cyc_1L_1V_1I, + A57Write_9cyc_1L_1V_1I, A57WrBackOne], + (instregex "VLD3LN(d|q)(8|16)_UPD")>; +def : InstRW<[A57Write_9cyc_1L_1V_1I, A57WrBackOne], + (instregex "VLD3LN(d|q)(8|16)Pseudo_UPD")>; + +// ASIMD load, 3 element, all lanes: 8cyc "L, F0/F1" +def : InstRW<[A57Write_8cyc_1L_1V, A57Write_8cyc_1L_1V, A57Write_8cyc_1L_1V], + (instregex "VLD3DUP(d|q)(8|16|32)$", + "VLD3DUP(d|q)(8|16|32)Pseudo$")>; +def : InstRW<[A57Write_8cyc_1L_1V_1I, A57Write_8cyc_1L_1V_1I, + A57Write_8cyc_1L_1V_1I, A57WrBackOne], + (instregex "VLD3DUP(d|q)(8|16|32)_UPD")>; +def : InstRW<[A57Write_8cyc_1L_1V_1I, A57WrBackOne], + (instregex "VLD3DUP(d|q)(8|16|32)Pseudo_UPD")>; + +// ASIMD load, 4 element, multiple, 4 reg: 9cyc "L, F0/F1" +def : InstRW<[A57Write_9cyc_1L_1V, A57Write_9cyc_1L_1V, A57Write_9cyc_1L_1V, + A57Write_9cyc_1L_1V], + (instregex "VLD4(d|q)(8|16|32)$")>; +def : InstRW<[A57Write_9cyc_1L_1V], + (instregex "VLD4(d|q)(8|16|32)(oddP|P)seudo$")>; +def : InstRW<[A57Write_9cyc_1L_1V_1I, A57Write_9cyc_1L_1V_1I, + A57Write_9cyc_1L_1V_1I, A57Write_9cyc_1L_1V_1I, A57WrBackOne], + (instregex "VLD4(d|q)(8|16|32)_UPD")>; +def : InstRW<[A57Write_9cyc_1L_1V_1I, A57WrBackOne], + (instregex "VLD4(d|q)(8|16|32)(oddP|P)seudo_UPD")>; + +// ASIMD load, 4 element, one lane, size 32: 8cyc "L, F0/F1" +def : InstRW<[A57Write_8cyc_1L_1V, A57Write_8cyc_1L_1V, A57Write_8cyc_1L_1V, + A57Write_8cyc_1L_1V], + (instregex "VLD4LN(d|q)32$", + "VLD4LN(d|q)32Pseudo$")>; +def : InstRW<[A57Write_8cyc_1L_1V_1I, A57Write_8cyc_1L_1V_1I, + A57Write_8cyc_1L_1V_1I, A57Write_8cyc_1L_1V_1I, + A57WrBackOne], + (instregex "VLD4LN(d|q)32_UPD")>; +def : InstRW<[A57Write_8cyc_1L_1V_1I, A57WrBackOne], + (instregex "VLD4LN(d|q)32Pseudo_UPD")>; + +// ASIMD load, 4 element, one lane, size 8/16: 9cyc "L, F0/F1" +def : InstRW<[A57Write_9cyc_1L_1V, A57Write_9cyc_1L_1V, A57Write_9cyc_1L_1V, + A57Write_9cyc_1L_1V], + (instregex "VLD4LN(d|q)(8|16)$", + "VLD4LN(d|q)(8|16)Pseudo$")>; +def : InstRW<[A57Write_9cyc_1L_1V_1I, A57Write_9cyc_1L_1V_1I, + A57Write_9cyc_1L_1V_1I, A57Write_9cyc_1L_1V_1I, + A57WrBackOne], + (instregex "VLD4LN(d|q)(8|16)_UPD")>; +def : InstRW<[A57Write_9cyc_1L_1V_1I, A57WrBackOne], + (instregex "VLD4LN(d|q)(8|16)Pseudo_UPD")>; + +// ASIMD load, 4 element, all lanes: 8cyc "L, F0/F1" +def : InstRW<[A57Write_8cyc_1L_1V, A57Write_8cyc_1L_1V, A57Write_8cyc_1L_1V, + A57Write_8cyc_1L_1V], + (instregex "VLD4DUP(d|q)(8|16|32)$", + "VLD4DUP(d|q)(8|16|32)Pseudo$")>; +def : InstRW<[A57Write_8cyc_1L_1V_1I, A57Write_8cyc_1L_1V_1I, + A57Write_8cyc_1L_1V_1I, A57Write_8cyc_1L_1V_1I, + A57WrBackOne], + (instregex "VLD4DUP(d|q)(8|16|32)_UPD")>; +def : InstRW<[A57Write_8cyc_1L_1V_1I, A57WrBackOne], + (instregex "VLD4DUP(d|q)(8|16|32)Pseudo_UPD")>; + +// --- 3.18 ASIMD Store Instructions --- + +// ASIMD store, 1 element, multiple, 1 reg: 1cyc S +def : InstRW<[A57Write_1cyc_1S], (instregex "VST1d(8|16|32|64)$")>; +def : InstRW<[A57WrBackOne, A57Write_1cyc_1S_1I], + (instregex "VST1d(8|16|32|64)wb")>; +// ASIMD store, 1 element, multiple, 2 reg: 2cyc S +def : InstRW<[A57Write_2cyc_1S], (instregex "VST1q(8|16|32|64)$")>; +def : InstRW<[A57WrBackOne, A57Write_2cyc_1S_1I], + (instregex "VST1q(8|16|32|64)wb")>; +// ASIMD store, 1 element, multiple, 3 reg: 3cyc S +def : InstRW<[A57Write_3cyc_1S], + (instregex "VST1d(8|16|32|64)T$", "VST1d64TPseudo$")>; +def : InstRW<[A57WrBackOne, A57Write_3cyc_1S_1I], + (instregex "VST1d(8|16|32|64)Twb", "VST1d64TPseudoWB")>; +// ASIMD store, 1 element, multiple, 4 reg: 4cyc S +def : InstRW<[A57Write_4cyc_1S], + (instregex "VST1d(8|16|32|64)(Q|QPseudo)$")>; +def : InstRW<[A57WrBackOne, A57Write_4cyc_1S_1I], + (instregex "VST1d(8|16|32|64)(Qwb|QPseudoWB)")>; +// ASIMD store, 1 element, one lane: 3cyc "F0/F1, S" +def : InstRW<[A57Write_3cyc_1S_1V], + (instregex "VST1LNd(8|16|32)$", "VST1LNq(8|16|32)Pseudo$")>; +def : InstRW<[A57WrBackOne, A57Write_3cyc_1S_1V_1I], + (instregex "VST1LNd(8|16|32)_UPD", "VST1LNq(8|16|32)Pseudo_UPD")>; +// ASIMD store, 2 element, multiple, 2 reg: 3cyc "F0/F1, S" +def : InstRW<[A57Write_3cyc_1S_1V], + (instregex "VST2(d|b)(8|16|32)$")>; +def : InstRW<[A57WrBackOne, A57Write_3cyc_1S_1V_1I], + (instregex "VST2(b|d)(8|16|32)wb")>; +// ASIMD store, 2 element, multiple, 4 reg: 4cyc "F0/F1, S" +def : InstRW<[A57Write_4cyc_1S_1V], + (instregex "VST2q(8|16|32)$", "VST2q(8|16|32)Pseudo$")>; +def : InstRW<[A57WrBackOne, A57Write_4cyc_1S_1V_1I], + (instregex "VST2q(8|16|32)wb", "VST2q(8|16|32)PseudoWB")>; +// ASIMD store, 2 element, one lane: 3cyc "F0/F1, S" +def : InstRW<[A57Write_3cyc_1S_1V], + (instregex "VST2LN(d|q)(8|16|32)$", "VST2LN(d|q)(8|16|32)Pseudo$")>; +def : InstRW<[A57WrBackOne, A57Write_3cyc_1S_1V_1I], + (instregex "VST2LN(d|q)(8|16|32)_UPD", + "VST2LN(d|q)(8|16|32)Pseudo_UPD")>; +// ASIMD store, 3 element, multiple, 3 reg +def : InstRW<[A57Write_3cyc_1S_1V], + (instregex "VST3(d|q)(8|16|32)$", "VST3(d|q)(8|16|32)(oddP|P)seudo$")>; +def : InstRW<[A57WrBackOne, A57Write_3cyc_1S_1V_1I], + (instregex "VST3(d|q)(8|16|32)_UPD", + "VST3(d|q)(8|16|32)(oddP|P)seudo_UPD$")>; +// ASIMD store, 3 element, one lane +def : InstRW<[A57Write_3cyc_1S_1V], + (instregex "VST3LN(d|q)(8|16|32)$", "VST3LN(d|q)(8|16|32)Pseudo$")>; +def : InstRW<[A57WrBackOne, A57Write_3cyc_1S_1V_1I], + (instregex "VST3LN(d|q)(8|16|32)_UPD", + "VST3LN(d|q)(8|16|32)Pseudo_UPD")>; +// ASIMD store, 4 element, multiple, 4 reg +def : InstRW<[A57Write_4cyc_1S_1V], + (instregex "VST4(d|q)(8|16|32)$", "VST4(d|q)(8|16|32)(oddP|P)seudo$")>; +def : InstRW<[A57WrBackOne, A57Write_4cyc_1S_1V_1I], + (instregex "VST4(d|q)(8|16|32)_UPD", + "VST4(d|q)(8|16|32)(oddP|P)seudo_UPD$")>; +// ASIMD store, 4 element, one lane +def : InstRW<[A57Write_3cyc_1S_1V], + (instregex "VST4LN(d|q)(8|16|32)$", "VST4LN(d|q)(8|16|32)Pseudo$")>; +def : InstRW<[A57WrBackOne, A57Write_3cyc_1S_1V_1I], + (instregex "VST4LN(d|q)(8|16|32)_UPD", + "VST4LN(d|q)(8|16|32)Pseudo_UPD")>; + +// --- 3.19 Cryptography Extensions --- +// Crypto AES ops +// AESD, AESE, AESIMC, AESMC: 3cyc F0 +def : InstRW<[A57Write_3cyc_1W], (instregex "^AES")>; +// Crypto polynomial (64x64) multiply long (VMULL.P64): 3cyc F0 +def : InstRW<[A57Write_3cyc_1W], (instregex "^VMULLp64")>; +// Crypto SHA1 xor ops: 6cyc F0/F1 +def : InstRW<[A57Write_6cyc_2V], (instregex "^SHA1SU0")>; +// Crypto SHA1 fast ops: 3cyc F0 +def : InstRW<[A57Write_3cyc_1W], (instregex "^SHA1(H|SU1)")>; +// Crypto SHA1 slow ops: 6cyc F0 +def : InstRW<[A57Write_6cyc_2W], (instregex "^SHA1[CMP]")>; +// Crypto SHA256 fast ops: 3cyc F0 +def : InstRW<[A57Write_3cyc_1W], (instregex "^SHA256SU0")>; +// Crypto SHA256 slow ops: 6cyc F0 +def : InstRW<[A57Write_6cyc_2W], (instregex "^SHA256(H|H2|SU1)")>; + +// --- 3.20 CRC --- +def : InstRW<[A57Write_3cyc_1W], (instregex "^(t2)?CRC32")>; + +// ----------------------------------------------------------------------------- +// Common definitions +def : WriteRes<WriteNoop, []> { let Latency = 0; let NumMicroOps = 0; } +def : SchedAlias<WriteALU, A57Write_1cyc_1I>; + +def : SchedAlias<WriteBr, A57Write_1cyc_1B>; +def : SchedAlias<WriteBrL, A57Write_1cyc_1B_1I>; +def : SchedAlias<WriteBrTbl, A57Write_1cyc_1B_1I>; +def : SchedAlias<WritePreLd, A57Write_4cyc_1L>; + +def : SchedAlias<WriteLd, A57Write_4cyc_1L>; +def : SchedAlias<WriteST, A57Write_1cyc_1S>; +def : ReadAdvance<ReadALU, 0>; + +} // SchedModel = CortexA57Model + diff --git a/lib/Target/ARM/ARMScheduleA57WriteRes.td b/lib/Target/ARM/ARMScheduleA57WriteRes.td new file mode 100644 index 000000000000..670717dc7c13 --- /dev/null +++ b/lib/Target/ARM/ARMScheduleA57WriteRes.td @@ -0,0 +1,323 @@ +//=- ARMScheduleA57WriteRes.td - ARM Cortex-A57 Write Res ---*- tablegen -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Contains all of the Cortex-A57 specific SchedWriteRes types. The approach +// below is to define a generic SchedWriteRes for every combination of +// latency and microOps. The naming conventions is to use a prefix, one field +// for latency, and one or more microOp count/type designators. +// Prefix: A57Write +// Latency: #cyc +// MicroOp Count/Types: #(B|I|M|L|S|X|W|V) +// +// e.g. A57Write_6cyc_1I_6S_4V means the total latency is 6 and there are +// 11 micro-ops to be issued as follows: one to I pipe, six to S pipes and +// four to V pipes. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Define Generic 1 micro-op types + +def A57Write_5cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 5; } +def A57Write_5cyc_1V : SchedWriteRes<[A57UnitV]> { let Latency = 5; } +def A57Write_5cyc_1W : SchedWriteRes<[A57UnitW]> { let Latency = 5; } +def A57Write_10cyc_1V : SchedWriteRes<[A57UnitV]> { let Latency = 10; } +def A57Write_17cyc_1W : SchedWriteRes<[A57UnitW]> { let Latency = 17; + let ResourceCycles = [17]; } +def A57Write_18cyc_1X : SchedWriteRes<[A57UnitX]> { let Latency = 18; + let ResourceCycles = [18]; } +def A57Write_19cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 19; + let ResourceCycles = [19]; } +def A57Write_20cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 20; + let ResourceCycles = [20]; } +def A57Write_1cyc_1B : SchedWriteRes<[A57UnitB]> { let Latency = 1; } +def A57Write_1cyc_1I : SchedWriteRes<[A57UnitI]> { let Latency = 1; } +def A57Write_2cyc_1I : SchedWriteRes<[A57UnitI]> { let Latency = 2; } +def A57Write_3cyc_1I : SchedWriteRes<[A57UnitI]> { let Latency = 3; } +def A57Write_1cyc_1S : SchedWriteRes<[A57UnitS]> { let Latency = 1; } +def A57Write_2cyc_1S : SchedWriteRes<[A57UnitS]> { let Latency = 2; } +def A57Write_3cyc_1S : SchedWriteRes<[A57UnitS]> { let Latency = 3; } +def A57Write_2cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 2; } +def A57Write_32cyc_1W : SchedWriteRes<[A57UnitW]> { let Latency = 32; + let ResourceCycles = [32]; } +def A57Write_32cyc_1X : SchedWriteRes<[A57UnitX]> { let Latency = 32; + let ResourceCycles = [32]; } +def A57Write_35cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 35; + let ResourceCycles = [35]; } +def A57Write_3cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 3; } +def A57Write_3cyc_1V : SchedWriteRes<[A57UnitV]> { let Latency = 3; } +def A57Write_3cyc_1W : SchedWriteRes<[A57UnitW]> { let Latency = 3; } +def A57Write_3cyc_1X : SchedWriteRes<[A57UnitX]> { let Latency = 3; } + +// A57Write_3cyc_1L - A57Write_20cyc_1L +foreach Lat = 3-20 in { + def A57Write_#Lat#cyc_1L : SchedWriteRes<[A57UnitL]> { + let Latency = Lat; + } +} + +// A57Write_4cyc_1S - A57Write_16cyc_1S +foreach Lat = 4-16 in { + def A57Write_#Lat#cyc_1S : SchedWriteRes<[A57UnitS]> { + let Latency = Lat; + } +} + +def A57Write_4cyc_1M : SchedWriteRes<[A57UnitL]> { let Latency = 4; } +def A57Write_4cyc_1X : SchedWriteRes<[A57UnitX]> { let Latency = 4; } +def A57Write_4cyc_1W : SchedWriteRes<[A57UnitW]> { let Latency = 4; } +def A57Write_5cyc_1X : SchedWriteRes<[A57UnitX]> { let Latency = 5; } +def A57Write_6cyc_1X : SchedWriteRes<[A57UnitX]> { let Latency = 6; } +def A57Write_6cyc_1W : SchedWriteRes<[A57UnitW]> { let Latency = 6; } +def A57Write_8cyc_1V : SchedWriteRes<[A57UnitV]> { let Latency = 8; } +def A57Write_9cyc_1V : SchedWriteRes<[A57UnitV]> { let Latency = 9; } +def A57Write_6cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 6; } +def A57Write_6cyc_1V : SchedWriteRes<[A57UnitV]> { let Latency = 6; } + + +//===----------------------------------------------------------------------===// +// Define Generic 2 micro-op types + +def A57Write_64cyc_2X : SchedWriteRes<[A57UnitX, A57UnitX]> { + let Latency = 64; + let NumMicroOps = 2; + let ResourceCycles = [32, 32]; +} +def A57Write_6cyc_1I_1L : SchedWriteRes<[A57UnitI, + A57UnitL]> { + let Latency = 6; + let NumMicroOps = 2; +} +def A57Write_6cyc_1V_1X : SchedWriteRes<[A57UnitV, + A57UnitX]> { + let Latency = 6; + let NumMicroOps = 2; +} +def A57Write_7cyc_1V_1X : SchedWriteRes<[A57UnitV, + A57UnitX]> { + let Latency = 7; + let NumMicroOps = 2; +} +def A57Write_8cyc_1L_1V : SchedWriteRes<[A57UnitL, + A57UnitV]> { + let Latency = 8; + let NumMicroOps = 2; +} +def A57Write_9cyc_1L_1V : SchedWriteRes<[A57UnitL, + A57UnitV]> { + let Latency = 9; + let NumMicroOps = 2; +} +def A57Write_9cyc_2V : SchedWriteRes<[A57UnitV, A57UnitV]> { + let Latency = 9; + let NumMicroOps = 2; +} +def A57Write_8cyc_2X : SchedWriteRes<[A57UnitX, A57UnitX]> { + let Latency = 8; + let NumMicroOps = 2; +} +def A57Write_6cyc_2L : SchedWriteRes<[A57UnitL, A57UnitL]> { + let Latency = 6; + let NumMicroOps = 2; +} +def A57Write_6cyc_2V : SchedWriteRes<[A57UnitV, A57UnitV]> { + let Latency = 6; + let NumMicroOps = 2; +} +def A57Write_6cyc_2W : SchedWriteRes<[A57UnitW, A57UnitW]> { + let Latency = 6; + let NumMicroOps = 2; +} +def A57Write_5cyc_1I_1L : SchedWriteRes<[A57UnitI, + A57UnitL]> { + let Latency = 5; + let NumMicroOps = 2; +} +def A57Write_5cyc_1I_1M : SchedWriteRes<[A57UnitI, + A57UnitM]> { + let Latency = 5; + let NumMicroOps = 2; +} +def A57Write_5cyc_2V : SchedWriteRes<[A57UnitV, A57UnitV]> { + let Latency = 5; + let NumMicroOps = 2; +} +def A57Write_5cyc_2X : SchedWriteRes<[A57UnitX, A57UnitX]> { + let Latency = 5; + let NumMicroOps = 2; +} +def A57Write_10cyc_1L_1V : SchedWriteRes<[A57UnitL, + A57UnitV]> { + let Latency = 10; + let NumMicroOps = 2; +} +def A57Write_10cyc_2V : SchedWriteRes<[A57UnitV, A57UnitV]> { + let Latency = 10; + let NumMicroOps = 2; +} +def A57Write_1cyc_1B_1I : SchedWriteRes<[A57UnitB, + A57UnitI]> { + let Latency = 1; + let NumMicroOps = 2; +} +def A57Write_1cyc_1I_1S : SchedWriteRes<[A57UnitI, + A57UnitS]> { + let Latency = 1; + let NumMicroOps = 2; +} +def A57Write_1cyc_1S_1I : SchedWriteRes<[A57UnitS, + A57UnitI]> { + let Latency = 1; + let NumMicroOps = 2; +} +def A57Write_2cyc_1S_1I : SchedWriteRes<[A57UnitS, + A57UnitI]> { + let Latency = 2; + let NumMicroOps = 2; +} +def A57Write_3cyc_1S_1I : SchedWriteRes<[A57UnitS, + A57UnitI]> { + let Latency = 3; + let NumMicroOps = 2; +} +def A57Write_1cyc_1S_1M : SchedWriteRes<[A57UnitS, + A57UnitM]> { + let Latency = 1; + let NumMicroOps = 2; +} +def A57Write_2cyc_1B_1I : SchedWriteRes<[A57UnitB, + A57UnitI]> { + let Latency = 2; + let NumMicroOps = 2; +} +def A57Write_3cyc_1B_1I : SchedWriteRes<[A57UnitB, + A57UnitI]> { + let Latency = 3; + let NumMicroOps = 2; +} +def A57Write_6cyc_1B_1L : SchedWriteRes<[A57UnitB, + A57UnitI]> { + let Latency = 6; + let NumMicroOps = 2; +} +def A57Write_2cyc_1I_1M : SchedWriteRes<[A57UnitI, + A57UnitM]> { + let Latency = 2; + let NumMicroOps = 2; +} +def A57Write_2cyc_2S : SchedWriteRes<[A57UnitS, A57UnitS]> { + let Latency = 2; + let NumMicroOps = 2; +} +def A57Write_2cyc_2V : SchedWriteRes<[A57UnitV, A57UnitV]> { + let Latency = 2; + let NumMicroOps = 2; +} +def A57Write_36cyc_2X : SchedWriteRes<[A57UnitX, A57UnitX]> { + let Latency = 36; + let NumMicroOps = 2; + let ResourceCycles = [18, 18]; +} +def A57Write_3cyc_1I_1M : SchedWriteRes<[A57UnitI, + A57UnitM]> { + let Latency = 3; + let NumMicroOps = 2; +} +def A57Write_4cyc_1I_1M : SchedWriteRes<[A57UnitI, + A57UnitM]> { + let Latency = 4; + let NumMicroOps = 2; +} + +// A57Write_3cyc_1L_1I - A57Write_20cyc_1L_1I +foreach Lat = 3-20 in { + def A57Write_#Lat#cyc_1L_1I : SchedWriteRes<[A57UnitL, A57UnitI]> { + let Latency = Lat; let NumMicroOps = 2; + } +} + +def A57Write_3cyc_1I_1S : SchedWriteRes<[A57UnitI, + A57UnitS]> { + let Latency = 3; + let NumMicroOps = 2; +} +def A57Write_3cyc_1S_1V : SchedWriteRes<[A57UnitS, + A57UnitV]> { + let Latency = 3; + let NumMicroOps = 2; +} +def A57Write_4cyc_1S_1V : SchedWriteRes<[A57UnitS, + A57UnitV]> { + let Latency = 4; + let NumMicroOps = 2; +} +def A57Write_3cyc_2V : SchedWriteRes<[A57UnitV, A57UnitV]> { + let Latency = 3; + let NumMicroOps = 2; +} + +// A57Write_4cyc_1S_1I - A57Write_16cyc_1S_1I +foreach Lat = 4-16 in { + def A57Write_#Lat#cyc_1S_1I : SchedWriteRes<[A57UnitS, A57UnitI]> { + let Latency = Lat; let NumMicroOps = 2; + } +} + +def A57Write_4cyc_2X : SchedWriteRes<[A57UnitX, A57UnitX]> { + let Latency = 4; + let NumMicroOps = 2; +} + + +//===----------------------------------------------------------------------===// +// Define Generic 3 micro-op types + +def A57Write_10cyc_3V : SchedWriteRes<[A57UnitV, A57UnitV, A57UnitV]> { + let Latency = 10; + let NumMicroOps = 3; +} +def A57Write_2cyc_1I_2S : SchedWriteRes<[A57UnitI, + A57UnitS, A57UnitS]> { + let Latency = 2; + let NumMicroOps = 3; +} +def A57Write_3cyc_1I_1S_1V : SchedWriteRes<[A57UnitI, + A57UnitS, + A57UnitV]> { + let Latency = 3; + let NumMicroOps = 3; +} +def A57Write_3cyc_1S_1V_1I : SchedWriteRes<[A57UnitS, + A57UnitV, + A57UnitI]> { + let Latency = 3; + let NumMicroOps = 3; +} +def A57Write_4cyc_1S_1V_1I : SchedWriteRes<[A57UnitS, + A57UnitV, + A57UnitI]> { + let Latency = 4; + let NumMicroOps = 3; +} +def A57Write_4cyc_1I_1L_1M : SchedWriteRes<[A57UnitI, A57UnitL, A57UnitM]> { + let Latency = 4; + let NumMicroOps = 3; +} +def A57Write_8cyc_1L_1V_1I : SchedWriteRes<[A57UnitL, + A57UnitV, + A57UnitI]> { + let Latency = 8; + let NumMicroOps = 3; +} +def A57Write_9cyc_1L_1V_1I : SchedWriteRes<[A57UnitL, + A57UnitV, + A57UnitI]> { + let Latency = 9; + let NumMicroOps = 3; +} diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h index d2630685d91b..af682dd8321c 100644 --- a/lib/Target/ARM/ARMSubtarget.h +++ b/lib/Target/ARM/ARMSubtarget.h @@ -234,6 +234,10 @@ protected: /// CPSR setting instruction. bool AvoidCPSRPartialUpdate = false; + /// CheapPredicableCPSRDef - If true, disable +1 predication cost + /// for instructions updating CPSR. Enabled for Cortex-A57. + bool CheapPredicableCPSRDef = false; + /// AvoidMOVsShifterOperand - If true, codegen should avoid using flag setting /// movs with shifter operand (i.e. asr, lsl, lsr). bool AvoidMOVsShifterOperand = false; @@ -543,6 +547,7 @@ public: bool nonpipelinedVFP() const { return NonpipelinedVFP; } bool prefers32BitThumb() const { return Pref32BitThumb; } bool avoidCPSRPartialUpdate() const { return AvoidCPSRPartialUpdate; } + bool cheapPredicableCPSRDef() const { return CheapPredicableCPSRDef; } bool avoidMOVsShifterOperand() const { return AvoidMOVsShifterOperand; } bool hasRetAddrStack() const { return HasRetAddrStack; } bool hasMPExtension() const { return HasMPExtension; } diff --git a/lib/Target/Hexagon/HexagonInstrInfo.cpp b/lib/Target/Hexagon/HexagonInstrInfo.cpp index 0fef91ec4d3e..b76da727237c 100644 --- a/lib/Target/Hexagon/HexagonInstrInfo.cpp +++ b/lib/Target/Hexagon/HexagonInstrInfo.cpp @@ -3419,9 +3419,7 @@ int HexagonInstrInfo::getDotNewPredOp(const MachineInstr &MI, int NewOpcode = Hexagon::getPredNewOpcode(MI.getOpcode()); if (NewOpcode >= 0) return NewOpcode; - - dbgs() << "Cannot convert to .new: " << getName(MI.getOpcode()) << '\n'; - llvm_unreachable(nullptr); + return 0; } int HexagonInstrInfo::getDotOldOp(const MachineInstr &MI) const { diff --git a/lib/Target/Mips/MicroMipsSizeReduction.cpp b/lib/Target/Mips/MicroMipsSizeReduction.cpp index 4593fc92ca6f..35948e36ad91 100644 --- a/lib/Target/Mips/MicroMipsSizeReduction.cpp +++ b/lib/Target/Mips/MicroMipsSizeReduction.cpp @@ -135,6 +135,14 @@ private: // returns true on success. static bool ReduceXWtoXWSP(MachineInstr *MI, const ReduceEntry &Entry); + // Attempts to reduce LBU/LHU instruction into LBU16/LHU16, + // returns true on success. + static bool ReduceLXUtoLXU16(MachineInstr *MI, const ReduceEntry &Entry); + + // Attempts to reduce SB/SH instruction into SB16/SH16, + // returns true on success. + static bool ReduceSXtoSX16(MachineInstr *MI, const ReduceEntry &Entry); + // Attempts to reduce arithmetic instructions, returns true on success static bool ReduceArithmeticInstructions(MachineInstr *MI, const ReduceEntry &Entry); @@ -162,10 +170,26 @@ llvm::SmallVector<ReduceEntry, 16> MicroMipsSizeReduce::ReduceTable = { {RT_OneInstr, OpCodes(Mips::ADDu_MM, Mips::ADDU16_MM), ReduceArithmeticInstructions, OpInfo(OT_OperandsAll), ImmField(0, 0, 0, -1)}, + {RT_OneInstr, OpCodes(Mips::LBu, Mips::LBU16_MM), ReduceLXUtoLXU16, + OpInfo(OT_OperandsAll), ImmField(0, -1, 15, 2)}, + {RT_OneInstr, OpCodes(Mips::LBu_MM, Mips::LBU16_MM), ReduceLXUtoLXU16, + OpInfo(OT_OperandsAll), ImmField(0, -1, 15, 2)}, + {RT_OneInstr, OpCodes(Mips::LHu, Mips::LHU16_MM), ReduceLXUtoLXU16, + OpInfo(OT_OperandsAll), ImmField(1, 0, 16, 2)}, + {RT_OneInstr, OpCodes(Mips::LHu_MM, Mips::LHU16_MM), ReduceLXUtoLXU16, + OpInfo(OT_OperandsAll), ImmField(1, 0, 16, 2)}, {RT_OneInstr, OpCodes(Mips::LW, Mips::LWSP_MM), ReduceXWtoXWSP, OpInfo(OT_OperandsAll), ImmField(2, 0, 32, 2)}, {RT_OneInstr, OpCodes(Mips::LW_MM, Mips::LWSP_MM), ReduceXWtoXWSP, OpInfo(OT_OperandsAll), ImmField(2, 0, 32, 2)}, + {RT_OneInstr, OpCodes(Mips::SB, Mips::SB16_MM), ReduceSXtoSX16, + OpInfo(OT_OperandsAll), ImmField(0, 0, 16, 2)}, + {RT_OneInstr, OpCodes(Mips::SB_MM, Mips::SB16_MM), ReduceSXtoSX16, + OpInfo(OT_OperandsAll), ImmField(0, 0, 16, 2)}, + {RT_OneInstr, OpCodes(Mips::SH, Mips::SH16_MM), ReduceSXtoSX16, + OpInfo(OT_OperandsAll), ImmField(1, 0, 16, 2)}, + {RT_OneInstr, OpCodes(Mips::SH_MM, Mips::SH16_MM), ReduceSXtoSX16, + OpInfo(OT_OperandsAll), ImmField(1, 0, 16, 2)}, {RT_OneInstr, OpCodes(Mips::SUBu, Mips::SUBU16_MM), ReduceArithmeticInstructions, OpInfo(OT_OperandsAll), ImmField(0, 0, 0, -1)}, @@ -193,6 +217,13 @@ static bool isMMThreeBitGPRegister(const MachineOperand &MO) { return false; } +// Returns true if the machine operand MO is register $0, $17, or $2-$7. +static bool isMMSourceRegister(const MachineOperand &MO) { + if (MO.isReg() && Mips::GPRMM16ZeroRegClass.contains(MO.getReg())) + return true; + return false; +} + // Returns true if the operand Op is an immediate value // and writes the immediate value into variable Imm static bool GetImm(MachineInstr *MI, unsigned Op, int64_t &Imm) { @@ -279,6 +310,32 @@ bool MicroMipsSizeReduce::ReduceArithmeticInstructions( return ReplaceInstruction(MI, Entry); } +bool MicroMipsSizeReduce::ReduceLXUtoLXU16(MachineInstr *MI, + const ReduceEntry &Entry) { + + if (!ImmInRange(MI, Entry)) + return false; + + if (!isMMThreeBitGPRegister(MI->getOperand(0)) || + !isMMThreeBitGPRegister(MI->getOperand(1))) + return false; + + return ReplaceInstruction(MI, Entry); +} + +bool MicroMipsSizeReduce::ReduceSXtoSX16(MachineInstr *MI, + const ReduceEntry &Entry) { + + if (!ImmInRange(MI, Entry)) + return false; + + if (!isMMSourceRegister(MI->getOperand(0)) || + !isMMThreeBitGPRegister(MI->getOperand(1))) + return false; + + return ReplaceInstruction(MI, Entry); +} + bool MicroMipsSizeReduce::ReduceMBB(MachineBasicBlock &MBB) { bool Modified = false; MachineBasicBlock::instr_iterator MII = MBB.instr_begin(), diff --git a/lib/Target/WebAssembly/known_gcc_test_failures.txt b/lib/Target/WebAssembly/known_gcc_test_failures.txt index 54619589c341..35a67134775a 100644 --- a/lib/Target/WebAssembly/known_gcc_test_failures.txt +++ b/lib/Target/WebAssembly/known_gcc_test_failures.txt @@ -88,6 +88,3 @@ pr45695.c wasm-o pr49279.c wasm-o pr49390.c wasm-o pr52286.c wasm-o - -# fatal error: error in backend: data symbols must have a size set with .size -921110-1.c wasm-o diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 0a41f35f9320..5303d7a406ad 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -4753,7 +4753,7 @@ static void scaleShuffleMask(int Scale, ArrayRef<int> Mask, SmallVectorImpl<int> &ScaledMask) { assert(0 < Scale && "Unexpected scaling factor"); int NumElts = Mask.size(); - ScaledMask.assign(NumElts * Scale, -1); + ScaledMask.assign(static_cast<size_t>(NumElts * Scale), -1); for (int i = 0; i != NumElts; ++i) { int M = Mask[i]; @@ -5848,17 +5848,39 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask, return true; } case ISD::SCALAR_TO_VECTOR: { - // Match against a scalar_to_vector of an extract from a similar vector. + // Match against a scalar_to_vector of an extract from a vector, + // for PEXTRW/PEXTRB we must handle the implicit zext of the scalar. SDValue N0 = N.getOperand(0); - if (N0.getOpcode() != ISD::EXTRACT_VECTOR_ELT || - N0.getOperand(0).getValueType() != VT || - !isa<ConstantSDNode>(N0.getOperand(1)) || - NumElts <= N0.getConstantOperandVal(1) || - !N->isOnlyUserOf(N0.getNode())) + SDValue SrcExtract; + + if (N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT && + N0.getOperand(0).getValueType() == VT) { + SrcExtract = N0; + } else if (N0.getOpcode() == ISD::AssertZext && + N0.getOperand(0).getOpcode() == X86ISD::PEXTRW && + cast<VTSDNode>(N0.getOperand(1))->getVT() == MVT::i16) { + SrcExtract = N0.getOperand(0); + assert(SrcExtract.getOperand(0).getValueType() == MVT::v8i16); + } else if (N0.getOpcode() == ISD::AssertZext && + N0.getOperand(0).getOpcode() == X86ISD::PEXTRB && + cast<VTSDNode>(N0.getOperand(1))->getVT() == MVT::i8) { + SrcExtract = N0.getOperand(0); + assert(SrcExtract.getOperand(0).getValueType() == MVT::v16i8); + } + + if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1)) || + NumElts <= SrcExtract.getConstantOperandVal(1)) return false; - Ops.push_back(N0.getOperand(0)); - Mask.push_back(N0.getConstantOperandVal(1)); - Mask.append(NumElts - 1, SM_SentinelUndef); + + SDValue SrcVec = SrcExtract.getOperand(0); + EVT SrcVT = SrcVec.getValueType(); + unsigned NumSrcElts = SrcVT.getVectorNumElements(); + unsigned NumZeros = (NumBitsPerElt / SrcVT.getScalarSizeInBits()) - 1; + + Ops.push_back(SrcVec); + Mask.push_back(SrcExtract.getConstantOperandVal(1)); + Mask.append(NumZeros, SM_SentinelZero); + Mask.append(NumSrcElts - Mask.size(), SM_SentinelUndef); return true; } case X86ISD::PINSRB: @@ -6542,12 +6564,12 @@ static Constant *getConstantVector(MVT VT, const APInt &SplatValue, APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * i); Constant *Const; if (VT.isFloatingPoint()) { - assert((ScalarSize == 32 || ScalarSize == 64) && - "Unsupported floating point scalar size"); - if (ScalarSize == 32) - Const = ConstantFP::get(Type::getFloatTy(C), Val.bitsToFloat()); - else - Const = ConstantFP::get(Type::getDoubleTy(C), Val.bitsToDouble()); + if (ScalarSize == 32) { + Const = ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val)); + } else { + assert(ScalarSize == 64 && "Unsupported floating point scalar size"); + Const = ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val)); + } } else Const = Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val); ConstantVec.push_back(Const); @@ -6633,11 +6655,13 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp, // AVX have support for 32 and 64 bit broadcast for floats only. // No 64bit integer in 32bit subtarget. MVT CVT = MVT::getFloatingPointVT(SplatBitSize); - Constant *C = SplatBitSize == 32 - ? ConstantFP::get(Type::getFloatTy(*Ctx), - SplatValue.bitsToFloat()) - : ConstantFP::get(Type::getDoubleTy(*Ctx), - SplatValue.bitsToDouble()); + // Lower the splat via APFloat directly, to avoid any conversion. + Constant *C = + SplatBitSize == 32 + ? ConstantFP::get(*Ctx, + APFloat(APFloat::IEEEsingle(), SplatValue)) + : ConstantFP::get(*Ctx, + APFloat(APFloat::IEEEdouble(), SplatValue)); SDValue CP = DAG.getConstantPool(C, PVT); unsigned Repeat = VT.getSizeInBits() / SplatBitSize; @@ -8003,7 +8027,7 @@ static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) { static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT, ArrayRef<int> Mask, SmallVectorImpl<int> &RepeatedMask) { - int LaneSize = LaneSizeInBits / VT.getScalarSizeInBits(); + auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits(); RepeatedMask.assign(LaneSize, -1); int Size = Mask.size(); for (int i = 0; i < Size; ++i) { @@ -16997,7 +17021,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget, SDValue Op1 = Op.getOperand(1); SDValue CC = Op.getOperand(2); MVT VT = Op.getSimpleValueType(); - ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); + ISD::CondCode Cond = cast<CondCodeSDNode>(CC)->get(); bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint(); SDLoc dl(Op); @@ -17024,18 +17048,18 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget, // TODO: This can be avoided if Intel (and only Intel as of 2016) AVX is // available. SDValue Cmp; - unsigned SSECC = translateX86FSETCC(SetCCOpcode, Op0, Op1); + unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1); if (SSECC == 8) { // LLVM predicate is SETUEQ or SETONE. unsigned CC0, CC1; unsigned CombineOpc; - if (SetCCOpcode == ISD::SETUEQ) { + if (Cond == ISD::SETUEQ) { CC0 = 3; // UNORD CC1 = 0; // EQ CombineOpc = Opc == X86ISD::CMPP ? static_cast<unsigned>(X86ISD::FOR) : static_cast<unsigned>(ISD::OR); } else { - assert(SetCCOpcode == ISD::SETONE); + assert(Cond == ISD::SETONE); CC0 = 7; // ORD CC1 = 4; // NEQ CombineOpc = Opc == X86ISD::CMPP ? static_cast<unsigned>(X86ISD::FAND) : @@ -17082,7 +17106,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget, // 2. The original operand type has been promoted to a 256-bit vector. // // Note that condition 2. only applies for AVX targets. - SDValue NewOp = DAG.getSetCC(dl, VTOp0, Op0, Op1, SetCCOpcode); + SDValue NewOp = DAG.getSetCC(dl, VTOp0, Op0, Op1, Cond); return DAG.getZExtOrTrunc(NewOp, dl, VT); } @@ -17122,7 +17146,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget, VT == MVT::v4i32 || VT == MVT::v2i64) && Subtarget.hasXOP()) { // Translate compare code to XOP PCOM compare mode. unsigned CmpMode = 0; - switch (SetCCOpcode) { + switch (Cond) { default: llvm_unreachable("Unexpected SETCC condition"); case ISD::SETULT: case ISD::SETLT: CmpMode = 0x00; break; @@ -17137,60 +17161,49 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget, } // Are we comparing unsigned or signed integers? - unsigned Opc = ISD::isUnsignedIntSetCC(SetCCOpcode) - ? X86ISD::VPCOMU : X86ISD::VPCOM; + unsigned Opc = + ISD::isUnsignedIntSetCC(Cond) ? X86ISD::VPCOMU : X86ISD::VPCOM; return DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(CmpMode, dl, MVT::i8)); } - // We are handling one of the integer comparisons here. Since SSE only has + // We are handling one of the integer comparisons here. Since SSE only has // GT and EQ comparisons for integer, swapping operands and multiple // operations may be required for some comparisons. - unsigned Opc; - bool Swap = false, Invert = false, FlipSigns = false, MinMax = false; - bool Subus = false; - - switch (SetCCOpcode) { - default: llvm_unreachable("Unexpected SETCC condition"); - case ISD::SETNE: Invert = true; LLVM_FALLTHROUGH; - case ISD::SETEQ: Opc = X86ISD::PCMPEQ; break; - case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH; - case ISD::SETGT: Opc = X86ISD::PCMPGT; break; - case ISD::SETGE: Swap = true; LLVM_FALLTHROUGH; - case ISD::SETLE: Opc = X86ISD::PCMPGT; - Invert = true; break; - case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH; - case ISD::SETUGT: Opc = X86ISD::PCMPGT; - FlipSigns = true; break; - case ISD::SETUGE: Swap = true; LLVM_FALLTHROUGH; - case ISD::SETULE: Opc = X86ISD::PCMPGT; - FlipSigns = true; Invert = true; break; - } + unsigned Opc = (Cond == ISD::SETEQ || Cond == ISD::SETNE) ? X86ISD::PCMPEQ + : X86ISD::PCMPGT; + bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT || + Cond == ISD::SETGE || Cond == ISD::SETUGE; + bool Invert = Cond == ISD::SETNE || + (Cond != ISD::SETEQ && ISD::isTrueWhenEqual(Cond)); + bool FlipSigns = ISD::isUnsignedIntSetCC(Cond); // Special case: Use min/max operations for SETULE/SETUGE MVT VET = VT.getVectorElementType(); - bool hasMinMax = - (Subtarget.hasSSE41() && (VET >= MVT::i8 && VET <= MVT::i32)) - || (Subtarget.hasSSE2() && (VET == MVT::i8)); - - if (hasMinMax) { - switch (SetCCOpcode) { + bool HasMinMax = + (Subtarget.hasSSE41() && (VET >= MVT::i8 && VET <= MVT::i32)) || + (Subtarget.hasSSE2() && (VET == MVT::i8)); + bool MinMax = false; + if (HasMinMax) { + switch (Cond) { default: break; case ISD::SETULE: Opc = ISD::UMIN; MinMax = true; break; case ISD::SETUGE: Opc = ISD::UMAX; MinMax = true; break; } - if (MinMax) { Swap = false; Invert = false; FlipSigns = false; } + if (MinMax) + Swap = Invert = FlipSigns = false; } - bool hasSubus = Subtarget.hasSSE2() && (VET == MVT::i8 || VET == MVT::i16); - if (!MinMax && hasSubus) { + bool HasSubus = Subtarget.hasSSE2() && (VET == MVT::i8 || VET == MVT::i16); + bool Subus = false; + if (!MinMax && HasSubus) { // As another special case, use PSUBUS[BW] when it's profitable. E.g. for // Op0 u<= Op1: // t = psubus Op0, Op1 // pcmpeq t, <0..0> - switch (SetCCOpcode) { + switch (Cond) { default: break; case ISD::SETULT: { // If the comparison is against a constant we can turn this into a |