aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target')
-rw-r--r--llvm/lib/Target/AArch64/AArch64.td6
-rw-r--r--llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp46
-rw-r--r--llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp27
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.cpp145
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.h8
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrInfo.cpp31
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrInfo.td84
-rw-r--r--llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td16
-rw-r--r--llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp97
-rw-r--r--llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h19
-rw-r--r--llvm/lib/Target/AArch64/AArch64Subtarget.cpp1
-rw-r--r--llvm/lib/Target/AArch64/AArch64Subtarget.h3
-rw-r--r--llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp21
-rw-r--r--llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp95
-rw-r--r--llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp55
-rw-r--r--llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h1
-rw-r--r--llvm/lib/Target/AArch64/SVEInstrFormats.td35
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPU.h1
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp88
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp8
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp106
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h41
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp1
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp3
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp3
-rw-r--r--llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp20
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.cpp1
-rw-r--r--llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp14
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp14
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h5
-rw-r--r--llvm/lib/Target/ARM/ARM.td10
-rw-r--r--llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp39
-rw-r--r--llvm/lib/Target/ARM/ARMISelLowering.cpp2
-rw-r--r--llvm/lib/Target/ARM/ARMInstrInfo.td19
-rw-r--r--llvm/lib/Target/ARM/ARMInstrThumb.td9
-rw-r--r--llvm/lib/Target/ARM/ARMInstrThumb2.td16
-rw-r--r--llvm/lib/Target/ARM/ARMSubtarget.cpp1
-rw-r--r--llvm/lib/Target/ARM/ARMSubtarget.h1
-rw-r--r--llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp3
-rw-r--r--llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp8
-rw-r--r--llvm/lib/Target/M68k/GISel/M68kLegalizerInfo.cpp35
-rw-r--r--llvm/lib/Target/M68k/M68kInstrBits.td8
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp1
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXInstrInfo.td1
-rw-r--r--llvm/lib/Target/PowerPC/PPCISelLowering.cpp45
-rw-r--r--llvm/lib/Target/PowerPC/PPCInstrPrefix.td45
-rw-r--r--llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp4
-rw-r--r--llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp6
-rw-r--r--llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.cpp11
-rw-r--r--llvm/lib/Target/RISCV/RISCV.td8
-rw-r--r--llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp6
-rw-r--r--llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp6
-rw-r--r--llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp28
-rw-r--r--llvm/lib/Target/RISCV/RISCVISelLowering.cpp175
-rw-r--r--llvm/lib/Target/RISCV/RISCVISelLowering.h1
-rw-r--r--llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp21
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfo.cpp98
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfo.td16
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoA.td12
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td20
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td2
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td4
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoZb.td503
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td28
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoZk.td2
-rw-r--r--llvm/lib/Target/RISCV/RISCVSExtWRemoval.cpp6
-rw-r--r--llvm/lib/Target/RISCV/RISCVSubtarget.h1
-rw-r--r--llvm/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp27
-rw-r--r--llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp4
-rw-r--r--llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp82
-rw-r--r--llvm/lib/Target/SystemZ/SystemZFrameLowering.h3
-rw-r--r--llvm/lib/Target/SystemZ/SystemZISelLowering.h5
-rw-r--r--llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp7
-rw-r--r--llvm/lib/Target/SystemZ/SystemZInstrInfo.h3
-rw-r--r--llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.cpp38
-rw-r--r--llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.h43
-rw-r--r--llvm/lib/Target/WebAssembly/Utils/WebAssemblyUtilities.h38
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp16
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp1
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp2
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp41
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyLowerRefTypesIntPtrConv.cpp2
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp34
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp18
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h4
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp230
-rw-r--r--llvm/lib/Target/X86/X86LowerAMXType.cpp2
88 files changed, 1930 insertions, 868 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td
index b87468d5c8de..9a04b28a8b8f 100644
--- a/llvm/lib/Target/AArch64/AArch64.td
+++ b/llvm/lib/Target/AArch64/AArch64.td
@@ -972,6 +972,10 @@ def ProcessorFeatures {
list<SubtargetFeature> X1 = [HasV8_2aOps, FeatureCrypto, FeatureFPARMv8,
FeatureNEON, FeatureRCPC, FeaturePerfMon,
FeatureSPE, FeatureFullFP16, FeatureDotProd];
+ list<SubtargetFeature> X1C = [HasV8_2aOps, FeatureCrypto, FeatureFPARMv8,
+ FeatureNEON, FeatureRCPC, FeaturePerfMon,
+ FeatureSPE, FeatureFullFP16, FeatureDotProd,
+ FeaturePAuth];
list<SubtargetFeature> X2 = [HasV9_0aOps, FeatureNEON, FeaturePerfMon,
FeatureMatMulInt8, FeatureBF16, FeatureAM,
FeatureMTE, FeatureETE, FeatureSVE2BitPerm,
@@ -1086,6 +1090,8 @@ def : ProcessorModel<"cortex-r82", CortexA55Model, ProcessorFeatures.R82,
[TuneR82]>;
def : ProcessorModel<"cortex-x1", CortexA57Model, ProcessorFeatures.X1,
[TuneX1]>;
+def : ProcessorModel<"cortex-x1c", CortexA57Model, ProcessorFeatures.X1C,
+ [TuneX1]>;
def : ProcessorModel<"cortex-x2", CortexA57Model, ProcessorFeatures.X2,
[TuneX2]>;
def : ProcessorModel<"neoverse-e1", CortexA53Model,
diff --git a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
index 85a9c04a3fef..b54a0eaba7d1 100644
--- a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -95,6 +95,8 @@ public:
void LowerJumpTableDest(MCStreamer &OutStreamer, const MachineInstr &MI);
+ void LowerMOPS(MCStreamer &OutStreamer, const MachineInstr &MI);
+
void LowerSTACKMAP(MCStreamer &OutStreamer, StackMaps &SM,
const MachineInstr &MI);
void LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM,
@@ -936,6 +938,43 @@ void AArch64AsmPrinter::LowerJumpTableDest(llvm::MCStreamer &OutStreamer,
.addImm(Size == 4 ? 0 : 2));
}
+void AArch64AsmPrinter::LowerMOPS(llvm::MCStreamer &OutStreamer,
+ const llvm::MachineInstr &MI) {
+ unsigned Opcode = MI.getOpcode();
+ assert(STI->hasMOPS());
+ assert(STI->hasMTE() || Opcode != AArch64::MOPSMemorySetTaggingPseudo);
+
+ const auto Ops = [Opcode]() -> std::array<unsigned, 3> {
+ if (Opcode == AArch64::MOPSMemoryCopyPseudo)
+ return {AArch64::CPYFP, AArch64::CPYFM, AArch64::CPYFE};
+ if (Opcode == AArch64::MOPSMemoryMovePseudo)
+ return {AArch64::CPYP, AArch64::CPYM, AArch64::CPYE};
+ if (Opcode == AArch64::MOPSMemorySetPseudo)
+ return {AArch64::SETP, AArch64::SETM, AArch64::SETE};
+ if (Opcode == AArch64::MOPSMemorySetTaggingPseudo)
+ return {AArch64::SETGP, AArch64::SETGM, AArch64::MOPSSETGE};
+ llvm_unreachable("Unhandled memory operation pseudo");
+ }();
+ const bool IsSet = Opcode == AArch64::MOPSMemorySetPseudo ||
+ Opcode == AArch64::MOPSMemorySetTaggingPseudo;
+
+ for (auto Op : Ops) {
+ int i = 0;
+ auto MCIB = MCInstBuilder(Op);
+ // Destination registers
+ MCIB.addReg(MI.getOperand(i++).getReg());
+ MCIB.addReg(MI.getOperand(i++).getReg());
+ if (!IsSet)
+ MCIB.addReg(MI.getOperand(i++).getReg());
+ // Input registers
+ MCIB.addReg(MI.getOperand(i++).getReg());
+ MCIB.addReg(MI.getOperand(i++).getReg());
+ MCIB.addReg(MI.getOperand(i++).getReg());
+
+ EmitToStreamer(OutStreamer, MCIB);
+ }
+}
+
void AArch64AsmPrinter::LowerSTACKMAP(MCStreamer &OutStreamer, StackMaps &SM,
const MachineInstr &MI) {
unsigned NumNOPBytes = StackMapOpers(&MI).getNumPatchBytes();
@@ -1363,6 +1402,13 @@ void AArch64AsmPrinter::emitInstruction(const MachineInstr *MI) {
emitFMov0(*MI);
return;
+ case AArch64::MOPSMemoryCopyPseudo:
+ case AArch64::MOPSMemoryMovePseudo:
+ case AArch64::MOPSMemorySetPseudo:
+ case AArch64::MOPSMemorySetTaggingPseudo:
+ LowerMOPS(*OutStreamer, *MI);
+ return;
+
case TargetOpcode::STACKMAP:
return LowerSTACKMAP(*OutStreamer, SM, *MI);
diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index 109b739528bf..b0f739cc26e6 100644
--- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -709,20 +709,24 @@ bool AArch64ExpandPseudo::expandSVESpillFill(MachineBasicBlock &MBB,
bool AArch64ExpandPseudo::expandCALL_RVMARKER(
MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) {
- // Expand CALL_RVMARKER pseudo to a branch, followed by the special `mov x29,
- // x29` marker. Mark the sequence as bundle, to avoid passes moving other code
- // in between.
+ // Expand CALL_RVMARKER pseudo to:
+ // - a branch to the call target, followed by
+ // - the special `mov x29, x29` marker, and
+ // - another branch, to the runtime function
+ // Mark the sequence as bundle, to avoid passes moving other code in between.
MachineInstr &MI = *MBBI;
MachineInstr *OriginalCall;
- MachineOperand &CallTarget = MI.getOperand(0);
+ MachineOperand &RVTarget = MI.getOperand(0);
+ MachineOperand &CallTarget = MI.getOperand(1);
assert((CallTarget.isGlobal() || CallTarget.isReg()) &&
"invalid operand for regular call");
+ assert(RVTarget.isGlobal() && "invalid operand for attached call");
unsigned Opc = CallTarget.isGlobal() ? AArch64::BL : AArch64::BLR;
OriginalCall = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc)).getInstr();
OriginalCall->addOperand(CallTarget);
- unsigned RegMaskStartIdx = 1;
+ unsigned RegMaskStartIdx = 2;
// Skip register arguments. Those are added during ISel, but are not
// needed for the concrete branch.
while (!MI.getOperand(RegMaskStartIdx).isRegMask()) {
@@ -736,17 +740,22 @@ bool AArch64ExpandPseudo::expandCALL_RVMARKER(
llvm::drop_begin(MI.operands(), RegMaskStartIdx))
OriginalCall->addOperand(MO);
- auto *Marker = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ORRXrs))
+ BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ORRXrs))
.addReg(AArch64::FP, RegState::Define)
.addReg(AArch64::XZR)
.addReg(AArch64::FP)
- .addImm(0)
+ .addImm(0);
+
+ auto *RVCall = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::BL))
+ .add(RVTarget)
.getInstr();
+
if (MI.shouldUpdateCallSiteInfo())
- MBB.getParent()->moveCallSiteInfo(&MI, Marker);
+ MBB.getParent()->moveCallSiteInfo(&MI, OriginalCall);
+
MI.eraseFromParent();
finalizeBundle(MBB, OriginalCall->getIterator(),
- std::next(Marker->getIterator()));
+ std::next(RVCall->getIterator()));
return true;
}
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index a26bbc77f248..c539c8617d99 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -29,6 +29,7 @@
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/Triple.h"
#include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/MemoryLocation.h"
#include "llvm/Analysis/ObjCARCUtil.h"
#include "llvm/Analysis/VectorUtils.h"
#include "llvm/CodeGen/Analysis.h"
@@ -938,19 +939,20 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
// In case of strict alignment, avoid an excessive number of byte wide stores.
MaxStoresPerMemsetOptSize = 8;
- MaxStoresPerMemset = Subtarget->requiresStrictAlign()
- ? MaxStoresPerMemsetOptSize : 32;
+ MaxStoresPerMemset =
+ Subtarget->requiresStrictAlign() ? MaxStoresPerMemsetOptSize : 32;
MaxGluedStoresPerMemcpy = 4;
MaxStoresPerMemcpyOptSize = 4;
- MaxStoresPerMemcpy = Subtarget->requiresStrictAlign()
- ? MaxStoresPerMemcpyOptSize : 16;
+ MaxStoresPerMemcpy =
+ Subtarget->requiresStrictAlign() ? MaxStoresPerMemcpyOptSize : 16;
- MaxStoresPerMemmoveOptSize = MaxStoresPerMemmove = 4;
+ MaxStoresPerMemmoveOptSize = 4;
+ MaxStoresPerMemmove = 4;
MaxLoadsPerMemcmpOptSize = 4;
- MaxLoadsPerMemcmp = Subtarget->requiresStrictAlign()
- ? MaxLoadsPerMemcmpOptSize : 8;
+ MaxLoadsPerMemcmp =
+ Subtarget->requiresStrictAlign() ? MaxLoadsPerMemcmpOptSize : 8;
setStackPointerRegisterToSaveRestore(AArch64::SP);
@@ -1426,6 +1428,11 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv16i1, MVT::nxv16i8);
}
+ if (Subtarget->hasMOPS() && Subtarget->hasMTE()) {
+ // Only required for llvm.aarch64.mops.memset.tag
+ setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom);
+ }
+
PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();
}
@@ -2201,7 +2208,6 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
MAKE_CASE(AArch64ISD::INSR)
MAKE_CASE(AArch64ISD::PTEST)
MAKE_CASE(AArch64ISD::PTRUE)
- MAKE_CASE(AArch64ISD::PFALSE)
MAKE_CASE(AArch64ISD::LD1_MERGE_ZERO)
MAKE_CASE(AArch64ISD::LD1S_MERGE_ZERO)
MAKE_CASE(AArch64ISD::LDNF1_MERGE_ZERO)
@@ -2268,6 +2274,10 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
MAKE_CASE(AArch64ISD::UADDLP)
MAKE_CASE(AArch64ISD::CALL_RVMARKER)
MAKE_CASE(AArch64ISD::ASSERT_ZEXT_BOOL)
+ MAKE_CASE(AArch64ISD::MOPS_MEMSET)
+ MAKE_CASE(AArch64ISD::MOPS_MEMSET_TAGGING)
+ MAKE_CASE(AArch64ISD::MOPS_MEMCOPY)
+ MAKE_CASE(AArch64ISD::MOPS_MEMMOVE)
}
#undef MAKE_CASE
return nullptr;
@@ -3746,6 +3756,10 @@ SDValue AArch64TargetLowering::LowerBITCAST(SDValue Op,
if (OpVT != MVT::f16 && OpVT != MVT::bf16)
return SDValue();
+ // Bitcasts between f16 and bf16 are legal.
+ if (ArgVT == MVT::f16 || ArgVT == MVT::bf16)
+ return Op;
+
assert(ArgVT == MVT::i16);
SDLoc DL(Op);
@@ -4056,6 +4070,39 @@ static SDValue lowerConvertToSVBool(SDValue Op, SelectionDAG &DAG) {
return DAG.getNode(ISD::AND, DL, OutVT, Reinterpret, MaskReinterpret);
}
+SDValue AArch64TargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
+ SelectionDAG &DAG) const {
+ unsigned IntNo = Op.getConstantOperandVal(1);
+ switch (IntNo) {
+ default:
+ return SDValue(); // Don't custom lower most intrinsics.
+ case Intrinsic::aarch64_mops_memset_tag: {
+ auto Node = cast<MemIntrinsicSDNode>(Op.getNode());
+ SDLoc DL(Op);
+ SDValue Chain = Node->getChain();
+ SDValue Dst = Op.getOperand(2);
+ SDValue Val = Op.getOperand(3);
+ Val = DAG.getAnyExtOrTrunc(Val, DL, MVT::i64);
+ SDValue Size = Op.getOperand(4);
+ auto Alignment = Node->getMemOperand()->getAlign();
+ bool IsVol = Node->isVolatile();
+ auto DstPtrInfo = Node->getPointerInfo();
+
+ const auto &SDI =
+ static_cast<const AArch64SelectionDAGInfo &>(DAG.getSelectionDAGInfo());
+ SDValue MS =
+ SDI.EmitMOPS(AArch64ISD::MOPS_MEMSET_TAGGING, DAG, DL, Chain, Dst, Val,
+ Size, Alignment, IsVol, DstPtrInfo, MachinePointerInfo{});
+
+ // MOPS_MEMSET_TAGGING has 3 results (DstWb, SizeWb, Chain) whereas the
+ // intrinsic has 2. So hide SizeWb using MERGE_VALUES. Otherwise
+ // LowerOperationWrapper will complain that the number of results has
+ // changed.
+ return DAG.getMergeValues({MS.getValue(0), MS.getValue(2)}, DL);
+ }
+ }
+}
+
SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
SelectionDAG &DAG) const {
unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
@@ -5123,6 +5170,8 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
case ISD::MULHU:
return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHU_PRED,
/*OverrideNEON=*/true);
+ case ISD::INTRINSIC_W_CHAIN:
+ return LowerINTRINSIC_W_CHAIN(Op, DAG);
case ISD::INTRINSIC_WO_CHAIN:
return LowerINTRINSIC_WO_CHAIN(Op, DAG);
case ISD::ATOMIC_STORE:
@@ -6475,12 +6524,18 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
unsigned CallOpc = AArch64ISD::CALL;
// Calls with operand bundle "clang.arc.attachedcall" are special. They should
- // be expanded to the call, directly followed by a special marker sequence.
- // Use the CALL_RVMARKER to do that.
+ // be expanded to the call, directly followed by a special marker sequence and
+ // a call to an ObjC library function. Use CALL_RVMARKER to do that.
if (CLI.CB && objcarc::hasAttachedCallOpBundle(CLI.CB)) {
assert(!IsTailCall &&
"tail calls cannot be marked with clang.arc.attachedcall");
CallOpc = AArch64ISD::CALL_RVMARKER;
+
+ // Add a target global address for the retainRV/claimRV runtime function
+ // just before the call target.
+ Function *ARCFn = *objcarc::getAttachedARCFunction(CLI.CB);
+ auto GA = DAG.getTargetGlobalAddress(ARCFn, DL, PtrVT);
+ Ops.insert(Ops.begin() + 1, GA);
}
// Returns a chain and a flag for retval copy to use.
@@ -9985,8 +10040,9 @@ SDValue AArch64TargetLowering::LowerSPLAT_VECTOR(SDValue Op,
// The only legal i1 vectors are SVE vectors, so we can use SVE-specific
// lowering code.
if (auto *ConstVal = dyn_cast<ConstantSDNode>(SplatVal)) {
+ // We can hande the zero case during isel.
if (ConstVal->isZero())
- return DAG.getNode(AArch64ISD::PFALSE, dl, VT);
+ return Op;
if (ConstVal->isOne())
return getPTrue(DAG, dl, VT, AArch64SVEPredPattern::all);
}
@@ -11869,6 +11925,19 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MONonTemporal;
return true;
}
+ case Intrinsic::aarch64_mops_memset_tag: {
+ Value *Dst = I.getArgOperand(0);
+ Value *Val = I.getArgOperand(1);
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
+ Info.memVT = MVT::getVT(Val->getType());
+ Info.ptrVal = Dst;
+ Info.offset = 0;
+ Info.align = I.getParamAlign(0).valueOrOne();
+ Info.flags = MachineMemOperand::MOStore;
+ // The size of the memory being operated on is unknown at this point
+ Info.size = MemoryLocation::UnknownSize;
+ return true;
+ }
default:
break;
}
@@ -15092,7 +15161,7 @@ static bool isAllInactivePredicate(SDValue N) {
while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST)
N = N.getOperand(0);
- return N.getOpcode() == AArch64ISD::PFALSE;
+ return ISD::isConstantSplatVectorAllZeros(N.getNode());
}
static bool isAllActivePredicate(SelectionDAG &DAG, SDValue N) {
@@ -15393,6 +15462,52 @@ static SDValue performIntrinsicCombine(SDNode *N,
return SDValue();
}
+static bool isCheapToExtend(const SDValue &N) {
+ unsigned OC = N->getOpcode();
+ return OC == ISD::LOAD || OC == ISD::MLOAD ||
+ ISD::isConstantSplatVectorAllZeros(N.getNode());
+}
+
+static SDValue
+performSignExtendSetCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
+ SelectionDAG &DAG) {
+ // If we have (sext (setcc A B)) and A and B are cheap to extend,
+ // we can move the sext into the arguments and have the same result. For
+ // example, if A and B are both loads, we can make those extending loads and
+ // avoid an extra instruction. This pattern appears often in VLS code
+ // generation where the inputs to the setcc have a different size to the
+ // instruction that wants to use the result of the setcc.
+ assert(N->getOpcode() == ISD::SIGN_EXTEND &&
+ N->getOperand(0)->getOpcode() == ISD::SETCC);
+ const SDValue SetCC = N->getOperand(0);
+
+ const SDValue CCOp0 = SetCC.getOperand(0);
+ const SDValue CCOp1 = SetCC.getOperand(1);
+ if (!CCOp0->getValueType(0).isInteger() ||
+ !CCOp1->getValueType(0).isInteger())
+ return SDValue();
+
+ ISD::CondCode Code =
+ cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get();
+
+ ISD::NodeType ExtType =
+ isSignedIntSetCC(Code) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+
+ if (isCheapToExtend(SetCC.getOperand(0)) &&
+ isCheapToExtend(SetCC.getOperand(1))) {
+ const SDValue Ext1 =
+ DAG.getNode(ExtType, SDLoc(N), N->getValueType(0), CCOp0);
+ const SDValue Ext2 =
+ DAG.getNode(ExtType, SDLoc(N), N->getValueType(0), CCOp1);
+
+ return DAG.getSetCC(
+ SDLoc(SetCC), N->getValueType(0), Ext1, Ext2,
+ cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get());
+ }
+
+ return SDValue();
+}
+
static SDValue performExtendCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
SelectionDAG &DAG) {
@@ -15411,6 +15526,12 @@ static SDValue performExtendCombine(SDNode *N,
return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), NewABD);
}
+
+ if (N->getValueType(0).isFixedLengthVector() &&
+ N->getOpcode() == ISD::SIGN_EXTEND &&
+ N->getOperand(0)->getOpcode() == ISD::SETCC)
+ return performSignExtendSetCCCombine(N, DCI, DAG);
+
return SDValue();
}
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index ca6c70297c0b..2138c0ffe70a 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -323,7 +323,6 @@ enum NodeType : unsigned {
INSR,
PTEST,
PTRUE,
- PFALSE,
BITREVERSE_MERGE_PASSTHRU,
BSWAP_MERGE_PASSTHRU,
@@ -453,6 +452,12 @@ enum NodeType : unsigned {
LDP,
STP,
STNP,
+
+ // Memory Operations
+ MOPS_MEMSET,
+ MOPS_MEMSET_TAGGING,
+ MOPS_MEMCOPY,
+ MOPS_MEMMOVE,
};
} // end namespace AArch64ISD
@@ -890,6 +895,7 @@ private:
SDValue LowerMLOAD(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
bool isEligibleForTailCallOptimization(
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 93c17133c845..a9191924129c 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -93,9 +93,18 @@ unsigned AArch64InstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
// before the assembly printer.
unsigned NumBytes = 0;
const MCInstrDesc &Desc = MI.getDesc();
+
+ // Size should be preferably set in
+ // llvm/lib/Target/AArch64/AArch64InstrInfo.td (default case).
+ // Specific cases handle instructions of variable sizes
switch (Desc.getOpcode()) {
default:
- // Anything not explicitly designated otherwise is a normal 4-byte insn.
+ if (Desc.getSize())
+ return Desc.getSize();
+
+ // Anything not explicitly designated otherwise (i.e. pseudo-instructions
+ // with fixed constant size but not specified in .td file) is a normal
+ // 4-byte insn.
NumBytes = 4;
break;
case TargetOpcode::STACKMAP:
@@ -115,29 +124,9 @@ unsigned AArch64InstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
if (NumBytes == 0)
NumBytes = 4;
break;
- case AArch64::TLSDESC_CALLSEQ:
- // This gets lowered to an instruction sequence which takes 16 bytes
- NumBytes = 16;
- break;
- case AArch64::SpeculationBarrierISBDSBEndBB:
- // This gets lowered to 2 4-byte instructions.
- NumBytes = 8;
- break;
- case AArch64::SpeculationBarrierSBEndBB:
- // This gets lowered to 1 4-byte instructions.
- NumBytes = 4;
- break;
- case AArch64::JumpTableDest32:
- case AArch64::JumpTableDest16:
- case AArch64::JumpTableDest8:
- NumBytes = 12;
- break;
case AArch64::SPACE:
NumBytes = MI.getOperand(1).getImm();
break;
- case AArch64::StoreSwiftAsyncContext:
- NumBytes = 20;
- break;
case TargetOpcode::BUNDLE:
NumBytes = getInstBundleLength(MI);
break;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index c8a697c8b82f..83bf89ff97c5 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -780,6 +780,7 @@ def : Pat<(AArch64LOADgot texternalsym:$addr),
def : Pat<(AArch64LOADgot tconstpool:$addr),
(LOADgot tconstpool:$addr)>;
+// In general these get lowered into a sequence of three 4-byte instructions.
// 32-bit jump table destination is actually only 2 instructions since we can
// use the table itself as a PC-relative base. But optimization occurs after
// branch relaxation so be pessimistic.
@@ -815,8 +816,12 @@ let hasSideEffects = 1, isCodeGenOnly = 1 in {
// SpeculationBarrierEndBB must only be used after an unconditional control
// flow, i.e. after a terminator for which isBarrier is True.
let hasSideEffects = 1, isCodeGenOnly = 1, isTerminator = 1, isBarrier = 1 in {
+ // This gets lowered to a pair of 4-byte instructions.
+ let Size = 8 in
def SpeculationBarrierISBDSBEndBB
: Pseudo<(outs), (ins), []>, Sched<[]>;
+ // This gets lowered to a 4-byte instruction.
+ let Size = 4 in
def SpeculationBarrierSBEndBB
: Pseudo<(outs), (ins), []>, Sched<[]>;
}
@@ -2324,8 +2329,8 @@ def : Pat<(AArch64call GPR64noip:$Rn),
(BLRNoIP GPR64noip:$Rn)>,
Requires<[SLSBLRMitigation]>;
-def : Pat<(AArch64call_rvmarker GPR64:$Rn),
- (BLR_RVMARKER GPR64:$Rn)>,
+def : Pat<(AArch64call_rvmarker (i64 tglobaladdr:$rvfunc), GPR64:$Rn),
+ (BLR_RVMARKER tglobaladdr:$rvfunc, GPR64:$Rn)>,
Requires<[NoSLSBLRMitigation]>;
let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
@@ -2356,7 +2361,8 @@ def EMITBKEY : Pseudo<(outs), (ins), []>, Sched<[]> {}
// FIXME: maybe the scratch register used shouldn't be fixed to X1?
// FIXME: can "hasSideEffects be dropped?
-let isCall = 1, Defs = [LR, X0, X1], hasSideEffects = 1,
+// This gets lowered to an instruction sequence which takes 16 bytes
+let isCall = 1, Defs = [LR, X0, X1], hasSideEffects = 1, Size = 16,
isCodeGenOnly = 1 in
def TLSDESC_CALLSEQ
: Pseudo<(outs), (ins i64imm:$sym),
@@ -7546,6 +7552,9 @@ def : Pat<(i64 (bitconvert (f64 FPR64:$Xn))),
def : Pat<(i64 (bitconvert (v1f64 V64:$Vn))),
(COPY_TO_REGCLASS V64:$Vn, GPR64)>;
+def : Pat<(f16 (bitconvert (bf16 FPR16:$src))), (f16 FPR16:$src)>;
+def : Pat<(bf16 (bitconvert (f16 FPR16:$src))), (bf16 FPR16:$src)>;
+
let Predicates = [IsLE] in {
def : Pat<(v1i64 (bitconvert (v2i32 FPR64:$src))), (v1i64 FPR64:$src)>;
def : Pat<(v1i64 (bitconvert (v4i16 FPR64:$src))), (v1i64 FPR64:$src)>;
@@ -8330,26 +8339,67 @@ let Predicates = [HasLS64] in {
}
let Predicates = [HasMOPS] in {
- defm CPYFP : MOPSMemoryCopyInsns<0b00, "cpyfp">;
- defm CPYFM : MOPSMemoryCopyInsns<0b01, "cpyfm">;
- defm CPYFE : MOPSMemoryCopyInsns<0b10, "cpyfe">;
+ let Defs = [NZCV] in {
+ defm CPYFP : MOPSMemoryCopyInsns<0b00, "cpyfp">;
+
+ defm CPYP : MOPSMemoryMoveInsns<0b00, "cpyp">;
+
+ defm SETP : MOPSMemorySetInsns<0b00, "setp">;
+ }
+ let Uses = [NZCV] in {
+ defm CPYFM : MOPSMemoryCopyInsns<0b01, "cpyfm">;
+ defm CPYFE : MOPSMemoryCopyInsns<0b10, "cpyfe">;
- defm CPYP : MOPSMemoryMoveInsns<0b00, "cpyp">;
- defm CPYM : MOPSMemoryMoveInsns<0b01, "cpym">;
- defm CPYE : MOPSMemoryMoveInsns<0b10, "cpye">;
+ defm CPYM : MOPSMemoryMoveInsns<0b01, "cpym">;
+ defm CPYE : MOPSMemoryMoveInsns<0b10, "cpye">;
- defm SETP : MOPSMemorySetInsns<0b00, "setp">;
- defm SETM : MOPSMemorySetInsns<0b01, "setm">;
- defm SETE : MOPSMemorySetInsns<0b10, "sete">;
+ defm SETM : MOPSMemorySetInsns<0b01, "setm">;
+ defm SETE : MOPSMemorySetInsns<0b10, "sete">;
+ }
}
let Predicates = [HasMOPS, HasMTE] in {
- defm SETGP : MOPSMemorySetTaggingInsns<0b00, "setgp">;
- defm SETGM : MOPSMemorySetTaggingInsns<0b01, "setgm">;
- // Can't use SETGE because it's a reserved name in TargetSelectionDAG.td
- defm MOPSSETGE : MOPSMemorySetTaggingInsns<0b10, "setge">;
+ let Defs = [NZCV] in {
+ defm SETGP : MOPSMemorySetTaggingInsns<0b00, "setgp">;
+ }
+ let Uses = [NZCV] in {
+ defm SETGM : MOPSMemorySetTaggingInsns<0b01, "setgm">;
+ // Can't use SETGE because it's a reserved name in TargetSelectionDAG.td
+ defm MOPSSETGE : MOPSMemorySetTaggingInsns<0b10, "setge">;
+ }
+}
+
+// MOPS Node operands: 0: Dst, 1: Src or Value, 2: Size, 3: Chain
+// MOPS Node results: 0: Dst writeback, 1: Size writeback, 2: Chain
+def SDT_AArch64mops : SDTypeProfile<2, 3, [ SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2> ]>;
+def AArch64mops_memset : SDNode<"AArch64ISD::MOPS_MEMSET", SDT_AArch64mops>;
+def AArch64mops_memset_tagging : SDNode<"AArch64ISD::MOPS_MEMSET_TAGGING", SDT_AArch64mops>;
+def AArch64mops_memcopy : SDNode<"AArch64ISD::MOPS_MEMCOPY", SDT_AArch64mops>;
+def AArch64mops_memmove : SDNode<"AArch64ISD::MOPS_MEMMOVE", SDT_AArch64mops>;
+
+// MOPS operations always contain three 4-byte instructions
+let Predicates = [HasMOPS], Defs = [NZCV], Size = 12, mayStore = 1 in {
+ let mayLoad = 1 in {
+ def MOPSMemoryCopyPseudo : Pseudo<(outs GPR64common:$Rd_wb, GPR64common:$Rs_wb, GPR64:$Rn_wb),
+ (ins GPR64common:$Rd, GPR64common:$Rs, GPR64:$Rn),
+ [], "$Rd = $Rd_wb,$Rs = $Rs_wb,$Rn = $Rn_wb">, Sched<[]>;
+ def MOPSMemoryMovePseudo : Pseudo<(outs GPR64common:$Rd_wb, GPR64common:$Rs_wb, GPR64:$Rn_wb),
+ (ins GPR64common:$Rd, GPR64common:$Rs, GPR64:$Rn),
+ [], "$Rd = $Rd_wb,$Rs = $Rs_wb,$Rn = $Rn_wb">, Sched<[]>;
+ }
+ let mayLoad = 0 in {
+ def MOPSMemorySetPseudo : Pseudo<(outs GPR64common:$Rd_wb, GPR64:$Rn_wb),
+ (ins GPR64common:$Rd, GPR64:$Rn, GPR64:$Rm),
+ [], "$Rd = $Rd_wb,$Rn = $Rn_wb">, Sched<[]>;
+ }
+}
+let Predicates = [HasMOPS, HasMTE], Defs = [NZCV], Size = 12, mayLoad = 0, mayStore = 1 in {
+ def MOPSMemorySetTaggingPseudo : Pseudo<(outs GPR64common:$Rd_wb, GPR64:$Rn_wb),
+ (ins GPR64common:$Rd, GPR64:$Rn, GPR64:$Rm),
+ [], "$Rd = $Rd_wb,$Rn = $Rn_wb">, Sched<[]>;
}
-let Defs = [X16, X17], mayStore = 1, isCodeGenOnly = 1 in
+// This gets lowered into an instruction sequence of 20 bytes
+let Defs = [X16, X17], mayStore = 1, isCodeGenOnly = 1, Size = 20 in
def StoreSwiftAsyncContext
: Pseudo<(outs), (ins GPR64:$ctx, GPR64sp:$base, simm9:$offset),
[]>, Sched<[]>;
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 73a680465f6f..1d162610de9c 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -292,7 +292,13 @@ def SDT_AArch64Arith_Unpred : SDTypeProfile<1, 2, [
SDTCisSameAs<0,1>, SDTCisSameAs<1,2>
]>;
-def AArch64bic : SDNode<"AArch64ISD::BIC", SDT_AArch64Arith_Unpred>;
+def AArch64bic_node : SDNode<"AArch64ISD::BIC", SDT_AArch64Arith_Unpred>;
+
+def AArch64bic : PatFrags<(ops node:$op1, node:$op2),
+ [(and node:$op1, (xor node:$op2, (AArch64dup (i32 -1)))),
+ (and node:$op1, (xor node:$op2, (AArch64dup (i64 -1)))),
+ (and node:$op1, (xor node:$op2, (SVEAllActive))),
+ (AArch64bic_node node:$op1, node:$op2)]>;
let Predicates = [HasSVE] in {
defm RDFFR_PPz : sve_int_rdffr_pred<0b0, "rdffr", int_aarch64_sve_rdffr_z>;
@@ -734,14 +740,14 @@ let Predicates = [HasSVEorStreamingSVE] in {
defm PFIRST : sve_int_pfirst<0b00000, "pfirst", int_aarch64_sve_pfirst>;
defm PNEXT : sve_int_pnext<0b00110, "pnext", int_aarch64_sve_pnext>;
- defm AND_PPzPP : sve_int_pred_log_and<0b0000, "and", int_aarch64_sve_and_z>;
- defm BIC_PPzPP : sve_int_pred_log<0b0001, "bic", int_aarch64_sve_bic_z>;
+ defm AND_PPzPP : sve_int_pred_log_v2<0b0000, "and", int_aarch64_sve_and_z, and>;
+ defm BIC_PPzPP : sve_int_pred_log_v2<0b0001, "bic", int_aarch64_sve_bic_z, AArch64bic>;
defm EOR_PPzPP : sve_int_pred_log<0b0010, "eor", int_aarch64_sve_eor_z, xor>;
- defm SEL_PPPP : sve_int_pred_log<0b0011, "sel", vselect>;
+ defm SEL_PPPP : sve_int_pred_log_v2<0b0011, "sel", vselect, or>;
defm ANDS_PPzPP : sve_int_pred_log<0b0100, "ands", null_frag>;
defm BICS_PPzPP : sve_int_pred_log<0b0101, "bics", null_frag>;
defm EORS_PPzPP : sve_int_pred_log<0b0110, "eors", null_frag>;
- defm ORR_PPzPP : sve_int_pred_log<0b1000, "orr", int_aarch64_sve_orr_z, or>;
+ defm ORR_PPzPP : sve_int_pred_log<0b1000, "orr", int_aarch64_sve_orr_z>;
defm ORN_PPzPP : sve_int_pred_log<0b1001, "orn", int_aarch64_sve_orn_z>;
defm NOR_PPzPP : sve_int_pred_log<0b1010, "nor", int_aarch64_sve_nor_z>;
defm NAND_PPzPP : sve_int_pred_log<0b1011, "nand", int_aarch64_sve_nand_z>;
diff --git a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
index d2d84b2a3f6d..893269c1a7ef 100644
--- a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
@@ -15,15 +15,95 @@ using namespace llvm;
#define DEBUG_TYPE "aarch64-selectiondag-info"
+SDValue AArch64SelectionDAGInfo::EmitMOPS(AArch64ISD::NodeType SDOpcode,
+ SelectionDAG &DAG, const SDLoc &DL,
+ SDValue Chain, SDValue Dst,
+ SDValue SrcOrValue, SDValue Size,
+ Align Alignment, bool isVolatile,
+ MachinePointerInfo DstPtrInfo,
+ MachinePointerInfo SrcPtrInfo) const {
+
+ // Get the constant size of the copy/set.
+ uint64_t ConstSize = 0;
+ if (auto *C = dyn_cast<ConstantSDNode>(Size))
+ ConstSize = C->getZExtValue();
+
+ const bool IsSet = SDOpcode == AArch64ISD::MOPS_MEMSET ||
+ SDOpcode == AArch64ISD::MOPS_MEMSET_TAGGING;
+
+ const auto MachineOpcode = [&]() {
+ switch (SDOpcode) {
+ case AArch64ISD::MOPS_MEMSET:
+ return AArch64::MOPSMemorySetPseudo;
+ case AArch64ISD::MOPS_MEMSET_TAGGING:
+ return AArch64::MOPSMemorySetTaggingPseudo;
+ case AArch64ISD::MOPS_MEMCOPY:
+ return AArch64::MOPSMemoryCopyPseudo;
+ case AArch64ISD::MOPS_MEMMOVE:
+ return AArch64::MOPSMemoryMovePseudo;
+ default:
+ llvm_unreachable("Unhandled MOPS ISD Opcode");
+ }
+ }();
+
+ MachineMemOperand::Flags Flags = MachineMemOperand::MOStore;
+ if (isVolatile)
+ Flags |= MachineMemOperand::MOVolatile;
+ if (!IsSet)
+ Flags |= MachineMemOperand::MOLoad;
+
+ MachineFunction &MF = DAG.getMachineFunction();
+
+ auto *DstOp =
+ MF.getMachineMemOperand(DstPtrInfo, Flags, ConstSize, Alignment);
+ auto *SrcOp =
+ MF.getMachineMemOperand(SrcPtrInfo, Flags, ConstSize, Alignment);
+
+ if (IsSet) {
+ // Extend value to i64 if required
+ if (SrcOrValue.getValueType() != MVT::i64)
+ SrcOrValue = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, SrcOrValue);
+ SDValue Ops[] = {Dst, Size, SrcOrValue, Chain};
+ const EVT ResultTys[] = {MVT::i64, MVT::i64, MVT::Other};
+ MachineSDNode *Node = DAG.getMachineNode(MachineOpcode, DL, ResultTys, Ops);
+ DAG.setNodeMemRefs(Node, {DstOp});
+ return SDValue(Node, 2);
+ } else {
+ SDValue Ops[] = {Dst, SrcOrValue, Size, Chain};
+ const EVT ResultTys[] = {MVT::i64, MVT::i64, MVT::i64, MVT::Other};
+ MachineSDNode *Node = DAG.getMachineNode(MachineOpcode, DL, ResultTys, Ops);
+ DAG.setNodeMemRefs(Node, {DstOp, SrcOp});
+ return SDValue(Node, 3);
+ }
+}
+
+SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemcpy(
+ SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, SDValue Dst, SDValue Src,
+ SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline,
+ MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
+ const AArch64Subtarget &STI =
+ DAG.getMachineFunction().getSubtarget<AArch64Subtarget>();
+ if (STI.hasMOPS())
+ return EmitMOPS(AArch64ISD::MOPS_MEMCOPY, DAG, DL, Chain, Dst, Src, Size,
+ Alignment, isVolatile, DstPtrInfo, SrcPtrInfo);
+ return SDValue();
+}
+
SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemset(
SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
SDValue Size, Align Alignment, bool isVolatile,
MachinePointerInfo DstPtrInfo) const {
+ const AArch64Subtarget &STI =
+ DAG.getMachineFunction().getSubtarget<AArch64Subtarget>();
+
+ if (STI.hasMOPS()) {
+ return EmitMOPS(AArch64ISD::MOPS_MEMSET, DAG, dl, Chain, Dst, Src, Size,
+ Alignment, isVolatile, DstPtrInfo, MachinePointerInfo{});
+ }
+
// Check to see if there is a specialized entry-point for memory zeroing.
ConstantSDNode *V = dyn_cast<ConstantSDNode>(Src);
ConstantSDNode *SizeValue = dyn_cast<ConstantSDNode>(Size);
- const AArch64Subtarget &STI =
- DAG.getMachineFunction().getSubtarget<AArch64Subtarget>();
const char *bzeroName =
(V && V->isZero())
? DAG.getTargetLoweringInfo().getLibcallName(RTLIB::BZERO)
@@ -55,6 +135,19 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemset(
return SDValue();
}
+SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemmove(
+ SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
+ SDValue Size, Align Alignment, bool isVolatile,
+ MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
+ const AArch64Subtarget &STI =
+ DAG.getMachineFunction().getSubtarget<AArch64Subtarget>();
+ if (STI.hasMOPS()) {
+ return EmitMOPS(AArch64ISD::MOPS_MEMMOVE, DAG, dl, Chain, Dst, Src, Size,
+ Alignment, isVolatile, DstPtrInfo, SrcPtrInfo);
+ }
+ return SDValue();
+}
+
static const int kSetTagLoopThreshold = 176;
static SDValue EmitUnrolledSetTag(SelectionDAG &DAG, const SDLoc &dl,
diff --git a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h
index 7d53bd456975..47fe3bf7dcf5 100644
--- a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h
@@ -19,11 +19,30 @@ namespace llvm {
class AArch64SelectionDAGInfo : public SelectionDAGTargetInfo {
public:
+ SDValue EmitMOPS(AArch64ISD::NodeType SDOpcode, SelectionDAG &DAG,
+ const SDLoc &DL, SDValue Chain, SDValue Dst,
+ SDValue SrcOrValue, SDValue Size, Align Alignment,
+ bool isVolatile, MachinePointerInfo DstPtrInfo,
+ MachinePointerInfo SrcPtrInfo) const;
+
+ SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, const SDLoc &dl,
+ SDValue Chain, SDValue Dst, SDValue Src,
+ SDValue Size, Align Alignment,
+ bool isVolatile, bool AlwaysInline,
+ MachinePointerInfo DstPtrInfo,
+ MachinePointerInfo SrcPtrInfo) const override;
SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, const SDLoc &dl,
SDValue Chain, SDValue Dst, SDValue Src,
SDValue Size, Align Alignment,
bool isVolatile,
MachinePointerInfo DstPtrInfo) const override;
+ SDValue
+ EmitTargetCodeForMemmove(SelectionDAG &DAG, const SDLoc &dl, SDValue Chain,
+ SDValue Dst, SDValue Src, SDValue Size,
+ Align Alignment, bool isVolatile,
+ MachinePointerInfo DstPtrInfo,
+ MachinePointerInfo SrcPtrInfo) const override;
+
SDValue EmitTargetCodeForSetTag(SelectionDAG &DAG, const SDLoc &dl,
SDValue Chain, SDValue Op1, SDValue Op2,
MachinePointerInfo DstPtrInfo,
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
index a4f4b8582182..8a7e20237271 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -99,6 +99,7 @@ void AArch64Subtarget::initializeProperties() {
case CortexA78C:
case CortexR82:
case CortexX1:
+ case CortexX1C:
PrefFunctionLogAlignment = 4;
break;
case CortexA510:
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h
index 3e3c0f6aba15..7b2bbad30f85 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.h
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h
@@ -63,6 +63,7 @@ public:
CortexA710,
CortexR82,
CortexX1,
+ CortexX1C,
CortexX2,
ExynosM3,
Falkor,
@@ -217,7 +218,6 @@ protected:
bool HasETE = false;
bool HasTRBE = false;
bool HasBRBE = false;
- bool HasPAUTH = false;
bool HasSPE_EEF = false;
// HasZeroCycleRegMove - Has zero-cycle register mov instructions.
@@ -510,7 +510,6 @@ public:
bool hasRandGen() const { return HasRandGen; }
bool hasMTE() const { return HasMTE; }
bool hasTME() const { return HasTME; }
- bool hasPAUTH() const { return HasPAUTH; }
// Arm SVE2 extensions
bool hasSVE2AES() const { return HasSVE2AES; }
bool hasSVE2SM4() const { return HasSVE2SM4; }
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index a4d666a0a3c2..b2ffdf949d8b 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -1886,14 +1886,21 @@ InstructionCost AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
m_Value())))
VecPred = CurrentPred;
}
- // Check if we have a compare/select chain that can be lowered using CMxx &
- // BFI pair.
- if (CmpInst::isIntPredicate(VecPred)) {
- static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
- MVT::v8i16, MVT::v2i32, MVT::v4i32,
- MVT::v2i64};
+ // Check if we have a compare/select chain that can be lowered using
+ // a (F)CMxx & BFI pair.
+ if (CmpInst::isIntPredicate(VecPred) || VecPred == CmpInst::FCMP_OLE ||
+ VecPred == CmpInst::FCMP_OLT || VecPred == CmpInst::FCMP_OGT ||
+ VecPred == CmpInst::FCMP_OGE || VecPred == CmpInst::FCMP_OEQ ||
+ VecPred == CmpInst::FCMP_UNE) {
+ static const auto ValidMinMaxTys = {
+ MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
+ MVT::v4i32, MVT::v2i64, MVT::v2f32, MVT::v4f32, MVT::v2f64};
+ static const auto ValidFP16MinMaxTys = {MVT::v4f16, MVT::v8f16};
+
auto LT = TLI->getTypeLegalizationCost(DL, ValTy);
- if (any_of(ValidMinMaxTys, [&LT](MVT M) { return M == LT.second; }))
+ if (any_of(ValidMinMaxTys, [&LT](MVT M) { return M == LT.second; }) ||
+ (ST->hasFullFP16() &&
+ any_of(ValidFP16MinMaxTys, [&LT](MVT M) { return M == LT.second; })))
return LT.first;
}
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
index 1f546ad50d57..703e356f016d 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -192,6 +192,7 @@ private:
bool selectBrJT(MachineInstr &I, MachineRegisterInfo &MRI);
bool selectTLSGlobalValue(MachineInstr &I, MachineRegisterInfo &MRI);
bool selectReduction(MachineInstr &I, MachineRegisterInfo &MRI);
+ bool selectMOPS(MachineInstr &I, MachineRegisterInfo &MRI);
bool selectUSMovFromExtend(MachineInstr &I, MachineRegisterInfo &MRI);
unsigned emitConstantPoolEntry(const Constant *CPVal,
@@ -3424,6 +3425,12 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
case TargetOpcode::G_VECREDUCE_FADD:
case TargetOpcode::G_VECREDUCE_ADD:
return selectReduction(I, MRI);
+ case TargetOpcode::G_MEMCPY:
+ case TargetOpcode::G_MEMCPY_INLINE:
+ case TargetOpcode::G_MEMMOVE:
+ case TargetOpcode::G_MEMSET:
+ assert(STI.hasMOPS() && "Shouldn't get here without +mops feature");
+ return selectMOPS(I, MRI);
}
return false;
@@ -3481,6 +3488,64 @@ bool AArch64InstructionSelector::selectReduction(MachineInstr &I,
return false;
}
+bool AArch64InstructionSelector::selectMOPS(MachineInstr &GI,
+ MachineRegisterInfo &MRI) {
+ unsigned Mopcode;
+ switch (GI.getOpcode()) {
+ case TargetOpcode::G_MEMCPY:
+ case TargetOpcode::G_MEMCPY_INLINE:
+ Mopcode = AArch64::MOPSMemoryCopyPseudo;
+ break;
+ case TargetOpcode::G_MEMMOVE:
+ Mopcode = AArch64::MOPSMemoryMovePseudo;
+ break;
+ case TargetOpcode::G_MEMSET:
+ // For tagged memset see llvm.aarch64.mops.memset.tag
+ Mopcode = AArch64::MOPSMemorySetPseudo;
+ break;
+ }
+
+ auto &DstPtr = GI.getOperand(0);
+ auto &SrcOrVal = GI.getOperand(1);
+ auto &Size = GI.getOperand(2);
+
+ // Create copies of the registers that can be clobbered.
+ const Register DstPtrCopy = MRI.cloneVirtualRegister(DstPtr.getReg());
+ const Register SrcValCopy = MRI.cloneVirtualRegister(SrcOrVal.getReg());
+ const Register SizeCopy = MRI.cloneVirtualRegister(Size.getReg());
+
+ const bool IsSet = Mopcode == AArch64::MOPSMemorySetPseudo;
+ const auto &SrcValRegClass =
+ IsSet ? AArch64::GPR64RegClass : AArch64::GPR64commonRegClass;
+
+ // Constrain to specific registers
+ RBI.constrainGenericRegister(DstPtrCopy, AArch64::GPR64commonRegClass, MRI);
+ RBI.constrainGenericRegister(SrcValCopy, SrcValRegClass, MRI);
+ RBI.constrainGenericRegister(SizeCopy, AArch64::GPR64RegClass, MRI);
+
+ MIB.buildCopy(DstPtrCopy, DstPtr);
+ MIB.buildCopy(SrcValCopy, SrcOrVal);
+ MIB.buildCopy(SizeCopy, Size);
+
+ // New instruction uses the copied registers because it must update them.
+ // The defs are not used since they don't exist in G_MEM*. They are still
+ // tied.
+ // Note: order of operands is different from G_MEMSET, G_MEMCPY, G_MEMMOVE
+ Register DefDstPtr = MRI.createVirtualRegister(&AArch64::GPR64commonRegClass);
+ Register DefSize = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
+ if (IsSet) {
+ MIB.buildInstr(Mopcode, {DefDstPtr, DefSize},
+ {DstPtrCopy, SizeCopy, SrcValCopy});
+ } else {
+ Register DefSrcPtr = MRI.createVirtualRegister(&SrcValRegClass);
+ MIB.buildInstr(Mopcode, {DefDstPtr, DefSrcPtr, DefSize},
+ {DstPtrCopy, SrcValCopy, SizeCopy});
+ }
+
+ GI.eraseFromParent();
+ return true;
+}
+
bool AArch64InstructionSelector::selectBrJT(MachineInstr &I,
MachineRegisterInfo &MRI) {
assert(I.getOpcode() == TargetOpcode::G_BRJT && "Expected G_BRJT");
@@ -5375,6 +5440,36 @@ bool AArch64InstructionSelector::selectIntrinsicWithSideEffects(
constrainSelectedInstRegOperands(*Store, TII, TRI, RBI);
break;
}
+ case Intrinsic::aarch64_mops_memset_tag: {
+ // Transform
+ // %dst:gpr(p0) = \
+ // G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.aarch64.mops.memset.tag),
+ // \ %dst:gpr(p0), %val:gpr(s64), %n:gpr(s64)
+ // where %dst is updated, into
+ // %Rd:GPR64common, %Rn:GPR64) = \
+ // MOPSMemorySetTaggingPseudo \
+ // %Rd:GPR64common, %Rn:GPR64, %Rm:GPR64
+ // where Rd and Rn are tied.
+ // It is expected that %val has been extended to s64 in legalization.
+ // Note that the order of the size/value operands are swapped.
+
+ Register DstDef = I.getOperand(0).getReg();
+ // I.getOperand(1) is the intrinsic function
+ Register DstUse = I.getOperand(2).getReg();
+ Register ValUse = I.getOperand(3).getReg();
+ Register SizeUse = I.getOperand(4).getReg();
+
+ // MOPSMemorySetTaggingPseudo has two defs; the intrinsic call has only one.
+ // Therefore an additional virtual register is requried for the updated size
+ // operand. This value is not accessible via the semantics of the intrinsic.
+ Register SizeDef = MRI.createGenericVirtualRegister(LLT::scalar(64));
+
+ auto Memset = MIB.buildInstr(AArch64::MOPSMemorySetTaggingPseudo,
+ {DstDef, SizeDef}, {DstUse, SizeUse, ValUse});
+ Memset.cloneMemRefs(I);
+ constrainSelectedInstRegOperands(*Memset, TII, TRI, RBI);
+ break;
+ }
}
I.eraseFromParent();
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index e8894e7933d6..e9df7e001d38 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -699,8 +699,28 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
getActionDefinitionsBuilder(G_DYN_STACKALLOC).lower();
- getActionDefinitionsBuilder({G_BZERO, G_MEMCPY, G_MEMMOVE, G_MEMSET})
- .libcall();
+ if (ST.hasMOPS()) {
+ // G_BZERO is not supported. Currently it is only emitted by
+ // PreLegalizerCombiner for G_MEMSET with zero constant.
+ getActionDefinitionsBuilder(G_BZERO).unsupported();
+
+ getActionDefinitionsBuilder(G_MEMSET)
+ .legalForCartesianProduct({p0}, {s64}, {s64})
+ .customForCartesianProduct({p0}, {s8}, {s64})
+ .immIdx(0); // Inform verifier imm idx 0 is handled.
+
+ getActionDefinitionsBuilder({G_MEMCPY, G_MEMMOVE})
+ .legalForCartesianProduct({p0}, {p0}, {s64})
+ .immIdx(0); // Inform verifier imm idx 0 is handled.
+
+ // G_MEMCPY_INLINE does not have a tailcall immediate
+ getActionDefinitionsBuilder(G_MEMCPY_INLINE)
+ .legalForCartesianProduct({p0}, {p0}, {s64});
+
+ } else {
+ getActionDefinitionsBuilder({G_BZERO, G_MEMCPY, G_MEMMOVE, G_MEMSET})
+ .libcall();
+ }
// FIXME: Legal types are only legal with NEON.
getActionDefinitionsBuilder(G_ABS)
@@ -832,6 +852,11 @@ bool AArch64LegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
return legalizeAtomicCmpxchg128(MI, MRI, Helper);
case TargetOpcode::G_CTTZ:
return legalizeCTTZ(MI, Helper);
+ case TargetOpcode::G_BZERO:
+ case TargetOpcode::G_MEMCPY:
+ case TargetOpcode::G_MEMMOVE:
+ case TargetOpcode::G_MEMSET:
+ return legalizeMemOps(MI, Helper);
}
llvm_unreachable("expected switch to return");
@@ -989,6 +1014,15 @@ bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
MI.eraseFromParent();
return true;
}
+ case Intrinsic::aarch64_mops_memset_tag: {
+ assert(MI.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS);
+ // Zext the value to 64 bit
+ MachineIRBuilder MIB(MI);
+ auto &Value = MI.getOperand(3);
+ Register ZExtValueReg = MIB.buildAnyExt(LLT::scalar(64), Value).getReg(0);
+ Value.setReg(ZExtValueReg);
+ return true;
+ }
}
return true;
@@ -1359,3 +1393,20 @@ bool AArch64LegalizerInfo::legalizeCTTZ(MachineInstr &MI,
MI.eraseFromParent();
return true;
}
+
+bool AArch64LegalizerInfo::legalizeMemOps(MachineInstr &MI,
+ LegalizerHelper &Helper) const {
+ MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
+
+ // Tagged version MOPSMemorySetTagged is legalised in legalizeIntrinsic
+ if (MI.getOpcode() == TargetOpcode::G_MEMSET) {
+ // Zext the value operand to 64 bit
+ auto &Value = MI.getOperand(1);
+ Register ZExtValueReg =
+ MIRBuilder.buildAnyExt(LLT::scalar(64), Value).getReg(0);
+ Value.setReg(ZExtValueReg);
+ return true;
+ }
+
+ return false;
+}
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h
index e2c46f4b4c1f..973f96ff4775 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h
@@ -56,6 +56,7 @@ private:
bool legalizeAtomicCmpxchg128(MachineInstr &MI, MachineRegisterInfo &MRI,
LegalizerHelper &Helper) const;
bool legalizeCTTZ(MachineInstr &MI, LegalizerHelper &Helper) const;
+ bool legalizeMemOps(MachineInstr &MI, LegalizerHelper &Helper) const;
const AArch64Subtarget *ST;
};
} // End llvm namespace.
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index 574b22124957..9d4bdbe5d053 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -334,8 +334,6 @@ multiclass sve_int_ptrue<bits<3> opc, string asm, SDPatternOperator op> {
def SDT_AArch64PTrue : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVT<1, i32>]>;
def AArch64ptrue : SDNode<"AArch64ISD::PTRUE", SDT_AArch64PTrue>;
-def SDT_AArch64PFalse : SDTypeProfile<1, 0, [SDTCisVec<0>, SDTCVecEltisVT<0, i1>]>;
-def AArch64pfalse : SDNode<"AArch64ISD::PFALSE", SDT_AArch64PFalse>;
let Predicates = [HasSVEorStreamingSVE] in {
defm PTRUE : sve_int_ptrue<0b000, "ptrue", AArch64ptrue>;
@@ -614,10 +612,10 @@ class sve_int_pfalse<bits<6> opc, string asm>
multiclass sve_int_pfalse<bits<6> opc, string asm> {
def NAME : sve_int_pfalse<opc, asm>;
- def : Pat<(nxv16i1 (AArch64pfalse)), (!cast<Instruction>(NAME))>;
- def : Pat<(nxv8i1 (AArch64pfalse)), (!cast<Instruction>(NAME))>;
- def : Pat<(nxv4i1 (AArch64pfalse)), (!cast<Instruction>(NAME))>;
- def : Pat<(nxv2i1 (AArch64pfalse)), (!cast<Instruction>(NAME))>;
+ def : Pat<(nxv16i1 (splat_vector (i32 0))), (!cast<Instruction>(NAME))>;
+ def : Pat<(nxv8i1 (splat_vector (i32 0))), (!cast<Instruction>(NAME))>;
+ def : Pat<(nxv4i1 (splat_vector (i32 0))), (!cast<Instruction>(NAME))>;
+ def : Pat<(nxv2i1 (splat_vector (i32 0))), (!cast<Instruction>(NAME))>;
}
class sve_int_ptest<bits<6> opc, string asm>
@@ -773,7 +771,7 @@ multiclass sve_int_count_r_x64<bits<5> opc, string asm,
def : Pat<(i64 (op GPR64:$Rn, (nxv2i1 PPRAny:$Pg))),
(!cast<Instruction>(NAME # _D) PPRAny:$Pg, $Rn)>;
- // Combine cntp with combine_op
+ // combine_op(x, cntp(all_active, p)) ==> inst p, x
def : Pat<(i64 (combine_op GPR64:$Rn, (int_aarch64_sve_cntp_oneuse (nxv16i1 (SVEAllActive)), (nxv16i1 PPRAny:$pred)))),
(!cast<Instruction>(NAME # _B) PPRAny:$pred, $Rn)>;
def : Pat<(i64 (combine_op GPR64:$Rn, (int_aarch64_sve_cntp_oneuse (nxv8i1 (SVEAllActive)), (nxv8i1 PPRAny:$pred)))),
@@ -782,6 +780,16 @@ multiclass sve_int_count_r_x64<bits<5> opc, string asm,
(!cast<Instruction>(NAME # _S) PPRAny:$pred, $Rn)>;
def : Pat<(i64 (combine_op GPR64:$Rn, (int_aarch64_sve_cntp_oneuse (nxv2i1 (SVEAllActive)), (nxv2i1 PPRAny:$pred)))),
(!cast<Instruction>(NAME # _D) PPRAny:$pred, $Rn)>;
+
+ // combine_op(x, cntp(p, p)) ==> inst p, x
+ def : Pat<(i64 (combine_op GPR64:$Rn, (int_aarch64_sve_cntp_oneuse (nxv16i1 PPRAny:$pred), (nxv16i1 PPRAny:$pred)))),
+ (!cast<Instruction>(NAME # _B) PPRAny:$pred, $Rn)>;
+ def : Pat<(i64 (combine_op GPR64:$Rn, (int_aarch64_sve_cntp_oneuse (nxv8i1 PPRAny:$pred), (nxv8i1 PPRAny:$pred)))),
+ (!cast<Instruction>(NAME # _H) PPRAny:$pred, $Rn)>;
+ def : Pat<(i64 (combine_op GPR64:$Rn, (int_aarch64_sve_cntp_oneuse (nxv4i1 PPRAny:$pred), (nxv4i1 PPRAny:$pred)))),
+ (!cast<Instruction>(NAME # _S) PPRAny:$pred, $Rn)>;
+ def : Pat<(i64 (combine_op GPR64:$Rn, (int_aarch64_sve_cntp_oneuse (nxv2i1 PPRAny:$pred), (nxv2i1 PPRAny:$pred)))),
+ (!cast<Instruction>(NAME # _D) PPRAny:$pred, $Rn)>;
}
class sve_int_count_v<bits<2> sz8_64, bits<5> opc, string asm,
@@ -1633,15 +1641,18 @@ multiclass sve_int_pred_log<bits<4> opc, string asm, SDPatternOperator op,
!cast<Instruction>(NAME), PTRUE_D>;
}
-multiclass sve_int_pred_log_and<bits<4> opc, string asm, SDPatternOperator op> :
+// An instance of sve_int_pred_log_and but uses op_nopred's first operand as the
+// general predicate.
+multiclass sve_int_pred_log_v2<bits<4> opc, string asm, SDPatternOperator op,
+ SDPatternOperator op_nopred> :
sve_int_pred_log<opc, asm, op> {
- def : Pat<(nxv16i1 (and nxv16i1:$Op1, nxv16i1:$Op2)),
+ def : Pat<(nxv16i1 (op_nopred nxv16i1:$Op1, nxv16i1:$Op2)),
(!cast<Instruction>(NAME) $Op1, $Op1, $Op2)>;
- def : Pat<(nxv8i1 (and nxv8i1:$Op1, nxv8i1:$Op2)),
+ def : Pat<(nxv8i1 (op_nopred nxv8i1:$Op1, nxv8i1:$Op2)),
(!cast<Instruction>(NAME) $Op1, $Op1, $Op2)>;
- def : Pat<(nxv4i1 (and nxv4i1:$Op1, nxv4i1:$Op2)),
+ def : Pat<(nxv4i1 (op_nopred nxv4i1:$Op1, nxv4i1:$Op2)),
(!cast<Instruction>(NAME) $Op1, $Op1, $Op2)>;
- def : Pat<(nxv2i1 (and nxv2i1:$Op1, nxv2i1:$Op2)),
+ def : Pat<(nxv2i1 (op_nopred nxv2i1:$Op1, nxv2i1:$Op2)),
(!cast<Instruction>(NAME) $Op1, $Op1, $Op2)>;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 958e8c9e5bc5..11cc1a01d248 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -11,6 +11,7 @@
#define LLVM_LIB_TARGET_AMDGPU_AMDGPU_H
#include "llvm/IR/PassManager.h"
+#include "llvm/Pass.h"
#include "llvm/Support/CodeGen.h"
namespace llvm {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
index 7d6845b287bc..bebf032b5535 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
@@ -14,9 +14,12 @@
#include "AMDGPU.h"
#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/LegacyDivergenceAnalysis.h"
#include "llvm/Analysis/MemorySSA.h"
#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/InitializePasses.h"
#define DEBUG_TYPE "amdgpu-annotate-uniform"
@@ -29,6 +32,7 @@ class AMDGPUAnnotateUniformValues : public FunctionPass,
public InstVisitor<AMDGPUAnnotateUniformValues> {
LegacyDivergenceAnalysis *DA;
MemorySSA *MSSA;
+ AliasAnalysis *AA;
DenseMap<Value*, GetElementPtrInst*> noClobberClones;
bool isEntryFunc;
@@ -44,6 +48,7 @@ public:
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<LegacyDivergenceAnalysis>();
AU.addRequired<MemorySSAWrapperPass>();
+ AU.addRequired<AAResultsWrapperPass>();
AU.setPreservesAll();
}
@@ -58,6 +63,7 @@ INITIALIZE_PASS_BEGIN(AMDGPUAnnotateUniformValues, DEBUG_TYPE,
"Add AMDGPU uniform metadata", false, false)
INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis)
INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
INITIALIZE_PASS_END(AMDGPUAnnotateUniformValues, DEBUG_TYPE,
"Add AMDGPU uniform metadata", false, false)
@@ -70,9 +76,79 @@ static void setNoClobberMetadata(Instruction *I) {
I->setMetadata("amdgpu.noclobber", MDNode::get(I->getContext(), {}));
}
-bool AMDGPUAnnotateUniformValues::isClobberedInFunction(LoadInst * Load) {
- const MemoryAccess *MA = MSSA->getWalker()->getClobberingMemoryAccess(Load);
- return !MSSA->isLiveOnEntryDef(MA);
+bool AMDGPUAnnotateUniformValues::isClobberedInFunction(LoadInst *Load) {
+ MemorySSAWalker *Walker = MSSA->getWalker();
+ SmallVector<MemoryAccess *> WorkList{Walker->getClobberingMemoryAccess(Load)};
+ SmallSet<MemoryAccess *, 8> Visited;
+ MemoryLocation Loc(MemoryLocation::get(Load));
+
+ const auto isReallyAClobber = [this, Load](MemoryDef *Def) -> bool {
+ Instruction *DefInst = Def->getMemoryInst();
+ LLVM_DEBUG(dbgs() << " Def: " << *DefInst << '\n');
+
+ if (isa<FenceInst>(DefInst))
+ return false;
+
+ if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(DefInst)) {
+ switch (II->getIntrinsicID()) {
+ case Intrinsic::amdgcn_s_barrier:
+ case Intrinsic::amdgcn_wave_barrier:
+ return false;
+ default:
+ break;
+ }
+ }
+
+ // Ignore atomics not aliasing with the original load, any atomic is a
+ // universal MemoryDef from MSSA's point of view too, just like a fence.
+ const auto checkNoAlias = [this, Load](auto I) -> bool {
+ return I && AA->isNoAlias(I->getPointerOperand(),
+ Load->getPointerOperand());
+ };
+
+ if (checkNoAlias(dyn_cast<AtomicCmpXchgInst>(DefInst)) ||
+ checkNoAlias(dyn_cast<AtomicRMWInst>(DefInst)))
+ return false;
+
+ return true;
+ };
+
+ LLVM_DEBUG(dbgs() << "Checking clobbering of: " << *Load << '\n');
+
+ // Start with a nearest dominating clobbering access, it will be either
+ // live on entry (nothing to do, load is not clobbered), MemoryDef, or
+ // MemoryPhi if several MemoryDefs can define this memory state. In that
+ // case add all Defs to WorkList and continue going up and checking all
+ // the definitions of this memory location until the root. When all the
+ // defs are exhausted and came to the entry state we have no clobber.
+ // Along the scan ignore barriers and fences which are considered clobbers
+ // by the MemorySSA, but not really writing anything into the memory.
+ while (!WorkList.empty()) {
+ MemoryAccess *MA = WorkList.pop_back_val();
+ if (!Visited.insert(MA).second)
+ continue;
+
+ if (MSSA->isLiveOnEntryDef(MA))
+ continue;
+
+ if (MemoryDef *Def = dyn_cast<MemoryDef>(MA)) {
+ if (isReallyAClobber(Def)) {
+ LLVM_DEBUG(dbgs() << " -> load is clobbered\n");
+ return true;
+ }
+
+ WorkList.push_back(
+ Walker->getClobberingMemoryAccess(Def->getDefiningAccess(), Loc));
+ continue;
+ }
+
+ const MemoryPhi *Phi = cast<MemoryPhi>(MA);
+ for (auto &Use : Phi->incoming_values())
+ WorkList.push_back(cast<MemoryAccess>(&Use));
+ }
+
+ LLVM_DEBUG(dbgs() << " -> no clobber\n");
+ return false;
}
void AMDGPUAnnotateUniformValues::visitBranchInst(BranchInst &I) {
@@ -84,9 +160,6 @@ void AMDGPUAnnotateUniformValues::visitLoadInst(LoadInst &I) {
Value *Ptr = I.getPointerOperand();
if (!DA->isUniform(Ptr))
return;
- auto isGlobalLoad = [&](LoadInst &Load)->bool {
- return Load.getPointerAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;
- };
// We're tracking up to the Function boundaries, and cannot go beyond because
// of FunctionPass restrictions. We can ensure that is memory not clobbered
// for memory operations that are live in to entry points only.
@@ -99,7 +172,7 @@ void AMDGPUAnnotateUniformValues::visitLoadInst(LoadInst &I) {
}
bool NotClobbered = false;
- bool GlobalLoad = isGlobalLoad(I);
+ bool GlobalLoad = I.getPointerAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;
if (PtrI)
NotClobbered = GlobalLoad && !isClobberedInFunction(&I);
else if (isa<Argument>(Ptr) || isa<GlobalValue>(Ptr)) {
@@ -139,6 +212,7 @@ bool AMDGPUAnnotateUniformValues::runOnFunction(Function &F) {
DA = &getAnalysis<LegacyDivergenceAnalysis>();
MSSA = &getAnalysis<MemorySSAWrapperPass>().getMSSA();
+ AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
isEntryFunc = AMDGPU::isEntryFunctionCC(F.getCallingConv());
visit(F);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index bb2e723f4ab0..6e2984f2a04f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -88,6 +88,8 @@ AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM,
HSAMetadataStream.reset(new HSAMD::MetadataStreamerV2());
} else if (isHsaAbiVersion3(getGlobalSTI())) {
HSAMetadataStream.reset(new HSAMD::MetadataStreamerV3());
+ } else if (isHsaAbiVersion5(getGlobalSTI())) {
+ HSAMetadataStream.reset(new HSAMD::MetadataStreamerV5());
} else {
HSAMetadataStream.reset(new HSAMD::MetadataStreamerV4());
}
@@ -118,7 +120,7 @@ void AMDGPUAsmPrinter::emitStartOfAsmFile(Module &M) {
TM.getTargetTriple().getOS() != Triple::AMDPAL)
return;
- if (isHsaAbiVersion3Or4(getGlobalSTI()))
+ if (isHsaAbiVersion3AndAbove(getGlobalSTI()))
getTargetStreamer()->EmitDirectiveAMDGCNTarget();
if (TM.getTargetTriple().getOS() == Triple::AMDHSA)
@@ -127,7 +129,7 @@ void AMDGPUAsmPrinter::emitStartOfAsmFile(Module &M) {
if (TM.getTargetTriple().getOS() == Triple::AMDPAL)
getTargetStreamer()->getPALMetadata()->readFromIR(M);
- if (isHsaAbiVersion3Or4(getGlobalSTI()))
+ if (isHsaAbiVersion3AndAbove(getGlobalSTI()))
return;
// HSA emits NT_AMD_HSA_CODE_OBJECT_VERSION for code objects v2.
@@ -259,7 +261,7 @@ void AMDGPUAsmPrinter::emitFunctionBodyEnd() {
void AMDGPUAsmPrinter::emitFunctionEntryLabel() {
if (TM.getTargetTriple().getOS() == Triple::AMDHSA &&
- isHsaAbiVersion3Or4(getGlobalSTI())) {
+ isHsaAbiVersion3AndAbove(getGlobalSTI())) {
AsmPrinter::emitFunctionEntryLabel();
return;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
index 3ac7c45b3275..f5018e3a19ac 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
@@ -672,15 +672,15 @@ void MetadataStreamerV3::emitKernelAttrs(const Function &Func,
Kern[".kind"] = Kern.getDocument()->getNode("fini");
}
-void MetadataStreamerV3::emitKernelArgs(const Function &Func,
- const GCNSubtarget &ST,
+void MetadataStreamerV3::emitKernelArgs(const MachineFunction &MF,
msgpack::MapDocNode Kern) {
+ auto &Func = MF.getFunction();
unsigned Offset = 0;
auto Args = HSAMetadataDoc->getArrayNode();
for (auto &Arg : Func.args())
emitKernelArg(Arg, Offset, Args);
- emitHiddenKernelArgs(Func, ST, Offset, Args);
+ emitHiddenKernelArgs(MF, Offset, Args);
Kern[".args"] = Args;
}
@@ -789,10 +789,12 @@ void MetadataStreamerV3::emitKernelArg(
Args.push_back(Arg);
}
-void MetadataStreamerV3::emitHiddenKernelArgs(const Function &Func,
- const GCNSubtarget &ST,
+void MetadataStreamerV3::emitHiddenKernelArgs(const MachineFunction &MF,
unsigned &Offset,
msgpack::ArrayDocNode Args) {
+ auto &Func = MF.getFunction();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+
unsigned HiddenArgNumBytes = ST.getImplicitArgNumBytes(Func);
if (!HiddenArgNumBytes)
return;
@@ -910,7 +912,6 @@ void MetadataStreamerV3::emitKernel(const MachineFunction &MF,
const SIProgramInfo &ProgramInfo) {
auto &Func = MF.getFunction();
auto Kern = getHSAKernelProps(MF, ProgramInfo);
- const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
assert(Func.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
Func.getCallingConv() == CallingConv::SPIR_KERNEL);
@@ -924,7 +925,7 @@ void MetadataStreamerV3::emitKernel(const MachineFunction &MF,
(Twine(Func.getName()) + Twine(".kd")).str(), /*Copy=*/true);
emitKernelLanguage(Func, Kern);
emitKernelAttrs(Func, Kern);
- emitKernelArgs(Func, ST, Kern);
+ emitKernelArgs(MF, Kern);
}
Kernels.push_back(Kern);
@@ -954,6 +955,97 @@ void MetadataStreamerV4::begin(const Module &Mod,
getRootMetadata("amdhsa.kernels") = HSAMetadataDoc->getArrayNode();
}
+//===----------------------------------------------------------------------===//
+// HSAMetadataStreamerV5
+//===----------------------------------------------------------------------===//
+
+void MetadataStreamerV5::emitVersion() {
+ auto Version = HSAMetadataDoc->getArrayNode();
+ Version.push_back(Version.getDocument()->getNode(VersionMajorV5));
+ Version.push_back(Version.getDocument()->getNode(VersionMinorV5));
+ getRootMetadata("amdhsa.version") = Version;
+}
+
+void MetadataStreamerV5::emitHiddenKernelArgs(const MachineFunction &MF,
+ unsigned &Offset,
+ msgpack::ArrayDocNode Args) {
+ auto &Func = MF.getFunction();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ const Module *M = Func.getParent();
+ auto &DL = M->getDataLayout();
+
+ auto Int64Ty = Type::getInt64Ty(Func.getContext());
+ auto Int32Ty = Type::getInt32Ty(Func.getContext());
+ auto Int16Ty = Type::getInt16Ty(Func.getContext());
+
+ emitKernelArg(DL, Int32Ty, Align(4), "hidden_block_count_x", Offset, Args);
+ emitKernelArg(DL, Int32Ty, Align(4), "hidden_block_count_y", Offset, Args);
+ emitKernelArg(DL, Int32Ty, Align(4), "hidden_block_count_z", Offset, Args);
+
+ emitKernelArg(DL, Int16Ty, Align(2), "hidden_group_size_x", Offset, Args);
+ emitKernelArg(DL, Int16Ty, Align(2), "hidden_group_size_y", Offset, Args);
+ emitKernelArg(DL, Int16Ty, Align(2), "hidden_group_size_z", Offset, Args);
+
+ emitKernelArg(DL, Int16Ty, Align(2), "hidden_remainder_x", Offset, Args);
+ emitKernelArg(DL, Int16Ty, Align(2), "hidden_remainder_y", Offset, Args);
+ emitKernelArg(DL, Int16Ty, Align(2), "hidden_remainder_z", Offset, Args);
+
+ // Reserved for hidden_tool_correlation_id.
+ Offset += 8;
+
+ Offset += 8; // Reserved.
+
+ emitKernelArg(DL, Int64Ty, Align(8), "hidden_global_offset_x", Offset, Args);
+ emitKernelArg(DL, Int64Ty, Align(8), "hidden_global_offset_y", Offset, Args);
+ emitKernelArg(DL, Int64Ty, Align(8), "hidden_global_offset_z", Offset, Args);
+
+ emitKernelArg(DL, Int16Ty, Align(2), "hidden_grid_dims", Offset, Args);
+
+ Offset += 6; // Reserved.
+ auto Int8PtrTy =
+ Type::getInt8PtrTy(Func.getContext(), AMDGPUAS::GLOBAL_ADDRESS);
+
+ if (M->getNamedMetadata("llvm.printf.fmts")) {
+ emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_printf_buffer", Offset,
+ Args);
+ } else
+ Offset += 8; // Skipped.
+
+ if (M->getModuleFlag("amdgpu_hostcall")) {
+ emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_hostcall_buffer", Offset,
+ Args);
+ } else
+ Offset += 8; // Skipped.
+
+ emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_multigrid_sync_arg", Offset,
+ Args);
+
+ // Ignore temporarily until it is implemented.
+ // emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_heap_v1", Offset, Args);
+ Offset += 8;
+
+ if (Func.hasFnAttribute("calls-enqueue-kernel")) {
+ emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_default_queue", Offset,
+ Args);
+ emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_completion_action", Offset,
+ Args);
+ } else
+ Offset += 16; // Skipped.
+
+ Offset += 72; // Reserved.
+
+ // hidden_private_base and hidden_shared_base are only used by GFX8.
+ if (ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+ emitKernelArg(DL, Int32Ty, Align(4), "hidden_private_base", Offset, Args);
+ emitKernelArg(DL, Int32Ty, Align(4), "hidden_shared_base", Offset, Args);
+ } else
+ Offset += 8; // Skipped.
+
+ const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
+ if (MFI.hasQueuePtr())
+ emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_queue_ptr", Offset, Args);
+}
+
} // end namespace HSAMD
} // end namespace AMDGPU
} // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
index 54ed0afbba6d..bcf7fc449094 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
@@ -53,6 +53,11 @@ public:
virtual void emitKernel(const MachineFunction &MF,
const SIProgramInfo &ProgramInfo) = 0;
+
+protected:
+ virtual void emitVersion() = 0;
+ virtual void emitHiddenKernelArgs(const MachineFunction &MF, unsigned &Offset,
+ msgpack::ArrayDocNode Args) = 0;
};
// TODO: Rename MetadataStreamerV3 -> MetadataStreamerMsgPackV3.
@@ -79,7 +84,7 @@ protected:
msgpack::MapDocNode getHSAKernelProps(const MachineFunction &MF,
const SIProgramInfo &ProgramInfo) const;
- void emitVersion();
+ void emitVersion() override;
void emitPrintf(const Module &Mod);
@@ -87,8 +92,7 @@ protected:
void emitKernelAttrs(const Function &Func, msgpack::MapDocNode Kern);
- void emitKernelArgs(const Function &Func, const GCNSubtarget &ST,
- msgpack::MapDocNode Kern);
+ void emitKernelArgs(const MachineFunction &MF, msgpack::MapDocNode Kern);
void emitKernelArg(const Argument &Arg, unsigned &Offset,
msgpack::ArrayDocNode Args);
@@ -100,8 +104,8 @@ protected:
StringRef BaseTypeName = "", StringRef AccQual = "",
StringRef TypeQual = "");
- void emitHiddenKernelArgs(const Function &Func, const GCNSubtarget &ST,
- unsigned &Offset, msgpack::ArrayDocNode Args);
+ void emitHiddenKernelArgs(const MachineFunction &MF, unsigned &Offset,
+ msgpack::ArrayDocNode Args) override;
msgpack::DocNode &getRootMetadata(StringRef Key) {
return HSAMetadataDoc->getRoot().getMap(/*Convert=*/true)[Key];
@@ -127,9 +131,9 @@ public:
};
// TODO: Rename MetadataStreamerV4 -> MetadataStreamerMsgPackV4.
-class MetadataStreamerV4 final : public MetadataStreamerV3 {
- void emitVersion();
-
+class MetadataStreamerV4 : public MetadataStreamerV3 {
+protected:
+ void emitVersion() override;
void emitTargetID(const IsaInfo::AMDGPUTargetID &TargetID);
public:
@@ -140,6 +144,18 @@ public:
const IsaInfo::AMDGPUTargetID &TargetID) override;
};
+// TODO: Rename MetadataStreamerV5 -> MetadataStreamerMsgPackV5.
+class MetadataStreamerV5 final : public MetadataStreamerV4 {
+protected:
+ void emitVersion() override;
+ void emitHiddenKernelArgs(const MachineFunction &MF, unsigned &Offset,
+ msgpack::ArrayDocNode Args) override;
+
+public:
+ MetadataStreamerV5() = default;
+ ~MetadataStreamerV5() = default;
+};
+
// TODO: Rename MetadataStreamerV2 -> MetadataStreamerYamlV2.
class MetadataStreamerV2 final : public MetadataStreamer {
private:
@@ -167,8 +183,6 @@ private:
const MachineFunction &MF,
const SIProgramInfo &ProgramInfo) const;
- void emitVersion();
-
void emitPrintf(const Module &Mod);
void emitKernelLanguage(const Function &Func);
@@ -191,6 +205,13 @@ private:
return HSAMetadata;
}
+protected:
+ void emitVersion() override;
+ void emitHiddenKernelArgs(const MachineFunction &MF, unsigned &Offset,
+ msgpack::ArrayDocNode Args) override {
+ llvm_unreachable("Dummy override should not be invoked!");
+ }
+
public:
MetadataStreamerV2() = default;
~MetadataStreamerV2() = default;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 04c6f67ed339..645d05aa9238 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -4778,6 +4778,7 @@ bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
case ELF::ELFABIVERSION_AMDGPU_HSA_V3:
return legalizeTrapHsaQueuePtr(MI, MRI, B);
case ELF::ELFABIVERSION_AMDGPU_HSA_V4:
+ case ELF::ELFABIVERSION_AMDGPU_HSA_V5:
return ST.supportsGetDoorbellID() ?
legalizeTrapHsa(MI, MRI, B) :
legalizeTrapHsaQueuePtr(MI, MRI, B);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
index c28427758ac7..bbbadfdfd444 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
@@ -16,8 +16,9 @@
#include "GCNSubtarget.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/Loads.h"
-#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/InitializePasses.h"
#include "llvm/Target/TargetMachine.h"
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 2d8126a49327..99b7ffb33884 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -13,15 +13,16 @@
#include "AMDGPU.h"
#include "GCNSubtarget.h"
+#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/Analysis/CaptureTracking.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/IntrinsicsR600.h"
#include "llvm/Pass.h"
#include "llvm/Target/TargetMachine.h"
-#include "Utils/AMDGPUBaseInfo.h"
#define DEBUG_TYPE "amdgpu-promote-alloca"
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index c1c88d9a7462..ffe626513d47 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -1129,7 +1129,8 @@ class KernelScopeInfo {
if (i >= SgprIndexUnusedMin) {
SgprIndexUnusedMin = ++i;
if (Ctx) {
- MCSymbol * const Sym = Ctx->getOrCreateSymbol(Twine(".kernel.sgpr_count"));
+ MCSymbol* const Sym =
+ Ctx->getOrCreateSymbol(Twine(".kernel.sgpr_count"));
Sym->setVariableValue(MCConstantExpr::create(SgprIndexUnusedMin, *Ctx));
}
}
@@ -1139,7 +1140,8 @@ class KernelScopeInfo {
if (i >= VgprIndexUnusedMin) {
VgprIndexUnusedMin = ++i;
if (Ctx) {
- MCSymbol * const Sym = Ctx->getOrCreateSymbol(Twine(".kernel.vgpr_count"));
+ MCSymbol* const Sym =
+ Ctx->getOrCreateSymbol(Twine(".kernel.vgpr_count"));
Sym->setVariableValue(MCConstantExpr::create(VgprIndexUnusedMin, *Ctx));
}
}
@@ -1296,7 +1298,7 @@ public:
// AsmParser::parseDirectiveSet() cannot be specialized for specific target.
AMDGPU::IsaVersion ISA = AMDGPU::getIsaVersion(getSTI().getCPU());
MCContext &Ctx = getContext();
- if (ISA.Major >= 6 && isHsaAbiVersion3Or4(&getSTI())) {
+ if (ISA.Major >= 6 && isHsaAbiVersion3AndAbove(&getSTI())) {
MCSymbol *Sym =
Ctx.getOrCreateSymbol(Twine(".amdgcn.gfx_generation_number"));
Sym->setVariableValue(MCConstantExpr::create(ISA.Major, Ctx));
@@ -1313,7 +1315,7 @@ public:
Sym = Ctx.getOrCreateSymbol(Twine(".option.machine_version_stepping"));
Sym->setVariableValue(MCConstantExpr::create(ISA.Stepping, Ctx));
}
- if (ISA.Major >= 6 && isHsaAbiVersion3Or4(&getSTI())) {
+ if (ISA.Major >= 6 && isHsaAbiVersion3AndAbove(&getSTI())) {
initializeGprCountSymbol(IS_VGPR);
initializeGprCountSymbol(IS_SGPR);
} else
@@ -2747,7 +2749,7 @@ AMDGPUAsmParser::parseRegister(bool RestoreOnFailure) {
if (!ParseAMDGPURegister(RegKind, Reg, RegNum, RegWidth)) {
return nullptr;
}
- if (isHsaAbiVersion3Or4(&getSTI())) {
+ if (isHsaAbiVersion3AndAbove(&getSTI())) {
if (!updateGprCountSymbols(RegKind, RegNum, RegWidth))
return nullptr;
} else
@@ -5099,7 +5101,7 @@ bool AMDGPUAsmParser::ParseDirectiveHSAMetadata() {
const char *AssemblerDirectiveBegin;
const char *AssemblerDirectiveEnd;
std::tie(AssemblerDirectiveBegin, AssemblerDirectiveEnd) =
- isHsaAbiVersion3Or4(&getSTI())
+ isHsaAbiVersion3AndAbove(&getSTI())
? std::make_tuple(HSAMD::V3::AssemblerDirectiveBegin,
HSAMD::V3::AssemblerDirectiveEnd)
: std::make_tuple(HSAMD::AssemblerDirectiveBegin,
@@ -5116,7 +5118,7 @@ bool AMDGPUAsmParser::ParseDirectiveHSAMetadata() {
HSAMetadataString))
return true;
- if (isHsaAbiVersion3Or4(&getSTI())) {
+ if (isHsaAbiVersion3AndAbove(&getSTI())) {
if (!getTargetStreamer().EmitHSAMetadataV3(HSAMetadataString))
return Error(getLoc(), "invalid HSA metadata");
} else {
@@ -5266,7 +5268,7 @@ bool AMDGPUAsmParser::ParseDirectiveAMDGPULDS() {
bool AMDGPUAsmParser::ParseDirective(AsmToken DirectiveID) {
StringRef IDVal = DirectiveID.getString();
- if (isHsaAbiVersion3Or4(&getSTI())) {
+ if (isHsaAbiVersion3AndAbove(&getSTI())) {
if (IDVal == ".amdhsa_kernel")
return ParseDirectiveAMDHSAKernel();
@@ -7440,7 +7442,7 @@ void AMDGPUAsmParser::onBeginOfFile() {
if (!getTargetStreamer().getTargetID())
getTargetStreamer().initializeTargetID(getSTI(), getSTI().getFeatureString());
- if (isHsaAbiVersion3Or4(&getSTI()))
+ if (isHsaAbiVersion3AndAbove(&getSTI()))
getTargetStreamer().EmitDirectiveAMDGCNTarget();
}
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index 9578bdb0bad0..7aa5f1abf65b 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -396,6 +396,7 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
break;
case ELF::ELFABIVERSION_AMDGPU_HSA_V3:
case ELF::ELFABIVERSION_AMDGPU_HSA_V4:
+ case ELF::ELFABIVERSION_AMDGPU_HSA_V5:
if (getTargetID()->isXnackSupported())
OS << "\t\t.amdhsa_reserve_xnack_mask " << getTargetID()->isXnackOnOrAny() << '\n';
break;
@@ -578,6 +579,7 @@ unsigned AMDGPUTargetELFStreamer::getEFlagsAMDHSA() {
case ELF::ELFABIVERSION_AMDGPU_HSA_V3:
return getEFlagsV3();
case ELF::ELFABIVERSION_AMDGPU_HSA_V4:
+ case ELF::ELFABIVERSION_AMDGPU_HSA_V5:
return getEFlagsV4();
}
}
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 561866b5a398..e2f4a0896bc3 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -5423,6 +5423,7 @@ SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
case ELF::ELFABIVERSION_AMDGPU_HSA_V3:
return lowerTrapHsaQueuePtr(Op, DAG);
case ELF::ELFABIVERSION_AMDGPU_HSA_V4:
+ case ELF::ELFABIVERSION_AMDGPU_HSA_V5:
return Subtarget->supportsGetDoorbellID() ?
lowerTrapHsa(Op, DAG) : lowerTrapHsaQueuePtr(Op, DAG);
}
diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index c18637bdbc43..44bdbe37dec0 100644
--- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -938,12 +938,6 @@ bool SILoadStoreOptimizer::checkAndPrepareMerge(
// 2. It is safe to move MBBI down past the instruction that I will
// be merged into.
- if (MBBI->hasUnmodeledSideEffects()) {
- // We can't re-order this instruction with respect to other memory
- // operations, so we fail both conditions mentioned above.
- return false;
- }
-
if (MBBI->mayLoadOrStore() &&
(!memAccessesCanBeReordered(*CI.I, *MBBI, AA) ||
!canMoveInstsAcrossMemOp(*MBBI, InstsToMove, AA))) {
@@ -1977,10 +1971,10 @@ SILoadStoreOptimizer::collectMergeableInsts(
if (promoteConstantOffsetToImm(MI, Visited, AnchorList))
Modified = true;
- // Don't combine if volatile. We also won't be able to merge across this, so
- // break the search. We can look after this barrier for separate merges.
- if (MI.hasOrderedMemoryRef()) {
- LLVM_DEBUG(dbgs() << "Breaking search on memory fence: " << MI);
+ // Treat volatile accesses, ordered accesses and unmodeled side effects as
+ // barriers. We can look after this barrier for separate merges.
+ if (MI.hasOrderedMemoryRef() || MI.hasUnmodeledSideEffects()) {
+ LLVM_DEBUG(dbgs() << "Breaking search on barrier: " << MI);
// Search will resume after this instruction in a separate merge list.
++BlockI;
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 1e96266eb06c..683be871ff82 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -99,6 +99,8 @@ Optional<uint8_t> getHsaAbiVersion(const MCSubtargetInfo *STI) {
return ELF::ELFABIVERSION_AMDGPU_HSA_V3;
case 4:
return ELF::ELFABIVERSION_AMDGPU_HSA_V4;
+ case 5:
+ return ELF::ELFABIVERSION_AMDGPU_HSA_V5;
default:
report_fatal_error(Twine("Unsupported AMDHSA Code Object Version ") +
Twine(AmdhsaCodeObjectVersion));
@@ -123,8 +125,15 @@ bool isHsaAbiVersion4(const MCSubtargetInfo *STI) {
return false;
}
-bool isHsaAbiVersion3Or4(const MCSubtargetInfo *STI) {
- return isHsaAbiVersion3(STI) || isHsaAbiVersion4(STI);
+bool isHsaAbiVersion5(const MCSubtargetInfo *STI) {
+ if (Optional<uint8_t> HsaAbiVer = getHsaAbiVersion(STI))
+ return *HsaAbiVer == ELF::ELFABIVERSION_AMDGPU_HSA_V5;
+ return false;
+}
+
+bool isHsaAbiVersion3AndAbove(const MCSubtargetInfo *STI) {
+ return isHsaAbiVersion3(STI) || isHsaAbiVersion4(STI) ||
+ isHsaAbiVersion5(STI);
}
#define GET_MIMGBaseOpcodesTable_IMPL
@@ -495,6 +504,7 @@ std::string AMDGPUTargetID::toString() const {
Features += "+sram-ecc";
break;
case ELF::ELFABIVERSION_AMDGPU_HSA_V4:
+ case ELF::ELFABIVERSION_AMDGPU_HSA_V5:
// sramecc.
if (getSramEccSetting() == TargetIDSetting::Off)
Features += ":sramecc-";
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 89f928eb8b92..4516b511f3c8 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -47,9 +47,12 @@ bool isHsaAbiVersion3(const MCSubtargetInfo *STI);
/// \returns True if HSA OS ABI Version identification is 4,
/// false otherwise.
bool isHsaAbiVersion4(const MCSubtargetInfo *STI);
+/// \returns True if HSA OS ABI Version identification is 5,
+/// false otherwise.
+bool isHsaAbiVersion5(const MCSubtargetInfo *STI);
/// \returns True if HSA OS ABI Version identification is 3 or 4,
/// false otherwise.
-bool isHsaAbiVersion3Or4(const MCSubtargetInfo *STI);
+bool isHsaAbiVersion3AndAbove(const MCSubtargetInfo *STI);
struct GcnBufferFormatInfo {
unsigned Format;
diff --git a/llvm/lib/Target/ARM/ARM.td b/llvm/lib/Target/ARM/ARM.td
index 4efbdbb2abc8..27edf69b4abf 100644
--- a/llvm/lib/Target/ARM/ARM.td
+++ b/llvm/lib/Target/ARM/ARM.td
@@ -656,6 +656,8 @@ def ProcA710 : SubtargetFeature<"cortex-a710", "ARMProcFamily",
"CortexA710", "Cortex-A710 ARM processors", []>;
def ProcX1 : SubtargetFeature<"cortex-x1", "ARMProcFamily", "CortexX1",
"Cortex-X1 ARM processors", []>;
+def ProcX1C : SubtargetFeature<"cortex-x1c", "ARMProcFamily", "CortexX1C",
+ "Cortex-X1C ARM processors", []>;
def ProcV1 : SubtargetFeature<"neoverse-v1", "ARMProcFamily",
"NeoverseV1", "Neoverse-V1 ARM processors", []>;
@@ -1443,6 +1445,14 @@ def : ProcNoItin<"cortex-x1", [ARMv82a, ProcX1,
FeatureFullFP16,
FeatureDotProd]>;
+def : ProcNoItin<"cortex-x1c", [ARMv82a, ProcX1C,
+ FeatureHWDivThumb,
+ FeatureHWDivARM,
+ FeatureCrypto,
+ FeatureCRC,
+ FeatureFullFP16,
+ FeatureDotProd]>;
+
def : ProcNoItin<"neoverse-v1", [ARMv84a,
FeatureHWDivThumb,
FeatureHWDivARM,
diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
index cde715880376..5b0bae4d9274 100644
--- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -752,23 +752,17 @@ unsigned ARMBaseInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();
const MCInstrDesc &MCID = MI.getDesc();
- if (MCID.getSize())
- return MCID.getSize();
switch (MI.getOpcode()) {
default:
- // pseudo-instruction sizes are zero.
- return 0;
+ // Return the size specified in .td file. If there's none, return 0, as we
+ // can't define a default size (Thumb1 instructions are 2 bytes, Thumb2
+ // instructions are 2-4 bytes, and ARM instructions are 4 bytes), in
+ // contrast to AArch64 instructions which have a default size of 4 bytes for
+ // example.
+ return MCID.getSize();
case TargetOpcode::BUNDLE:
return getInstBundleLength(MI);
- case ARM::MOVi16_ga_pcrel:
- case ARM::MOVTi16_ga_pcrel:
- case ARM::t2MOVi16_ga_pcrel:
- case ARM::t2MOVTi16_ga_pcrel:
- return 4;
- case ARM::MOVi32imm:
- case ARM::t2MOVi32imm:
- return 8;
case ARM::CONSTPOOL_ENTRY:
case ARM::JUMPTABLE_INSTS:
case ARM::JUMPTABLE_ADDRS:
@@ -777,19 +771,6 @@ unsigned ARMBaseInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
// If this machine instr is a constant pool entry, its size is recorded as
// operand #2.
return MI.getOperand(2).getImm();
- case ARM::Int_eh_sjlj_longjmp:
- return 16;
- case ARM::tInt_eh_sjlj_longjmp:
- return 10;
- case ARM::tInt_WIN_eh_sjlj_longjmp:
- return 12;
- case ARM::Int_eh_sjlj_setjmp:
- case ARM::Int_eh_sjlj_setjmp_nofp:
- return 20;
- case ARM::tInt_eh_sjlj_setjmp:
- case ARM::t2Int_eh_sjlj_setjmp:
- case ARM::t2Int_eh_sjlj_setjmp_nofp:
- return 12;
case ARM::SPACE:
return MI.getOperand(1).getImm();
case ARM::INLINEASM:
@@ -800,14 +781,6 @@ unsigned ARMBaseInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
Size = alignTo(Size, 4);
return Size;
}
- case ARM::SpeculationBarrierISBDSBEndBB:
- case ARM::t2SpeculationBarrierISBDSBEndBB:
- // This gets lowered to 2 4-byte instructions.
- return 8;
- case ARM::SpeculationBarrierSBEndBB:
- case ARM::t2SpeculationBarrierSBEndBB:
- // This gets lowered to 1 4-byte instructions.
- return 4;
}
}
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index fe4e6b24367a..1b41427a1cab 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -14527,7 +14527,7 @@ static SDValue PerformXORCombine(SDNode *N,
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
const TargetLowering *TLI = Subtarget->getTargetLowering();
- if (TLI->isConstTrueVal(N1.getNode()) &&
+ if (TLI->isConstTrueVal(N1) &&
(N0->getOpcode() == ARMISD::VCMP || N0->getOpcode() == ARMISD::VCMPZ)) {
if (CanInvertMVEVCMP(N0)) {
SDLoc DL(N0);
diff --git a/llvm/lib/Target/ARM/ARMInstrInfo.td b/llvm/lib/Target/ARM/ARMInstrInfo.td
index 1c1db473f866..32a3911d3369 100644
--- a/llvm/lib/Target/ARM/ARMInstrInfo.td
+++ b/llvm/lib/Target/ARM/ARMInstrInfo.td
@@ -3657,6 +3657,8 @@ def : InstAlias<"mov${p} $Rd, $imm",
(MOVi16 GPR:$Rd, imm0_65535_expr:$imm, pred:$p), 0>,
Requires<[IsARM, HasV6T2]>;
+// This gets lowered to a single 4-byte instructions
+let Size = 4 in
def MOVi16_ga_pcrel : PseudoInst<(outs GPR:$Rd),
(ins i32imm:$addr, pclabel:$id), IIC_iMOVi, []>,
Sched<[WriteALU]>;
@@ -3680,6 +3682,8 @@ def MOVTi16 : AI1<0b1010, (outs GPRnopc:$Rd),
let DecoderMethod = "DecodeArmMOVTWInstruction";
}
+// This gets lowered to a single 4-byte instructions
+let Size = 4 in
def MOVTi16_ga_pcrel : PseudoInst<(outs GPR:$Rd),
(ins GPR:$src, i32imm:$addr, pclabel:$id), IIC_iMOVi, []>,
Sched<[WriteALU]>;
@@ -5895,27 +5899,30 @@ def : ARMPat<(ARMthread_pointer), (MRC 15, 0, 13, 0, 3)>,
//
// These are pseudo-instructions and are lowered to individual MC-insts, so
// no encoding information is necessary.
+// This gets lowered to an instruction sequence of 20 bytes
let Defs =
[ R0, R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, LR, CPSR,
Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7, Q8, Q9, Q10, Q11, Q12, Q13, Q14, Q15 ],
- hasSideEffects = 1, isBarrier = 1, usesCustomInserter = 1 in {
+ hasSideEffects = 1, isBarrier = 1, usesCustomInserter = 1, Size = 20 in {
def Int_eh_sjlj_setjmp : PseudoInst<(outs), (ins GPR:$src, GPR:$val),
NoItinerary,
[(set R0, (ARMeh_sjlj_setjmp GPR:$src, GPR:$val))]>,
Requires<[IsARM, HasVFP2]>;
}
+// This gets lowered to an instruction sequence of 20 bytes
let Defs =
[ R0, R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, LR, CPSR ],
- hasSideEffects = 1, isBarrier = 1, usesCustomInserter = 1 in {
+ hasSideEffects = 1, isBarrier = 1, usesCustomInserter = 1, Size = 20 in {
def Int_eh_sjlj_setjmp_nofp : PseudoInst<(outs), (ins GPR:$src, GPR:$val),
NoItinerary,
[(set R0, (ARMeh_sjlj_setjmp GPR:$src, GPR:$val))]>,
Requires<[IsARM, NoVFP]>;
}
+// This gets lowered to an instruction sequence of 16 bytes
// FIXME: Non-IOS version(s)
-let isBarrier = 1, hasSideEffects = 1, isTerminator = 1,
+let isBarrier = 1, hasSideEffects = 1, isTerminator = 1, Size = 16,
Defs = [ R7, LR, SP ] in {
def Int_eh_sjlj_longjmp : PseudoInst<(outs), (ins GPR:$src, GPR:$scratch),
NoItinerary,
@@ -5958,7 +5965,7 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [SP] in
// This is a single pseudo instruction, the benefit is that it can be remat'd
// as a single unit instead of having to handle reg inputs.
// FIXME: Remove this when we can do generalized remat.
-let isReMaterializable = 1, isMoveImm = 1 in
+let isReMaterializable = 1, isMoveImm = 1, Size = 8 in
def MOVi32imm : PseudoInst<(outs GPR:$dst), (ins i32imm:$src), IIC_iMOVix2,
[(set GPR:$dst, (arm_i32imm:$src))]>,
Requires<[IsARM]>;
@@ -6419,8 +6426,12 @@ def SPACE : PseudoInst<(outs GPR:$Rd), (ins i32imm:$size, GPR:$Rn),
// SpeculationBarrierEndBB must only be used after an unconditional control
// flow, i.e. after a terminator for which isBarrier is True.
let hasSideEffects = 1, isCodeGenOnly = 1, isTerminator = 1, isBarrier = 1 in {
+ // This gets lowered to a pair of 4-byte instructions
+ let Size = 8 in
def SpeculationBarrierISBDSBEndBB
: PseudoInst<(outs), (ins), NoItinerary, []>, Sched<[]>;
+ // This gets lowered to a single 4-byte instructions
+ let Size = 4 in
def SpeculationBarrierSBEndBB
: PseudoInst<(outs), (ins), NoItinerary, []>, Sched<[]>;
}
diff --git a/llvm/lib/Target/ARM/ARMInstrThumb.td b/llvm/lib/Target/ARM/ARMInstrThumb.td
index f09ad8167600..71527ae1ab11 100644
--- a/llvm/lib/Target/ARM/ARMInstrThumb.td
+++ b/llvm/lib/Target/ARM/ARMInstrThumb.td
@@ -1537,25 +1537,28 @@ def tTPsoft : tPseudoInst<(outs), (ins), 4, IIC_Br,
// Defs. By doing so, we also cause the prologue/epilogue code to actively
// preserve all of the callee-saved registers, which is exactly what we want.
// $val is a scratch register for our use.
+// This gets lowered to an instruction sequence of 12 bytes
let Defs = [ R0, R1, R2, R3, R4, R5, R6, R7, R12, CPSR ],
- hasSideEffects = 1, isBarrier = 1, isCodeGenOnly = 1,
+ hasSideEffects = 1, isBarrier = 1, isCodeGenOnly = 1, Size = 12,
usesCustomInserter = 1 in
def tInt_eh_sjlj_setjmp : ThumbXI<(outs),(ins tGPR:$src, tGPR:$val),
AddrModeNone, 0, NoItinerary, "","",
[(set R0, (ARMeh_sjlj_setjmp tGPR:$src, tGPR:$val))]>;
+// This gets lowered to an instruction sequence of 10 bytes
// FIXME: Non-IOS version(s)
let isBarrier = 1, hasSideEffects = 1, isTerminator = 1, isCodeGenOnly = 1,
- Defs = [ R7, LR, SP ] in
+ Size = 10, Defs = [ R7, LR, SP ] in
def tInt_eh_sjlj_longjmp : XI<(outs), (ins tGPR:$src, tGPR:$scratch),
AddrModeNone, 0, IndexModeNone,
Pseudo, NoItinerary, "", "",
[(ARMeh_sjlj_longjmp tGPR:$src, tGPR:$scratch)]>,
Requires<[IsThumb,IsNotWindows]>;
+// This gets lowered to an instruction sequence of 12 bytes
// (Windows is Thumb2-only)
let isBarrier = 1, hasSideEffects = 1, isTerminator = 1, isCodeGenOnly = 1,
- Defs = [ R11, LR, SP ] in
+ Size = 12, Defs = [ R11, LR, SP ] in
def tInt_WIN_eh_sjlj_longjmp
: XI<(outs), (ins GPR:$src, GPR:$scratch), AddrModeNone, 0, IndexModeNone,
Pseudo, NoItinerary, "", "", [(ARMeh_sjlj_longjmp GPR:$src, GPR:$scratch)]>,
diff --git a/llvm/lib/Target/ARM/ARMInstrThumb2.td b/llvm/lib/Target/ARM/ARMInstrThumb2.td
index 6e8e61ca2b8e..f80b9a5053f7 100644
--- a/llvm/lib/Target/ARM/ARMInstrThumb2.td
+++ b/llvm/lib/Target/ARM/ARMInstrThumb2.td
@@ -2194,6 +2194,8 @@ def : InstAlias<"mov${p} $Rd, $imm",
(t2MOVi16 rGPR:$Rd, imm256_65535_expr:$imm, pred:$p), 0>,
Requires<[IsThumb, HasV8MBaseline]>, Sched<[WriteALU]>;
+// This gets lowered to a single 4-byte instructions
+let Size = 4 in
def t2MOVi16_ga_pcrel : PseudoInst<(outs rGPR:$Rd),
(ins i32imm:$addr, pclabel:$id), IIC_iMOVi, []>,
Sched<[WriteALU]>;
@@ -2223,6 +2225,8 @@ def t2MOVTi16 : T2I<(outs rGPR:$Rd),
let DecoderMethod = "DecodeT2MOVTWInstruction";
}
+// This gets lowered to a single 4-byte instructions
+let Size = 4 in
def t2MOVTi16_ga_pcrel : PseudoInst<(outs rGPR:$Rd),
(ins rGPR:$src, i32imm:$addr, pclabel:$id), IIC_iMOVi, []>,
Sched<[WriteALU]>, Requires<[IsThumb, HasV8MBaseline]>;
@@ -3814,10 +3818,11 @@ def : T2Pat<(stlex_2 (and GPR:$Rt, 0xffff), addr_offset_none:$addr),
// doing so, we also cause the prologue/epilogue code to actively preserve
// all of the callee-saved registers, which is exactly what we want.
// $val is a scratch register for our use.
+// This gets lowered to an instruction sequence of 12 bytes
let Defs =
[ R0, R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, LR, CPSR,
Q0, Q1, Q2, Q3, Q8, Q9, Q10, Q11, Q12, Q13, Q14, Q15],
- hasSideEffects = 1, isBarrier = 1, isCodeGenOnly = 1,
+ hasSideEffects = 1, isBarrier = 1, isCodeGenOnly = 1, Size = 12,
usesCustomInserter = 1 in {
def t2Int_eh_sjlj_setjmp : Thumb2XI<(outs), (ins tGPR:$src, tGPR:$val),
AddrModeNone, 0, NoItinerary, "", "",
@@ -3825,9 +3830,10 @@ let Defs =
Requires<[IsThumb2, HasVFP2]>;
}
+// This gets lowered to an instruction sequence of 12 bytes
let Defs =
[ R0, R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, LR, CPSR ],
- hasSideEffects = 1, isBarrier = 1, isCodeGenOnly = 1,
+ hasSideEffects = 1, isBarrier = 1, isCodeGenOnly = 1, Size = 12,
usesCustomInserter = 1 in {
def t2Int_eh_sjlj_setjmp_nofp : Thumb2XI<(outs), (ins tGPR:$src, tGPR:$val),
AddrModeNone, 0, NoItinerary, "", "",
@@ -4224,7 +4230,7 @@ def : t2InstAlias<"hvc\t$imm16", (t2HVC imm0_65535:$imm16)>;
// 32-bit immediate using movw + movt.
// This is a single pseudo instruction to make it re-materializable.
// FIXME: Remove this when we can do generalized remat.
-let isReMaterializable = 1, isMoveImm = 1 in
+let isReMaterializable = 1, isMoveImm = 1, Size = 8 in
def t2MOVi32imm : PseudoInst<(outs rGPR:$dst), (ins i32imm:$src), IIC_iMOVix2,
[(set rGPR:$dst, (i32 imm:$src))]>,
Requires<[IsThumb, UseMovt]>;
@@ -5006,8 +5012,12 @@ def : InstAlias<"dfb${p}", (t2DSB 0xc, pred:$p), 1>, Requires<[HasDFB]>;
// SpeculationBarrierEndBB must only be used after an unconditional control
// flow, i.e. after a terminator for which isBarrier is True.
let hasSideEffects = 1, isCodeGenOnly = 1, isTerminator = 1, isBarrier = 1 in {
+ // This gets lowered to a pair of 4-byte instructions
+ let Size = 8 in
def t2SpeculationBarrierISBDSBEndBB
: PseudoInst<(outs), (ins), NoItinerary, []>, Sched<[]>;
+ // This gets lowered to a single 4-byte instructions
+ let Size = 4 in
def t2SpeculationBarrierSBEndBB
: PseudoInst<(outs), (ins), NoItinerary, []>, Sched<[]>;
}
diff --git a/llvm/lib/Target/ARM/ARMSubtarget.cpp b/llvm/lib/Target/ARM/ARMSubtarget.cpp
index 2dd25234dc50..32160b109343 100644
--- a/llvm/lib/Target/ARM/ARMSubtarget.cpp
+++ b/llvm/lib/Target/ARM/ARMSubtarget.cpp
@@ -304,6 +304,7 @@ void ARMSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
case CortexM7:
case CortexR52:
case CortexX1:
+ case CortexX1C:
break;
case Exynos:
LdStMultipleTiming = SingleIssuePlusExtras;
diff --git a/llvm/lib/Target/ARM/ARMSubtarget.h b/llvm/lib/Target/ARM/ARMSubtarget.h
index 1c2b7ee6ba35..7cbdc014299f 100644
--- a/llvm/lib/Target/ARM/ARMSubtarget.h
+++ b/llvm/lib/Target/ARM/ARMSubtarget.h
@@ -77,6 +77,7 @@ protected:
CortexR52,
CortexR7,
CortexX1,
+ CortexX1C,
Exynos,
Krait,
Kryo,
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index e0750a9945d2..d9d563ead260 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -2109,9 +2109,6 @@ static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
}
Type *T = I.getType();
- if (T->isPointerTy())
- T = T->getPointerElementType();
-
if (T->getScalarSizeInBits() > 32) {
LLVM_DEBUG(dbgs() << "Unsupported Type: "; T->dump());
return false;
diff --git a/llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp b/llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp
index ea6a7498e27f..311e43d77210 100644
--- a/llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp
@@ -313,12 +313,18 @@ bool HexagonOptAddrMode::isSafeToExtLR(NodeAddr<StmtNode *> SN,
return false;
}
+ // If the register is undefined (for example if it's a reserved register),
+ // it may still be possible to extend the range, but it's safer to be
+ // conservative and just punt.
+ if (LRExtRegRD == 0)
+ return false;
+
MachineInstr *UseMI = NodeAddr<StmtNode *>(IA).Addr->getCode();
NodeAddr<DefNode *> LRExtRegDN = DFG->addr<DefNode *>(LRExtRegRD);
// Reaching Def to LRExtReg can't be a phi.
if ((LRExtRegDN.Addr->getFlags() & NodeAttrs::PhiRef) &&
MI->getParent() != UseMI->getParent())
- return false;
+ return false;
}
return true;
}
diff --git a/llvm/lib/Target/M68k/GISel/M68kLegalizerInfo.cpp b/llvm/lib/Target/M68k/GISel/M68kLegalizerInfo.cpp
index 860c0ce29326..79e9ad4dd1d2 100644
--- a/llvm/lib/Target/M68k/GISel/M68kLegalizerInfo.cpp
+++ b/llvm/lib/Target/M68k/GISel/M68kLegalizerInfo.cpp
@@ -21,13 +21,32 @@ using namespace llvm;
M68kLegalizerInfo::M68kLegalizerInfo(const M68kSubtarget &ST) {
using namespace TargetOpcode;
- const LLT S32 = LLT::scalar(32);
- const LLT P0 = LLT::pointer(0, 32);
- getActionDefinitionsBuilder(G_LOAD).legalFor({S32});
- getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({P0});
- getActionDefinitionsBuilder(G_ADD).legalFor({S32});
- getActionDefinitionsBuilder(G_SUB).legalFor({S32});
- getActionDefinitionsBuilder(G_MUL).legalFor({S32});
- getActionDefinitionsBuilder(G_UDIV).legalFor({S32});
+ const LLT s8 = LLT::scalar(8);
+ const LLT s16 = LLT::scalar(16);
+ const LLT s32 = LLT::scalar(32);
+ const LLT p0 = LLT::pointer(0, 32);
+
+ getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL, G_UDIV, G_AND})
+ .legalFor({s8, s16, s32})
+ .clampScalar(0, s8, s32)
+ .widenScalarToNextPow2(0, 8);
+
+ getActionDefinitionsBuilder(G_CONSTANT)
+ .legalFor({s32, p0})
+ .clampScalar(0, s32, s32);
+
+ getActionDefinitionsBuilder({G_FRAME_INDEX, G_GLOBAL_VALUE}).legalFor({p0});
+
+ getActionDefinitionsBuilder({G_STORE, G_LOAD})
+ .legalForTypesWithMemDesc({{s32, p0, s32, 4},
+ {s32, p0, s16, 4},
+ {s32, p0, s8, 4},
+ {s16, p0, s16, 2},
+ {s8, p0, s8, 1},
+ {p0, p0, s32, 4}})
+ .clampScalar(0, s8, s32);
+
+ getActionDefinitionsBuilder(G_PTR_ADD).legalFor({{p0, s32}});
+
getLegacyLegalizerInfo().computeTables();
}
diff --git a/llvm/lib/Target/M68k/M68kInstrBits.td b/llvm/lib/Target/M68k/M68kInstrBits.td
index d610bce5c277..0d1278102378 100644
--- a/llvm/lib/Target/M68k/M68kInstrBits.td
+++ b/llvm/lib/Target/M68k/M68kInstrBits.td
@@ -79,6 +79,10 @@ def BTST32di : MxBTST_RI<MxType32d>;
// Memory BTST limited to 8 bits only
def BTST8jd : MxBTST_MR<MxType8d, MxType8.JOp, MxType8.JPat,
MxEncEAj_0, MxExtEmpty>;
+def BTST8od : MxBTST_MR<MxType8d, MxType8.OOp, MxType8.OPat,
+ MxEncEAo_0, MxExtEmpty>;
+def BTST8ed : MxBTST_MR<MxType8d, MxType8.EOp, MxType8.EPat,
+ MxEncEAe_0, MxExtEmpty>;
def BTST8pd : MxBTST_MR<MxType8d, MxType8.POp, MxType8.PPat,
MxEncEAp_0, MxExtI16_0>;
def BTST8fd : MxBTST_MR<MxType8d, MxType8.FOp, MxType8.FPat,
@@ -90,6 +94,10 @@ def BTST8kd : MxBTST_MR<MxType8d, MxType8.KOp, MxType8.KPat,
def BTST8ji : MxBTST_MI<MxType8d, MxType8.JOp, MxType8.JPat,
MxEncEAj_0, MxExtEmpty>;
+def BTST8oi : MxBTST_MI<MxType8d, MxType8.OOp, MxType8.OPat,
+ MxEncEAo_0, MxExtEmpty>;
+def BTST8ei : MxBTST_MI<MxType8d, MxType8.EOp, MxType8.EPat,
+ MxEncEAe_0, MxExtEmpty>;
def BTST8pi : MxBTST_MI<MxType8d, MxType8.POp, MxType8.PPat,
MxEncEAp_0, MxExtI16_0>;
def BTST8fi : MxBTST_MI<MxType8d, MxType8.FOp, MxType8.FPat,
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index eac237bb27bb..7b5248906b56 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -574,7 +574,6 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
for (const auto &Op : {ISD::FMINIMUM, ISD::FMAXIMUM}) {
setFP16OperationAction(Op, MVT::f16, GetMinMaxAction(Expand), Expand);
setOperationAction(Op, MVT::f32, GetMinMaxAction(Expand));
- setOperationAction(Op, MVT::f64, GetMinMaxAction(Expand));
setFP16OperationAction(Op, MVT::v2f16, GetMinMaxAction(Expand), Expand);
}
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 22e200e77831..22084cddc092 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -896,6 +896,7 @@ defm FMUL : F3_fma_component<"mul", fmul>;
defm FMIN : F3<"min", fminnum>;
defm FMAX : F3<"max", fmaxnum>;
+// Note: min.NaN.f64 and max.NaN.f64 do not actually exist.
defm FMINNAN : F3<"min.NaN", fminimum>;
defm FMAXNAN : F3<"max.NaN", fmaximum>;
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 25cc34badda0..cbeae0ab03b8 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -1252,7 +1252,6 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Legal);
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Legal);
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Legal);
- setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Legal);
} else {
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);
@@ -9093,22 +9092,30 @@ bool llvm::checkConvertToNonDenormSingle(APFloat &ArgAPFloat) {
static bool isValidSplatLoad(const PPCSubtarget &Subtarget, const SDValue &Op,
unsigned &Opcode) {
- const SDNode *InputNode = Op.getOperand(0).getNode();
- if (!InputNode || !ISD::isUNINDEXEDLoad(InputNode))
- return false;
-
- if (!Subtarget.hasVSX())
+ LoadSDNode *InputNode = dyn_cast<LoadSDNode>(Op.getOperand(0));
+ if (!InputNode || !Subtarget.hasVSX() || !ISD::isUNINDEXEDLoad(InputNode))
return false;
EVT Ty = Op->getValueType(0);
- if (Ty == MVT::v2f64 || Ty == MVT::v4f32 || Ty == MVT::v4i32 ||
- Ty == MVT::v8i16 || Ty == MVT::v16i8)
+ // For v2f64, v4f32 and v4i32 types, we require the load to be non-extending
+ // as we cannot handle extending loads for these types.
+ if ((Ty == MVT::v2f64 || Ty == MVT::v4f32 || Ty == MVT::v4i32) &&
+ ISD::isNON_EXTLoad(InputNode))
+ return true;
+
+ EVT MemVT = InputNode->getMemoryVT();
+ // For v8i16 and v16i8 types, extending loads can be handled as long as the
+ // memory VT is the same vector element VT type.
+ // The loads feeding into the v8i16 and v16i8 types will be extending because
+ // scalar i8/i16 are not legal types.
+ if ((Ty == MVT::v8i16 || Ty == MVT::v16i8) && ISD::isEXTLoad(InputNode) &&
+ (MemVT == Ty.getVectorElementType()))
return true;
if (Ty == MVT::v2i64) {
// Check the extend type, when the input type is i32, and the output vector
// type is v2i64.
- if (cast<LoadSDNode>(Op.getOperand(0))->getMemoryVT() == MVT::i32) {
+ if (MemVT == MVT::i32) {
if (ISD::isZEXTLoad(InputNode))
Opcode = PPCISD::ZEXT_LD_SPLAT;
if (ISD::isSEXTLoad(InputNode))
@@ -10755,6 +10762,26 @@ SDValue PPCTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
if (VT == MVT::v2f64 && C)
return Op;
+ if (Subtarget.hasP9Vector()) {
+ // A f32 load feeding into a v4f32 insert_vector_elt is handled in this way
+ // because on P10, it allows this specific insert_vector_elt load pattern to
+ // utilize the refactored load and store infrastructure in order to exploit
+ // prefixed loads.
+ // On targets with inexpensive direct moves (Power9 and up), a
+ // (insert_vector_elt v4f32:$vec, (f32 load)) is always better as an integer
+ // load since a single precision load will involve conversion to double
+ // precision on the load followed by another conversion to single precision.
+ if ((VT == MVT::v4f32) && (V2.getValueType() == MVT::f32) &&
+ (isa<LoadSDNode>(V2))) {
+ SDValue BitcastVector = DAG.getBitcast(MVT::v4i32, V1);
+ SDValue BitcastLoad = DAG.getBitcast(MVT::i32, V2);
+ SDValue InsVecElt =
+ DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v4i32, BitcastVector,
+ BitcastLoad, Op.getOperand(2));
+ return DAG.getBitcast(MVT::v4f32, InsVecElt);
+ }
+ }
+
if (Subtarget.isISA3_1()) {
if ((VT == MVT::v2i64 || VT == MVT::v2f64) && !Subtarget.isPPC64())
return SDValue();
diff --git a/llvm/lib/Target/PowerPC/PPCInstrPrefix.td b/llvm/lib/Target/PowerPC/PPCInstrPrefix.td
index fe354208533b..ff43426dd1ef 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrPrefix.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrPrefix.td
@@ -2816,32 +2816,20 @@ let Predicates = [IsISA3_1, HasVSX, IsLittleEndian] in {
def : Pat<(v4f32 (insertelt v4f32:$vDi, f32:$rA, i64:$rB)),
(VINSWVRX $vDi, InsertEltShift.Sub32Left2, (XSCVDPSPN $rA))>;
- def : Pat<(v4f32 (insertelt v4f32:$vDi, (f32 (load iaddr:$rA)), i64:$rB)),
- (VINSWRX $vDi, InsertEltShift.Sub32Left2, (LWZ memri:$rA))>;
- def : Pat<(v4f32 (insertelt v4f32:$vDi, (f32 (load iaddrX34:$rA)), i64:$rB)),
- (VINSWRX $vDi, InsertEltShift.Sub32Left2, (PLWZ memri34:$rA))>;
- def : Pat<(v4f32 (insertelt v4f32:$vDi, (f32 (load xaddr:$rA)), i64:$rB)),
- (VINSWRX $vDi, InsertEltShift.Sub32Left2, (LWZX memrr:$rA))>;
def : Pat<(v2f64 (insertelt v2f64:$vDi, f64:$A, i64:$rB)),
(VINSDRX $vDi, InsertEltShift.Left3, Bitcast.DblToLong)>;
- def : Pat<(v2f64 (insertelt v2f64:$vDi, (f64 (load iaddrX4:$rA)), i64:$rB)),
+ def : Pat<(v2f64 (insertelt v2f64:$vDi, (f64 (load DSForm:$rA)), i64:$rB)),
(VINSDRX $vDi, InsertEltShift.Left3, (LD memrix:$rA))>;
- def : Pat<(v2f64 (insertelt v2f64:$vDi, (f64 (load iaddrX34:$rA)), i64:$rB)),
+ def : Pat<(v2f64 (insertelt v2f64:$vDi, (f64 (load PDForm:$rA)), i64:$rB)),
(VINSDRX $vDi, InsertEltShift.Left3, (PLD memri34:$rA))>;
- def : Pat<(v2f64 (insertelt v2f64:$vDi, (f64 (load xaddrX4:$rA)), i64:$rB)),
+ def : Pat<(v2f64 (insertelt v2f64:$vDi, (f64 (load XForm:$rA)), i64:$rB)),
(VINSDRX $vDi, InsertEltShift.Left3, (LDX memrr:$rA))>;
let AddedComplexity = 400 in {
// Immediate vector insert element
foreach Idx = [0, 1, 2, 3] in {
def : Pat<(v4i32 (insertelt v4i32:$vDi, i32:$rA, Idx)),
(VINSW $vDi, !mul(!sub(3, Idx), 4), $rA)>;
- def : Pat<(v4f32 (insertelt v4f32:$vDi, (f32 (load iaddr:$rA)), Idx)),
- (VINSW $vDi, !mul(!sub(3, Idx), 4), (LWZ memri:$rA))>;
- def : Pat<(v4f32 (insertelt v4f32:$vDi, (f32 (load iaddrX34:$rA)), Idx)),
- (VINSW $vDi, !mul(!sub(3, Idx), 4), (PLWZ memri34:$rA))>;
- def : Pat<(v4f32 (insertelt v4f32:$vDi, (f32 (load xaddr:$rA)), Idx)),
- (VINSW $vDi, !mul(!sub(3, Idx), 4), (LWZX memrr:$rA))>;
}
foreach i = [0, 1] in
def : Pat<(v2i64 (insertelt v2i64:$vDi, i64:$rA, (i64 i))),
@@ -2860,12 +2848,6 @@ let Predicates = [IsISA3_1, HasVSX, IsBigEndian, IsPPC32] in {
def : Pat<(v4f32 (insertelt v4f32:$vDi, f32:$rA, i32:$rB)),
(VINSWVLX $vDi, InsertEltShift.Left2, (XSCVDPSPN $rA))>;
- def : Pat<(v4f32 (insertelt v4f32:$vDi, (f32 (load iaddr:$rA)), i32:$rB)),
- (VINSWLX v4f32:$vDi, InsertEltShift.Left2, (LWZ memri:$rA))>;
- def : Pat<(v4f32 (insertelt v4f32:$vDi, (f32 (load iaddrX34:$rA)), i32:$rB)),
- (VINSWLX v4f32:$vDi, InsertEltShift.Left2, (PLWZ memri34:$rA))>;
- def: Pat<(v4f32(insertelt v4f32 : $vDi, (f32(load xaddr : $rA)), i32 : $rB)),
- (VINSWLX v4f32 : $vDi, InsertEltShift.Left2, (LWZX memrr : $rA))>;
}
let Predicates = [IsISA3_1, HasVSX, IsBigEndian, IsPPC64] in {
@@ -2881,20 +2863,14 @@ let Predicates = [IsISA3_1, HasVSX, IsBigEndian, IsPPC64] in {
def : Pat<(v4f32 (insertelt v4f32:$vDi, f32:$rA, i64:$rB)),
(VINSWVLX $vDi, InsertEltShift.Sub32Left2, (XSCVDPSPN $rA))>;
- def : Pat<(v4f32 (insertelt v4f32:$vDi, (f32 (load iaddr:$rA)), i64:$rB)),
- (VINSWLX $vDi, InsertEltShift.Sub32Left2, (LWZ memri:$rA))>;
- def : Pat<(v4f32 (insertelt v4f32:$vDi, (f32 (load iaddrX34:$rA)), i64:$rB)),
- (VINSWLX $vDi, InsertEltShift.Sub32Left2, (PLWZ memri34:$rA))>;
- def : Pat<(v4f32 (insertelt v4f32:$vDi, (f32 (load xaddr:$rA)), i64:$rB)),
- (VINSWLX $vDi, InsertEltShift.Sub32Left2, (LWZX memrr:$rA))>;
def : Pat<(v2f64 (insertelt v2f64:$vDi, f64:$A, i64:$rB)),
(VINSDLX $vDi, InsertEltShift.Left3, Bitcast.DblToLong)>;
- def : Pat<(v2f64 (insertelt v2f64:$vDi, (f64 (load iaddrX4:$rA)), i64:$rB)),
+ def : Pat<(v2f64 (insertelt v2f64:$vDi, (f64 (load DSForm:$rA)), i64:$rB)),
(VINSDLX $vDi, InsertEltShift.Left3, (LD memrix:$rA))>;
- def : Pat<(v2f64 (insertelt v2f64:$vDi, (f64 (load iaddrX34:$rA)), i64:$rB)),
+ def : Pat<(v2f64 (insertelt v2f64:$vDi, (f64 (load PDForm:$rA)), i64:$rB)),
(VINSDLX $vDi, InsertEltShift.Left3, (PLD memri34:$rA))>;
- def : Pat<(v2f64 (insertelt v2f64:$vDi, (f64 (load xaddrX4:$rA)), i64:$rB)),
+ def : Pat<(v2f64 (insertelt v2f64:$vDi, (f64 (load XForm:$rA)), i64:$rB)),
(VINSDLX $vDi, InsertEltShift.Left3, (LDX memrr:$rA))>;
}
@@ -2904,15 +2880,6 @@ let AddedComplexity = 400, Predicates = [IsISA3_1, HasVSX, IsBigEndian] in {
foreach Idx = [0, 1, 2, 3] in {
def : Pat<(v4i32 (insertelt v4i32:$vDi, i32:$rA, (Ty Idx))),
(VINSW $vDi, !mul(Idx, 4), $rA)>;
- def : Pat<(v4f32 (insertelt v4f32:$vDi, (f32 (load iaddr:$rA)),
- (Ty Idx))),
- (VINSW $vDi, !mul(Idx, 4), (LWZ memri:$rA))>;
- def : Pat<(v4f32 (insertelt v4f32:$vDi, (f32 (load iaddrX34:$rA)),
- (Ty Idx))),
- (VINSW $vDi, !mul(Idx, 4), (PLWZ memri34:$rA))>;
- def : Pat<(v4f32 (insertelt v4f32:$vDi, (f32 (load xaddr:$rA)),
- (Ty Idx))),
- (VINSW $vDi, !mul(Idx, 4), (LWZX memrr:$rA))>;
}
}
diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
index a2ea34fe11c7..01f36e6dcdd2 100644
--- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
+++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
@@ -2266,8 +2266,8 @@ void RISCVAsmParser::emitLoadImm(MCRegister DestReg, int64_t Value,
if (Inst.Opc == RISCV::LUI) {
emitToStreamer(
Out, MCInstBuilder(RISCV::LUI).addReg(DestReg).addImm(Inst.Imm));
- } else if (Inst.Opc == RISCV::ADDUW) {
- emitToStreamer(Out, MCInstBuilder(RISCV::ADDUW)
+ } else if (Inst.Opc == RISCV::ADD_UW) {
+ emitToStreamer(Out, MCInstBuilder(RISCV::ADD_UW)
.addReg(DestReg)
.addReg(SrcReg)
.addReg(RISCV::X0));
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
index 14d0191a505f..1078403a3fd2 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
@@ -197,9 +197,9 @@ void RISCVMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
// Get byte count of instruction.
unsigned Size = Desc.getSize();
- // RISCVInstrInfo::getInstSizeInBytes hard-codes the number of expanded
- // instructions for each pseudo, and must be updated when adding new pseudos
- // or changing existing ones.
+ // RISCVInstrInfo::getInstSizeInBytes expects that the total size of the
+ // expanded instructions for each pseudo is correct in the Size field of the
+ // tablegen definition for the pseudo.
if (MI.getOpcode() == RISCV::PseudoCALLReg ||
MI.getOpcode() == RISCV::PseudoCALL ||
MI.getOpcode() == RISCV::PseudoTAIL ||
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.cpp
index 18858209aa9b..e935179e5f9b 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.cpp
@@ -31,7 +31,7 @@ static int getInstSeqCost(RISCVMatInt::InstSeq &Res, bool HasRVC) {
case RISCV::LUI:
Compressed = isInt<6>(Instr.Imm);
break;
- case RISCV::ADDUW:
+ case RISCV::ADD_UW:
Compressed = false;
break;
}
@@ -123,10 +123,11 @@ static void generateInstSeqImpl(int64_t Val,
}
}
- // Try to use SLLIUW for Hi52 when it is uint32 but not int32.
+ // Try to use SLLI_UW for Hi52 when it is uint32 but not int32.
if (isUInt<32>((uint64_t)Hi52) && !isInt<32>((uint64_t)Hi52) &&
ActiveFeatures[RISCV::FeatureStdExtZba]) {
- // Use LUI+ADDI or LUI to compose, then clear the upper 32 bits with SLLIUW.
+ // Use LUI+ADDI or LUI to compose, then clear the upper 32 bits with
+ // SLLI_UW.
Hi52 = ((uint64_t)Hi52) | (0xffffffffull << 32);
Unsigned = true;
}
@@ -134,7 +135,7 @@ static void generateInstSeqImpl(int64_t Val,
generateInstSeqImpl(Hi52, ActiveFeatures, Res);
if (Unsigned)
- Res.push_back(RISCVMatInt::Inst(RISCV::SLLIUW, ShiftAmount));
+ Res.push_back(RISCVMatInt::Inst(RISCV::SLLI_UW, ShiftAmount));
else
Res.push_back(RISCVMatInt::Inst(RISCV::SLLI, ShiftAmount));
if (Lo12)
@@ -210,7 +211,7 @@ InstSeq generateInstSeq(int64_t Val, const FeatureBitset &ActiveFeatures) {
uint64_t LeadingOnesVal = Val | maskLeadingOnes<uint64_t>(LeadingZeros);
TmpSeq.clear();
generateInstSeqImpl(LeadingOnesVal, ActiveFeatures, TmpSeq);
- TmpSeq.push_back(RISCVMatInt::Inst(RISCV::ADDUW, 0));
+ TmpSeq.push_back(RISCVMatInt::Inst(RISCV::ADD_UW, 0));
// Keep the new sequence if it is an improvement.
if (TmpSeq.size() < Res.size()) {
diff --git a/llvm/lib/Target/RISCV/RISCV.td b/llvm/lib/Target/RISCV/RISCV.td
index 5b0f27c5e937..e32a8fb010de 100644
--- a/llvm/lib/Target/RISCV/RISCV.td
+++ b/llvm/lib/Target/RISCV/RISCV.td
@@ -52,11 +52,17 @@ def HasStdExtZfhmin : Predicate<"Subtarget->hasStdExtZfhmin()">,
def FeatureStdExtZfh
: SubtargetFeature<"zfh", "HasStdExtZfh", "true",
"'Zfh' (Half-Precision Floating-Point)",
- [FeatureStdExtZfhmin, FeatureStdExtF]>;
+ [FeatureStdExtF]>;
def HasStdExtZfh : Predicate<"Subtarget->hasStdExtZfh()">,
AssemblerPredicate<(all_of FeatureStdExtZfh),
"'Zfh' (Half-Precision Floating-Point)">;
+def HasStdExtZfhOrZfhmin
+ : Predicate<"Subtarget->hasStdExtZfh() || Subtarget->hasStdExtZfhmin()">,
+ AssemblerPredicate<(any_of FeatureStdExtZfh, FeatureStdExtZfhmin),
+ "'Zfh' (Half-Precision Floating-Point) or "
+ "'Zfhmin' (Half-Precision Floating-Point Minimal)">;
+
def FeatureStdExtC
: SubtargetFeature<"c", "HasStdExtC", "true",
"'C' (Compressed Instructions)">;
diff --git a/llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp b/llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp
index 26ce16486bd9..40ee7ca6bc1c 100644
--- a/llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp
+++ b/llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp
@@ -86,9 +86,9 @@ bool RISCVExpandAtomicPseudo::expandMBB(MachineBasicBlock &MBB) {
bool RISCVExpandAtomicPseudo::expandMI(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI,
MachineBasicBlock::iterator &NextMBBI) {
- // RISCVInstrInfo::getInstSizeInBytes hard-codes the number of expanded
- // instructions for each pseudo, and must be updated when adding new pseudos
- // or changing existing ones.
+ // RISCVInstrInfo::getInstSizeInBytes expects that the total size of the
+ // expanded instructions for each pseudo is correct in the Size field of the
+ // tablegen definition for the pseudo.
switch (MBBI->getOpcode()) {
case RISCV::PseudoAtomicLoadNand32:
return expandAtomicBinOp(MBB, MBBI, AtomicRMWInst::Nand, false, 32,
diff --git a/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp b/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp
index 80340ee81509..0c5c13db7112 100644
--- a/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp
@@ -92,9 +92,9 @@ bool RISCVExpandPseudo::expandMBB(MachineBasicBlock &MBB) {
bool RISCVExpandPseudo::expandMI(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI,
MachineBasicBlock::iterator &NextMBBI) {
- // RISCVInstrInfo::getInstSizeInBytes hard-codes the number of expanded
- // instructions for each pseudo, and must be updated when adding new pseudos
- // or changing existing ones.
+ // RISCVInstrInfo::getInstSizeInBytes expects that the total size of the
+ // expanded instructions for each pseudo is correct in the Size field of the
+ // tablegen definition for the pseudo.
switch (MBBI->getOpcode()) {
case RISCV::PseudoLLA:
return expandLoadLocalAddress(MBB, MBBI, NextMBBI);
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index 5870502d74d5..6f77428ae721 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -166,8 +166,8 @@ static SDNode *selectImm(SelectionDAG *CurDAG, const SDLoc &DL, const MVT VT,
SDValue SDImm = CurDAG->getTargetConstant(Inst.Imm, DL, XLenVT);
if (Inst.Opc == RISCV::LUI)
Result = CurDAG->getMachineNode(RISCV::LUI, DL, XLenVT, SDImm);
- else if (Inst.Opc == RISCV::ADDUW)
- Result = CurDAG->getMachineNode(RISCV::ADDUW, DL, XLenVT, SrcReg,
+ else if (Inst.Opc == RISCV::ADD_UW)
+ Result = CurDAG->getMachineNode(RISCV::ADD_UW, DL, XLenVT, SrcReg,
CurDAG->getRegister(RISCV::X0, XLenVT));
else if (Inst.Opc == RISCV::SH1ADD || Inst.Opc == RISCV::SH2ADD ||
Inst.Opc == RISCV::SH3ADD)
@@ -775,10 +775,10 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
C1 == (maskTrailingOnes<uint64_t>(XLen - (C2 + C3)) << C2)) {
// Use slli.uw when possible.
if ((XLen - (C2 + C3)) == 32 && Subtarget->hasStdExtZba()) {
- SDNode *SLLIUW =
- CurDAG->getMachineNode(RISCV::SLLIUW, DL, XLenVT, X,
+ SDNode *SLLI_UW =
+ CurDAG->getMachineNode(RISCV::SLLI_UW, DL, XLenVT, X,
CurDAG->getTargetConstant(C2, DL, XLenVT));
- ReplaceNode(Node, SLLIUW);
+ ReplaceNode(Node, SLLI_UW);
return;
}
@@ -1811,7 +1811,7 @@ bool RISCVDAGToDAGISel::hasAllNBitUsers(SDNode *Node, unsigned Bits) const {
case RISCV::CLZW:
case RISCV::CTZW:
case RISCV::CPOPW:
- case RISCV::SLLIUW:
+ case RISCV::SLLI_UW:
case RISCV::FCVT_H_W:
case RISCV::FCVT_H_WU:
case RISCV::FCVT_S_W:
@@ -1830,20 +1830,20 @@ bool RISCVDAGToDAGISel::hasAllNBitUsers(SDNode *Node, unsigned Bits) const {
if (Bits < (64 - countLeadingZeros(User->getConstantOperandVal(1))))
return false;
break;
- case RISCV::SEXTB:
+ case RISCV::SEXT_B:
if (Bits < 8)
return false;
break;
- case RISCV::SEXTH:
- case RISCV::ZEXTH_RV32:
- case RISCV::ZEXTH_RV64:
+ case RISCV::SEXT_H:
+ case RISCV::ZEXT_H_RV32:
+ case RISCV::ZEXT_H_RV64:
if (Bits < 16)
return false;
break;
- case RISCV::ADDUW:
- case RISCV::SH1ADDUW:
- case RISCV::SH2ADDUW:
- case RISCV::SH3ADDUW:
+ case RISCV::ADD_UW:
+ case RISCV::SH1ADD_UW:
+ case RISCV::SH2ADD_UW:
+ case RISCV::SH3ADD_UW:
// The first operand to add.uw/shXadd.uw is implicitly zero extended from
// 32 bits.
if (UI.getOperandNo() != 0 || Bits < 32)
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 5cc3aa35d4d2..97d24c8e9c0b 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -282,6 +282,9 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
(Subtarget.hasStdExtZbb() || Subtarget.hasStdExtZbkb())
? Legal
: Expand);
+ // Zbkb can use rev8+brev8 to implement bitreverse.
+ setOperationAction(ISD::BITREVERSE, XLenVT,
+ Subtarget.hasStdExtZbkb() ? Custom : Expand);
}
if (Subtarget.hasStdExtZbb()) {
@@ -1082,6 +1085,9 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
setTargetDAGCombine(ISD::SHL);
setTargetDAGCombine(ISD::STORE);
}
+
+ setLibcallName(RTLIB::FPEXT_F16_F32, "__extendhfsf2");
+ setLibcallName(RTLIB::FPROUND_F32_F16, "__truncsfhf2");
}
EVT RISCVTargetLowering::getSetCCResultType(const DataLayout &DL,
@@ -1115,17 +1121,15 @@ bool RISCVTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
case Intrinsic::riscv_masked_atomicrmw_min_i32:
case Intrinsic::riscv_masked_atomicrmw_umax_i32:
case Intrinsic::riscv_masked_atomicrmw_umin_i32:
- case Intrinsic::riscv_masked_cmpxchg_i32: {
- PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType());
+ case Intrinsic::riscv_masked_cmpxchg_i32:
Info.opc = ISD::INTRINSIC_W_CHAIN;
- Info.memVT = MVT::getVT(PtrTy->getPointerElementType());
+ Info.memVT = MVT::i32;
Info.ptrVal = I.getArgOperand(0);
Info.offset = 0;
Info.align = Align(4);
Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore |
MachineMemOperand::MOVolatile;
return true;
- }
case Intrinsic::riscv_masked_strided_load:
Info.opc = ISD::INTRINSIC_W_CHAIN;
Info.ptrVal = I.getArgOperand(1);
@@ -2952,17 +2956,26 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
return LowerINTRINSIC_VOID(Op, DAG);
case ISD::BSWAP:
case ISD::BITREVERSE: {
- // Convert BSWAP/BITREVERSE to GREVI to enable GREVI combinining.
- assert(Subtarget.hasStdExtZbp() && "Unexpected custom legalisation");
MVT VT = Op.getSimpleValueType();
SDLoc DL(Op);
- // Start with the maximum immediate value which is the bitwidth - 1.
- unsigned Imm = VT.getSizeInBits() - 1;
- // If this is BSWAP rather than BITREVERSE, clear the lower 3 bits.
- if (Op.getOpcode() == ISD::BSWAP)
- Imm &= ~0x7U;
- return DAG.getNode(RISCVISD::GREV, DL, VT, Op.getOperand(0),
- DAG.getConstant(Imm, DL, VT));
+ if (Subtarget.hasStdExtZbp()) {
+ // Convert BSWAP/BITREVERSE to GREVI to enable GREVI combinining.
+ // Start with the maximum immediate value which is the bitwidth - 1.
+ unsigned Imm = VT.getSizeInBits() - 1;
+ // If this is BSWAP rather than BITREVERSE, clear the lower 3 bits.
+ if (Op.getOpcode() == ISD::BSWAP)
+ Imm &= ~0x7U;
+ return DAG.getNode(RISCVISD::GREV, DL, VT, Op.getOperand(0),
+ DAG.getConstant(Imm, DL, VT));
+ }
+ assert(Subtarget.hasStdExtZbkb() && "Unexpected custom legalization");
+ assert(Op.getOpcode() == ISD::BITREVERSE && "Unexpected opcode");
+ // Expand bitreverse to a bswap(rev8) followed by brev8.
+ SDValue BSwap = DAG.getNode(ISD::BSWAP, DL, VT, Op.getOperand(0));
+ // We use the Zbp grevi encoding for rev.b/brev8 which will be recognized
+ // as brev8 by an isel pattern.
+ return DAG.getNode(RISCVISD::GREV, DL, VT, BSwap,
+ DAG.getConstant(7, DL, VT));
}
case ISD::FSHL:
case ISD::FSHR: {
@@ -3063,6 +3076,8 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
// minimum size. e.g. <vscale x 2 x i32>. VLENB is in bytes so we calculate
// vscale as VLENB / 8.
static_assert(RISCV::RVVBitsPerBlock == 64, "Unexpected bits per block!");
+ if (Subtarget.getMinVLen() < RISCV::RVVBitsPerBlock)
+ report_fatal_error("Support for VLEN==32 is incomplete.");
if (isa<ConstantSDNode>(Op.getOperand(0))) {
// We assume VLENB is a multiple of 8. We manually choose the best shift
// here because SimplifyDemandedBits isn't always able to simplify it.
@@ -4288,8 +4303,47 @@ SDValue RISCVTargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
MVT XLenVT = Subtarget.getXLenVT();
if (VecVT.getVectorElementType() == MVT::i1) {
- // FIXME: For now we just promote to an i8 vector and extract from that,
- // but this is probably not optimal.
+ if (VecVT.isFixedLengthVector()) {
+ unsigned NumElts = VecVT.getVectorNumElements();
+ if (NumElts >= 8) {
+ MVT WideEltVT;
+ unsigned WidenVecLen;
+ SDValue ExtractElementIdx;
+ SDValue ExtractBitIdx;
+ unsigned MaxEEW = Subtarget.getMaxELENForFixedLengthVectors();
+ MVT LargestEltVT = MVT::getIntegerVT(
+ std::min(MaxEEW, unsigned(XLenVT.getSizeInBits())));
+ if (NumElts <= LargestEltVT.getSizeInBits()) {
+ assert(isPowerOf2_32(NumElts) &&
+ "the number of elements should be power of 2");
+ WideEltVT = MVT::getIntegerVT(NumElts);
+ WidenVecLen = 1;
+ ExtractElementIdx = DAG.getConstant(0, DL, XLenVT);
+ ExtractBitIdx = Idx;
+ } else {
+ WideEltVT = LargestEltVT;
+ WidenVecLen = NumElts / WideEltVT.getSizeInBits();
+ // extract element index = index / element width
+ ExtractElementIdx = DAG.getNode(
+ ISD::SRL, DL, XLenVT, Idx,
+ DAG.getConstant(Log2_64(WideEltVT.getSizeInBits()), DL, XLenVT));
+ // mask bit index = index % element width
+ ExtractBitIdx = DAG.getNode(
+ ISD::AND, DL, XLenVT, Idx,
+ DAG.getConstant(WideEltVT.getSizeInBits() - 1, DL, XLenVT));
+ }
+ MVT WideVT = MVT::getVectorVT(WideEltVT, WidenVecLen);
+ Vec = DAG.getNode(ISD::BITCAST, DL, WideVT, Vec);
+ SDValue ExtractElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, XLenVT,
+ Vec, ExtractElementIdx);
+ // Extract the bit from GPR.
+ SDValue ShiftRight =
+ DAG.getNode(ISD::SRL, DL, XLenVT, ExtractElt, ExtractBitIdx);
+ return DAG.getNode(ISD::AND, DL, XLenVT, ShiftRight,
+ DAG.getConstant(1, DL, XLenVT));
+ }
+ }
+ // Otherwise, promote to an i8 vector and extract from that.
MVT WideVT = MVT::getVectorVT(MVT::i8, VecVT.getVectorElementCount());
Vec = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, Vec);
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Vec, Idx);
@@ -4411,15 +4465,30 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
return DAG.getRegister(RISCV::X4, PtrVT);
}
case Intrinsic::riscv_orc_b:
- // Lower to the GORCI encoding for orc.b.
- return DAG.getNode(RISCVISD::GORC, DL, XLenVT, Op.getOperand(1),
+ case Intrinsic::riscv_brev8: {
+ // Lower to the GORCI encoding for orc.b or the GREVI encoding for brev8.
+ unsigned Opc =
+ IntNo == Intrinsic::riscv_brev8 ? RISCVISD::GREV : RISCVISD::GORC;
+ return DAG.getNode(Opc, DL, XLenVT, Op.getOperand(1),
DAG.getConstant(7, DL, XLenVT));
+ }
case Intrinsic::riscv_grev:
case Intrinsic::riscv_gorc: {
unsigned Opc =
IntNo == Intrinsic::riscv_grev ? RISCVISD::GREV : RISCVISD::GORC;
return DAG.getNode(Opc, DL, XLenVT, Op.getOperand(1), Op.getOperand(2));
}
+ case Intrinsic::riscv_zip:
+ case Intrinsic::riscv_unzip: {
+ // Lower to the SHFLI encoding for zip or the UNSHFLI encoding for unzip.
+ // For i32 the immdiate is 15. For i64 the immediate is 31.
+ unsigned Opc =
+ IntNo == Intrinsic::riscv_zip ? RISCVISD::SHFL : RISCVISD::UNSHFL;
+ unsigned BitWidth = Op.getValueSizeInBits();
+ assert(isPowerOf2_32(BitWidth) && BitWidth >= 2 && "Unexpected bit width");
+ return DAG.getNode(Opc, DL, XLenVT, Op.getOperand(1),
+ DAG.getConstant((BitWidth / 2) - 1, DL, XLenVT));
+ }
case Intrinsic::riscv_shfl:
case Intrinsic::riscv_unshfl: {
unsigned Opc =
@@ -5829,14 +5898,17 @@ SDValue RISCVTargetLowering::lowerMaskedGather(SDValue Op,
}
}
- if (XLenVT == MVT::i32 && IndexVT.getVectorElementType().bitsGT(XLenVT)) {
- IndexVT = IndexVT.changeVectorElementType(XLenVT);
- Index = DAG.getNode(ISD::TRUNCATE, DL, IndexVT, Index);
- }
-
if (!VL)
VL = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget).second;
+ if (XLenVT == MVT::i32 && IndexVT.getVectorElementType().bitsGT(XLenVT)) {
+ IndexVT = IndexVT.changeVectorElementType(XLenVT);
+ SDValue TrueMask = DAG.getNode(RISCVISD::VMSET_VL, DL, Mask.getValueType(),
+ VL);
+ Index = DAG.getNode(RISCVISD::TRUNCATE_VECTOR_VL, DL, IndexVT, Index,
+ TrueMask, VL);
+ }
+
unsigned IntID =
IsUnmasked ? Intrinsic::riscv_vluxei : Intrinsic::riscv_vluxei_mask;
SmallVector<SDValue, 8> Ops{Chain, DAG.getTargetConstant(IntID, DL, XLenVT)};
@@ -5937,14 +6009,17 @@ SDValue RISCVTargetLowering::lowerMaskedScatter(SDValue Op,
}
}
- if (XLenVT == MVT::i32 && IndexVT.getVectorElementType().bitsGT(XLenVT)) {
- IndexVT = IndexVT.changeVectorElementType(XLenVT);
- Index = DAG.getNode(ISD::TRUNCATE, DL, IndexVT, Index);
- }
-
if (!VL)
VL = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget).second;
+ if (XLenVT == MVT::i32 && IndexVT.getVectorElementType().bitsGT(XLenVT)) {
+ IndexVT = IndexVT.changeVectorElementType(XLenVT);
+ SDValue TrueMask = DAG.getNode(RISCVISD::VMSET_VL, DL, Mask.getValueType(),
+ VL);
+ Index = DAG.getNode(RISCVISD::TRUNCATE_VECTOR_VL, DL, IndexVT, Index,
+ TrueMask, VL);
+ }
+
unsigned IntID =
IsUnmasked ? Intrinsic::riscv_vsoxei : Intrinsic::riscv_vsoxei_mask;
SmallVector<SDValue, 8> Ops{Chain, DAG.getTargetConstant(IntID, DL, XLenVT)};
@@ -6568,7 +6643,11 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N,
DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(2));
unsigned Opc =
IntNo == Intrinsic::riscv_shfl ? RISCVISD::SHFLW : RISCVISD::UNSHFLW;
- if (isa<ConstantSDNode>(N->getOperand(2))) {
+ // There is no (UN)SHFLIW. If the control word is a constant, we can use
+ // (UN)SHFLI with bit 4 of the control word cleared. The upper 32 bit half
+ // will be shuffled the same way as the lower 32 bit half, but the two
+ // halves won't cross.
+ if (isa<ConstantSDNode>(NewOp2)) {
NewOp2 = DAG.getNode(ISD::AND, DL, MVT::i64, NewOp2,
DAG.getConstant(0xf, DL, MVT::i64));
Opc =
@@ -7284,8 +7363,8 @@ static SDValue performANY_EXTENDCombine(SDNode *N,
return SDValue(N, 0);
}
-// Try to form VWMUL or VWMULU.
-// FIXME: Support VWMULSU.
+// Try to form VWMUL, VWMULU or VWMULSU.
+// TODO: Support VWMULSU.vx with a sign extend Op and a splat of scalar Op.
static SDValue combineMUL_VLToVWMUL_VL(SDNode *N, SelectionDAG &DAG,
bool Commute) {
assert(N->getOpcode() == RISCVISD::MUL_VL && "Unexpected opcode");
@@ -7296,6 +7375,7 @@ static SDValue combineMUL_VLToVWMUL_VL(SDNode *N, SelectionDAG &DAG,
bool IsSignExt = Op0.getOpcode() == RISCVISD::VSEXT_VL;
bool IsZeroExt = Op0.getOpcode() == RISCVISD::VZEXT_VL;
+ bool IsVWMULSU = IsSignExt && Op1.getOpcode() == RISCVISD::VZEXT_VL;
if ((!IsSignExt && !IsZeroExt) || !Op0.hasOneUse())
return SDValue();
@@ -7316,7 +7396,7 @@ static SDValue combineMUL_VLToVWMUL_VL(SDNode *N, SelectionDAG &DAG,
SDLoc DL(N);
// See if the other operand is the same opcode.
- if (Op0.getOpcode() == Op1.getOpcode()) {
+ if (IsVWMULSU || Op0.getOpcode() == Op1.getOpcode()) {
if (!Op1.hasOneUse())
return SDValue();
@@ -7366,7 +7446,9 @@ static SDValue combineMUL_VLToVWMUL_VL(SDNode *N, SelectionDAG &DAG,
if (Op1.getValueType() != NarrowVT)
Op1 = DAG.getNode(ExtOpc, DL, NarrowVT, Op1, Mask, VL);
- unsigned WMulOpc = IsSignExt ? RISCVISD::VWMUL_VL : RISCVISD::VWMULU_VL;
+ unsigned WMulOpc = RISCVISD::VWMULSU_VL;
+ if (!IsVWMULSU)
+ WMulOpc = IsSignExt ? RISCVISD::VWMUL_VL : RISCVISD::VWMULU_VL;
return DAG.getNode(WMulOpc, DL, VT, Op0, Op1, Mask, VL);
}
@@ -8194,12 +8276,17 @@ void RISCVTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
}
break;
}
- case RISCVISD::READ_VLENB:
- // We assume VLENB is at least 16 bytes.
- Known.Zero.setLowBits(4);
+ case RISCVISD::READ_VLENB: {
+ // If we know the minimum VLen from Zvl extensions, we can use that to
+ // determine the trailing zeros of VLENB.
+ // FIXME: Limit to 128 bit vectors until we have more testing.
+ unsigned MinVLenB = std::min(128U, Subtarget.getMinVLen()) / 8;
+ if (MinVLenB > 0)
+ Known.Zero.setLowBits(Log2_32(MinVLenB));
// We assume VLENB is no more than 65536 / 8 bytes.
Known.Zero.setBitsFrom(14);
break;
+ }
case ISD::INTRINSIC_W_CHAIN:
case ISD::INTRINSIC_WO_CHAIN: {
unsigned IntNo =
@@ -8230,9 +8317,11 @@ unsigned RISCVTargetLowering::ComputeNumSignBitsForTargetNode(
default:
break;
case RISCVISD::SELECT_CC: {
- unsigned Tmp = DAG.ComputeNumSignBits(Op.getOperand(3), DemandedElts, Depth + 1);
+ unsigned Tmp =
+ DAG.ComputeNumSignBits(Op.getOperand(3), DemandedElts, Depth + 1);
if (Tmp == 1) return 1; // Early out.
- unsigned Tmp2 = DAG.ComputeNumSignBits(Op.getOperand(4), DemandedElts, Depth + 1);
+ unsigned Tmp2 =
+ DAG.ComputeNumSignBits(Op.getOperand(4), DemandedElts, Depth + 1);
return std::min(Tmp, Tmp2);
}
case RISCVISD::SLLW:
@@ -8275,15 +8364,18 @@ unsigned RISCVTargetLowering::ComputeNumSignBitsForTargetNode(
}
break;
}
- case RISCVISD::VMV_X_S:
+ case RISCVISD::VMV_X_S: {
// The number of sign bits of the scalar result is computed by obtaining the
// element type of the input vector operand, subtracting its width from the
// XLEN, and then adding one (sign bit within the element type). If the
// element type is wider than XLen, the least-significant XLEN bits are
// taken.
- if (Op.getOperand(0).getScalarValueSizeInBits() > Subtarget.getXLen())
- return 1;
- return Subtarget.getXLen() - Op.getOperand(0).getScalarValueSizeInBits() + 1;
+ unsigned XLen = Subtarget.getXLen();
+ unsigned EltBits = Op.getOperand(0).getScalarValueSizeInBits();
+ if (EltBits <= XLen)
+ return XLen - EltBits + 1;
+ break;
+ }
}
return 1;
@@ -10129,6 +10221,7 @@ const char *RISCVTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(FP_ROUND_VL)
NODE_NAME_CASE(VWMUL_VL)
NODE_NAME_CASE(VWMULU_VL)
+ NODE_NAME_CASE(VWMULSU_VL)
NODE_NAME_CASE(VWADDU_VL)
NODE_NAME_CASE(SETCC_VL)
NODE_NAME_CASE(VSELECT_VL)
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index 58b7ec89f875..840a821870a7 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -245,6 +245,7 @@ enum NodeType : unsigned {
// Widening instructions
VWMUL_VL,
VWMULU_VL,
+ VWMULSU_VL,
VWADDU_VL,
// Vector compare producing a mask. Fourth operand is input mask. Fifth
diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
index d39e0805a79c..649eb57b325b 100644
--- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
@@ -999,6 +999,12 @@ bool RISCVInsertVSETVLI::needVSETVLIPHI(const VSETVLIInfo &Require,
void RISCVInsertVSETVLI::emitVSETVLIs(MachineBasicBlock &MBB) {
VSETVLIInfo CurInfo;
+ // BBLocalInfo tracks the VL/VTYPE state the same way BBInfo.Change was
+ // calculated in computeIncomingVLVTYPE. We need this to apply
+ // canSkipVSETVLIForLoadStore the same way computeIncomingVLVTYPE did. We
+ // can't include predecessor information in that decision to avoid disagreeing
+ // with the global analysis.
+ VSETVLIInfo BBLocalInfo;
// Only be set if current VSETVLIInfo is from an explicit VSET(I)VLI.
MachineInstr *PrevVSETVLIMI = nullptr;
@@ -1014,6 +1020,7 @@ void RISCVInsertVSETVLI::emitVSETVLIs(MachineBasicBlock &MBB) {
MI.getOperand(3).setIsDead(false);
MI.getOperand(4).setIsDead(false);
CurInfo = getInfoForVSETVLI(MI);
+ BBLocalInfo = getInfoForVSETVLI(MI);
PrevVSETVLIMI = &MI;
continue;
}
@@ -1043,12 +1050,22 @@ void RISCVInsertVSETVLI::emitVSETVLIs(MachineBasicBlock &MBB) {
// use the predecessor information.
assert(BlockInfo[MBB.getNumber()].Pred.isValid() &&
"Expected a valid predecessor state.");
- if (needVSETVLI(NewInfo, BlockInfo[MBB.getNumber()].Pred) &&
+ // Don't use predecessor information if there was an earlier instruction
+ // in this block that allowed a vsetvli to be skipped for load/store.
+ if (!(BBLocalInfo.isValid() &&
+ canSkipVSETVLIForLoadStore(MI, NewInfo, BBLocalInfo)) &&
+ needVSETVLI(NewInfo, BlockInfo[MBB.getNumber()].Pred) &&
needVSETVLIPHI(NewInfo, MBB)) {
insertVSETVLI(MBB, MI, NewInfo, BlockInfo[MBB.getNumber()].Pred);
CurInfo = NewInfo;
+ BBLocalInfo = NewInfo;
}
+
+ // We must update BBLocalInfo for every vector instruction.
+ if (!BBLocalInfo.isValid())
+ BBLocalInfo = NewInfo;
} else {
+ assert(BBLocalInfo.isValid());
// If this instruction isn't compatible with the previous VL/VTYPE
// we need to insert a VSETVLI.
// If this is a unit-stride or strided load/store, we may be able to use
@@ -1084,6 +1101,7 @@ void RISCVInsertVSETVLI::emitVSETVLIs(MachineBasicBlock &MBB) {
if (NeedInsertVSETVLI)
insertVSETVLI(MBB, MI, NewInfo, CurInfo);
CurInfo = NewInfo;
+ BBLocalInfo = NewInfo;
}
}
PrevVSETVLIMI = nullptr;
@@ -1094,6 +1112,7 @@ void RISCVInsertVSETVLI::emitVSETVLIs(MachineBasicBlock &MBB) {
if (MI.isCall() || MI.isInlineAsm() || MI.modifiesRegister(RISCV::VL) ||
MI.modifiesRegister(RISCV::VTYPE)) {
CurInfo = VSETVLIInfo::getUnknown();
+ BBLocalInfo = VSETVLIInfo::getUnknown();
PrevVSETVLIMI = nullptr;
}
}
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 7baed2793e4e..55f4a19b79eb 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -654,8 +654,8 @@ void RISCVInstrInfo::movImm(MachineBasicBlock &MBB,
BuildMI(MBB, MBBI, DL, get(RISCV::LUI), Result)
.addImm(Inst.Imm)
.setMIFlag(Flag);
- } else if (Inst.Opc == RISCV::ADDUW) {
- BuildMI(MBB, MBBI, DL, get(RISCV::ADDUW), Result)
+ } else if (Inst.Opc == RISCV::ADD_UW) {
+ BuildMI(MBB, MBBI, DL, get(RISCV::ADD_UW), Result)
.addReg(SrcReg, RegState::Kill)
.addReg(RISCV::X0)
.setMIFlag(Flag);
@@ -965,93 +965,29 @@ bool RISCVInstrInfo::isBranchOffsetInRange(unsigned BranchOp,
}
unsigned RISCVInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
+ if (MI.isMetaInstruction())
+ return 0;
+
unsigned Opcode = MI.getOpcode();
- switch (Opcode) {
- default: {
- if (MI.getParent() && MI.getParent()->getParent()) {
- const auto MF = MI.getMF();
- const auto &TM = static_cast<const RISCVTargetMachine &>(MF->getTarget());
- const MCRegisterInfo &MRI = *TM.getMCRegisterInfo();
- const MCSubtargetInfo &STI = *TM.getMCSubtargetInfo();
- const RISCVSubtarget &ST = MF->getSubtarget<RISCVSubtarget>();
- if (isCompressibleInst(MI, &ST, MRI, STI))
- return 2;
- }
- return get(Opcode).getSize();
- }
- case TargetOpcode::EH_LABEL:
- case TargetOpcode::IMPLICIT_DEF:
- case TargetOpcode::KILL:
- case TargetOpcode::DBG_VALUE:
- return 0;
- // These values are determined based on RISCVExpandAtomicPseudoInsts,
- // RISCVExpandPseudoInsts and RISCVMCCodeEmitter, depending on where the
- // pseudos are expanded.
- case RISCV::PseudoCALLReg:
- case RISCV::PseudoCALL:
- case RISCV::PseudoJump:
- case RISCV::PseudoTAIL:
- case RISCV::PseudoLLA:
- case RISCV::PseudoLA:
- case RISCV::PseudoLA_TLS_IE:
- case RISCV::PseudoLA_TLS_GD:
- return 8;
- case RISCV::PseudoAtomicLoadNand32:
- case RISCV::PseudoAtomicLoadNand64:
- return 20;
- case RISCV::PseudoMaskedAtomicSwap32:
- case RISCV::PseudoMaskedAtomicLoadAdd32:
- case RISCV::PseudoMaskedAtomicLoadSub32:
- return 28;
- case RISCV::PseudoMaskedAtomicLoadNand32:
- return 32;
- case RISCV::PseudoMaskedAtomicLoadMax32:
- case RISCV::PseudoMaskedAtomicLoadMin32:
- return 44;
- case RISCV::PseudoMaskedAtomicLoadUMax32:
- case RISCV::PseudoMaskedAtomicLoadUMin32:
- return 36;
- case RISCV::PseudoCmpXchg32:
- case RISCV::PseudoCmpXchg64:
- return 16;
- case RISCV::PseudoMaskedCmpXchg32:
- return 32;
- case TargetOpcode::INLINEASM:
- case TargetOpcode::INLINEASM_BR: {
+ if (Opcode == TargetOpcode::INLINEASM ||
+ Opcode == TargetOpcode::INLINEASM_BR) {
const MachineFunction &MF = *MI.getParent()->getParent();
const auto &TM = static_cast<const RISCVTargetMachine &>(MF.getTarget());
return getInlineAsmLength(MI.getOperand(0).getSymbolName(),
*TM.getMCAsmInfo());
}
- case RISCV::PseudoVSPILL2_M1:
- case RISCV::PseudoVSPILL2_M2:
- case RISCV::PseudoVSPILL2_M4:
- case RISCV::PseudoVSPILL3_M1:
- case RISCV::PseudoVSPILL3_M2:
- case RISCV::PseudoVSPILL4_M1:
- case RISCV::PseudoVSPILL4_M2:
- case RISCV::PseudoVSPILL5_M1:
- case RISCV::PseudoVSPILL6_M1:
- case RISCV::PseudoVSPILL7_M1:
- case RISCV::PseudoVSPILL8_M1:
- case RISCV::PseudoVRELOAD2_M1:
- case RISCV::PseudoVRELOAD2_M2:
- case RISCV::PseudoVRELOAD2_M4:
- case RISCV::PseudoVRELOAD3_M1:
- case RISCV::PseudoVRELOAD3_M2:
- case RISCV::PseudoVRELOAD4_M1:
- case RISCV::PseudoVRELOAD4_M2:
- case RISCV::PseudoVRELOAD5_M1:
- case RISCV::PseudoVRELOAD6_M1:
- case RISCV::PseudoVRELOAD7_M1:
- case RISCV::PseudoVRELOAD8_M1: {
- // The values are determined based on expandVSPILL and expandVRELOAD that
- // expand the pseudos depending on NF.
- unsigned NF = isRVVSpillForZvlsseg(Opcode)->first;
- return 4 * (2 * NF - 1);
- }
+
+ if (MI.getParent() && MI.getParent()->getParent()) {
+ const auto MF = MI.getMF();
+ const auto &TM = static_cast<const RISCVTargetMachine &>(MF->getTarget());
+ const MCRegisterInfo &MRI = *TM.getMCRegisterInfo();
+ const MCSubtargetInfo &STI = *TM.getMCSubtargetInfo();
+ const RISCVSubtarget &ST = MF->getSubtarget<RISCVSubtarget>();
+ if (isCompressibleInst(MI, &ST, MRI, STI))
+ return 2;
}
+ return get(Opcode).getSize();
}
bool RISCVInstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const {
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
index 64cd89cda06a..ee6a74b7f14f 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
@@ -1183,7 +1183,7 @@ def : Pat<(brind (add GPRJALR:$rs1, simm12:$imm12)),
// destination.
// Define AsmString to print "call" when compile with -S flag.
// Define isCodeGenOnly = 0 to support parsing assembly "call" instruction.
-let isCall = 1, isBarrier = 1, isCodeGenOnly = 0, hasSideEffects = 0,
+let isCall = 1, isBarrier = 1, isCodeGenOnly = 0, Size = 8, hasSideEffects = 0,
mayStore = 0, mayLoad = 0 in
def PseudoCALLReg : Pseudo<(outs GPR:$rd), (ins call_symbol:$func), []> {
let AsmString = "call\t$rd, $func";
@@ -1195,7 +1195,7 @@ def PseudoCALLReg : Pseudo<(outs GPR:$rd), (ins call_symbol:$func), []> {
// if the offset fits in a signed 21-bit immediate.
// Define AsmString to print "call" when compile with -S flag.
// Define isCodeGenOnly = 0 to support parsing assembly "call" instruction.
-let isCall = 1, Defs = [X1], isCodeGenOnly = 0 in
+let isCall = 1, Defs = [X1], isCodeGenOnly = 0, Size = 8 in
def PseudoCALL : Pseudo<(outs), (ins call_symbol:$func), []> {
let AsmString = "call\t$func";
}
@@ -1220,7 +1220,7 @@ def PseudoRET : Pseudo<(outs), (ins), [(riscv_ret_flag)]>,
// expand to auipc and jalr while encoding.
// Define AsmString to print "tail" when compile with -S flag.
let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [X2],
- isCodeGenOnly = 0 in
+ Size = 8, isCodeGenOnly = 0 in
def PseudoTAIL : Pseudo<(outs), (ins call_symbol:$dst), []> {
let AsmString = "tail\t$dst";
}
@@ -1235,28 +1235,28 @@ def : Pat<(riscv_tail (iPTR tglobaladdr:$dst)),
def : Pat<(riscv_tail (iPTR texternalsym:$dst)),
(PseudoTAIL texternalsym:$dst)>;
-let isCall = 0, isBarrier = 1, isBranch = 1, isTerminator = 1,
+let isCall = 0, isBarrier = 1, isBranch = 1, isTerminator = 1, Size = 8,
isCodeGenOnly = 0, hasSideEffects = 0, mayStore = 0, mayLoad = 0 in
def PseudoJump : Pseudo<(outs GPR:$rd), (ins pseudo_jump_symbol:$target), []> {
let AsmString = "jump\t$target, $rd";
}
-let hasSideEffects = 0, mayLoad = 0, mayStore = 0, isCodeGenOnly = 0,
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Size = 8, isCodeGenOnly = 0,
isAsmParserOnly = 1 in
def PseudoLLA : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [],
"lla", "$dst, $src">;
-let hasSideEffects = 0, mayLoad = 1, mayStore = 0, isCodeGenOnly = 0,
+let hasSideEffects = 0, mayLoad = 1, mayStore = 0, Size = 8, isCodeGenOnly = 0,
isAsmParserOnly = 1 in
def PseudoLA : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [],
"la", "$dst, $src">;
-let hasSideEffects = 0, mayLoad = 1, mayStore = 0, isCodeGenOnly = 0,
+let hasSideEffects = 0, mayLoad = 1, mayStore = 0, Size = 8, isCodeGenOnly = 0,
isAsmParserOnly = 1 in
def PseudoLA_TLS_IE : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [],
"la.tls.ie", "$dst, $src">;
-let hasSideEffects = 0, mayLoad = 1, mayStore = 0, isCodeGenOnly = 0,
+let hasSideEffects = 0, mayLoad = 1, mayStore = 0, Size = 8, isCodeGenOnly = 0,
isAsmParserOnly = 1 in
def PseudoLA_TLS_GD : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [],
"la.tls.gd", "$dst, $src">;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoA.td b/llvm/lib/Target/RISCV/RISCVInstrInfoA.td
index ee10c3a54b2f..7d23dafb0346 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoA.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoA.td
@@ -188,6 +188,7 @@ class PseudoAMO : Pseudo<(outs GPR:$res, GPR:$scratch),
let hasSideEffects = 0;
}
+let Size = 20 in
def PseudoAtomicLoadNand32 : PseudoAMO;
// Ordering constants must be kept in sync with the AtomicOrdering enum in
// AtomicOrdering.h.
@@ -242,27 +243,35 @@ class PseudoMaskedAMOMinMaxPat<Intrinsic intrin, Pseudo AMOInst>
(AMOInst GPR:$addr, GPR:$incr, GPR:$mask, GPR:$shiftamt,
timm:$ordering)>;
+let Size = 28 in
def PseudoMaskedAtomicSwap32 : PseudoMaskedAMO;
def : PseudoMaskedAMOPat<int_riscv_masked_atomicrmw_xchg_i32,
PseudoMaskedAtomicSwap32>;
+let Size = 28 in
def PseudoMaskedAtomicLoadAdd32 : PseudoMaskedAMO;
def : PseudoMaskedAMOPat<int_riscv_masked_atomicrmw_add_i32,
PseudoMaskedAtomicLoadAdd32>;
+let Size = 28 in
def PseudoMaskedAtomicLoadSub32 : PseudoMaskedAMO;
def : PseudoMaskedAMOPat<int_riscv_masked_atomicrmw_sub_i32,
PseudoMaskedAtomicLoadSub32>;
+let Size = 32 in
def PseudoMaskedAtomicLoadNand32 : PseudoMaskedAMO;
def : PseudoMaskedAMOPat<int_riscv_masked_atomicrmw_nand_i32,
PseudoMaskedAtomicLoadNand32>;
+let Size = 44 in
def PseudoMaskedAtomicLoadMax32 : PseudoMaskedAMOMinMax;
def : PseudoMaskedAMOMinMaxPat<int_riscv_masked_atomicrmw_max_i32,
PseudoMaskedAtomicLoadMax32>;
+let Size = 44 in
def PseudoMaskedAtomicLoadMin32 : PseudoMaskedAMOMinMax;
def : PseudoMaskedAMOMinMaxPat<int_riscv_masked_atomicrmw_min_i32,
PseudoMaskedAtomicLoadMin32>;
+let Size = 36 in
def PseudoMaskedAtomicLoadUMax32 : PseudoMaskedAMOUMinUMax;
def : PseudoMaskedAMOPat<int_riscv_masked_atomicrmw_umax_i32,
PseudoMaskedAtomicLoadUMax32>;
+let Size = 36 in
def PseudoMaskedAtomicLoadUMin32 : PseudoMaskedAMOUMinUMax;
def : PseudoMaskedAMOPat<int_riscv_masked_atomicrmw_umin_i32,
PseudoMaskedAtomicLoadUMin32>;
@@ -276,6 +285,7 @@ class PseudoCmpXchg
let mayLoad = 1;
let mayStore = 1;
let hasSideEffects = 0;
+ let Size = 16;
}
// Ordering constants must be kept in sync with the AtomicOrdering enum in
@@ -304,6 +314,7 @@ def PseudoMaskedCmpXchg32
let mayLoad = 1;
let mayStore = 1;
let hasSideEffects = 0;
+ let Size = 32;
}
def : Pat<(int_riscv_masked_cmpxchg_i32
@@ -347,6 +358,7 @@ def : Pat<(i64 (atomic_load_sub_64_seq_cst GPR:$addr, GPR:$incr)),
/// 64-bit pseudo AMOs
+let Size = 20 in
def PseudoAtomicLoadNand64 : PseudoAMO;
// Ordering constants must be kept in sync with the AtomicOrdering enum in
// AtomicOrdering.h.
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
index 4e7e251bc412..9087ed50f9fc 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
@@ -3836,7 +3836,7 @@ multiclass VPatConversionVF_WF <string intrinsic, string instruction> {
}
multiclass VPatCompare_VI<string intrinsic, string inst,
- ImmLeaf ImmType = simm5_plus1> {
+ ImmLeaf ImmType> {
foreach vti = AllIntegerVectors in {
defvar Intr = !cast<Intrinsic>(intrinsic);
defvar Pseudo = !cast<Instruction>(inst#"_VI_"#vti.LMul.MX);
@@ -3899,11 +3899,13 @@ let hasSideEffects = 0, mayLoad = 1, mayStore = 0, isCodeGenOnly = 1 in {
foreach lmul = MxList in {
foreach nf = NFSet<lmul>.L in {
defvar vreg = SegRegClass<lmul, nf>.RC;
- let hasSideEffects = 0, mayLoad = 0, mayStore = 1, isCodeGenOnly = 1 in {
+ let hasSideEffects = 0, mayLoad = 0, mayStore = 1, isCodeGenOnly = 1,
+ Size = !mul(4, !sub(!mul(nf, 2), 1)) in {
def "PseudoVSPILL" # nf # "_" # lmul.MX :
Pseudo<(outs), (ins vreg:$rs1, GPR:$rs2, GPR:$vlenb), []>;
}
- let hasSideEffects = 0, mayLoad = 1, mayStore = 0, isCodeGenOnly = 1 in {
+ let hasSideEffects = 0, mayLoad = 1, mayStore = 0, isCodeGenOnly = 1,
+ Size = !mul(4, !sub(!mul(nf, 2), 1)) in {
def "PseudoVRELOAD" # nf # "_" # lmul.MX :
Pseudo<(outs vreg:$rs1), (ins GPR:$rs2, GPR:$vlenb), []>;
}
@@ -4657,13 +4659,15 @@ defm : VPatBinarySwappedM_VV<"int_riscv_vmsgt", "PseudoVMSLT", AllIntegerVectors
defm : VPatBinarySwappedM_VV<"int_riscv_vmsgeu", "PseudoVMSLEU", AllIntegerVectors>;
defm : VPatBinarySwappedM_VV<"int_riscv_vmsge", "PseudoVMSLE", AllIntegerVectors>;
-// Match vmslt(u).vx intrinsics to vmsle(u).vi if the scalar is -15 to 16. This
-// avoids the user needing to know that there is no vmslt(u).vi instruction.
-// Similar for vmsge(u).vx intrinsics using vmslt(u).vi.
-defm : VPatCompare_VI<"int_riscv_vmslt", "PseudoVMSLE">;
+// Match vmslt(u).vx intrinsics to vmsle(u).vi if the scalar is -15 to 16 and
+// non-zero. Zero can be .vx with x0. This avoids the user needing to know that
+// there is no vmslt(u).vi instruction. Similar for vmsge(u).vx intrinsics
+// using vmslt(u).vi.
+defm : VPatCompare_VI<"int_riscv_vmslt", "PseudoVMSLE", simm5_plus1_nonzero>;
defm : VPatCompare_VI<"int_riscv_vmsltu", "PseudoVMSLEU", simm5_plus1_nonzero>;
-defm : VPatCompare_VI<"int_riscv_vmsge", "PseudoVMSGT">;
+// We need to handle 0 for vmsge.vi using vmslt.vi because there is no vmsge.vx.
+defm : VPatCompare_VI<"int_riscv_vmsge", "PseudoVMSGT", simm5_plus1>;
defm : VPatCompare_VI<"int_riscv_vmsgeu", "PseudoVMSGTU", simm5_plus1_nonzero>;
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td
index e452a84a9a6f..2b920d29ab81 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td
@@ -539,7 +539,7 @@ defm : VPatIntegerSetCCSDNode_VV_VX_VI<SETNE, "PseudoVMSNE">;
defm : VPatIntegerSetCCSDNode_VV_VX<SETLT, "PseudoVMSLT">;
defm : VPatIntegerSetCCSDNode_VV_VX<SETULT, "PseudoVMSLTU">;
defm : VPatIntegerSetCCSDNode_VIPlus1<SETLT, "PseudoVMSLE",
- SplatPat_simm5_plus1>;
+ SplatPat_simm5_plus1_nonzero>;
defm : VPatIntegerSetCCSDNode_VIPlus1<SETULT, "PseudoVMSLEU",
SplatPat_simm5_plus1_nonzero>;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
index 964f0fa54512..e71c498fd5f4 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
@@ -228,6 +228,7 @@ def SDT_RISCVVWBinOp_VL : SDTypeProfile<1, 4, [SDTCisVec<0>,
SDTCisVT<4, XLenVT>]>;
def riscv_vwmul_vl : SDNode<"RISCVISD::VWMUL_VL", SDT_RISCVVWBinOp_VL, [SDNPCommutative]>;
def riscv_vwmulu_vl : SDNode<"RISCVISD::VWMULU_VL", SDT_RISCVVWBinOp_VL, [SDNPCommutative]>;
+def riscv_vwmulsu_vl : SDNode<"RISCVISD::VWMULSU_VL", SDT_RISCVVWBinOp_VL>;
def riscv_vwaddu_vl : SDNode<"RISCVISD::VWADDU_VL", SDT_RISCVVWBinOp_VL, [SDNPCommutative]>;
def SDTRVVVecReduce : SDTypeProfile<1, 5, [
@@ -832,7 +833,7 @@ foreach vti = AllIntegerVectors in {
defm : VPatIntegerSetCCVL_VI_Swappable<vti, "PseudoVMSGTU", SETUGT, SETULT>;
defm : VPatIntegerSetCCVL_VIPlus1<vti, "PseudoVMSLE", SETLT,
- SplatPat_simm5_plus1>;
+ SplatPat_simm5_plus1_nonzero>;
defm : VPatIntegerSetCCVL_VIPlus1<vti, "PseudoVMSLEU", SETULT,
SplatPat_simm5_plus1_nonzero>;
defm : VPatIntegerSetCCVL_VIPlus1<vti, "PseudoVMSGT", SETGE,
@@ -861,6 +862,7 @@ defm : VPatBinaryVL_VV_VX<riscv_srem_vl, "PseudoVREM">;
// 12.12. Vector Widening Integer Multiply Instructions
defm : VPatBinaryWVL_VV_VX<riscv_vwmul_vl, "PseudoVWMUL">;
defm : VPatBinaryWVL_VV_VX<riscv_vwmulu_vl, "PseudoVWMULU">;
+defm : VPatBinaryWVL_VV_VX<riscv_vwmulsu_vl, "PseudoVWMULSU">;
// 12.13 Vector Single-Width Integer Multiply-Add Instructions
foreach vti = AllIntegerVectors in {
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
index db3f5851879a..07884d35f63c 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
@@ -337,13 +337,39 @@ def SH3ADD : ALU_rr<0b0010000, 0b110, "sh3add">,
Sched<[WriteSHXADD, ReadSHXADD, ReadSHXADD]>;
} // Predicates = [HasStdExtZba]
+let Predicates = [HasStdExtZba, IsRV64] in {
+def SLLI_UW : RVBShift_ri<0b00001, 0b001, OPC_OP_IMM_32, "slli.uw">,
+ Sched<[WriteShiftImm32, ReadShiftImm32]>;
+def ADD_UW : ALUW_rr<0b0000100, 0b000, "add.uw">,
+ Sched<[WriteIALU32, ReadIALU32, ReadIALU32]>;
+def SH1ADD_UW : ALUW_rr<0b0010000, 0b010, "sh1add.uw">,
+ Sched<[WriteSHXADD32, ReadSHXADD32, ReadSHXADD32]>;
+def SH2ADD_UW : ALUW_rr<0b0010000, 0b100, "sh2add.uw">,
+ Sched<[WriteSHXADD32, ReadSHXADD32, ReadSHXADD32]>;
+def SH3ADD_UW : ALUW_rr<0b0010000, 0b110, "sh3add.uw">,
+ Sched<[WriteSHXADD32, ReadSHXADD32, ReadSHXADD32]>;
+} // Predicates = [HasStdExtZbb, IsRV64]
+
let Predicates = [HasStdExtZbbOrZbpOrZbkb] in {
def ROL : ALU_rr<0b0110000, 0b001, "rol">,
Sched<[WriteRotateReg, ReadRotateReg, ReadRotateReg]>;
def ROR : ALU_rr<0b0110000, 0b101, "ror">,
Sched<[WriteRotateReg, ReadRotateReg, ReadRotateReg]>;
+
+def RORI : RVBShift_ri<0b01100, 0b101, OPC_OP_IMM, "rori">,
+ Sched<[WriteRotateImm, ReadRotateImm]>;
} // Predicates = [HasStdExtZbbOrZbpOrZbkb]
+let Predicates = [HasStdExtZbbOrZbpOrZbkb, IsRV64] in {
+def ROLW : ALUW_rr<0b0110000, 0b001, "rolw">,
+ Sched<[WriteRotateReg32, ReadRotateReg32, ReadRotateReg32]>;
+def RORW : ALUW_rr<0b0110000, 0b101, "rorw">,
+ Sched<[WriteRotateReg32, ReadRotateReg32, ReadRotateReg32]>;
+
+def RORIW : RVBShiftW_ri<0b0110000, 0b101, OPC_OP_IMM_32, "roriw">,
+ Sched<[WriteRotateImm32, ReadRotateImm32]>;
+} // Predicates = [HasStdExtZbbOrZbp, IsRV64]
+
let Predicates = [HasStdExtZbs] in {
def BCLR : ALU_rr<0b0100100, 0b001, "bclr">,
Sched<[WriteSingleBit, ReadSingleBit, ReadSingleBit]>;
@@ -353,27 +379,7 @@ def BINV : ALU_rr<0b0110100, 0b001, "binv">,
Sched<[WriteSingleBit, ReadSingleBit, ReadSingleBit]>;
def BEXT : ALU_rr<0b0100100, 0b101, "bext">,
Sched<[WriteSingleBit, ReadSingleBit, ReadSingleBit]>;
-} // Predicates = [HasStdExtZbs]
-
-let Predicates = [HasStdExtZbp] in {
-def GORC : ALU_rr<0b0010100, 0b101, "gorc">, Sched<[]>;
-def GREV : ALU_rr<0b0110100, 0b101, "grev">, Sched<[]>;
-} // Predicates = [HasStdExtZbp]
-let Predicates = [HasStdExtZbpOrZbkx] in {
-def XPERMN : ALU_rr<0b0010100, 0b010, "xperm4">, Sched<[]>;
-def XPERMB : ALU_rr<0b0010100, 0b100, "xperm8">, Sched<[]>;
-} // Predicates = [HasStdExtZbpOrZbkx]
-
-let Predicates = [HasStdExtZbp] in {
-def XPERMH : ALU_rr<0b0010100, 0b110, "xperm.h">, Sched<[]>;
-} // Predicates = [HasStdExtZbp]
-
-let Predicates = [HasStdExtZbbOrZbpOrZbkb] in
-def RORI : RVBShift_ri<0b01100, 0b101, OPC_OP_IMM, "rori">,
- Sched<[WriteRotateImm, ReadRotateImm]>;
-
-let Predicates = [HasStdExtZbs] in {
def BCLRI : RVBShift_ri<0b01001, 0b001, OPC_OP_IMM, "bclri">,
Sched<[WriteSingleBitImm, ReadSingleBitImm]>;
def BSETI : RVBShift_ri<0b00101, 0b001, OPC_OP_IMM, "bseti">,
@@ -385,10 +391,42 @@ def BEXTI : RVBShift_ri<0b01001, 0b101, OPC_OP_IMM, "bexti">,
} // Predicates = [HasStdExtZbs]
let Predicates = [HasStdExtZbp] in {
+def GORC : ALU_rr<0b0010100, 0b101, "gorc">, Sched<[]>;
+def GREV : ALU_rr<0b0110100, 0b101, "grev">, Sched<[]>;
+
def GREVI : RVBShift_ri<0b01101, 0b101, OPC_OP_IMM, "grevi">, Sched<[]>;
def GORCI : RVBShift_ri<0b00101, 0b101, OPC_OP_IMM, "gorci">, Sched<[]>;
+
+def SHFL : ALU_rr<0b0000100, 0b001, "shfl">, Sched<[]>;
+def UNSHFL : ALU_rr<0b0000100, 0b101, "unshfl">, Sched<[]>;
+
+def SHFLI : RVBShfl_ri<0b0000100, 0b001, OPC_OP_IMM, "shfli">, Sched<[]>;
+def UNSHFLI : RVBShfl_ri<0b0000100, 0b101, OPC_OP_IMM, "unshfli">, Sched<[]>;
+
+def XPERM_H : ALU_rr<0b0010100, 0b110, "xperm.h">, Sched<[]>;
} // Predicates = [HasStdExtZbp]
+let Predicates = [HasStdExtZbp, IsRV64] in {
+def GORCW : ALUW_rr<0b0010100, 0b101, "gorcw">, Sched<[]>;
+def GREVW : ALUW_rr<0b0110100, 0b101, "grevw">, Sched<[]>;
+
+def GORCIW : RVBShiftW_ri<0b0010100, 0b101, OPC_OP_IMM_32, "gorciw">, Sched<[]>;
+def GREVIW : RVBShiftW_ri<0b0110100, 0b101, OPC_OP_IMM_32, "greviw">, Sched<[]>;
+
+def SHFLW : ALUW_rr<0b0000100, 0b001, "shflw">, Sched<[]>;
+def UNSHFLW : ALUW_rr<0b0000100, 0b101, "unshflw">, Sched<[]>;
+
+def XPERM_W : ALU_rr<0b0010100, 0b000, "xperm.w">, Sched<[]>;
+} // Predicates = [HasStdExtZbp, IsRV64]
+
+// These instructions were named xperm.n and xperm.b in the last version of
+// the draft bit manipulation specification they were included in. However, we
+// use the mnemonics given to them in the ratified Zbkx extension.
+let Predicates = [HasStdExtZbpOrZbkx] in {
+def XPERM4 : ALU_rr<0b0010100, 0b010, "xperm4">, Sched<[]>;
+def XPERM8 : ALU_rr<0b0010100, 0b100, "xperm8">, Sched<[]>;
+} // Predicates = [HasStdExtZbpOrZbkx]
+
let Predicates = [HasStdExtZbt] in {
def CMIX : RVBTernaryR<0b11, 0b001, OPC_OP, "cmix", "$rd, $rs2, $rs1, $rs3">,
Sched<[]>;
@@ -402,6 +440,15 @@ def FSRI : RVBTernaryImm6<0b101, OPC_OP_IMM, "fsri",
"$rd, $rs1, $rs3, $shamt">, Sched<[]>;
} // Predicates = [HasStdExtZbt]
+let Predicates = [HasStdExtZbt, IsRV64] in {
+def FSLW : RVBTernaryR<0b10, 0b001, OPC_OP_32,
+ "fslw", "$rd, $rs1, $rs3, $rs2">, Sched<[]>;
+def FSRW : RVBTernaryR<0b10, 0b101, OPC_OP_32, "fsrw",
+ "$rd, $rs1, $rs3, $rs2">, Sched<[]>;
+def FSRIW : RVBTernaryImm5<0b10, 0b101, OPC_OP_IMM_32,
+ "fsriw", "$rd, $rs1, $rs3, $shamt">, Sched<[]>;
+} // Predicates = [HasStdExtZbt, IsRV64]
+
let Predicates = [HasStdExtZbb] in {
def CLZ : RVBUnary<0b0110000, 0b00000, 0b001, OPC_OP_IMM, "clz">,
Sched<[WriteCLZ, ReadCLZ]>;
@@ -411,42 +458,45 @@ def CPOP : RVBUnary<0b0110000, 0b00010, 0b001, OPC_OP_IMM, "cpop">,
Sched<[WriteCPOP, ReadCPOP]>;
} // Predicates = [HasStdExtZbb]
-let Predicates = [HasStdExtZbm, IsRV64] in
-def BMATFLIP : RVBUnary<0b0110000, 0b00011, 0b001, OPC_OP_IMM, "bmatflip">,
- Sched<[]>;
+let Predicates = [HasStdExtZbb, IsRV64] in {
+def CLZW : RVBUnary<0b0110000, 0b00000, 0b001, OPC_OP_IMM_32, "clzw">,
+ Sched<[WriteCLZ32, ReadCLZ32]>;
+def CTZW : RVBUnary<0b0110000, 0b00001, 0b001, OPC_OP_IMM_32, "ctzw">,
+ Sched<[WriteCTZ32, ReadCTZ32]>;
+def CPOPW : RVBUnary<0b0110000, 0b00010, 0b001, OPC_OP_IMM_32, "cpopw">,
+ Sched<[WriteCPOP32, ReadCPOP32]>;
+} // Predicates = [HasStdExtZbb, IsRV64]
let Predicates = [HasStdExtZbb] in {
-def SEXTB : RVBUnary<0b0110000, 0b00100, 0b001, OPC_OP_IMM, "sext.b">,
- Sched<[WriteIALU, ReadIALU]>;
-def SEXTH : RVBUnary<0b0110000, 0b00101, 0b001, OPC_OP_IMM, "sext.h">,
- Sched<[WriteIALU, ReadIALU]>;
+def SEXT_B : RVBUnary<0b0110000, 0b00100, 0b001, OPC_OP_IMM, "sext.b">,
+ Sched<[WriteIALU, ReadIALU]>;
+def SEXT_H : RVBUnary<0b0110000, 0b00101, 0b001, OPC_OP_IMM, "sext.h">,
+ Sched<[WriteIALU, ReadIALU]>;
} // Predicates = [HasStdExtZbb]
let Predicates = [HasStdExtZbr] in {
-def CRC32B : RVBUnary<0b0110000, 0b10000, 0b001, OPC_OP_IMM, "crc32.b">,
- Sched<[]>;
-def CRC32H : RVBUnary<0b0110000, 0b10001, 0b001, OPC_OP_IMM, "crc32.h">,
- Sched<[]>;
-def CRC32W : RVBUnary<0b0110000, 0b10010, 0b001, OPC_OP_IMM, "crc32.w">,
- Sched<[]>;
-} // Predicates = [HasStdExtZbr]
-
-let Predicates = [HasStdExtZbr, IsRV64] in
-def CRC32D : RVBUnary<0b0110000, 0b10011, 0b001, OPC_OP_IMM, "crc32.d">,
+def CRC32_B : RVBUnary<0b0110000, 0b10000, 0b001, OPC_OP_IMM, "crc32.b">,
Sched<[]>;
-
-let Predicates = [HasStdExtZbr] in {
-def CRC32CB : RVBUnary<0b0110000, 0b11000, 0b001, OPC_OP_IMM, "crc32c.b">,
+def CRC32_H : RVBUnary<0b0110000, 0b10001, 0b001, OPC_OP_IMM, "crc32.h">,
Sched<[]>;
-def CRC32CH : RVBUnary<0b0110000, 0b11001, 0b001, OPC_OP_IMM, "crc32c.h">,
- Sched<[]>;
-def CRC32CW : RVBUnary<0b0110000, 0b11010, 0b001, OPC_OP_IMM, "crc32c.w">,
+def CRC32_W : RVBUnary<0b0110000, 0b10010, 0b001, OPC_OP_IMM, "crc32.w">,
Sched<[]>;
+
+def CRC32C_B : RVBUnary<0b0110000, 0b11000, 0b001, OPC_OP_IMM, "crc32c.b">,
+ Sched<[]>;
+def CRC32C_H : RVBUnary<0b0110000, 0b11001, 0b001, OPC_OP_IMM, "crc32c.h">,
+ Sched<[]>;
+def CRC32C_W : RVBUnary<0b0110000, 0b11010, 0b001, OPC_OP_IMM, "crc32c.w">,
+ Sched<[]>;
} // Predicates = [HasStdExtZbr]
-let Predicates = [HasStdExtZbr, IsRV64] in
-def CRC32CD : RVBUnary<0b0110000, 0b11011, 0b001, OPC_OP_IMM, "crc32c.d">,
- Sched<[]>;
+let Predicates = [HasStdExtZbr, IsRV64] in {
+def CRC32_D : RVBUnary<0b0110000, 0b10011, 0b001, OPC_OP_IMM, "crc32.d">,
+ Sched<[]>;
+
+def CRC32C_D : RVBUnary<0b0110000, 0b11011, 0b001, OPC_OP_IMM, "crc32c.d">,
+ Sched<[]>;
+} // Predicates = [HasStdExtZbr, IsRV64]
let Predicates = [HasStdExtZbc] in {
def CLMULR : ALU_rr<0b0000101, 0b010, "clmulr">,
@@ -472,8 +522,6 @@ def MAXU : ALU_rr<0b0000101, 0b111, "maxu">,
} // Predicates = [HasStdExtZbb]
let Predicates = [HasStdExtZbp] in {
-def SHFL : ALU_rr<0b0000100, 0b001, "shfl">, Sched<[]>;
-def UNSHFL : ALU_rr<0b0000100, 0b101, "unshfl">, Sched<[]>;
} // Predicates = [HasStdExtZbp]
let Predicates = [HasStdExtZbe] in {
@@ -483,15 +531,31 @@ def BDECOMPRESS : ALU_rr<0b0100100, 0b110, "bdecompress">, Sched<[]>;
def BCOMPRESS : ALU_rr<0b0000100, 0b110, "bcompress">, Sched<[]>;
} // Predicates = [HasStdExtZbe]
+let Predicates = [HasStdExtZbe, IsRV64] in {
+// NOTE: These mnemonics are from the 0.94 spec. There is a name conflict with
+// bextw in the 0.93 spec.
+def BDECOMPRESSW : ALUW_rr<0b0100100, 0b110, "bdecompressw">, Sched<[]>;
+def BCOMPRESSW : ALUW_rr<0b0000100, 0b110, "bcompressw">, Sched<[]>;
+} // Predicates = [HasStdExtZbe, IsRV64]
+
let Predicates = [HasStdExtZbpOrZbkb] in {
def PACK : ALU_rr<0b0000100, 0b100, "pack">, Sched<[]>;
def PACKH : ALU_rr<0b0000100, 0b111, "packh">, Sched<[]>;
} // Predicates = [HasStdExtZbpOrZbkb]
+let Predicates = [HasStdExtZbpOrZbkb, IsRV64] in
+def PACKW : ALUW_rr<0b0000100, 0b100, "packw">, Sched<[]>;
+
let Predicates = [HasStdExtZbp] in
def PACKU : ALU_rr<0b0100100, 0b100, "packu">, Sched<[]>;
+let Predicates = [HasStdExtZbp, IsRV64] in
+def PACKUW : ALUW_rr<0b0100100, 0b100, "packuw">, Sched<[]>;
+
let Predicates = [HasStdExtZbm, IsRV64] in {
+def BMATFLIP : RVBUnary<0b0110000, 0b00011, 0b001, OPC_OP_IMM, "bmatflip">,
+ Sched<[]>;
+
def BMATOR : ALU_rr<0b0000100, 0b011, "bmator">, Sched<[]>;
def BMATXOR : ALU_rr<0b0100100, 0b011, "bmatxor">, Sched<[]>;
} // Predicates = [HasStdExtZbm, IsRV64]
@@ -500,105 +564,18 @@ let Predicates = [HasStdExtZbf] in
def BFP : ALU_rr<0b0100100, 0b111, "bfp">,
Sched<[WriteBFP, ReadBFP, ReadBFP]>;
-let Predicates = [HasStdExtZbp] in {
-def SHFLI : RVBShfl_ri<0b0000100, 0b001, OPC_OP_IMM, "shfli">, Sched<[]>;
-def UNSHFLI : RVBShfl_ri<0b0000100, 0b101, OPC_OP_IMM, "unshfli">, Sched<[]>;
-} // Predicates = [HasStdExtZbp]
-
-let Predicates = [HasStdExtZba, IsRV64] in {
-def SLLIUW : RVBShift_ri<0b00001, 0b001, OPC_OP_IMM_32, "slli.uw">,
- Sched<[WriteShiftImm32, ReadShiftImm32]>;
-def ADDUW : ALUW_rr<0b0000100, 0b000, "add.uw">,
- Sched<[WriteIALU32, ReadIALU32, ReadIALU32]>;
-def SH1ADDUW : ALUW_rr<0b0010000, 0b010, "sh1add.uw">,
- Sched<[WriteSHXADD32, ReadSHXADD32, ReadSHXADD32]>;
-def SH2ADDUW : ALUW_rr<0b0010000, 0b100, "sh2add.uw">,
- Sched<[WriteSHXADD32, ReadSHXADD32, ReadSHXADD32]>;
-def SH3ADDUW : ALUW_rr<0b0010000, 0b110, "sh3add.uw">,
- Sched<[WriteSHXADD32, ReadSHXADD32, ReadSHXADD32]>;
-} // Predicates = [HasStdExtZbb, IsRV64]
-
-let Predicates = [HasStdExtZbbOrZbpOrZbkb, IsRV64] in {
-def ROLW : ALUW_rr<0b0110000, 0b001, "rolw">,
- Sched<[WriteRotateReg32, ReadRotateReg32, ReadRotateReg32]>;
-def RORW : ALUW_rr<0b0110000, 0b101, "rorw">,
- Sched<[WriteRotateReg32, ReadRotateReg32, ReadRotateReg32]>;
-} // Predicates = [HasStdExtZbbOrZbp, IsRV64]
-
-let Predicates = [HasStdExtZbp, IsRV64] in {
-def GORCW : ALUW_rr<0b0010100, 0b101, "gorcw">, Sched<[]>;
-def GREVW : ALUW_rr<0b0110100, 0b101, "grevw">, Sched<[]>;
-} // Predicates = [HasStdExtZbp, IsRV64]
-
-let Predicates = [HasStdExtZbp, IsRV64] in {
-def XPERMW : ALU_rr<0b0010100, 0b000, "xperm.w">, Sched<[]>;
-} // Predicates = [HasStdExtZbp, IsRV64]
-
-let Predicates = [HasStdExtZbbOrZbpOrZbkb, IsRV64] in
-def RORIW : RVBShiftW_ri<0b0110000, 0b101, OPC_OP_IMM_32, "roriw">,
- Sched<[WriteRotateImm32, ReadRotateImm32]>;
-
-let Predicates = [HasStdExtZbp, IsRV64] in {
-def GORCIW : RVBShiftW_ri<0b0010100, 0b101, OPC_OP_IMM_32, "gorciw">, Sched<[]>;
-def GREVIW : RVBShiftW_ri<0b0110100, 0b101, OPC_OP_IMM_32, "greviw">, Sched<[]>;
-} // Predicates = [HasStdExtZbp, IsRV64]
-
-let Predicates = [HasStdExtZbt, IsRV64] in {
-def FSLW : RVBTernaryR<0b10, 0b001, OPC_OP_32,
- "fslw", "$rd, $rs1, $rs3, $rs2">, Sched<[]>;
-def FSRW : RVBTernaryR<0b10, 0b101, OPC_OP_32, "fsrw",
- "$rd, $rs1, $rs3, $rs2">, Sched<[]>;
-def FSRIW : RVBTernaryImm5<0b10, 0b101, OPC_OP_IMM_32,
- "fsriw", "$rd, $rs1, $rs3, $shamt">, Sched<[]>;
-} // Predicates = [HasStdExtZbt, IsRV64]
-
-let Predicates = [HasStdExtZbb, IsRV64] in {
-def CLZW : RVBUnary<0b0110000, 0b00000, 0b001, OPC_OP_IMM_32, "clzw">,
- Sched<[WriteCLZ32, ReadCLZ32]>;
-def CTZW : RVBUnary<0b0110000, 0b00001, 0b001, OPC_OP_IMM_32, "ctzw">,
- Sched<[WriteCTZ32, ReadCTZ32]>;
-def CPOPW : RVBUnary<0b0110000, 0b00010, 0b001, OPC_OP_IMM_32, "cpopw">,
- Sched<[WriteCPOP32, ReadCPOP32]>;
-} // Predicates = [HasStdExtZbb, IsRV64]
-
-let Predicates = [HasStdExtZbp, IsRV64] in {
-def SHFLW : ALUW_rr<0b0000100, 0b001, "shflw">, Sched<[]>;
-def UNSHFLW : ALUW_rr<0b0000100, 0b101, "unshflw">, Sched<[]>;
-} // Predicates = [HasStdExtZbp, IsRV64]
-
-let Predicates = [HasStdExtZbe, IsRV64] in {
-// NOTE: These mnemonics are from the 0.94 spec. There is a name conflict with
-// bextw in the 0.93 spec.
-def BDECOMPRESSW : ALUW_rr<0b0100100, 0b110, "bdecompressw">, Sched<[]>;
-def BCOMPRESSW : ALUW_rr<0b0000100, 0b110, "bcompressw">, Sched<[]>;
-} // Predicates = [HasStdExtZbe, IsRV64]
-
-let Predicates = [HasStdExtZbpOrZbkb, IsRV64] in
-def PACKW : ALUW_rr<0b0000100, 0b100, "packw">, Sched<[]>;
-
-let Predicates = [HasStdExtZbp, IsRV64] in
-def PACKUW : ALUW_rr<0b0100100, 0b100, "packuw">, Sched<[]>;
-
let Predicates = [HasStdExtZbf, IsRV64] in
def BFPW : ALUW_rr<0b0100100, 0b111, "bfpw">,
Sched<[WriteBFP32, ReadBFP32, ReadBFP32]>;
let Predicates = [HasStdExtZbbOrZbp, IsRV32] in {
-let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
-def ZEXTH_RV32 : RVInstR<0b0000100, 0b100, OPC_OP, (outs GPR:$rd),
- (ins GPR:$rs1), "zext.h", "$rd, $rs1">,
- Sched<[WriteIALU, ReadIALU]> {
- let rs2 = 0b00000;
-}
+def ZEXT_H_RV32 : RVBUnary<0b0000100, 0b00000, 0b100, OPC_OP, "zext.h">,
+ Sched<[WriteIALU, ReadIALU]>;
} // Predicates = [HasStdExtZbbOrZbp, IsRV32]
let Predicates = [HasStdExtZbbOrZbp, IsRV64] in {
-let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
-def ZEXTH_RV64 : RVInstR<0b0000100, 0b100, OPC_OP_32, (outs GPR:$rd),
- (ins GPR:$rs1), "zext.h", "$rd, $rs1">,
- Sched<[WriteIALU, ReadIALU]> {
- let rs2 = 0b00000;
-}
+def ZEXT_H_RV64 : RVBUnary<0b0000100, 0b00000, 0b100, OPC_OP_32, "zext.h">,
+ Sched<[WriteIALU, ReadIALU]>;
} // Predicates = [HasStdExtZbbOrZbp, IsRV64]
// We treat rev8 and orc.b as standalone instructions even though they use a
@@ -619,8 +596,8 @@ def REV8_RV64 : RVBUnary<0b0110101, 0b11000, 0b101, OPC_OP_IMM, "rev8">,
} // Predicates = [HasStdExtZbbOrZbpOrZbkb, IsRV64]
let Predicates = [HasStdExtZbbOrZbp] in {
-def ORCB : RVBUnary<0b0010100, 0b00111, 0b101, OPC_OP_IMM, "orc.b">,
- Sched<[WriteORCB, ReadORCB]>;
+def ORC_B : RVBUnary<0b0010100, 0b00111, 0b101, OPC_OP_IMM, "orc.b">,
+ Sched<[WriteORCB, ReadORCB]>;
} // Predicates = [HasStdExtZbbOrZbp]
let Predicates = [HasStdExtZbpOrZbkb] in
@@ -637,7 +614,7 @@ def UNZIP_RV32 : RVBUnary<0b0000100, 0b01111, 0b101, OPC_OP_IMM, "unzip">;
//===----------------------------------------------------------------------===//
let Predicates = [HasStdExtZba, IsRV64] in {
-def : InstAlias<"zext.w $rd, $rs", (ADDUW GPR:$rd, GPR:$rs, X0)>;
+def : InstAlias<"zext.w $rd, $rs", (ADD_UW GPR:$rd, GPR:$rs, X0)>;
}
let Predicates = [HasStdExtZbp] in {
@@ -775,8 +752,10 @@ def : InstAlias<"gorcw $rd, $rs1, $shamt",
// Zbp is unratified and that it would likely adopt the already ratified Zbkx names.
// Thus current Zbp instructions are defined as aliases for Zbkx instructions.
let Predicates = [HasStdExtZbp] in {
- def : InstAlias<"xperm.b $rd, $rs1, $rs2", (XPERMB GPR:$rd, GPR:$rs1, GPR:$rs2)>;
- def : InstAlias<"xperm.n $rd, $rs1, $rs2", (XPERMN GPR:$rd, GPR:$rs1, GPR:$rs2)>;
+ def : InstAlias<"xperm.b $rd, $rs1, $rs2",
+ (XPERM8 GPR:$rd, GPR:$rs1, GPR:$rs2)>;
+ def : InstAlias<"xperm.n $rd, $rs1, $rs2",
+ (XPERM4 GPR:$rd, GPR:$rs1, GPR:$rs2)>;
} // Predicates = [HasStdExtZbp]
let Predicates = [HasStdExtZbs] in {
@@ -803,8 +782,22 @@ def : Pat<(xor GPR:$rs1, (not GPR:$rs2)), (XNOR GPR:$rs1, GPR:$rs2)>;
let Predicates = [HasStdExtZbbOrZbpOrZbkb] in {
def : PatGprGpr<rotl, ROL>;
def : PatGprGpr<rotr, ROR>;
+
+def : PatGprImm<rotr, RORI, uimmlog2xlen>;
+// There's no encoding for roli in the the 'B' extension as it can be
+// implemented with rori by negating the immediate.
+def : Pat<(rotl GPR:$rs1, uimmlog2xlen:$shamt),
+ (RORI GPR:$rs1, (ImmSubFromXLen uimmlog2xlen:$shamt))>;
} // Predicates = [HasStdExtZbbOrZbpOrZbkb]
+let Predicates = [HasStdExtZbbOrZbpOrZbkb, IsRV64] in {
+def : PatGprGpr<riscv_rolw, ROLW>;
+def : PatGprGpr<riscv_rorw, RORW>;
+def : PatGprImm<riscv_rorw, RORIW, uimm5>;
+def : Pat<(riscv_rolw GPR:$rs1, uimm5:$rs2),
+ (RORIW GPR:$rs1, (ImmSubFrom32 uimm5:$rs2))>;
+} // Predicates = [HasStdExtZbbOrZbpOrZbkb, IsRV64]
+
let Predicates = [HasStdExtZbs] in {
def : Pat<(and (not (shiftop<shl> 1, GPR:$rs2)), GPR:$rs1),
(BCLR GPR:$rs1, GPR:$rs2)>;
@@ -852,48 +845,62 @@ def : Pat<(and GPR:$r, BCLRIANDIMask:$i),
(BCLRITwoBitsMaskHigh BCLRIANDIMask:$i))>;
}
-// There's no encoding for roli in the the 'B' extension as it can be
-// implemented with rori by negating the immediate.
-let Predicates = [HasStdExtZbbOrZbpOrZbkb] in {
-def : PatGprImm<rotr, RORI, uimmlog2xlen>;
-def : Pat<(rotl GPR:$rs1, uimmlog2xlen:$shamt),
- (RORI GPR:$rs1, (ImmSubFromXLen uimmlog2xlen:$shamt))>;
-
+let Predicates = [HasStdExtZbbOrZbp] in {
// We treat orc.b as a separate instruction, so match it directly. We also
// lower the Zbb orc.b intrinsic to this.
-def : Pat<(riscv_gorc GPR:$rs1, 7), (ORCB GPR:$rs1)>;
+def : Pat<(riscv_gorc GPR:$rs1, 7), (ORC_B GPR:$rs1)>;
+}
+
+let Predicates = [HasStdExtZbpOrZbkb] in {
+// We treat brev8 as a separate instruction, so match it directly. We also
+// use this for brev8 when lowering bitreverse with Zbkb.
+def : Pat<(riscv_grev GPR:$rs1, 7), (BREV8 GPR:$rs1)>;
+
+// We treat zip and unzip as separate instructions, so match it directly.
+def : Pat<(i32 (riscv_shfl GPR:$rs1, 15)), (ZIP_RV32 GPR:$rs1)>;
+def : Pat<(i32 (riscv_unshfl GPR:$rs1, 15)), (UNZIP_RV32 GPR:$rs1)>;
}
let Predicates = [HasStdExtZbp] in {
def : PatGprGpr<riscv_grev, GREV>;
def : PatGprGpr<riscv_gorc, GORC>;
+def : PatGprImm<riscv_grev, GREVI, uimmlog2xlen>;
+def : PatGprImm<riscv_gorc, GORCI, uimmlog2xlen>;
+
def : PatGprGpr<riscv_shfl, SHFL>;
def : PatGprGpr<riscv_unshfl, UNSHFL>;
-def : PatGprGpr<int_riscv_xperm_n, XPERMN>;
-def : PatGprGpr<int_riscv_xperm_b, XPERMB>;
-def : PatGprGpr<int_riscv_xperm_h, XPERMH>;
def : PatGprImm<riscv_shfl, SHFLI, shfl_uimm>;
def : PatGprImm<riscv_unshfl, UNSHFLI, shfl_uimm>;
-def : PatGprImm<riscv_grev, GREVI, uimmlog2xlen>;
-def : PatGprImm<riscv_gorc, GORCI, uimmlog2xlen>;
-// We treat brev8 as a separate instruction, so match it directly.
-def : Pat<(riscv_grev GPR:$rs1, 7), (BREV8 GPR:$rs1)>;
+def : PatGprGpr<int_riscv_xperm_n, XPERM4>;
+def : PatGprGpr<int_riscv_xperm_b, XPERM8>;
+def : PatGprGpr<int_riscv_xperm_h, XPERM_H>;
} // Predicates = [HasStdExtZbp]
+let Predicates = [HasStdExtZbp, IsRV64] in {
+def : PatGprGpr<riscv_grevw, GREVW>;
+def : PatGprGpr<riscv_gorcw, GORCW>;
+def : PatGprImm<riscv_grevw, GREVIW, uimm5>;
+def : PatGprImm<riscv_gorcw, GORCIW, uimm5>;
+
+// FIXME: Move to DAG combine.
+def : Pat<(riscv_rorw (riscv_grevw GPR:$rs1, 24), 16), (GREVIW GPR:$rs1, 8)>;
+def : Pat<(riscv_rolw (riscv_grevw GPR:$rs1, 24), 16), (GREVIW GPR:$rs1, 8)>;
+
+def : PatGprGpr<riscv_shflw, SHFLW>;
+def : PatGprGpr<riscv_unshflw, UNSHFLW>;
+} // Predicates = [HasStdExtZbp, IsRV64]
+
let Predicates = [HasStdExtZbp, IsRV64] in
-def : PatGprGpr<int_riscv_xperm_w, XPERMW>;
+def : PatGprGpr<int_riscv_xperm_w, XPERM_W>;
let Predicates = [HasStdExtZbp, IsRV32] in {
+// FIXME : Move to DAG combine.
def : Pat<(i32 (rotr (riscv_grev GPR:$rs1, 24), (i32 16))), (GREVI GPR:$rs1, 8)>;
def : Pat<(i32 (rotl (riscv_grev GPR:$rs1, 24), (i32 16))), (GREVI GPR:$rs1, 8)>;
// We treat rev8 as a separate instruction, so match it directly.
def : Pat<(i32 (riscv_grev GPR:$rs1, 24)), (REV8_RV32 GPR:$rs1)>;
-
-// We treat zip and unzip as separate instructions, so match it directly.
-def : Pat<(i32 (riscv_shfl GPR:$rs1, 15)), (ZIP_RV32 GPR:$rs1)>;
-def : Pat<(i32 (riscv_unshfl GPR:$rs1, 15)), (UNZIP_RV32 GPR:$rs1)>;
} // Predicates = [HasStdExtZbp, IsRV32]
let Predicates = [HasStdExtZbp, IsRV64] in {
@@ -942,15 +949,34 @@ def : Pat<(riscv_fsl GPR:$rs3, GPR:$rs1, uimmlog2xlen:$shamt),
(FSRI GPR:$rs1, GPR:$rs3, (ImmSubFromXLen uimmlog2xlen:$shamt))>;
} // Predicates = [HasStdExtZbt]
+let Predicates = [HasStdExtZbt, IsRV64] in {
+def : Pat<(riscv_fslw GPR:$rs1, GPR:$rs3, GPR:$rs2),
+ (FSLW GPR:$rs1, GPR:$rs2, GPR:$rs3)>;
+def : Pat<(riscv_fsrw GPR:$rs1, GPR:$rs3, GPR:$rs2),
+ (FSRW GPR:$rs1, GPR:$rs2, GPR:$rs3)>;
+def : Pat<(riscv_fsrw GPR:$rs1, GPR:$rs3, uimm5:$shamt),
+ (FSRIW GPR:$rs1, GPR:$rs3, uimm5:$shamt)>;
+// We can use FSRIW for FSLW by immediate if we subtract the immediate from
+// 32 and swap the operands.
+def : Pat<(riscv_fslw GPR:$rs3, GPR:$rs1, uimm5:$shamt),
+ (FSRIW GPR:$rs1, GPR:$rs3, (ImmSubFrom32 uimm5:$shamt))>;
+} // Predicates = [HasStdExtZbt, IsRV64]
+
let Predicates = [HasStdExtZbb] in {
def : PatGpr<ctlz, CLZ>;
def : PatGpr<cttz, CTZ>;
def : PatGpr<ctpop, CPOP>;
} // Predicates = [HasStdExtZbb]
+let Predicates = [HasStdExtZbb, IsRV64] in {
+def : PatGpr<riscv_clzw, CLZW>;
+def : PatGpr<riscv_ctzw, CTZW>;
+def : Pat<(i64 (ctpop (i64 (zexti32 (i64 GPR:$rs1))))), (CPOPW GPR:$rs1)>;
+} // Predicates = [HasStdExtZbb, IsRV64]
+
let Predicates = [HasStdExtZbb] in {
-def : Pat<(sext_inreg GPR:$rs1, i8), (SEXTB GPR:$rs1)>;
-def : Pat<(sext_inreg GPR:$rs1, i16), (SEXTH GPR:$rs1)>;
+def : Pat<(sext_inreg GPR:$rs1, i8), (SEXT_B GPR:$rs1)>;
+def : Pat<(sext_inreg GPR:$rs1, i16), (SEXT_H GPR:$rs1)>;
}
let Predicates = [HasStdExtZbb] in {
@@ -968,35 +994,49 @@ let Predicates = [HasStdExtZbbOrZbkb, IsRV64] in {
def : Pat<(i64 (bswap GPR:$rs1)), (REV8_RV64 GPR:$rs1)>;
} // Predicates = [HasStdExtZbbOrZbkb, IsRV64]
+let Predicates = [HasStdExtZbpOrZbkb] in {
+def : Pat<(or (and (shl GPR:$rs2, (XLenVT 8)), 0xFFFF),
+ (and GPR:$rs1, 0x00FF)),
+ (PACKH GPR:$rs1, GPR:$rs2)>;
+def : Pat<(or (shl (and GPR:$rs2, 0x00FF), (XLenVT 8)),
+ (and GPR:$rs1, 0x00FF)),
+ (PACKH GPR:$rs1, GPR:$rs2)>;
+} // Predicates = [HasStdExtZbpOrZbkb]
+
let Predicates = [HasStdExtZbpOrZbkb, IsRV32] in
def : Pat<(i32 (or (and GPR:$rs1, 0x0000FFFF), (shl GPR:$rs2, (i32 16)))),
(PACK GPR:$rs1, GPR:$rs2)>;
+let Predicates = [HasStdExtZbpOrZbkb, IsRV64] in {
+def : Pat<(i64 (or (and GPR:$rs1, 0x00000000FFFFFFFF), (shl GPR:$rs2, (i64 32)))),
+ (PACK GPR:$rs1, GPR:$rs2)>;
+
+def : Pat<(i64 (sext_inreg (or (shl GPR:$rs2, (i64 16)),
+ (and GPR:$rs1, 0x000000000000FFFF)),
+ i32)),
+ (PACKW GPR:$rs1, GPR:$rs2)>;
+def : Pat<(i64 (or (sext_inreg (shl GPR:$rs2, (i64 16)), i32),
+ (and GPR:$rs1, 0x000000000000FFFF))),
+ (PACKW GPR:$rs1, GPR:$rs2)>;
+}
+
let Predicates = [HasStdExtZbp, IsRV32] in
def : Pat<(i32 (or (and GPR:$rs2, 0xFFFF0000), (srl GPR:$rs1, (i32 16)))),
(PACKU GPR:$rs1, GPR:$rs2)>;
-let Predicates = [HasStdExtZbpOrZbkb, IsRV64] in
-def : Pat<(i64 (or (and GPR:$rs1, 0x00000000FFFFFFFF), (shl GPR:$rs2, (i64 32)))),
- (PACK GPR:$rs1, GPR:$rs2)>;
-
-let Predicates = [HasStdExtZbp, IsRV64] in
+let Predicates = [HasStdExtZbp, IsRV64] in {
def : Pat<(i64 (or (and GPR:$rs2, 0xFFFFFFFF00000000), (srl GPR:$rs1, (i64 32)))),
(PACKU GPR:$rs1, GPR:$rs2)>;
-let Predicates = [HasStdExtZbpOrZbkb] in {
-def : Pat<(or (and (shl GPR:$rs2, (XLenVT 8)), 0xFFFF),
- (and GPR:$rs1, 0x00FF)),
- (PACKH GPR:$rs1, GPR:$rs2)>;
-def : Pat<(or (shl (and GPR:$rs2, 0x00FF), (XLenVT 8)),
- (and GPR:$rs1, 0x00FF)),
- (PACKH GPR:$rs1, GPR:$rs2)>;
-} // Predicates = [HasStdExtZbpOrZbkb]
+def : Pat<(i64 (or (and (assertsexti32 GPR:$rs2), 0xFFFFFFFFFFFF0000),
+ (srl (and GPR:$rs1, 0xFFFFFFFF), (i64 16)))),
+ (PACKUW GPR:$rs1, GPR:$rs2)>;
+}
let Predicates = [HasStdExtZbbOrZbp, IsRV32] in
-def : Pat<(i32 (and GPR:$rs, 0xFFFF)), (ZEXTH_RV32 GPR:$rs)>;
+def : Pat<(i32 (and GPR:$rs, 0xFFFF)), (ZEXT_H_RV32 GPR:$rs)>;
let Predicates = [HasStdExtZbbOrZbp, IsRV64] in
-def : Pat<(i64 (and GPR:$rs, 0xFFFF)), (ZEXTH_RV64 GPR:$rs)>;
+def : Pat<(i64 (and GPR:$rs, 0xFFFF)), (ZEXT_H_RV64 GPR:$rs)>;
// Pattern to exclude simm12 immediates from matching.
def non_imm12 : PatLeaf<(XLenVT GPR:$a), [{
@@ -1074,80 +1114,26 @@ def : Pat<(mul_const_oneuse GPR:$r, (XLenVT 81)),
let Predicates = [HasStdExtZba, IsRV64] in {
def : Pat<(i64 (shl (and GPR:$rs1, 0xFFFFFFFF), uimm5:$shamt)),
- (SLLIUW GPR:$rs1, uimm5:$shamt)>;
+ (SLLI_UW GPR:$rs1, uimm5:$shamt)>;
def : Pat<(i64 (add (and GPR:$rs1, 0xFFFFFFFF), non_imm12:$rs2)),
- (ADDUW GPR:$rs1, GPR:$rs2)>;
-def : Pat<(i64 (and GPR:$rs, 0xFFFFFFFF)), (ADDUW GPR:$rs, X0)>;
+ (ADD_UW GPR:$rs1, GPR:$rs2)>;
+def : Pat<(i64 (and GPR:$rs, 0xFFFFFFFF)), (ADD_UW GPR:$rs, X0)>;
def : Pat<(i64 (add (shl (and GPR:$rs1, 0xFFFFFFFF), (i64 1)), non_imm12:$rs2)),
- (SH1ADDUW GPR:$rs1, GPR:$rs2)>;
+ (SH1ADD_UW GPR:$rs1, GPR:$rs2)>;
def : Pat<(i64 (add (shl (and GPR:$rs1, 0xFFFFFFFF), (i64 2)), non_imm12:$rs2)),
- (SH2ADDUW GPR:$rs1, GPR:$rs2)>;
+ (SH2ADD_UW GPR:$rs1, GPR:$rs2)>;
def : Pat<(i64 (add (shl (and GPR:$rs1, 0xFFFFFFFF), (i64 3)), non_imm12:$rs2)),
- (SH3ADDUW GPR:$rs1, GPR:$rs2)>;
+ (SH3ADD_UW GPR:$rs1, GPR:$rs2)>;
def : Pat<(i64 (add (and (shl GPR:$rs1, (i64 1)), 0x1FFFFFFFF), non_imm12:$rs2)),
- (SH1ADDUW GPR:$rs1, GPR:$rs2)>;
+ (SH1ADD_UW GPR:$rs1, GPR:$rs2)>;
def : Pat<(i64 (add (and (shl GPR:$rs1, (i64 2)), 0x3FFFFFFFF), non_imm12:$rs2)),
- (SH2ADDUW GPR:$rs1, GPR:$rs2)>;
+ (SH2ADD_UW GPR:$rs1, GPR:$rs2)>;
def : Pat<(i64 (add (and (shl GPR:$rs1, (i64 3)), 0x7FFFFFFFF), non_imm12:$rs2)),
- (SH3ADDUW GPR:$rs1, GPR:$rs2)>;
+ (SH3ADD_UW GPR:$rs1, GPR:$rs2)>;
} // Predicates = [HasStdExtZba, IsRV64]
-let Predicates = [HasStdExtZbbOrZbpOrZbkb, IsRV64] in {
-def : PatGprGpr<riscv_rolw, ROLW>;
-def : PatGprGpr<riscv_rorw, RORW>;
-def : PatGprImm<riscv_rorw, RORIW, uimm5>;
-def : Pat<(riscv_rolw GPR:$rs1, uimm5:$rs2),
- (RORIW GPR:$rs1, (ImmSubFrom32 uimm5:$rs2))>;
-} // Predicates = [HasStdExtZbbOrZbpOrZbkb, IsRV64]
-
-let Predicates = [HasStdExtZbp, IsRV64] in {
-def : Pat<(riscv_rorw (riscv_grevw GPR:$rs1, 24), 16), (GREVIW GPR:$rs1, 8)>;
-def : Pat<(riscv_rolw (riscv_grevw GPR:$rs1, 24), 16), (GREVIW GPR:$rs1, 8)>;
-def : PatGprGpr<riscv_grevw, GREVW>;
-def : PatGprGpr<riscv_gorcw, GORCW>;
-def : PatGprGpr<riscv_shflw, SHFLW>;
-def : PatGprGpr<riscv_unshflw, UNSHFLW>;
-def : PatGprImm<riscv_grevw, GREVIW, uimm5>;
-def : PatGprImm<riscv_gorcw, GORCIW, uimm5>;
-} // Predicates = [HasStdExtZbp, IsRV64]
-
-let Predicates = [HasStdExtZbt, IsRV64] in {
-def : Pat<(riscv_fslw GPR:$rs1, GPR:$rs3, GPR:$rs2),
- (FSLW GPR:$rs1, GPR:$rs2, GPR:$rs3)>;
-def : Pat<(riscv_fsrw GPR:$rs1, GPR:$rs3, GPR:$rs2),
- (FSRW GPR:$rs1, GPR:$rs2, GPR:$rs3)>;
-def : Pat<(riscv_fsrw GPR:$rs1, GPR:$rs3, uimm5:$shamt),
- (FSRIW GPR:$rs1, GPR:$rs3, uimm5:$shamt)>;
-// We can use FSRIW for FSLW by immediate if we subtract the immediate from
-// 32 and swap the operands.
-def : Pat<(riscv_fslw GPR:$rs3, GPR:$rs1, uimm5:$shamt),
- (FSRIW GPR:$rs1, GPR:$rs3, (ImmSubFrom32 uimm5:$shamt))>;
-} // Predicates = [HasStdExtZbt, IsRV64]
-
-let Predicates = [HasStdExtZbb, IsRV64] in {
-def : PatGpr<riscv_clzw, CLZW>;
-def : PatGpr<riscv_ctzw, CTZW>;
-def : Pat<(i64 (ctpop (i64 (zexti32 (i64 GPR:$rs1))))), (CPOPW GPR:$rs1)>;
-} // Predicates = [HasStdExtZbb, IsRV64]
-
-let Predicates = [HasStdExtZbpOrZbkb, IsRV64] in {
-def : Pat<(i64 (sext_inreg (or (shl GPR:$rs2, (i64 16)),
- (and GPR:$rs1, 0x000000000000FFFF)),
- i32)),
- (PACKW GPR:$rs1, GPR:$rs2)>;
-def : Pat<(i64 (or (sext_inreg (shl GPR:$rs2, (i64 16)), i32),
- (and GPR:$rs1, 0x000000000000FFFF))),
- (PACKW GPR:$rs1, GPR:$rs2)>;
-}
-
-let Predicates = [HasStdExtZbp, IsRV64] in
-def : Pat<(i64 (or (and (assertsexti32 GPR:$rs2), 0xFFFFFFFFFFFF0000),
- (srl (and GPR:$rs1, 0xFFFFFFFF), (i64 16)))),
- (PACKUW GPR:$rs1, GPR:$rs2)>;
-
-
let Predicates = [HasStdExtZbcOrZbkc] in {
def : PatGprGpr<int_riscv_clmul, CLMUL>;
def : PatGprGpr<int_riscv_clmulh, CLMULH>;
@@ -1167,17 +1153,17 @@ def : PatGprGpr<riscv_bdecompressw, BDECOMPRESSW>;
} // Predicates = [HasStdExtZbe, IsRV64]
let Predicates = [HasStdExtZbr] in {
-def : PatGpr<int_riscv_crc32_b, CRC32B>;
-def : PatGpr<int_riscv_crc32_h, CRC32H>;
-def : PatGpr<int_riscv_crc32_w, CRC32W>;
-def : PatGpr<int_riscv_crc32c_b, CRC32CB>;
-def : PatGpr<int_riscv_crc32c_h, CRC32CH>;
-def : PatGpr<int_riscv_crc32c_w, CRC32CW>;
+def : PatGpr<int_riscv_crc32_b, CRC32_B>;
+def : PatGpr<int_riscv_crc32_h, CRC32_H>;
+def : PatGpr<int_riscv_crc32_w, CRC32_W>;
+def : PatGpr<int_riscv_crc32c_b, CRC32C_B>;
+def : PatGpr<int_riscv_crc32c_h, CRC32C_H>;
+def : PatGpr<int_riscv_crc32c_w, CRC32C_W>;
} // Predicates = [HasStdExtZbr]
let Predicates = [HasStdExtZbr, IsRV64] in {
-def : PatGpr<int_riscv_crc32_d, CRC32D>;
-def : PatGpr<int_riscv_crc32c_d, CRC32CD>;
+def : PatGpr<int_riscv_crc32_d, CRC32_D>;
+def : PatGpr<int_riscv_crc32c_d, CRC32C_D>;
} // Predicates = [HasStdExtZbr, IsRV64]
let Predicates = [HasStdExtZbf] in
@@ -1186,16 +1172,7 @@ def : PatGprGpr<riscv_bfp, BFP>;
let Predicates = [HasStdExtZbf, IsRV64] in
def : PatGprGpr<riscv_bfpw, BFPW>;
-let Predicates = [HasStdExtZbkb] in {
-def : PatGpr<int_riscv_brev8, BREV8>;
-} // Predicates = [HasStdExtZbkb]
-
-let Predicates = [HasStdExtZbkb, IsRV32] in {
-def : PatGpr<int_riscv_zip, ZIP_RV32>;
-def : PatGpr<int_riscv_unzip, UNZIP_RV32>;
-} // Predicates = [HasStdExtZbkb, IsRV32]
-
let Predicates = [HasStdExtZbkx] in {
-def : PatGprGpr<int_riscv_xperm4, XPERMN>;
-def : PatGprGpr<int_riscv_xperm8, XPERMB>;
+def : PatGprGpr<int_riscv_xperm4, XPERM4>;
+def : PatGprGpr<int_riscv_xperm8, XPERM8>;
}
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td
index dfd0c74ee26c..a2753c132354 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td
@@ -29,14 +29,14 @@ def riscv_fmv_x_anyexth
// Instructions
//===----------------------------------------------------------------------===//
-let Predicates = [HasStdExtZfhmin] in {
+let Predicates = [HasStdExtZfhOrZfhmin] in {
def FLH : FPLoad_r<0b001, "flh", FPR16, WriteFLD16>;
// Operands for stores are in the order srcreg, base, offset rather than
// reflecting the order these fields are specified in the instruction
// encoding.
def FSH : FPStore_r<0b001, "fsh", FPR16, WriteFST16>;
-} // Predicates = [HasStdExtZfhmin]
+} // Predicates = [HasStdExtZfhOrZfhmin]
let Predicates = [HasStdExtZfh] in {
let SchedRW = [WriteFMA16, ReadFMA16, ReadFMA16, ReadFMA16] in {
@@ -98,7 +98,7 @@ def FCVT_H_WU : FPUnaryOp_r_frm<0b1101010, 0b00001, FPR16, GPR, "fcvt.h.wu">,
def : FPUnaryOpDynFrmAlias<FCVT_H_WU, "fcvt.h.wu", FPR16, GPR>;
} // Predicates = [HasStdExtZfh]
-let Predicates = [HasStdExtZfhmin] in {
+let Predicates = [HasStdExtZfhOrZfhmin] in {
def FCVT_H_S : FPUnaryOp_r_frm<0b0100010, 0b00000, FPR16, FPR32, "fcvt.h.s">,
Sched<[WriteFCvtF32ToF16, ReadFCvtF32ToF16]>;
def : FPUnaryOpDynFrmAlias<FCVT_H_S, "fcvt.h.s", FPR16, FPR32>;
@@ -113,7 +113,7 @@ def FMV_X_H : FPUnaryOp_r<0b1110010, 0b00000, 0b000, GPR, FPR16, "fmv.x.h">,
let mayRaiseFPException = 0 in
def FMV_H_X : FPUnaryOp_r<0b1111010, 0b00000, 0b000, FPR16, GPR, "fmv.h.x">,
Sched<[WriteFMovI16ToF16, ReadFMovI16ToF16]>;
-} // Predicates = [HasStdExtZfhmin]
+} // Predicates = [HasStdExtZfhOrZfhmin]
let Predicates = [HasStdExtZfh] in {
@@ -146,23 +146,23 @@ def FCVT_H_LU : FPUnaryOp_r_frm<0b1101010, 0b00011, FPR16, GPR, "fcvt.h.lu">,
def : FPUnaryOpDynFrmAlias<FCVT_H_LU, "fcvt.h.lu", FPR16, GPR>;
} // Predicates = [HasStdExtZfh, IsRV64]
-let Predicates = [HasStdExtZfhmin, HasStdExtD] in {
+let Predicates = [HasStdExtZfhOrZfhmin, HasStdExtD] in {
def FCVT_H_D : FPUnaryOp_r_frm<0b0100010, 0b00001, FPR16, FPR64, "fcvt.h.d">,
Sched<[WriteFCvtF64ToF16, ReadFCvtF64ToF16]>;
def : FPUnaryOpDynFrmAlias<FCVT_H_D, "fcvt.h.d", FPR16, FPR64>;
def FCVT_D_H : FPUnaryOp_r<0b0100001, 0b00010, 0b000, FPR64, FPR16, "fcvt.d.h">,
Sched<[WriteFCvtF16ToF64, ReadFCvtF16ToF64]>;
-} // Predicates = [HasStdExtZfhmin, HasStdExtD]
+} // Predicates = [HasStdExtZfhOrZfhmin, HasStdExtD]
//===----------------------------------------------------------------------===//
// Assembler Pseudo Instructions (User-Level ISA, Version 2.2, Chapter 20)
//===----------------------------------------------------------------------===//
-let Predicates = [HasStdExtZfhmin] in {
+let Predicates = [HasStdExtZfhOrZfhmin] in {
def : InstAlias<"flh $rd, (${rs1})", (FLH FPR16:$rd, GPR:$rs1, 0), 0>;
def : InstAlias<"fsh $rs2, (${rs1})", (FSH FPR16:$rs2, GPR:$rs1, 0), 0>;
-} // Predicates = [HasStdExtZfhmin]
+} // Predicates = [HasStdExtZfhOrZfhmin]
let Predicates = [HasStdExtZfh] in {
def : InstAlias<"fmv.h $rd, $rs", (FSGNJ_H FPR16:$rd, FPR16:$rs, FPR16:$rs)>;
@@ -177,14 +177,14 @@ def : InstAlias<"fge.h $rd, $rs, $rt",
(FLE_H GPR:$rd, FPR16:$rt, FPR16:$rs), 0>;
} // Predicates = [HasStdExtZfh]
-let Predicates = [HasStdExtZfhmin] in {
+let Predicates = [HasStdExtZfhOrZfhmin] in {
def PseudoFLH : PseudoFloatLoad<"flh", FPR16>;
def PseudoFSH : PseudoStore<"fsh", FPR16>;
let usesCustomInserter = 1 in {
def PseudoQuietFLE_H : PseudoQuietFCMP<FPR16>;
def PseudoQuietFLT_H : PseudoQuietFCMP<FPR16>;
}
-} // Predicates = [HasStdExtZfhmin]
+} // Predicates = [HasStdExtZfhOrZfhmin]
//===----------------------------------------------------------------------===//
// Pseudo-instructions and codegen patterns
@@ -281,7 +281,7 @@ def : PatSetCC<FPR16, any_fsetccs, SETOLE, FLE_H>;
def Select_FPR16_Using_CC_GPR : SelectCC_rrirr<FPR16, GPR>;
} // Predicates = [HasStdExtZfh]
-let Predicates = [HasStdExtZfhmin] in {
+let Predicates = [HasStdExtZfhOrZfhmin] in {
/// Loads
defm : LdPat<load, FLH, f16>;
@@ -299,7 +299,7 @@ def : Pat<(any_fpextend FPR16:$rs1), (FCVT_S_H FPR16:$rs1)>;
// Moves (no conversion)
def : Pat<(riscv_fmv_h_x GPR:$src), (FMV_H_X GPR:$src)>;
def : Pat<(riscv_fmv_x_anyexth FPR16:$src), (FMV_X_H FPR16:$src)>;
-} // Predicates = [HasStdExtZfhmin]
+} // Predicates = [HasStdExtZfhOrZfhmin]
let Predicates = [HasStdExtZfh, IsRV32] in {
// half->[u]int. Round-to-zero must be used.
@@ -351,7 +351,7 @@ def : Pat<(any_sint_to_fp (i64 GPR:$rs1)), (FCVT_H_L $rs1, 0b111)>;
def : Pat<(any_uint_to_fp (i64 GPR:$rs1)), (FCVT_H_LU $rs1, 0b111)>;
} // Predicates = [HasStdExtZfh, IsRV64]
-let Predicates = [HasStdExtZfhmin, HasStdExtD] in {
+let Predicates = [HasStdExtZfhOrZfhmin, HasStdExtD] in {
/// Float conversion operations
// f64 -> f16, f16 -> f64
def : Pat<(any_fpround FPR64:$rs1), (FCVT_H_D FPR64:$rs1, 0b111)>;
@@ -361,4 +361,4 @@ def : Pat<(any_fpextend FPR16:$rs1), (FCVT_D_H FPR16:$rs1)>;
def : Pat<(fcopysign FPR16:$rs1, FPR64:$rs2),
(FSGNJ_H $rs1, (FCVT_H_D $rs2, 0b111))>;
def : Pat<(fcopysign FPR64:$rs1, FPR16:$rs2), (FSGNJ_D $rs1, (FCVT_D_H $rs2))>;
-} // Predicates = [HasStdExtZfhmin, HasStdExtD]
+} // Predicates = [HasStdExtZfhOrZfhmin, HasStdExtD]
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZk.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZk.td
index 4a41cddedc71..e4e07f4789a6 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZk.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZk.td
@@ -1,4 +1,4 @@
-//===- RISCVInstrInfoZk.td - RISC-V Scalar Crypto instructions - tablegen -*===//
+//===- RISCVInstrInfoZk.td - RISC-V 'Zk' instructions ------*- tablegen -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
diff --git a/llvm/lib/Target/RISCV/RISCVSExtWRemoval.cpp b/llvm/lib/Target/RISCV/RISCVSExtWRemoval.cpp
index 12ec52925798..715d92b036e3 100644
--- a/llvm/lib/Target/RISCV/RISCVSExtWRemoval.cpp
+++ b/llvm/lib/Target/RISCV/RISCVSExtWRemoval.cpp
@@ -99,9 +99,9 @@ static bool isSignExtendingOpW(const MachineInstr &MI) {
case RISCV::SLTI:
case RISCV::SLTU:
case RISCV::SLTIU:
- case RISCV::SEXTB:
- case RISCV::SEXTH:
- case RISCV::ZEXTH_RV64:
+ case RISCV::SEXT_B:
+ case RISCV::SEXT_H:
+ case RISCV::ZEXT_H_RV64:
return true;
// shifting right sufficiently makes the value 32-bit sign-extended
case RISCV::SRAI:
diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.h b/llvm/lib/Target/RISCV/RISCVSubtarget.h
index 044dda0a1ccc..34c6e8e684ac 100644
--- a/llvm/lib/Target/RISCV/RISCVSubtarget.h
+++ b/llvm/lib/Target/RISCV/RISCVSubtarget.h
@@ -195,6 +195,7 @@ public:
return 0;
}
+ unsigned getMinVLen() const { return ZvlLen; }
RISCVABI::ABI getTargetABI() const { return TargetABI; }
bool isRegisterReservedByUser(Register i) const {
assert(i < RISCV::NUM_TARGET_REGS && "Register out of range");
diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp b/llvm/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp
index e950f9582f09..4d69040a4508 100644
--- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp
+++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp
@@ -8,6 +8,7 @@
#include "MCTargetDesc/SparcFixupKinds.h"
#include "MCTargetDesc/SparcMCTargetDesc.h"
+#include "llvm/ADT/StringSwitch.h"
#include "llvm/MC/MCAsmBackend.h"
#include "llvm/MC/MCELFObjectWriter.h"
#include "llvm/MC/MCExpr.h"
@@ -131,6 +132,23 @@ namespace {
return Sparc::NumTargetFixupKinds;
}
+ Optional<MCFixupKind> getFixupKind(StringRef Name) const override {
+ unsigned Type;
+ Type = llvm::StringSwitch<unsigned>(Name)
+#define ELF_RELOC(X, Y) .Case(#X, Y)
+#include "llvm/BinaryFormat/ELFRelocs/Sparc.def"
+#undef ELF_RELOC
+ .Case("BFD_RELOC_NONE", ELF::R_SPARC_NONE)
+ .Case("BFD_RELOC_8", ELF::R_SPARC_8)
+ .Case("BFD_RELOC_16", ELF::R_SPARC_16)
+ .Case("BFD_RELOC_32", ELF::R_SPARC_32)
+ .Case("BFD_RELOC_64", ELF::R_SPARC_64)
+ .Default(-1u);
+ if (Type == -1u)
+ return None;
+ return static_cast<MCFixupKind>(FirstLiteralRelocationKind + Type);
+ }
+
const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override {
const static MCFixupKindInfo InfosBE[Sparc::NumTargetFixupKinds] = {
// name offset bits flags
@@ -216,6 +234,11 @@ namespace {
{ "fixup_sparc_tls_le_lox10", 0, 0, 0 }
};
+ // Fixup kinds from .reloc directive are like R_SPARC_NONE. They do
+ // not require any extra processing.
+ if (Kind >= FirstLiteralRelocationKind)
+ return MCAsmBackend::getFixupKindInfo(FK_NONE);
+
if (Kind < FirstTargetFixupKind)
return MCAsmBackend::getFixupKindInfo(Kind);
@@ -229,6 +252,8 @@ namespace {
bool shouldForceRelocation(const MCAssembler &Asm, const MCFixup &Fixup,
const MCValue &Target) override {
+ if (Fixup.getKind() >= FirstLiteralRelocationKind)
+ return true;
switch ((Sparc::Fixups)Fixup.getKind()) {
default:
return false;
@@ -299,6 +324,8 @@ namespace {
uint64_t Value, bool IsResolved,
const MCSubtargetInfo *STI) const override {
+ if (Fixup.getKind() >= FirstLiteralRelocationKind)
+ return;
Value = adjustFixupValue(Fixup.getKind(), Value);
if (!Value) return; // Doesn't change encoding.
diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp b/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp
index bc508b45c3bd..02261dc5c4cd 100644
--- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp
+++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp
@@ -42,6 +42,9 @@ unsigned SparcELFObjectWriter::getRelocType(MCContext &Ctx,
const MCValue &Target,
const MCFixup &Fixup,
bool IsPCRel) const {
+ MCFixupKind Kind = Fixup.getKind();
+ if (Kind >= FirstLiteralRelocationKind)
+ return Kind - FirstLiteralRelocationKind;
if (const SparcMCExpr *SExpr = dyn_cast<SparcMCExpr>(Fixup.getValue())) {
if (SExpr->getKind() == SparcMCExpr::VK_Sparc_R_DISP32)
@@ -68,6 +71,7 @@ unsigned SparcELFObjectWriter::getRelocType(MCContext &Ctx,
switch(Fixup.getTargetKind()) {
default:
llvm_unreachable("Unimplemented fixup -> relocation");
+ case FK_NONE: return ELF::R_SPARC_NONE;
case FK_Data_1: return ELF::R_SPARC_8;
case FK_Data_2: return ((Fixup.getOffset() % 2)
? ELF::R_SPARC_UA16
diff --git a/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp b/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
index ccc7d0737f53..610627e7e3f0 100644
--- a/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
@@ -80,6 +80,88 @@ MachineBasicBlock::iterator SystemZFrameLowering::eliminateCallFramePseudoInstr(
}
}
+namespace {
+struct SZFrameSortingObj {
+ bool IsValid = false; // True if we care about this Object.
+ uint32_t ObjectIndex = 0; // Index of Object into MFI list.
+ uint64_t ObjectSize = 0; // Size of Object in bytes.
+ uint32_t D12Count = 0; // 12-bit displacement only.
+ uint32_t DPairCount = 0; // 12 or 20 bit displacement.
+};
+typedef std::vector<SZFrameSortingObj> SZFrameObjVec;
+} // namespace
+
+// TODO: Move to base class.
+void SystemZELFFrameLowering::orderFrameObjects(
+ const MachineFunction &MF, SmallVectorImpl<int> &ObjectsToAllocate) const {
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ const SystemZInstrInfo *TII =
+ static_cast<const SystemZInstrInfo *>(MF.getSubtarget().getInstrInfo());
+
+ // Make a vector of sorting objects to track all MFI objects and mark those
+ // to be sorted as valid.
+ if (ObjectsToAllocate.size() <= 1)
+ return;
+ SZFrameObjVec SortingObjects(MFI.getObjectIndexEnd());
+ for (auto &Obj : ObjectsToAllocate) {
+ SortingObjects[Obj].IsValid = true;
+ SortingObjects[Obj].ObjectIndex = Obj;
+ SortingObjects[Obj].ObjectSize = MFI.getObjectSize(Obj);
+ }
+
+ // Examine uses for each object and record short (12-bit) and "pair"
+ // displacement types.
+ for (auto &MBB : MF)
+ for (auto &MI : MBB) {
+ if (MI.isDebugInstr())
+ continue;
+ for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
+ const MachineOperand &MO = MI.getOperand(I);
+ if (!MO.isFI())
+ continue;
+ int Index = MO.getIndex();
+ if (Index >= 0 && Index < MFI.getObjectIndexEnd() &&
+ SortingObjects[Index].IsValid) {
+ if (TII->hasDisplacementPairInsn(MI.getOpcode()))
+ SortingObjects[Index].DPairCount++;
+ else if (!(MI.getDesc().TSFlags & SystemZII::Has20BitOffset))
+ SortingObjects[Index].D12Count++;
+ }
+ }
+ }
+
+ // Sort all objects for short/paired displacements, which should be
+ // sufficient as it seems like all frame objects typically are within the
+ // long displacement range. Sorting works by computing the "density" as
+ // Count / ObjectSize. The comparisons of two such fractions are refactored
+ // by multiplying both sides with A.ObjectSize * B.ObjectSize, in order to
+ // eliminate the (fp) divisions. A higher density object needs to go after
+ // in the list in order for it to end up lower on the stack.
+ auto CmpD12 = [](const SZFrameSortingObj &A, const SZFrameSortingObj &B) {
+ // Put all invalid and variable sized objects at the end.
+ if (!A.IsValid || !B.IsValid)
+ return A.IsValid;
+ if (!A.ObjectSize || !B.ObjectSize)
+ return A.ObjectSize > 0;
+ uint64_t ADensityCmp = A.D12Count * B.ObjectSize;
+ uint64_t BDensityCmp = B.D12Count * A.ObjectSize;
+ if (ADensityCmp != BDensityCmp)
+ return ADensityCmp < BDensityCmp;
+ return A.DPairCount * B.ObjectSize < B.DPairCount * A.ObjectSize;
+ };
+ std::stable_sort(SortingObjects.begin(), SortingObjects.end(), CmpD12);
+
+ // Now modify the original list to represent the final order that
+ // we want.
+ unsigned Idx = 0;
+ for (auto &Obj : SortingObjects) {
+ // All invalid items are sorted at the end, so it's safe to stop.
+ if (!Obj.IsValid)
+ break;
+ ObjectsToAllocate[Idx++] = Obj.ObjectIndex;
+ }
+}
+
bool SystemZFrameLowering::hasReservedCallFrame(
const MachineFunction &MF) const {
// The ELF ABI requires us to allocate 160 bytes of stack space for the
diff --git a/llvm/lib/Target/SystemZ/SystemZFrameLowering.h b/llvm/lib/Target/SystemZ/SystemZFrameLowering.h
index 3a1af888d8f9..2b3d7efed53b 100644
--- a/llvm/lib/Target/SystemZ/SystemZFrameLowering.h
+++ b/llvm/lib/Target/SystemZ/SystemZFrameLowering.h
@@ -77,6 +77,9 @@ public:
bool hasFP(const MachineFunction &MF) const override;
StackOffset getFrameIndexReference(const MachineFunction &MF, int FI,
Register &FrameReg) const override;
+ void
+ orderFrameObjects(const MachineFunction &MF,
+ SmallVectorImpl<int> &ObjectsToAllocate) const override;
// Return the byte offset from the incoming stack pointer of Reg's
// ABI-defined save slot. Return 0 if no slot is defined for Reg. Adjust
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
index a8ddb8c62d18..de446f33f5f1 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
@@ -443,6 +443,11 @@ public:
EVT VT) const override;
bool isFPImmLegal(const APFloat &Imm, EVT VT,
bool ForCodeSize) const override;
+ bool ShouldShrinkFPConstant(EVT VT) const override {
+ // Do not shrink 64-bit FP constpool entries since LDEB is slower than
+ // LD, and having the full constant in memory enables reg/mem opcodes.
+ return VT != MVT::f64;
+ }
bool hasInlineStackProbe(MachineFunction &MF) const override;
bool isLegalICmpImmediate(int64_t Imm) const override;
bool isLegalAddImmediate(int64_t Imm) const override;
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
index 6db9bf3056b7..4b6aa60f5d55 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
@@ -1652,6 +1652,13 @@ unsigned SystemZInstrInfo::getOpcodeForOffset(unsigned Opcode,
return 0;
}
+bool SystemZInstrInfo::hasDisplacementPairInsn(unsigned Opcode) const {
+ const MCInstrDesc &MCID = get(Opcode);
+ if (MCID.TSFlags & SystemZII::Has20BitOffset)
+ return SystemZ::getDisp12Opcode(Opcode) >= 0;
+ return SystemZ::getDisp20Opcode(Opcode) >= 0;
+}
+
unsigned SystemZInstrInfo::getLoadAndTest(unsigned Opcode) const {
switch (Opcode) {
case SystemZ::L: return SystemZ::LT;
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.h b/llvm/lib/Target/SystemZ/SystemZInstrInfo.h
index 396f56c7f59c..9e5b2729a707 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.h
+++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.h
@@ -312,6 +312,9 @@ public:
// exists.
unsigned getOpcodeForOffset(unsigned Opcode, int64_t Offset) const;
+ // Return true if Opcode has a mapping in 12 <-> 20 bit displacements.
+ bool hasDisplacementPairInsn(unsigned Opcode) const;
+
// If Opcode is a load instruction that has a LOAD AND TEST form,
// return the opcode for the testing form, otherwise return 0.
unsigned getLoadAndTest(unsigned Opcode) const;
diff --git a/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.cpp b/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.cpp
index 0412e524f800..0f1655718481 100644
--- a/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.cpp
+++ b/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.cpp
@@ -167,3 +167,41 @@ wasm::ValType WebAssembly::regClassToValType(unsigned RC) {
llvm_unreachable("unexpected type");
}
}
+
+void WebAssembly::wasmSymbolSetType(MCSymbolWasm *Sym, const Type *GlobalVT,
+ const SmallVector<MVT, 1> &VTs) {
+ assert(!Sym->getType());
+
+ // Tables are represented as Arrays in LLVM IR therefore
+ // they reach this point as aggregate Array types with an element type
+ // that is a reference type.
+ wasm::ValType Type;
+ bool IsTable = false;
+ if (GlobalVT->isArrayTy() &&
+ WebAssembly::isRefType(GlobalVT->getArrayElementType())) {
+ MVT VT;
+ IsTable = true;
+ switch (GlobalVT->getArrayElementType()->getPointerAddressSpace()) {
+ case WebAssembly::WasmAddressSpace::WASM_ADDRESS_SPACE_FUNCREF:
+ VT = MVT::funcref;
+ break;
+ case WebAssembly::WasmAddressSpace::WASM_ADDRESS_SPACE_EXTERNREF:
+ VT = MVT::externref;
+ break;
+ default:
+ report_fatal_error("unhandled address space type");
+ }
+ Type = WebAssembly::toValType(VT);
+ } else if (VTs.size() == 1) {
+ Type = WebAssembly::toValType(VTs[0]);
+ } else
+ report_fatal_error("Aggregate globals not yet implemented");
+
+ if (IsTable) {
+ Sym->setType(wasm::WASM_SYMBOL_TYPE_TABLE);
+ Sym->setTableType(Type);
+ } else {
+ Sym->setType(wasm::WASM_SYMBOL_TYPE_GLOBAL);
+ Sym->setGlobalType(wasm::WasmGlobalType{uint8_t(Type), /*Mutable=*/true});
+ }
+}
diff --git a/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.h b/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.h
index 042d51c7d6cb..cdb95d48398d 100644
--- a/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.h
+++ b/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.h
@@ -17,6 +17,8 @@
#include "llvm/ADT/Optional.h"
#include "llvm/BinaryFormat/Wasm.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/MC/MCSymbolWasm.h"
#include "llvm/Support/MachineValueType.h"
namespace llvm {
@@ -41,6 +43,43 @@ enum class BlockType : unsigned {
Multivalue = 0xffff,
};
+enum WasmAddressSpace : unsigned {
+ // Default address space, for pointers to linear memory (stack, heap, data).
+ WASM_ADDRESS_SPACE_DEFAULT = 0,
+ // A non-integral address space for pointers to named objects outside of
+ // linear memory: WebAssembly globals or WebAssembly locals. Loads and stores
+ // to these pointers are lowered to global.get / global.set or local.get /
+ // local.set, as appropriate.
+ WASM_ADDRESS_SPACE_VAR = 1,
+ // A non-integral address space for externref values
+ WASM_ADDRESS_SPACE_EXTERNREF = 10,
+ // A non-integral address space for funcref values
+ WASM_ADDRESS_SPACE_FUNCREF = 20,
+};
+
+inline bool isDefaultAddressSpace(unsigned AS) {
+ return AS == WASM_ADDRESS_SPACE_DEFAULT;
+}
+inline bool isWasmVarAddressSpace(unsigned AS) {
+ return AS == WASM_ADDRESS_SPACE_VAR;
+}
+inline bool isValidAddressSpace(unsigned AS) {
+ return isDefaultAddressSpace(AS) || isWasmVarAddressSpace(AS);
+}
+inline bool isFuncrefType(const Type *Ty) {
+ return isa<PointerType>(Ty) &&
+ Ty->getPointerAddressSpace() ==
+ WasmAddressSpace::WASM_ADDRESS_SPACE_FUNCREF;
+}
+inline bool isExternrefType(const Type *Ty) {
+ return isa<PointerType>(Ty) &&
+ Ty->getPointerAddressSpace() ==
+ WasmAddressSpace::WASM_ADDRESS_SPACE_EXTERNREF;
+}
+inline bool isRefType(const Type *Ty) {
+ return isFuncrefType(Ty) || isExternrefType(Ty);
+}
+
// Convert StringRef to ValType / HealType / BlockType
Optional<wasm::ValType> parseType(StringRef Type);
@@ -68,6 +107,10 @@ wasm::ValType toValType(MVT Type);
// Convert a register class to a wasm ValType.
wasm::ValType regClassToValType(unsigned RC);
+/// Sets a Wasm Symbol Type.
+void wasmSymbolSetType(MCSymbolWasm *Sym, const Type *GlobalVT,
+ const SmallVector<MVT, 1> &VTs);
+
} // end namespace WebAssembly
} // end namespace llvm
diff --git a/llvm/lib/Target/WebAssembly/Utils/WebAssemblyUtilities.h b/llvm/lib/Target/WebAssembly/Utils/WebAssemblyUtilities.h
index 57e40f6cd8d7..cdfc758db7ac 100644
--- a/llvm/lib/Target/WebAssembly/Utils/WebAssemblyUtilities.h
+++ b/llvm/lib/Target/WebAssembly/Utils/WebAssemblyUtilities.h
@@ -15,7 +15,6 @@
#ifndef LLVM_LIB_TARGET_WEBASSEMBLY_UTILS_WEBASSEMBLYUTILITIES_H
#define LLVM_LIB_TARGET_WEBASSEMBLY_UTILS_WEBASSEMBLYUTILITIES_H
-#include "llvm/IR/DerivedTypes.h"
#include "llvm/Support/CommandLine.h"
namespace llvm {
@@ -30,43 +29,6 @@ class WebAssemblySubtarget;
namespace WebAssembly {
-enum WasmAddressSpace : unsigned {
- // Default address space, for pointers to linear memory (stack, heap, data).
- WASM_ADDRESS_SPACE_DEFAULT = 0,
- // A non-integral address space for pointers to named objects outside of
- // linear memory: WebAssembly globals or WebAssembly locals. Loads and stores
- // to these pointers are lowered to global.get / global.set or local.get /
- // local.set, as appropriate.
- WASM_ADDRESS_SPACE_VAR = 1,
- // A non-integral address space for externref values
- WASM_ADDRESS_SPACE_EXTERNREF = 10,
- // A non-integral address space for funcref values
- WASM_ADDRESS_SPACE_FUNCREF = 20,
-};
-
-inline bool isDefaultAddressSpace(unsigned AS) {
- return AS == WASM_ADDRESS_SPACE_DEFAULT;
-}
-inline bool isWasmVarAddressSpace(unsigned AS) {
- return AS == WASM_ADDRESS_SPACE_VAR;
-}
-inline bool isValidAddressSpace(unsigned AS) {
- return isDefaultAddressSpace(AS) || isWasmVarAddressSpace(AS);
-}
-inline bool isFuncrefType(const Type *Ty) {
- return isa<PointerType>(Ty) &&
- Ty->getPointerAddressSpace() ==
- WasmAddressSpace::WASM_ADDRESS_SPACE_FUNCREF;
-}
-inline bool isExternrefType(const Type *Ty) {
- return isa<PointerType>(Ty) &&
- Ty->getPointerAddressSpace() ==
- WasmAddressSpace::WASM_ADDRESS_SPACE_EXTERNREF;
-}
-inline bool isRefType(const Type *Ty) {
- return isFuncrefType(Ty) || isExternrefType(Ty);
-}
-
bool isChild(const MachineInstr &MI, const WebAssemblyFunctionInfo &MFI);
bool mayThrow(const MachineInstr &MI);
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
index e3af6b2662ef..bf326e5106be 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
@@ -181,17 +181,11 @@ void WebAssemblyAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) {
if (!Sym->getType()) {
const WebAssemblyTargetLowering &TLI = *Subtarget->getTargetLowering();
- SmallVector<EVT, 1> VTs;
- ComputeValueVTs(TLI, GV->getParent()->getDataLayout(), GV->getValueType(),
- VTs);
- if (VTs.size() != 1 ||
- TLI.getNumRegisters(GV->getParent()->getContext(), VTs[0]) != 1)
- report_fatal_error("Aggregate globals not yet implemented");
- MVT VT = TLI.getRegisterType(GV->getParent()->getContext(), VTs[0]);
- bool Mutable = true;
- wasm::ValType Type = WebAssembly::toValType(VT);
- Sym->setType(wasm::WASM_SYMBOL_TYPE_GLOBAL);
- Sym->setGlobalType(wasm::WasmGlobalType{uint8_t(Type), Mutable});
+ SmallVector<MVT, 1> VTs;
+ Type *GlobalVT = GV->getValueType();
+ computeLegalValueVTs(TLI, GV->getParent()->getContext(),
+ GV->getParent()->getDataLayout(), GlobalVT, VTs);
+ WebAssembly::wasmSymbolSetType(Sym, GlobalVT, VTs);
}
// If the GlobalVariable refers to a table, we handle it here instead of
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
index 406edef8ff3f..8ddd414b043a 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
@@ -16,6 +16,7 @@
//===----------------------------------------------------------------------===//
#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "Utils/WebAssemblyTypeUtilities.h"
#include "Utils/WebAssemblyUtilities.h"
#include "WebAssembly.h"
#include "WebAssemblyMachineFunctionInfo.h"
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp
index c45f7d7176b5..01baa3d9389d 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp
@@ -19,7 +19,7 @@
#include "WebAssemblyFrameLowering.h"
#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
-#include "Utils/WebAssemblyUtilities.h"
+#include "Utils/WebAssemblyTypeUtilities.h"
#include "WebAssembly.h"
#include "WebAssemblyInstrInfo.h"
#include "WebAssemblyMachineFunctionInfo.h"
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
index fe656753889f..b6c43be03aba 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
@@ -560,6 +560,9 @@ Value *WebAssemblyLowerEmscriptenEHSjLj::wrapInvoke(CallBase *CI) {
NEltArg = NEltArg.getValue() + 1;
FnAttrs.addAllocSizeAttr(SizeArg, NEltArg);
}
+ // In case the callee has 'noreturn' attribute, We need to remove it, because
+ // we expect invoke wrappers to return.
+ FnAttrs.removeAttribute(Attribute::NoReturn);
// Reconstruct the AttributesList based on the vector we constructed.
AttributeList NewCallAL = AttributeList::get(
@@ -630,9 +633,9 @@ static bool canLongjmp(const Value *Callee) {
// Exception-catching related functions
//
- // We intentionally excluded __cxa_end_catch here even though it surely cannot
- // longjmp, in order to maintain the unwind relationship from all existing
- // catchpads (and calls within them) to catch.dispatch.longjmp.
+ // We intentionally treat __cxa_end_catch longjmpable in Wasm SjLj even though
+ // it surely cannot longjmp, in order to maintain the unwind relationship from
+ // all existing catchpads (and calls within them) to catch.dispatch.longjmp.
//
// In Wasm EH + Wasm SjLj, we
// 1. Make all catchswitch and cleanuppad that unwind to caller unwind to
@@ -663,6 +666,8 @@ static bool canLongjmp(const Value *Callee) {
//
// The comment block in findWasmUnwindDestinations() in
// SelectionDAGBuilder.cpp is addressing a similar problem.
+ if (CalleeName == "__cxa_end_catch")
+ return WebAssembly::WasmEnableSjLj;
if (CalleeName == "__cxa_begin_catch" ||
CalleeName == "__cxa_allocate_exception" || CalleeName == "__cxa_throw" ||
CalleeName == "__clang_call_terminate")
@@ -869,15 +874,17 @@ static void nullifySetjmp(Function *F) {
Function *SetjmpF = M.getFunction("setjmp");
SmallVector<Instruction *, 1> ToErase;
- for (User *U : SetjmpF->users()) {
- auto *CI = dyn_cast<CallInst>(U);
- // FIXME 'invoke' to setjmp can happen when we use Wasm EH + Wasm SjLj, but
- // we don't support two being used together yet.
- if (!CI)
- report_fatal_error("Wasm EH + Wasm SjLj is not fully supported yet");
- BasicBlock *BB = CI->getParent();
+ for (User *U : make_early_inc_range(SetjmpF->users())) {
+ auto *CB = cast<CallBase>(U);
+ BasicBlock *BB = CB->getParent();
if (BB->getParent() != F) // in other function
continue;
+ CallInst *CI = nullptr;
+ // setjmp cannot throw. So if it is an invoke, lower it to a call
+ if (auto *II = dyn_cast<InvokeInst>(CB))
+ CI = llvm::changeToCall(II);
+ else
+ CI = cast<CallInst>(CB);
ToErase.push_back(CI);
CI->replaceAllUsesWith(IRB.getInt32(0));
}
@@ -1313,10 +1320,13 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runSjLjOnFunction(Function &F) {
SmallVector<PHINode *, 4> SetjmpRetPHIs;
Function *SetjmpF = M.getFunction("setjmp");
for (auto *U : make_early_inc_range(SetjmpF->users())) {
- auto *CB = dyn_cast<CallBase>(U);
+ auto *CB = cast<CallBase>(U);
BasicBlock *BB = CB->getParent();
if (BB->getParent() != &F) // in other function
continue;
+ if (CB->getOperandBundle(LLVMContext::OB_funclet))
+ report_fatal_error(
+ "setjmp within a catch clause is not supported in Wasm EH");
CallInst *CI = nullptr;
// setjmp cannot throw. So if it is an invoke, lower it to a call
@@ -1815,10 +1825,10 @@ void WebAssemblyLowerEmscriptenEHSjLj::handleLongjmpableCallsForWasmSjLj(
BasicBlock *UnwindDest = nullptr;
if (auto Bundle = CI->getOperandBundle(LLVMContext::OB_funclet)) {
Instruction *FromPad = cast<Instruction>(Bundle->Inputs[0]);
- while (!UnwindDest && FromPad) {
+ while (!UnwindDest) {
if (auto *CPI = dyn_cast<CatchPadInst>(FromPad)) {
UnwindDest = CPI->getCatchSwitch()->getUnwindDest();
- FromPad = nullptr; // stop searching
+ break;
} else if (auto *CPI = dyn_cast<CleanupPadInst>(FromPad)) {
// getCleanupRetUnwindDest() can return nullptr when
// 1. This cleanuppad's matching cleanupret uwninds to caller
@@ -1826,7 +1836,10 @@ void WebAssemblyLowerEmscriptenEHSjLj::handleLongjmpableCallsForWasmSjLj(
// unreachable.
// In case of 2, we need to traverse the parent pad chain.
UnwindDest = getCleanupRetUnwindDest(CPI);
- FromPad = cast<Instruction>(CPI->getParentPad());
+ Value *ParentPad = CPI->getParentPad();
+ if (isa<ConstantTokenNone>(ParentPad))
+ break;
+ FromPad = cast<Instruction>(ParentPad);
}
}
}
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyLowerRefTypesIntPtrConv.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyLowerRefTypesIntPtrConv.cpp
index 8ff916c28c4e..6fd87f10150d 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyLowerRefTypesIntPtrConv.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyLowerRefTypesIntPtrConv.cpp
@@ -14,7 +14,7 @@
///
//===----------------------------------------------------------------------===//
-#include "Utils/WebAssemblyUtilities.h"
+#include "Utils/WebAssemblyTypeUtilities.h"
#include "WebAssembly.h"
#include "WebAssemblySubtarget.h"
#include "llvm/IR/InstIterator.h"
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
index 09bccef17ab0..2e6027a5605c 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
@@ -59,39 +59,7 @@ WebAssemblyMCInstLower::GetGlobalAddressSymbol(const MachineOperand &MO) const {
SmallVector<MVT, 1> VTs;
computeLegalValueVTs(CurrentFunc, TM, GlobalVT, VTs);
- // Tables are represented as Arrays in LLVM IR therefore
- // they reach this point as aggregate Array types with an element type
- // that is a reference type.
- wasm::ValType Type;
- bool IsTable = false;
- if (GlobalVT->isArrayTy() &&
- WebAssembly::isRefType(GlobalVT->getArrayElementType())) {
- MVT VT;
- IsTable = true;
- switch (GlobalVT->getArrayElementType()->getPointerAddressSpace()) {
- case WebAssembly::WasmAddressSpace::WASM_ADDRESS_SPACE_FUNCREF:
- VT = MVT::funcref;
- break;
- case WebAssembly::WasmAddressSpace::WASM_ADDRESS_SPACE_EXTERNREF:
- VT = MVT::externref;
- break;
- default:
- report_fatal_error("unhandled address space type");
- }
- Type = WebAssembly::toValType(VT);
- } else if (VTs.size() == 1) {
- Type = WebAssembly::toValType(VTs[0]);
- } else
- report_fatal_error("Aggregate globals not yet implemented");
-
- if (IsTable) {
- WasmSym->setType(wasm::WASM_SYMBOL_TYPE_TABLE);
- WasmSym->setTableType(Type);
- } else {
- WasmSym->setType(wasm::WASM_SYMBOL_TYPE_GLOBAL);
- WasmSym->setGlobalType(
- wasm::WasmGlobalType{uint8_t(Type), /*Mutable=*/true});
- }
+ WebAssembly::wasmSymbolSetType(WasmSym, GlobalVT, VTs);
}
return WasmSym;
}
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp
index 00b11321fdb2..ea80e96d50de 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp
@@ -30,22 +30,28 @@ void WebAssemblyFunctionInfo::initWARegs(MachineRegisterInfo &MRI) {
WARegs.resize(MRI.getNumVirtRegs(), Reg);
}
-void llvm::computeLegalValueVTs(const Function &F, const TargetMachine &TM,
+void llvm::computeLegalValueVTs(const WebAssemblyTargetLowering &TLI,
+ LLVMContext &Ctx, const DataLayout &DL,
Type *Ty, SmallVectorImpl<MVT> &ValueVTs) {
- const DataLayout &DL(F.getParent()->getDataLayout());
- const WebAssemblyTargetLowering &TLI =
- *TM.getSubtarget<WebAssemblySubtarget>(F).getTargetLowering();
SmallVector<EVT, 4> VTs;
ComputeValueVTs(TLI, DL, Ty, VTs);
for (EVT VT : VTs) {
- unsigned NumRegs = TLI.getNumRegisters(F.getContext(), VT);
- MVT RegisterVT = TLI.getRegisterType(F.getContext(), VT);
+ unsigned NumRegs = TLI.getNumRegisters(Ctx, VT);
+ MVT RegisterVT = TLI.getRegisterType(Ctx, VT);
for (unsigned I = 0; I != NumRegs; ++I)
ValueVTs.push_back(RegisterVT);
}
}
+void llvm::computeLegalValueVTs(const Function &F, const TargetMachine &TM,
+ Type *Ty, SmallVectorImpl<MVT> &ValueVTs) {
+ const DataLayout &DL(F.getParent()->getDataLayout());
+ const WebAssemblyTargetLowering &TLI =
+ *TM.getSubtarget<WebAssemblySubtarget>(F).getTargetLowering();
+ computeLegalValueVTs(TLI, F.getContext(), DL, Ty, ValueVTs);
+}
+
void llvm::computeSignatureVTs(const FunctionType *Ty,
const Function *TargetFunc,
const Function &ContextFunc,
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h b/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h
index 3fa2d0c8a2f2..413d0d1dc554 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h
@@ -166,6 +166,10 @@ public:
void setWasmEHFuncInfo(WasmEHFuncInfo *Info) { WasmEHInfo = Info; }
};
+void computeLegalValueVTs(const WebAssemblyTargetLowering &TLI,
+ LLVMContext &Ctx, const DataLayout &DL, Type *Ty,
+ SmallVectorImpl<MVT> &ValueVTs);
+
void computeLegalValueVTs(const Function &F, const TargetMachine &TM, Type *Ty,
SmallVectorImpl<MVT> &ValueVTs);
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index aff72452af6c..90753b5b4d33 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -805,8 +805,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
// Some FP actions are always expanded for vector types.
- for (auto VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32,
- MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
+ for (auto VT : { MVT::v8f16, MVT::v16f16, MVT::v32f16,
+ MVT::v4f32, MVT::v8f32, MVT::v16f32,
+ MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
setOperationAction(ISD::FSIN, VT, Expand);
setOperationAction(ISD::FSINCOS, VT, Expand);
setOperationAction(ISD::FCOS, VT, Expand);
@@ -1094,13 +1095,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
if (VT == MVT::v2i64) continue;
setOperationAction(ISD::ROTL, VT, Custom);
setOperationAction(ISD::ROTR, VT, Custom);
+ setOperationAction(ISD::FSHL, VT, Custom);
+ setOperationAction(ISD::FSHR, VT, Custom);
}
- setOperationAction(ISD::FSHL, MVT::v16i8, Custom);
- setOperationAction(ISD::FSHR, MVT::v16i8, Custom);
- setOperationAction(ISD::FSHL, MVT::v4i32, Custom);
- setOperationAction(ISD::FSHR, MVT::v4i32, Custom);
-
setOperationAction(ISD::STRICT_FSQRT, MVT::v2f64, Legal);
setOperationAction(ISD::STRICT_FADD, MVT::v2f64, Legal);
setOperationAction(ISD::STRICT_FSUB, MVT::v2f64, Legal);
@@ -1958,6 +1956,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
// AVX512_FP16 scalar operations
setGroup(MVT::f16);
addRegisterClass(MVT::f16, &X86::FR16XRegClass);
+ setOperationAction(ISD::FREM, MVT::f16, Promote);
+ setOperationAction(ISD::STRICT_FREM, MVT::f16, Promote);
setOperationAction(ISD::SELECT_CC, MVT::f16, Expand);
setOperationAction(ISD::BR_CC, MVT::f16, Expand);
setOperationAction(ISD::SETCC, MVT::f16, Custom);
@@ -12571,6 +12571,8 @@ static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
if (ForceV2Zero)
V2 = getZeroVector(VT, Subtarget, DAG, DL);
+ unsigned NumElts = VT.getVectorNumElements();
+
switch (VT.SimpleTy) {
case MVT::v4i64:
case MVT::v8i32:
@@ -12629,8 +12631,7 @@ static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
return Masked;
if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
- MVT IntegerType =
- MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
+ MVT IntegerType = MVT::getIntegerVT(std::max<unsigned>(NumElts, 8));
SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
}
@@ -12699,8 +12700,7 @@ static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
// Otherwise load an immediate into a GPR, cast to k-register, and use a
// masked move.
- MVT IntegerType =
- MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
+ MVT IntegerType = MVT::getIntegerVT(std::max<unsigned>(NumElts, 8));
SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
}
@@ -29843,7 +29843,8 @@ static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget,
{Op0, Op1, Amt}, DAG, Subtarget);
}
assert((VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 ||
- VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) &&
+ VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v8i32 ||
+ VT == MVT::v16i32) &&
"Unexpected funnel shift type!");
// fshl(x,y,z) -> unpack(y,x) << (z & (bw-1))) >> bw.
@@ -29855,6 +29856,10 @@ static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget,
SDValue AmtMod = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
bool IsCst = ISD::isBuildVectorOfConstantSDNodes(AmtMod.getNode());
+ // Constant vXi16 funnel shifts can be efficiently handled by default.
+ if (IsCst && EltSizeInBits == 16)
+ return SDValue();
+
unsigned ShiftOpc = IsFSHR ? ISD::SRL : ISD::SHL;
unsigned NumElts = VT.getVectorNumElements();
MVT ExtSVT = MVT::getIntegerVT(2 * EltSizeInBits);
@@ -29874,6 +29879,10 @@ static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget,
// Attempt to fold scalar shift as unpack(y,x) << zext(splat(z))
if (supportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, ShiftOpc)) {
if (SDValue ScalarAmt = DAG.getSplatValue(AmtMod)) {
+ // Uniform vXi16 funnel shifts can be efficiently handled by default.
+ if (EltSizeInBits == 16)
+ return SDValue();
+
SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, Op1, Op0));
SDValue Hi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, Op1, Op0));
ScalarAmt = DAG.getZExtOrTrunc(ScalarAmt, DL, MVT::i32);
@@ -29912,7 +29921,7 @@ static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget,
}
// Attempt to fold per-element (ExtVT) shift as unpack(y,x) << zext(z)
- if ((IsCst && !IsFSHR && EltSizeInBits == 8) ||
+ if (((IsCst || !Subtarget.hasAVX512()) && !IsFSHR && EltSizeInBits <= 16) ||
supportedVectorVarShift(ExtVT, Subtarget, ShiftOpc)) {
SDValue Z = DAG.getConstant(0, DL, VT);
SDValue RLo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, Op1, Op0));
@@ -36477,9 +36486,8 @@ static SDValue narrowLoadToVZLoad(LoadSDNode *LN, MVT MemVT, MVT VT,
// TODO: Investigate sharing more of this with shuffle lowering.
static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
bool AllowFloatDomain, bool AllowIntDomain,
- SDValue &V1, const SDLoc &DL, SelectionDAG &DAG,
- const X86Subtarget &Subtarget, unsigned &Shuffle,
- MVT &SrcVT, MVT &DstVT) {
+ SDValue V1, const X86Subtarget &Subtarget,
+ unsigned &Shuffle, MVT &SrcVT, MVT &DstVT) {
unsigned NumMaskElts = Mask.size();
unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
@@ -36522,9 +36530,6 @@ static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
MVT::getIntegerVT(MaskEltSize);
SrcVT = MVT::getVectorVT(ScalarTy, SrcSize / MaskEltSize);
- if (SrcVT.getSizeInBits() != MaskVT.getSizeInBits())
- V1 = extractSubVector(V1, 0, DAG, DL, SrcSize);
-
Shuffle = unsigned(MatchAny ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND);
if (SrcVT.getVectorNumElements() != NumDstElts)
Shuffle = getOpcode_EXTEND_VECTOR_INREG(Shuffle);
@@ -37102,6 +37107,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
assert((Inputs.size() == 1 || Inputs.size() == 2) &&
"Unexpected number of shuffle inputs!");
+ SDLoc DL(Root);
MVT RootVT = Root.getSimpleValueType();
unsigned RootSizeInBits = RootVT.getSizeInBits();
unsigned NumRootElts = RootVT.getVectorNumElements();
@@ -37109,6 +37115,8 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
// Canonicalize shuffle input op to the requested type.
// TODO: Support cases where Op is smaller than VT.
auto CanonicalizeShuffleInput = [&](MVT VT, SDValue Op) {
+ if (VT.getSizeInBits() < Op.getValueSizeInBits())
+ Op = extractSubVector(Op, 0, DAG, DL, VT.getSizeInBits());
return DAG.getBitcast(VT, Op);
};
@@ -37124,7 +37132,6 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
assert(VT1.getSizeInBits() == RootSizeInBits &&
VT2.getSizeInBits() == RootSizeInBits && "Vector size mismatch");
- SDLoc DL(Root);
SDValue Res;
unsigned NumBaseMaskElts = BaseMask.size();
@@ -37393,15 +37400,13 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
}
}
- SDValue NewV1 = V1; // Save operand in case early exit happens.
- if (matchUnaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1,
- DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
- ShuffleVT) &&
+ if (matchUnaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, V1,
+ Subtarget, Shuffle, ShuffleSrcVT, ShuffleVT) &&
(!IsMaskedShuffle ||
(NumRootElts == ShuffleVT.getVectorNumElements()))) {
if (Depth == 0 && Root.getOpcode() == Shuffle)
return SDValue(); // Nothing to do!
- Res = CanonicalizeShuffleInput(ShuffleSrcVT, NewV1);
+ Res = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
return DAG.getBitcast(RootVT, Res);
}
@@ -40903,6 +40908,28 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
Known.One.setHighBits(ShAmt);
return false;
}
+ case X86ISD::BLENDV: {
+ SDValue Sel = Op.getOperand(0);
+ SDValue LHS = Op.getOperand(1);
+ SDValue RHS = Op.getOperand(2);
+
+ APInt SignMask = APInt::getSignMask(BitWidth);
+ SDValue NewSel = SimplifyMultipleUseDemandedBits(
+ Sel, SignMask, OriginalDemandedElts, TLO.DAG, Depth + 1);
+ SDValue NewLHS = SimplifyMultipleUseDemandedBits(
+ LHS, OriginalDemandedBits, OriginalDemandedElts, TLO.DAG, Depth + 1);
+ SDValue NewRHS = SimplifyMultipleUseDemandedBits(
+ RHS, OriginalDemandedBits, OriginalDemandedElts, TLO.DAG, Depth + 1);
+
+ if (NewSel || NewLHS || NewRHS) {
+ NewSel = NewSel ? NewSel : Sel;
+ NewLHS = NewLHS ? NewLHS : LHS;
+ NewRHS = NewRHS ? NewRHS : RHS;
+ return TLO.CombineTo(Op, TLO.DAG.getNode(X86ISD::BLENDV, SDLoc(Op), VT,
+ NewSel, NewLHS, NewRHS));
+ }
+ break;
+ }
case X86ISD::PEXTRB:
case X86ISD::PEXTRW: {
SDValue Vec = Op.getOperand(0);
@@ -41043,6 +41070,13 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
if (OriginalDemandedBits.countTrailingZeros() >= NumElts)
return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
+ // See if we only demand bits from the lower 128-bit vector.
+ if (SrcVT.is256BitVector() &&
+ OriginalDemandedBits.getActiveBits() <= (NumElts / 2)) {
+ SDValue NewSrc = extract128BitVector(Src, 0, TLO.DAG, SDLoc(Src));
+ return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
+ }
+
// Only demand the vector elements of the sign bits we need.
APInt KnownUndef, KnownZero;
APInt DemandedElts = OriginalDemandedBits.zextOrTrunc(NumElts);
@@ -42238,19 +42272,14 @@ static SDValue combinePredicateReduction(SDNode *Extract, SelectionDAG &DAG,
EVT MovmskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
Movmsk = DAG.getBitcast(MovmskVT, Match);
} else {
- // For all_of(setcc(x,y,eq))
- // - avoid vXi64 comparisons without PCMPEQQ (SSE41+), use PCMPEQD.
- // - avoid vXi16 comparisons, use PMOVMSKB(PCMPEQB()).
+ // For all_of(setcc(x,y,eq)) - use PMOVMSKB(PCMPEQB()).
if (BinOp == ISD::AND && Match.getOpcode() == ISD::SETCC &&
cast<CondCodeSDNode>(Match.getOperand(2))->get() ==
ISD::CondCode::SETEQ) {
- SDValue Vec = Match.getOperand(0);
- EVT VecSVT = Vec.getValueType().getScalarType();
- if ((VecSVT == MVT::i16 && !Subtarget.hasBWI()) ||
- (VecSVT == MVT::i64 && !Subtarget.hasSSE41())) {
- NumElts *= 2;
- VecSVT = VecSVT.getHalfSizedIntegerVT(*DAG.getContext());
- EVT CmpVT = EVT::getVectorVT(*DAG.getContext(), VecSVT, NumElts);
+ EVT VecSVT = Match.getOperand(0).getValueType().getScalarType();
+ if (VecSVT != MVT::i8) {
+ NumElts *= VecSVT.getSizeInBits() / 8;
+ EVT CmpVT = EVT::getVectorVT(*DAG.getContext(), MVT::i8, NumElts);
MatchVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
Match = DAG.getSetCC(
DL, MatchVT, DAG.getBitcast(CmpVT, Match.getOperand(0)),
@@ -43079,6 +43108,38 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
}
}
+ // If this extract is from a loaded vector value and will be used as an
+ // integer, that requires a potentially expensive XMM -> GPR transfer.
+ // Additionally, if we can convert to a scalar integer load, that will likely
+ // be folded into a subsequent integer op.
+ // Note: Unlike the related fold for this in DAGCombiner, this is not limited
+ // to a single-use of the loaded vector. For the reasons above, we
+ // expect this to be profitable even if it creates an extra load.
+ bool LikelyUsedAsVector = any_of(N->uses(), [](SDNode *Use) {
+ return Use->getOpcode() == ISD::STORE ||
+ Use->getOpcode() == ISD::INSERT_VECTOR_ELT ||
+ Use->getOpcode() == ISD::SCALAR_TO_VECTOR;
+ });
+ auto *LoadVec = dyn_cast<LoadSDNode>(InputVector);
+ if (LoadVec && CIdx && ISD::isNormalLoad(LoadVec) && VT.isInteger() &&
+ SrcVT.getVectorElementType() == VT && DCI.isAfterLegalizeDAG() &&
+ !LikelyUsedAsVector) {
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ SDValue NewPtr =
+ TLI.getVectorElementPointer(DAG, LoadVec->getBasePtr(), SrcVT, EltIdx);
+ unsigned PtrOff = VT.getSizeInBits() * CIdx->getZExtValue() / 8;
+ MachinePointerInfo MPI = LoadVec->getPointerInfo().getWithOffset(PtrOff);
+ Align Alignment = commonAlignment(LoadVec->getAlign(), PtrOff);
+ SDValue Load =
+ DAG.getLoad(VT, dl, LoadVec->getChain(), NewPtr, MPI, Alignment,
+ LoadVec->getMemOperand()->getFlags(), LoadVec->getAAInfo());
+ SDValue Chain = Load.getValue(1);
+ SDValue From[] = {SDValue(N, 0), SDValue(LoadVec, 1)};
+ SDValue To[] = {Load, Chain};
+ DAG.ReplaceAllUsesOfValuesWith(From, To, 2);
+ return SDValue(N, 0);
+ }
+
return SDValue();
}
@@ -44467,8 +44528,8 @@ static SDValue combineSetCCMOVMSK(SDValue EFLAGS, X86::CondCode &CC,
unsigned NumEltBits = VecVT.getScalarSizeInBits();
bool IsAnyOf = CmpOpcode == X86ISD::CMP && CmpVal.isZero();
- bool IsAllOf = CmpOpcode == X86ISD::SUB && NumElts <= CmpBits &&
- CmpVal.isMask(NumElts);
+ bool IsAllOf = (CmpOpcode == X86ISD::SUB || CmpOpcode == X86ISD::CMP) &&
+ NumElts <= CmpBits && CmpVal.isMask(NumElts);
if (!IsAnyOf && !IsAllOf)
return SDValue();
@@ -44500,14 +44561,16 @@ static SDValue combineSetCCMOVMSK(SDValue EFLAGS, X86::CondCode &CC,
// MOVMSK(CONCAT(X,Y)) != 0 -> MOVMSK(OR(X,Y)).
// MOVMSK(CONCAT(X,Y)) == -1 -> MOVMSK(AND(X,Y)).
// MOVMSK(CONCAT(X,Y)) != -1 -> MOVMSK(AND(X,Y)).
- if (VecVT.is256BitVector()) {
+ if (VecVT.is256BitVector() && NumElts <= CmpBits) {
SmallVector<SDValue> Ops;
if (collectConcatOps(peekThroughBitcasts(Vec).getNode(), Ops) &&
Ops.size() == 2) {
SDLoc DL(EFLAGS);
- EVT SubVT = Ops[0].getValueType();
+ EVT SubVT = Ops[0].getValueType().changeTypeToInteger();
APInt CmpMask = APInt::getLowBitsSet(32, IsAnyOf ? 0 : NumElts / 2);
- SDValue V = DAG.getNode(IsAnyOf ? ISD::OR : ISD::AND, DL, SubVT, Ops);
+ SDValue V = DAG.getNode(IsAnyOf ? ISD::OR : ISD::AND, DL, SubVT,
+ DAG.getBitcast(SubVT, Ops[0]),
+ DAG.getBitcast(SubVT, Ops[1]));
V = DAG.getBitcast(VecVT.getHalfNumVectorElementsVT(), V);
return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V),
@@ -44522,26 +44585,29 @@ static SDValue combineSetCCMOVMSK(SDValue EFLAGS, X86::CondCode &CC,
if (IsAllOf && Subtarget.hasSSE41()) {
MVT TestVT = VecVT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
SDValue BC = peekThroughBitcasts(Vec);
- if (BC.getOpcode() == X86ISD::PCMPEQ) {
- SDValue V = DAG.getNode(ISD::SUB, SDLoc(BC), BC.getValueType(),
- BC.getOperand(0), BC.getOperand(1));
- V = DAG.getBitcast(TestVT, V);
- return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
- }
- // Check for 256-bit split vector cases.
- if (BC.getOpcode() == ISD::AND &&
- BC.getOperand(0).getOpcode() == X86ISD::PCMPEQ &&
- BC.getOperand(1).getOpcode() == X86ISD::PCMPEQ) {
- SDValue LHS = BC.getOperand(0);
- SDValue RHS = BC.getOperand(1);
- LHS = DAG.getNode(ISD::SUB, SDLoc(LHS), LHS.getValueType(),
- LHS.getOperand(0), LHS.getOperand(1));
- RHS = DAG.getNode(ISD::SUB, SDLoc(RHS), RHS.getValueType(),
- RHS.getOperand(0), RHS.getOperand(1));
- LHS = DAG.getBitcast(TestVT, LHS);
- RHS = DAG.getBitcast(TestVT, RHS);
- SDValue V = DAG.getNode(ISD::OR, SDLoc(EFLAGS), TestVT, LHS, RHS);
- return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
+ // Ensure MOVMSK was testing every signbit of BC.
+ if (BC.getValueType().getVectorNumElements() <= NumElts) {
+ if (BC.getOpcode() == X86ISD::PCMPEQ) {
+ SDValue V = DAG.getNode(ISD::SUB, SDLoc(BC), BC.getValueType(),
+ BC.getOperand(0), BC.getOperand(1));
+ V = DAG.getBitcast(TestVT, V);
+ return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
+ }
+ // Check for 256-bit split vector cases.
+ if (BC.getOpcode() == ISD::AND &&
+ BC.getOperand(0).getOpcode() == X86ISD::PCMPEQ &&
+ BC.getOperand(1).getOpcode() == X86ISD::PCMPEQ) {
+ SDValue LHS = BC.getOperand(0);
+ SDValue RHS = BC.getOperand(1);
+ LHS = DAG.getNode(ISD::SUB, SDLoc(LHS), LHS.getValueType(),
+ LHS.getOperand(0), LHS.getOperand(1));
+ RHS = DAG.getNode(ISD::SUB, SDLoc(RHS), RHS.getValueType(),
+ RHS.getOperand(0), RHS.getOperand(1));
+ LHS = DAG.getBitcast(TestVT, LHS);
+ RHS = DAG.getBitcast(TestVT, RHS);
+ SDValue V = DAG.getNode(ISD::OR, SDLoc(EFLAGS), TestVT, LHS, RHS);
+ return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
+ }
}
}
@@ -44575,7 +44641,8 @@ static SDValue combineSetCCMOVMSK(SDValue EFLAGS, X86::CondCode &CC,
if (SDValue Src = getSplitVectorSrc(VecOp0, VecOp1, true)) {
SDLoc DL(EFLAGS);
SDValue Result = peekThroughBitcasts(Src);
- if (IsAllOf && Result.getOpcode() == X86ISD::PCMPEQ) {
+ if (IsAllOf && Result.getOpcode() == X86ISD::PCMPEQ &&
+ Result.getValueType().getVectorNumElements() <= NumElts) {
SDValue V = DAG.getNode(ISD::SUB, DL, Result.getValueType(),
Result.getOperand(0), Result.getOperand(1));
V = DAG.getBitcast(MVT::v4i64, V);
@@ -46840,14 +46907,18 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
if (!getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts, EltBits))
return false;
+ APInt DemandedBits = APInt::getZero(EltSizeInBits);
APInt DemandedElts = APInt::getZero(NumElts);
for (int I = 0; I != NumElts; ++I)
- if (!EltBits[I].isZero())
+ if (!EltBits[I].isZero()) {
+ DemandedBits |= EltBits[I];
DemandedElts.setBit(I);
+ }
APInt KnownUndef, KnownZero;
return TLI.SimplifyDemandedVectorElts(OtherOp, DemandedElts, KnownUndef,
- KnownZero, DCI);
+ KnownZero, DCI) ||
+ TLI.SimplifyDemandedBits(OtherOp, DemandedBits, DemandedElts, DCI);
};
if (SimplifyUndemandedElts(N0, N1) || SimplifyUndemandedElts(N1, N0)) {
if (N->getOpcode() != ISD::DELETED_NODE)
@@ -49031,8 +49102,13 @@ static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
return SDValue();
// SSSE3's pshufb results in less instructions in the cases below.
- if (Subtarget.hasSSSE3() && NumElems == 8 && InSVT != MVT::i64)
- return SDValue();
+ if (Subtarget.hasSSSE3() && NumElems == 8) {
+ if (InSVT == MVT::i16)
+ return SDValue();
+ if (InSVT == MVT::i32 &&
+ (OutSVT == MVT::i8 || !Subtarget.hasSSE41() || Subtarget.hasInt256()))
+ return SDValue();
+ }
SDLoc DL(N);
// SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
@@ -51110,6 +51186,30 @@ static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG,
DAG.getConstant(NotMask, DL, VT));
}
+ // Fold movmsk(icmp_eq(and(x,c1),0)) -> movmsk(not(shl(x,c2)))
+ // iff pow2splat(c1).
+ if (Src.getOpcode() == X86ISD::PCMPEQ &&
+ Src.getOperand(0).getOpcode() == ISD::AND &&
+ ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode())) {
+ SDValue LHS = Src.getOperand(0).getOperand(0);
+ SDValue RHS = Src.getOperand(0).getOperand(1);
+ KnownBits KnownRHS = DAG.computeKnownBits(RHS);
+ if (KnownRHS.isConstant() && KnownRHS.getConstant().isPowerOf2()) {
+ SDLoc DL(N);
+ MVT ShiftVT = SrcVT;
+ if (ShiftVT.getScalarType() == MVT::i8) {
+ // vXi8 shifts - we only care about the signbit so can use PSLLW.
+ ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
+ LHS = DAG.getBitcast(ShiftVT, LHS);
+ }
+ unsigned ShiftAmt = KnownRHS.getConstant().countLeadingZeros();
+ LHS = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, ShiftVT, LHS,
+ ShiftAmt, DAG);
+ LHS = DAG.getNOT(DL, DAG.getBitcast(SrcVT, LHS), SrcVT);
+ return DAG.getNode(X86ISD::MOVMSK, DL, VT, LHS);
+ }
+ }
+
// Simplify the inputs.
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
APInt DemandedMask(APInt::getAllOnes(NumBits));
diff --git a/llvm/lib/Target/X86/X86LowerAMXType.cpp b/llvm/lib/Target/X86/X86LowerAMXType.cpp
index 7368b64efd9a..6206d8efb3d0 100644
--- a/llvm/lib/Target/X86/X86LowerAMXType.cpp
+++ b/llvm/lib/Target/X86/X86LowerAMXType.cpp
@@ -61,6 +61,8 @@
#include "llvm/Transforms/Utils/AssumeBundleBuilder.h"
#include "llvm/Transforms/Utils/Local.h"
+#include <map>
+
using namespace llvm;
using namespace PatternMatch;