diff options
Diffstat (limited to 'llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp')
-rw-r--r-- | llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp | 483 |
1 files changed, 410 insertions, 73 deletions
diff --git a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp index 76a9ac12062d..2a9a31dab74f 100644 --- a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp +++ b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -239,6 +239,10 @@ private: void SelectMVE_VADCSBC(SDNode *N, uint16_t OpcodeWithCarry, uint16_t OpcodeWithNoCarry, bool Add, bool Predicated); + /// SelectMVE_VSHLC - Select MVE intrinsics for a shift that carries between + /// vector lanes. + void SelectMVE_VSHLC(SDNode *N, bool Predicated); + /// Select long MVE vector reductions with two vector operands /// Stride is the number of vector element widths the instruction can operate /// on: @@ -266,7 +270,21 @@ private: /// pointer points to a set of NumVecs sub-opcodes used for the /// different stages (e.g. VLD20 versus VLD21) of each load family. void SelectMVE_VLD(SDNode *N, unsigned NumVecs, - const uint16_t *const *Opcodes); + const uint16_t *const *Opcodes, bool HasWriteback); + + /// SelectMVE_VxDUP - Select MVE incrementing-dup instructions. Opcodes is an + /// array of 3 elements for the 8, 16 and 32-bit lane sizes. + void SelectMVE_VxDUP(SDNode *N, const uint16_t *Opcodes, + bool Wrapping, bool Predicated); + + /// Select SelectCDE_CXxD - Select CDE dual-GPR instruction (one of CX1D, + /// CX1DA, CX2D, CX2DA, CX3, CX3DA). + /// \arg \c NumExtraOps number of extra operands besides the coprocossor, + /// the accumulator and the immediate operand, i.e. 0 + /// for CX1*, 1 for CX2*, 2 for CX3* + /// \arg \c HasAccum whether the instruction has an accumulator operand + void SelectCDE_CXxD(SDNode *N, uint16_t Opcode, size_t NumExtraOps, + bool HasAccum); /// SelectVLDDup - Select NEON load-duplicate intrinsics. NumVecs /// should be 1, 2, 3 or 4. The opcode array specifies the instructions used @@ -1173,8 +1191,8 @@ bool ARMDAGToDAGISel::SelectThumbAddrModeSP(SDValue N, // Only multiples of 4 are allowed for the offset, so the frame object // alignment must be at least 4. MachineFrameInfo &MFI = MF->getFrameInfo(); - if (MFI.getObjectAlignment(FI) < 4) - MFI.setObjectAlignment(FI, 4); + if (MFI.getObjectAlign(FI) < Align(4)) + MFI.setObjectAlignment(FI, Align(4)); Base = CurDAG->getTargetFrameIndex( FI, TLI->getPointerTy(CurDAG->getDataLayout())); OffImm = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i32); @@ -1197,9 +1215,9 @@ bool ARMDAGToDAGISel::SelectThumbAddrModeSP(SDValue N, if (RHSC * 4 < MFI.getObjectSize(FI)) { // For LHS+RHS to result in an offset that's a multiple of 4 the object // indexed by the LHS must be 4-byte aligned. - if (!MFI.isFixedObjectIndex(FI) && MFI.getObjectAlignment(FI) < 4) - MFI.setObjectAlignment(FI, 4); - if (MFI.getObjectAlignment(FI) >= 4) { + if (!MFI.isFixedObjectIndex(FI) && MFI.getObjectAlign(FI) < Align(4)) + MFI.setObjectAlignment(FI, Align(4)); + if (MFI.getObjectAlign(FI) >= Align(4)) { Base = CurDAG->getTargetFrameIndex( FI, TLI->getPointerTy(CurDAG->getDataLayout())); OffImm = CurDAG->getTargetConstant(RHSC, SDLoc(N), MVT::i32); @@ -1708,7 +1726,7 @@ bool ARMDAGToDAGISel::tryMVEIndexedLoad(SDNode *N) { EVT LoadedVT; unsigned Opcode = 0; bool isSExtLd, isPre; - unsigned Align; + Align Alignment; ARMVCC::VPTCodes Pred; SDValue PredReg; SDValue Chain, Base, Offset; @@ -1724,7 +1742,7 @@ bool ARMDAGToDAGISel::tryMVEIndexedLoad(SDNode *N) { Chain = LD->getChain(); Base = LD->getBasePtr(); Offset = LD->getOffset(); - Align = LD->getAlignment(); + Alignment = LD->getAlign(); isSExtLd = LD->getExtensionType() == ISD::SEXTLOAD; isPre = (AM == ISD::PRE_INC) || (AM == ISD::PRE_DEC); Pred = ARMVCC::None; @@ -1740,7 +1758,7 @@ bool ARMDAGToDAGISel::tryMVEIndexedLoad(SDNode *N) { Chain = LD->getChain(); Base = LD->getBasePtr(); Offset = LD->getOffset(); - Align = LD->getAlignment(); + Alignment = LD->getAlign(); isSExtLd = LD->getExtensionType() == ISD::SEXTLOAD; isPre = (AM == ISD::PRE_INC) || (AM == ISD::PRE_DEC); Pred = ARMVCC::Then; @@ -1754,7 +1772,7 @@ bool ARMDAGToDAGISel::tryMVEIndexedLoad(SDNode *N) { bool CanChangeType = Subtarget->isLittle() && !isa<MaskedLoadSDNode>(N); SDValue NewOffset; - if (Align >= 2 && LoadedVT == MVT::v4i16 && + if (Alignment >= Align(2) && LoadedVT == MVT::v4i16 && SelectT2AddrModeImm7Offset(N, Offset, NewOffset, 1)) { if (isSExtLd) Opcode = isPre ? ARM::MVE_VLDRHS32_pre : ARM::MVE_VLDRHS32_post; @@ -1772,12 +1790,12 @@ bool ARMDAGToDAGISel::tryMVEIndexedLoad(SDNode *N) { Opcode = isPre ? ARM::MVE_VLDRBS32_pre : ARM::MVE_VLDRBS32_post; else Opcode = isPre ? ARM::MVE_VLDRBU32_pre : ARM::MVE_VLDRBU32_post; - } else if (Align >= 4 && + } else if (Alignment >= Align(4) && (CanChangeType || LoadedVT == MVT::v4i32 || LoadedVT == MVT::v4f32) && SelectT2AddrModeImm7Offset(N, Offset, NewOffset, 2)) Opcode = isPre ? ARM::MVE_VLDRWU32_pre : ARM::MVE_VLDRWU32_post; - else if (Align >= 2 && + else if (Alignment >= Align(2) && (CanChangeType || LoadedVT == MVT::v8i16 || LoadedVT == MVT::v8f16) && SelectT2AddrModeImm7Offset(N, Offset, NewOffset, 1)) @@ -1791,8 +1809,8 @@ bool ARMDAGToDAGISel::tryMVEIndexedLoad(SDNode *N) { SDValue Ops[] = {Base, NewOffset, CurDAG->getTargetConstant(Pred, SDLoc(N), MVT::i32), PredReg, Chain}; - SDNode *New = CurDAG->getMachineNode(Opcode, SDLoc(N), N->getValueType(0), - MVT::i32, MVT::Other, Ops); + SDNode *New = CurDAG->getMachineNode(Opcode, SDLoc(N), MVT::i32, + N->getValueType(0), MVT::Other, Ops); transferMemOperands(N, New); ReplaceUses(SDValue(N, 0), SDValue(New, 1)); ReplaceUses(SDValue(N, 1), SDValue(New, 0)); @@ -2038,6 +2056,7 @@ void ARMDAGToDAGISel::SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs, const uint16_t *DOpcodes, const uint16_t *QOpcodes0, const uint16_t *QOpcodes1) { + assert(Subtarget->hasNEON()); assert(NumVecs >= 1 && NumVecs <= 4 && "VLD NumVecs out-of-range"); SDLoc dl(N); @@ -2059,6 +2078,7 @@ void ARMDAGToDAGISel::SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs, // Double-register operations: case MVT::v8i8: OpcodeIndex = 0; break; case MVT::v4f16: + case MVT::v4bf16: case MVT::v4i16: OpcodeIndex = 1; break; case MVT::v2f32: case MVT::v2i32: OpcodeIndex = 2; break; @@ -2066,6 +2086,7 @@ void ARMDAGToDAGISel::SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs, // Quad-register operations: case MVT::v16i8: OpcodeIndex = 0; break; case MVT::v8f16: + case MVT::v8bf16: case MVT::v8i16: OpcodeIndex = 1; break; case MVT::v4f32: case MVT::v4i32: OpcodeIndex = 2; break; @@ -2177,6 +2198,7 @@ void ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs, const uint16_t *DOpcodes, const uint16_t *QOpcodes0, const uint16_t *QOpcodes1) { + assert(Subtarget->hasNEON()); assert(NumVecs >= 1 && NumVecs <= 4 && "VST NumVecs out-of-range"); SDLoc dl(N); @@ -2201,6 +2223,7 @@ void ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs, // Double-register operations: case MVT::v8i8: OpcodeIndex = 0; break; case MVT::v4f16: + case MVT::v4bf16: case MVT::v4i16: OpcodeIndex = 1; break; case MVT::v2f32: case MVT::v2i32: OpcodeIndex = 2; break; @@ -2208,6 +2231,7 @@ void ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs, // Quad-register operations: case MVT::v16i8: OpcodeIndex = 0; break; case MVT::v8f16: + case MVT::v8bf16: case MVT::v8i16: OpcodeIndex = 1; break; case MVT::v4f32: case MVT::v4i32: OpcodeIndex = 2; break; @@ -2328,6 +2352,7 @@ void ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad, bool isUpdating, unsigned NumVecs, const uint16_t *DOpcodes, const uint16_t *QOpcodes) { + assert(Subtarget->hasNEON()); assert(NumVecs >=2 && NumVecs <= 4 && "VLDSTLane NumVecs out-of-range"); SDLoc dl(N); @@ -2368,11 +2393,13 @@ void ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad, bool isUpdating, // Double-register operations: case MVT::v8i8: OpcodeIndex = 0; break; case MVT::v4f16: + case MVT::v4bf16: case MVT::v4i16: OpcodeIndex = 1; break; case MVT::v2f32: case MVT::v2i32: OpcodeIndex = 2; break; // Quad-register operations: case MVT::v8f16: + case MVT::v8bf16: case MVT::v8i16: OpcodeIndex = 0; break; case MVT::v4f32: case MVT::v4i32: OpcodeIndex = 1; break; @@ -2511,7 +2538,16 @@ void ARMDAGToDAGISel::SelectMVE_WB(SDNode *N, const uint16_t *Opcodes, Ops.push_back(N->getOperand(0)); // chain - CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), makeArrayRef(Ops)); + SmallVector<EVT, 8> VTs; + VTs.push_back(N->getValueType(1)); + VTs.push_back(N->getValueType(0)); + VTs.push_back(N->getValueType(2)); + + SDNode *New = CurDAG->getMachineNode(Opcode, SDLoc(N), VTs, Ops); + ReplaceUses(SDValue(N, 0), SDValue(New, 1)); + ReplaceUses(SDValue(N, 1), SDValue(New, 0)); + ReplaceUses(SDValue(N, 2), SDValue(New, 2)); + CurDAG->RemoveDeadNode(N); } void ARMDAGToDAGISel::SelectMVE_LongShift(SDNode *N, uint16_t Opcode, @@ -2581,6 +2617,25 @@ void ARMDAGToDAGISel::SelectMVE_VADCSBC(SDNode *N, uint16_t OpcodeWithCarry, CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), makeArrayRef(Ops)); } +void ARMDAGToDAGISel::SelectMVE_VSHLC(SDNode *N, bool Predicated) { + SDLoc Loc(N); + SmallVector<SDValue, 8> Ops; + + // One vector input, followed by a 32-bit word of bits to shift in + // and then an immediate shift count + Ops.push_back(N->getOperand(1)); + Ops.push_back(N->getOperand(2)); + int32_t ImmValue = cast<ConstantSDNode>(N->getOperand(3))->getZExtValue(); + Ops.push_back(getI32Imm(ImmValue, Loc)); // immediate shift count + + if (Predicated) + AddMVEPredicateToOps(Ops, Loc, N->getOperand(4)); + else + AddEmptyMVEPredicateToOps(Ops, Loc); + + CurDAG->SelectNodeTo(N, ARM::MVE_VSHLC, N->getVTList(), makeArrayRef(Ops)); +} + static bool SDValueToConstBool(SDValue SDVal) { assert(isa<ConstantSDNode>(SDVal) && "expected a compile-time constant"); ConstantSDNode *SDValConstant = dyn_cast<ConstantSDNode>(SDVal); @@ -2673,7 +2728,8 @@ void ARMDAGToDAGISel::SelectMVE_VRMLLDAVH(SDNode *N, bool Predicated, } void ARMDAGToDAGISel::SelectMVE_VLD(SDNode *N, unsigned NumVecs, - const uint16_t *const *Opcodes) { + const uint16_t *const *Opcodes, + bool HasWriteback) { EVT VT = N->getValueType(0); SDLoc Loc(N); @@ -2693,23 +2749,141 @@ void ARMDAGToDAGISel::SelectMVE_VLD(SDNode *N, unsigned NumVecs, } EVT DataTy = EVT::getVectorVT(*CurDAG->getContext(), MVT::i64, NumVecs * 2); - EVT ResultTys[] = {DataTy, MVT::Other}; + SmallVector<EVT, 4> ResultTys = {DataTy, MVT::Other}; + unsigned PtrOperand = HasWriteback ? 1 : 2; auto Data = SDValue( CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, Loc, DataTy), 0); SDValue Chain = N->getOperand(0); - for (unsigned Stage = 0; Stage < NumVecs; ++Stage) { - SDValue Ops[] = {Data, N->getOperand(2), Chain}; + // Add a MVE_VLDn instruction for each Vec, except the last + for (unsigned Stage = 0; Stage < NumVecs - 1; ++Stage) { + SDValue Ops[] = {Data, N->getOperand(PtrOperand), Chain}; auto LoadInst = CurDAG->getMachineNode(OurOpcodes[Stage], Loc, ResultTys, Ops); Data = SDValue(LoadInst, 0); Chain = SDValue(LoadInst, 1); } + // The last may need a writeback on it + if (HasWriteback) + ResultTys = {DataTy, MVT::i32, MVT::Other}; + SDValue Ops[] = {Data, N->getOperand(PtrOperand), Chain}; + auto LoadInst = + CurDAG->getMachineNode(OurOpcodes[NumVecs - 1], Loc, ResultTys, Ops); - for (unsigned i = 0; i < NumVecs; i++) + unsigned i; + for (i = 0; i < NumVecs; i++) ReplaceUses(SDValue(N, i), - CurDAG->getTargetExtractSubreg(ARM::qsub_0 + i, Loc, VT, Data)); - ReplaceUses(SDValue(N, NumVecs), Chain); + CurDAG->getTargetExtractSubreg(ARM::qsub_0 + i, Loc, VT, + SDValue(LoadInst, 0))); + if (HasWriteback) + ReplaceUses(SDValue(N, i++), SDValue(LoadInst, 1)); + ReplaceUses(SDValue(N, i), SDValue(LoadInst, HasWriteback ? 2 : 1)); + CurDAG->RemoveDeadNode(N); +} + +void ARMDAGToDAGISel::SelectMVE_VxDUP(SDNode *N, const uint16_t *Opcodes, + bool Wrapping, bool Predicated) { + EVT VT = N->getValueType(0); + SDLoc Loc(N); + + uint16_t Opcode; + switch (VT.getScalarSizeInBits()) { + case 8: + Opcode = Opcodes[0]; + break; + case 16: + Opcode = Opcodes[1]; + break; + case 32: + Opcode = Opcodes[2]; + break; + default: + llvm_unreachable("bad vector element size in SelectMVE_VxDUP"); + } + + SmallVector<SDValue, 8> Ops; + unsigned OpIdx = 1; + + SDValue Inactive; + if (Predicated) + Inactive = N->getOperand(OpIdx++); + + Ops.push_back(N->getOperand(OpIdx++)); // base + if (Wrapping) + Ops.push_back(N->getOperand(OpIdx++)); // limit + + SDValue ImmOp = N->getOperand(OpIdx++); // step + int ImmValue = cast<ConstantSDNode>(ImmOp)->getZExtValue(); + Ops.push_back(getI32Imm(ImmValue, Loc)); + + if (Predicated) + AddMVEPredicateToOps(Ops, Loc, N->getOperand(OpIdx), Inactive); + else + AddEmptyMVEPredicateToOps(Ops, Loc, N->getValueType(0)); + + CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), makeArrayRef(Ops)); +} + +void ARMDAGToDAGISel::SelectCDE_CXxD(SDNode *N, uint16_t Opcode, + size_t NumExtraOps, bool HasAccum) { + bool IsBigEndian = CurDAG->getDataLayout().isBigEndian(); + SDLoc Loc(N); + SmallVector<SDValue, 8> Ops; + + unsigned OpIdx = 1; + + // Convert and append the immediate operand designating the coprocessor. + SDValue ImmCorpoc = N->getOperand(OpIdx++); + uint32_t ImmCoprocVal = cast<ConstantSDNode>(ImmCorpoc)->getZExtValue(); + Ops.push_back(getI32Imm(ImmCoprocVal, Loc)); + + // For accumulating variants copy the low and high order parts of the + // accumulator into a register pair and add it to the operand vector. + if (HasAccum) { + SDValue AccLo = N->getOperand(OpIdx++); + SDValue AccHi = N->getOperand(OpIdx++); + if (IsBigEndian) + std::swap(AccLo, AccHi); + Ops.push_back(SDValue(createGPRPairNode(MVT::Untyped, AccLo, AccHi), 0)); + } + + // Copy extra operands as-is. + for (size_t I = 0; I < NumExtraOps; I++) + Ops.push_back(N->getOperand(OpIdx++)); + + // Convert and append the immediate operand + SDValue Imm = N->getOperand(OpIdx); + uint32_t ImmVal = cast<ConstantSDNode>(Imm)->getZExtValue(); + Ops.push_back(getI32Imm(ImmVal, Loc)); + + // Accumulating variants are IT-predicable, add predicate operands. + if (HasAccum) { + SDValue Pred = getAL(CurDAG, Loc); + SDValue PredReg = CurDAG->getRegister(0, MVT::i32); + Ops.push_back(Pred); + Ops.push_back(PredReg); + } + + // Create the CDE intruction + SDNode *InstrNode = CurDAG->getMachineNode(Opcode, Loc, MVT::Untyped, Ops); + SDValue ResultPair = SDValue(InstrNode, 0); + + // The original intrinsic had two outputs, and the output of the dual-register + // CDE instruction is a register pair. We need to extract the two subregisters + // and replace all uses of the original outputs with the extracted + // subregisters. + uint16_t SubRegs[2] = {ARM::gsub_0, ARM::gsub_1}; + if (IsBigEndian) + std::swap(SubRegs[0], SubRegs[1]); + + for (size_t ResIdx = 0; ResIdx < 2; ResIdx++) { + if (SDValue(N, ResIdx).use_empty()) + continue; + SDValue SubReg = CurDAG->getTargetExtractSubreg(SubRegs[ResIdx], Loc, + MVT::i32, ResultPair); + ReplaceUses(SDValue(N, ResIdx), SubReg); + } + CurDAG->RemoveDeadNode(N); } @@ -2718,6 +2892,7 @@ void ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool IsIntrinsic, const uint16_t *DOpcodes, const uint16_t *QOpcodes0, const uint16_t *QOpcodes1) { + assert(Subtarget->hasNEON()); assert(NumVecs >= 1 && NumVecs <= 4 && "VLDDup NumVecs out-of-range"); SDLoc dl(N); @@ -2754,6 +2929,8 @@ void ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool IsIntrinsic, case MVT::v8i16: case MVT::v4f16: case MVT::v8f16: + case MVT::v4bf16: + case MVT::v8bf16: OpcodeIndex = 1; break; case MVT::v2f32: case MVT::v2i32: @@ -3231,7 +3408,7 @@ void ARMDAGToDAGISel::Select(SDNode *N) { MachineFunction& MF = CurDAG->getMachineFunction(); MachineMemOperand *MemOp = MF.getMachineMemOperand(MachinePointerInfo::getConstantPool(MF), - MachineMemOperand::MOLoad, 4, 4); + MachineMemOperand::MOLoad, 4, Align(4)); CurDAG->setNodeMemRefs(cast<MachineSDNode>(ResNode), {MemOp}); @@ -3251,8 +3428,8 @@ void ARMDAGToDAGISel::Select(SDNode *N) { // Set the alignment of the frame object to 4, to avoid having to generate // more than one ADD MachineFrameInfo &MFI = MF->getFrameInfo(); - if (MFI.getObjectAlignment(FI) < 4) - MFI.setObjectAlignment(FI, 4); + if (MFI.getObjectAlign(FI) < Align(4)) + MFI.setObjectAlignment(FI, Align(4)); CurDAG->SelectNodeTo(N, ARM::tADDframe, MVT::i32, TFI, CurDAG->getTargetConstant(0, dl, MVT::i32)); return; @@ -3522,6 +3699,14 @@ void ARMDAGToDAGISel::Select(SDNode *N) { const SDValue &Chain = N->getOperand(0); const SDValue &Addr = N->getOperand(1); SelectAddrMode3(Addr, Base, RegOffset, ImmOffset); + if (RegOffset != CurDAG->getRegister(0, MVT::i32)) { + // The register-offset variant of LDRD mandates that the register + // allocated to RegOffset is not reused in any of the remaining operands. + // This restriction is currently not enforced. Therefore emitting this + // variant is explicitly avoided. + Base = Addr; + RegOffset = CurDAG->getRegister(0, MVT::i32); + } SDValue Ops[] = {Base, RegOffset, ImmOffset, Chain}; SDNode *New = CurDAG->getMachineNode(ARM::LOADDUAL, dl, {MVT::Untyped, MVT::Other}, Ops); @@ -3529,12 +3714,37 @@ void ARMDAGToDAGISel::Select(SDNode *N) { SDValue(New, 0)); SDValue Hi = CurDAG->getTargetExtractSubreg(ARM::gsub_1, dl, MVT::i32, SDValue(New, 0)); + transferMemOperands(N, New); ReplaceUses(SDValue(N, 0), Lo); ReplaceUses(SDValue(N, 1), Hi); ReplaceUses(SDValue(N, 2), SDValue(New, 1)); CurDAG->RemoveDeadNode(N); return; } + case ARMISD::STRD: { + if (Subtarget->isThumb2()) + break; // TableGen handles isel in this case. + SDValue Base, RegOffset, ImmOffset; + const SDValue &Chain = N->getOperand(0); + const SDValue &Addr = N->getOperand(3); + SelectAddrMode3(Addr, Base, RegOffset, ImmOffset); + if (RegOffset != CurDAG->getRegister(0, MVT::i32)) { + // The register-offset variant of STRD mandates that the register + // allocated to RegOffset is not reused in any of the remaining operands. + // This restriction is currently not enforced. Therefore emitting this + // variant is explicitly avoided. + Base = Addr; + RegOffset = CurDAG->getRegister(0, MVT::i32); + } + SDNode *RegPair = + createGPRPairNode(MVT::Untyped, N->getOperand(1), N->getOperand(2)); + SDValue Ops[] = {SDValue(RegPair, 0), Base, RegOffset, ImmOffset, Chain}; + SDNode *New = CurDAG->getMachineNode(ARM::STOREDUAL, dl, MVT::Other, Ops); + transferMemOperands(N, New); + ReplaceUses(SDValue(N, 0), SDValue(New, 0)); + CurDAG->RemoveDeadNode(N); + return; + } case ARMISD::LOOP_DEC: { SDValue Ops[] = { N->getOperand(1), N->getOperand(2), @@ -3877,14 +4087,24 @@ void ARMDAGToDAGISel::Select(SDNode *N) { } case ARMISD::VLD2_UPD: { - static const uint16_t DOpcodes[] = { ARM::VLD2d8wb_fixed, - ARM::VLD2d16wb_fixed, - ARM::VLD2d32wb_fixed, - ARM::VLD1q64wb_fixed}; - static const uint16_t QOpcodes[] = { ARM::VLD2q8PseudoWB_fixed, - ARM::VLD2q16PseudoWB_fixed, - ARM::VLD2q32PseudoWB_fixed }; - SelectVLD(N, true, 2, DOpcodes, QOpcodes, nullptr); + if (Subtarget->hasNEON()) { + static const uint16_t DOpcodes[] = { + ARM::VLD2d8wb_fixed, ARM::VLD2d16wb_fixed, ARM::VLD2d32wb_fixed, + ARM::VLD1q64wb_fixed}; + static const uint16_t QOpcodes[] = {ARM::VLD2q8PseudoWB_fixed, + ARM::VLD2q16PseudoWB_fixed, + ARM::VLD2q32PseudoWB_fixed}; + SelectVLD(N, true, 2, DOpcodes, QOpcodes, nullptr); + } else { + static const uint16_t Opcodes8[] = {ARM::MVE_VLD20_8, + ARM::MVE_VLD21_8_wb}; + static const uint16_t Opcodes16[] = {ARM::MVE_VLD20_16, + ARM::MVE_VLD21_16_wb}; + static const uint16_t Opcodes32[] = {ARM::MVE_VLD20_32, + ARM::MVE_VLD21_32_wb}; + static const uint16_t *const Opcodes[] = {Opcodes8, Opcodes16, Opcodes32}; + SelectMVE_VLD(N, 2, Opcodes, true); + } return; } @@ -3904,17 +4124,30 @@ void ARMDAGToDAGISel::Select(SDNode *N) { } case ARMISD::VLD4_UPD: { - static const uint16_t DOpcodes[] = { ARM::VLD4d8Pseudo_UPD, - ARM::VLD4d16Pseudo_UPD, - ARM::VLD4d32Pseudo_UPD, - ARM::VLD1d64QPseudoWB_fixed}; - static const uint16_t QOpcodes0[] = { ARM::VLD4q8Pseudo_UPD, - ARM::VLD4q16Pseudo_UPD, - ARM::VLD4q32Pseudo_UPD }; - static const uint16_t QOpcodes1[] = { ARM::VLD4q8oddPseudo_UPD, - ARM::VLD4q16oddPseudo_UPD, - ARM::VLD4q32oddPseudo_UPD }; - SelectVLD(N, true, 4, DOpcodes, QOpcodes0, QOpcodes1); + if (Subtarget->hasNEON()) { + static const uint16_t DOpcodes[] = { + ARM::VLD4d8Pseudo_UPD, ARM::VLD4d16Pseudo_UPD, ARM::VLD4d32Pseudo_UPD, + ARM::VLD1d64QPseudoWB_fixed}; + static const uint16_t QOpcodes0[] = {ARM::VLD4q8Pseudo_UPD, + ARM::VLD4q16Pseudo_UPD, + ARM::VLD4q32Pseudo_UPD}; + static const uint16_t QOpcodes1[] = {ARM::VLD4q8oddPseudo_UPD, + ARM::VLD4q16oddPseudo_UPD, + ARM::VLD4q32oddPseudo_UPD}; + SelectVLD(N, true, 4, DOpcodes, QOpcodes0, QOpcodes1); + } else { + static const uint16_t Opcodes8[] = {ARM::MVE_VLD40_8, ARM::MVE_VLD41_8, + ARM::MVE_VLD42_8, + ARM::MVE_VLD43_8_wb}; + static const uint16_t Opcodes16[] = {ARM::MVE_VLD40_16, ARM::MVE_VLD41_16, + ARM::MVE_VLD42_16, + ARM::MVE_VLD43_16_wb}; + static const uint16_t Opcodes32[] = {ARM::MVE_VLD40_32, ARM::MVE_VLD41_32, + ARM::MVE_VLD42_32, + ARM::MVE_VLD43_32_wb}; + static const uint16_t *const Opcodes[] = {Opcodes8, Opcodes16, Opcodes32}; + SelectMVE_VLD(N, 4, Opcodes, true); + } return; } @@ -3962,15 +4195,17 @@ void ARMDAGToDAGISel::Select(SDNode *N) { } case ARMISD::VST2_UPD: { - static const uint16_t DOpcodes[] = { ARM::VST2d8wb_fixed, - ARM::VST2d16wb_fixed, - ARM::VST2d32wb_fixed, - ARM::VST1q64wb_fixed}; - static const uint16_t QOpcodes[] = { ARM::VST2q8PseudoWB_fixed, - ARM::VST2q16PseudoWB_fixed, - ARM::VST2q32PseudoWB_fixed }; - SelectVST(N, true, 2, DOpcodes, QOpcodes, nullptr); - return; + if (Subtarget->hasNEON()) { + static const uint16_t DOpcodes[] = { + ARM::VST2d8wb_fixed, ARM::VST2d16wb_fixed, ARM::VST2d32wb_fixed, + ARM::VST1q64wb_fixed}; + static const uint16_t QOpcodes[] = {ARM::VST2q8PseudoWB_fixed, + ARM::VST2q16PseudoWB_fixed, + ARM::VST2q32PseudoWB_fixed}; + SelectVST(N, true, 2, DOpcodes, QOpcodes, nullptr); + return; + } + break; } case ARMISD::VST3_UPD: { @@ -3989,18 +4224,20 @@ void ARMDAGToDAGISel::Select(SDNode *N) { } case ARMISD::VST4_UPD: { - static const uint16_t DOpcodes[] = { ARM::VST4d8Pseudo_UPD, - ARM::VST4d16Pseudo_UPD, - ARM::VST4d32Pseudo_UPD, - ARM::VST1d64QPseudoWB_fixed}; - static const uint16_t QOpcodes0[] = { ARM::VST4q8Pseudo_UPD, - ARM::VST4q16Pseudo_UPD, - ARM::VST4q32Pseudo_UPD }; - static const uint16_t QOpcodes1[] = { ARM::VST4q8oddPseudo_UPD, - ARM::VST4q16oddPseudo_UPD, - ARM::VST4q32oddPseudo_UPD }; - SelectVST(N, true, 4, DOpcodes, QOpcodes0, QOpcodes1); - return; + if (Subtarget->hasNEON()) { + static const uint16_t DOpcodes[] = { + ARM::VST4d8Pseudo_UPD, ARM::VST4d16Pseudo_UPD, ARM::VST4d32Pseudo_UPD, + ARM::VST1d64QPseudoWB_fixed}; + static const uint16_t QOpcodes0[] = {ARM::VST4q8Pseudo_UPD, + ARM::VST4q16Pseudo_UPD, + ARM::VST4q32Pseudo_UPD}; + static const uint16_t QOpcodes1[] = {ARM::VST4q8oddPseudo_UPD, + ARM::VST4q16oddPseudo_UPD, + ARM::VST4q32oddPseudo_UPD}; + SelectVST(N, true, 4, DOpcodes, QOpcodes0, QOpcodes1); + return; + } + break; } case ARMISD::VST2LN_UPD: { @@ -4479,7 +4716,7 @@ void ARMDAGToDAGISel::Select(SDNode *N) { static const uint16_t Opcodes32[] = {ARM::MVE_VLD20_32, ARM::MVE_VLD21_32}; static const uint16_t *const Opcodes[] = {Opcodes8, Opcodes16, Opcodes32}; - SelectMVE_VLD(N, 2, Opcodes); + SelectMVE_VLD(N, 2, Opcodes, false); return; } @@ -4493,7 +4730,7 @@ void ARMDAGToDAGISel::Select(SDNode *N) { ARM::MVE_VLD42_32, ARM::MVE_VLD43_32}; static const uint16_t *const Opcodes[] = {Opcodes8, Opcodes16, Opcodes32}; - SelectMVE_VLD(N, 4, Opcodes); + SelectMVE_VLD(N, 4, Opcodes, false); return; } } @@ -4506,6 +4743,29 @@ void ARMDAGToDAGISel::Select(SDNode *N) { default: break; + // Scalar f32 -> bf16 + case Intrinsic::arm_neon_vcvtbfp2bf: { + SDLoc dl(N); + const SDValue &Src = N->getOperand(1); + llvm::EVT DestTy = N->getValueType(0); + SDValue Pred = getAL(CurDAG, dl); + SDValue Reg0 = CurDAG->getRegister(0, MVT::i32); + SDValue Ops[] = { Src, Src, Pred, Reg0 }; + CurDAG->SelectNodeTo(N, ARM::BF16_VCVTB, DestTy, Ops); + return; + } + + // Vector v4f32 -> v4bf16 + case Intrinsic::arm_neon_vcvtfp2bf: { + SDLoc dl(N); + const SDValue &Src = N->getOperand(1); + SDValue Pred = getAL(CurDAG, dl); + SDValue Reg0 = CurDAG->getRegister(0, MVT::i32); + SDValue Ops[] = { Src, Pred, Reg0 }; + CurDAG->SelectNodeTo(N, ARM::BF16_VCVT, MVT::v4bf16, Ops); + return; + } + case Intrinsic::arm_mve_urshrl: SelectMVE_LongShift(N, ARM::MVE_URSHRL, true, false); return; @@ -4524,18 +4784,21 @@ void ARMDAGToDAGISel::Select(SDNode *N) { case Intrinsic::arm_mve_sqrshrl: SelectMVE_LongShift(N, ARM::MVE_SQRSHRL, false, true); return; - case Intrinsic::arm_mve_lsll: - SelectMVE_LongShift(N, ARM::MVE_LSLLr, false, false); - return; - case Intrinsic::arm_mve_asrl: - SelectMVE_LongShift(N, ARM::MVE_ASRLr, false, false); - return; case Intrinsic::arm_mve_vadc: case Intrinsic::arm_mve_vadc_predicated: SelectMVE_VADCSBC(N, ARM::MVE_VADC, ARM::MVE_VADCI, true, IntNo == Intrinsic::arm_mve_vadc_predicated); return; + case Intrinsic::arm_mve_vsbc: + case Intrinsic::arm_mve_vsbc_predicated: + SelectMVE_VADCSBC(N, ARM::MVE_VSBC, ARM::MVE_VSBCI, true, + IntNo == Intrinsic::arm_mve_vsbc_predicated); + return; + case Intrinsic::arm_mve_vshlc: + case Intrinsic::arm_mve_vshlc_predicated: + SelectMVE_VSHLC(N, IntNo == Intrinsic::arm_mve_vshlc_predicated); + return; case Intrinsic::arm_mve_vmlldava: case Intrinsic::arm_mve_vmlldava_predicated: { @@ -4573,6 +4836,80 @@ void ARMDAGToDAGISel::Select(SDNode *N) { OpcodesS, OpcodesU); return; } + + case Intrinsic::arm_mve_vidup: + case Intrinsic::arm_mve_vidup_predicated: { + static const uint16_t Opcodes[] = { + ARM::MVE_VIDUPu8, ARM::MVE_VIDUPu16, ARM::MVE_VIDUPu32, + }; + SelectMVE_VxDUP(N, Opcodes, false, + IntNo == Intrinsic::arm_mve_vidup_predicated); + return; + } + + case Intrinsic::arm_mve_vddup: + case Intrinsic::arm_mve_vddup_predicated: { + static const uint16_t Opcodes[] = { + ARM::MVE_VDDUPu8, ARM::MVE_VDDUPu16, ARM::MVE_VDDUPu32, + }; + SelectMVE_VxDUP(N, Opcodes, false, + IntNo == Intrinsic::arm_mve_vddup_predicated); + return; + } + + case Intrinsic::arm_mve_viwdup: + case Intrinsic::arm_mve_viwdup_predicated: { + static const uint16_t Opcodes[] = { + ARM::MVE_VIWDUPu8, ARM::MVE_VIWDUPu16, ARM::MVE_VIWDUPu32, + }; + SelectMVE_VxDUP(N, Opcodes, true, + IntNo == Intrinsic::arm_mve_viwdup_predicated); + return; + } + + case Intrinsic::arm_mve_vdwdup: + case Intrinsic::arm_mve_vdwdup_predicated: { + static const uint16_t Opcodes[] = { + ARM::MVE_VDWDUPu8, ARM::MVE_VDWDUPu16, ARM::MVE_VDWDUPu32, + }; + SelectMVE_VxDUP(N, Opcodes, true, + IntNo == Intrinsic::arm_mve_vdwdup_predicated); + return; + } + + case Intrinsic::arm_cde_cx1d: + case Intrinsic::arm_cde_cx1da: + case Intrinsic::arm_cde_cx2d: + case Intrinsic::arm_cde_cx2da: + case Intrinsic::arm_cde_cx3d: + case Intrinsic::arm_cde_cx3da: { + bool HasAccum = IntNo == Intrinsic::arm_cde_cx1da || + IntNo == Intrinsic::arm_cde_cx2da || + IntNo == Intrinsic::arm_cde_cx3da; + size_t NumExtraOps; + uint16_t Opcode; + switch (IntNo) { + case Intrinsic::arm_cde_cx1d: + case Intrinsic::arm_cde_cx1da: + NumExtraOps = 0; + Opcode = HasAccum ? ARM::CDE_CX1DA : ARM::CDE_CX1D; + break; + case Intrinsic::arm_cde_cx2d: + case Intrinsic::arm_cde_cx2da: + NumExtraOps = 1; + Opcode = HasAccum ? ARM::CDE_CX2DA : ARM::CDE_CX2D; + break; + case Intrinsic::arm_cde_cx3d: + case Intrinsic::arm_cde_cx3da: + NumExtraOps = 2; + Opcode = HasAccum ? ARM::CDE_CX3DA : ARM::CDE_CX3D; + break; + default: + llvm_unreachable("Unexpected opcode"); + } + SelectCDE_CXxD(N, Opcode, NumExtraOps, HasAccum); + return; + } } break; } |