From f8af5cf600354830d4ccf59732403f0f073eccb9 Mon Sep 17 00:00:00 2001 From: Dimitry Andric Date: Sun, 22 Dec 2013 00:04:03 +0000 Subject: Vendor import of llvm release_34 branch r197841 (effectively, 3.4 RC3): https://llvm.org/svn/llvm-project/llvm/branches/release_34@197841 --- lib/Target/AArch64/AArch64.td | 7 +- lib/Target/AArch64/AArch64AsmPrinter.cpp | 172 +- lib/Target/AArch64/AArch64AsmPrinter.h | 4 - lib/Target/AArch64/AArch64BranchFixupPass.cpp | 2 +- lib/Target/AArch64/AArch64CallingConv.td | 9 +- lib/Target/AArch64/AArch64FrameLowering.cpp | 35 +- lib/Target/AArch64/AArch64FrameLowering.h | 2 +- lib/Target/AArch64/AArch64ISelDAGToDAG.cpp | 1066 ++- lib/Target/AArch64/AArch64ISelLowering.cpp | 1814 +++- lib/Target/AArch64/AArch64ISelLowering.h | 147 +- lib/Target/AArch64/AArch64InstrFormats.td | 528 ++ lib/Target/AArch64/AArch64InstrInfo.cpp | 105 +- lib/Target/AArch64/AArch64InstrInfo.h | 4 - lib/Target/AArch64/AArch64InstrInfo.td | 87 +- lib/Target/AArch64/AArch64InstrNEON.td | 8671 ++++++++++++++++++++ lib/Target/AArch64/AArch64MCInstLower.cpp | 7 +- lib/Target/AArch64/AArch64RegisterInfo.cpp | 7 +- lib/Target/AArch64/AArch64RegisterInfo.h | 7 +- lib/Target/AArch64/AArch64RegisterInfo.td | 176 +- lib/Target/AArch64/AArch64Subtarget.cpp | 28 +- lib/Target/AArch64/AArch64Subtarget.h | 20 +- lib/Target/AArch64/AArch64TargetMachine.cpp | 1 + lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp | 521 +- lib/Target/AArch64/CMakeLists.txt | 2 + .../AArch64/Disassembler/AArch64Disassembler.cpp | 795 +- .../AArch64/InstPrinter/AArch64InstPrinter.cpp | 131 + .../AArch64/InstPrinter/AArch64InstPrinter.h | 14 +- .../AArch64/MCTargetDesc/AArch64AsmBackend.cpp | 4 +- .../AArch64/MCTargetDesc/AArch64ELFStreamer.cpp | 20 +- .../AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp | 5 +- lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h | 10 +- .../AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp | 106 +- .../AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp | 21 +- .../AArch64/MCTargetDesc/AArch64MCTargetDesc.h | 5 +- .../AArch64/TargetInfo/AArch64TargetInfo.cpp | 2 +- lib/Target/AArch64/Utils/AArch64BaseInfo.cpp | 68 +- lib/Target/AArch64/Utils/AArch64BaseInfo.h | 65 +- lib/Target/AArch64/Utils/CMakeLists.txt | 2 + lib/Target/ARM/A15SDOptimizer.cpp | 13 +- lib/Target/ARM/ARM.td | 94 +- lib/Target/ARM/ARMAsmPrinter.cpp | 605 +- lib/Target/ARM/ARMAsmPrinter.h | 8 +- lib/Target/ARM/ARMBaseInstrInfo.cpp | 470 +- lib/Target/ARM/ARMBaseInstrInfo.h | 28 +- lib/Target/ARM/ARMBaseRegisterInfo.cpp | 99 +- lib/Target/ARM/ARMBaseRegisterInfo.h | 27 +- lib/Target/ARM/ARMBuildAttrs.h | 71 +- lib/Target/ARM/ARMCallingConv.td | 26 +- lib/Target/ARM/ARMCodeEmitter.cpp | 6 +- lib/Target/ARM/ARMConstantIslandPass.cpp | 3 +- lib/Target/ARM/ARMConstantPoolValue.cpp | 50 +- lib/Target/ARM/ARMConstantPoolValue.h | 33 + lib/Target/ARM/ARMExpandPseudoInsts.cpp | 75 +- lib/Target/ARM/ARMFPUName.def | 32 + lib/Target/ARM/ARMFPUName.h | 26 + lib/Target/ARM/ARMFastISel.cpp | 375 +- lib/Target/ARM/ARMFeatures.h | 93 + lib/Target/ARM/ARMFrameLowering.cpp | 156 +- lib/Target/ARM/ARMHazardRecognizer.cpp | 10 +- lib/Target/ARM/ARMHazardRecognizer.h | 13 +- lib/Target/ARM/ARMISelDAGToDAG.cpp | 566 +- lib/Target/ARM/ARMISelLowering.cpp | 1722 ++-- lib/Target/ARM/ARMISelLowering.h | 72 +- lib/Target/ARM/ARMInstrFormats.td | 290 +- lib/Target/ARM/ARMInstrInfo.cpp | 34 +- lib/Target/ARM/ARMInstrInfo.td | 885 +- lib/Target/ARM/ARMInstrNEON.td | 451 +- lib/Target/ARM/ARMInstrThumb.td | 263 +- lib/Target/ARM/ARMInstrThumb2.td | 849 +- lib/Target/ARM/ARMInstrVFP.td | 348 +- lib/Target/ARM/ARMLoadStoreOptimizer.cpp | 115 +- lib/Target/ARM/ARMMCInstLower.cpp | 2 +- lib/Target/ARM/ARMMachineFunctionInfo.h | 78 +- lib/Target/ARM/ARMRegisterInfo.cpp | 5 +- lib/Target/ARM/ARMRegisterInfo.h | 6 +- lib/Target/ARM/ARMRegisterInfo.td | 84 +- lib/Target/ARM/ARMSchedule.td | 18 + lib/Target/ARM/ARMScheduleA9.td | 196 +- lib/Target/ARM/ARMScheduleSwift.td | 944 ++- lib/Target/ARM/ARMSelectionDAGInfo.cpp | 4 +- lib/Target/ARM/ARMSelectionDAGInfo.h | 4 +- lib/Target/ARM/ARMSubtarget.cpp | 120 +- lib/Target/ARM/ARMSubtarget.h | 79 +- lib/Target/ARM/ARMTargetMachine.cpp | 15 +- lib/Target/ARM/ARMTargetObjectFile.cpp | 2 +- lib/Target/ARM/ARMTargetTransformInfo.cpp | 113 +- lib/Target/ARM/AsmParser/ARMAsmParser.cpp | 1429 ++-- lib/Target/ARM/CMakeLists.txt | 2 +- lib/Target/ARM/Disassembler/ARMDisassembler.cpp | 817 +- lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp | 284 +- lib/Target/ARM/InstPrinter/ARMInstPrinter.h | 4 + lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h | 12 +- lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp | 53 +- lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h | 61 +- lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp | 735 +- lib/Target/ARM/MCTargetDesc/ARMELFStreamer.h | 27 - lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp | 2 - lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h | 3 +- lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp | 102 +- lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp | 93 +- lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h | 16 +- .../ARM/MCTargetDesc/ARMMachORelocationInfo.cpp | 43 + .../ARM/MCTargetDesc/ARMMachObjectWriter.cpp | 129 +- lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.cpp | 157 +- lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.h | 73 +- lib/Target/ARM/MCTargetDesc/CMakeLists.txt | 1 + lib/Target/ARM/Thumb1FrameLowering.cpp | 36 +- lib/Target/ARM/Thumb1InstrInfo.cpp | 2 +- lib/Target/ARM/Thumb1RegisterInfo.cpp | 20 +- lib/Target/ARM/Thumb1RegisterInfo.h | 2 +- lib/Target/ARM/Thumb2ITBlockPass.cpp | 71 +- lib/Target/ARM/Thumb2InstrInfo.cpp | 25 +- lib/Target/ARM/Thumb2RegisterInfo.cpp | 6 +- lib/Target/ARM/Thumb2RegisterInfo.h | 6 +- lib/Target/CppBackend/CPPBackend.cpp | 27 +- lib/Target/Hexagon/CMakeLists.txt | 7 +- lib/Target/Hexagon/Hexagon.h | 6 +- lib/Target/Hexagon/Hexagon.td | 30 +- lib/Target/Hexagon/HexagonAsmPrinter.cpp | 4 +- lib/Target/Hexagon/HexagonCallingConvLower.cpp | 8 +- lib/Target/Hexagon/HexagonCallingConvLower.h | 5 +- lib/Target/Hexagon/HexagonCopyToCombine.cpp | 677 ++ lib/Target/Hexagon/HexagonFrameLowering.cpp | 80 +- lib/Target/Hexagon/HexagonHardwareLoops.cpp | 10 +- lib/Target/Hexagon/HexagonISelDAGToDAG.cpp | 76 +- lib/Target/Hexagon/HexagonISelLowering.cpp | 104 +- lib/Target/Hexagon/HexagonISelLowering.h | 17 +- lib/Target/Hexagon/HexagonInstrFormats.td | 5 +- lib/Target/Hexagon/HexagonInstrInfo.cpp | 857 +- lib/Target/Hexagon/HexagonInstrInfo.h | 15 +- lib/Target/Hexagon/HexagonInstrInfo.td | 105 +- lib/Target/Hexagon/HexagonInstrInfoV4.td | 265 +- lib/Target/Hexagon/HexagonInstrInfoV5.td | 23 +- lib/Target/Hexagon/HexagonMCInstLower.cpp | 2 +- lib/Target/Hexagon/HexagonMachineFunctionInfo.cpp | 16 + lib/Target/Hexagon/HexagonMachineFunctionInfo.h | 6 +- lib/Target/Hexagon/HexagonMachineScheduler.cpp | 21 +- lib/Target/Hexagon/HexagonMachineScheduler.h | 5 +- lib/Target/Hexagon/HexagonNewValueJump.cpp | 1 + lib/Target/Hexagon/HexagonPeephole.cpp | 2 +- lib/Target/Hexagon/HexagonRegisterInfo.cpp | 26 +- lib/Target/Hexagon/HexagonRegisterInfo.h | 8 +- lib/Target/Hexagon/HexagonRegisterInfo.td | 4 +- lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp | 2 +- lib/Target/Hexagon/HexagonSelectionDAGInfo.h | 2 +- .../Hexagon/HexagonSplitConst32AndConst64.cpp | 174 + lib/Target/Hexagon/HexagonSubtarget.cpp | 2 + lib/Target/Hexagon/HexagonSubtarget.h | 2 +- lib/Target/Hexagon/HexagonTargetMachine.cpp | 31 +- lib/Target/Hexagon/HexagonTargetObjectFile.cpp | 8 +- lib/Target/Hexagon/HexagonTargetObjectFile.h | 1 + lib/Target/Hexagon/HexagonVLIWPacketizer.cpp | 1833 +---- .../Hexagon/InstPrinter/HexagonInstPrinter.cpp | 15 +- lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h | 5 +- .../Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp | 6 +- lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.h | 9 +- .../Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp | 11 +- lib/Target/LLVMBuild.txt | 2 +- lib/Target/MBlaze/AsmParser/CMakeLists.txt | 8 - lib/Target/MBlaze/AsmParser/LLVMBuild.txt | 23 - lib/Target/MBlaze/AsmParser/MBlazeAsmParser.cpp | 572 -- lib/Target/MBlaze/AsmParser/Makefile | 15 - lib/Target/MBlaze/CMakeLists.txt | 37 - lib/Target/MBlaze/Disassembler/CMakeLists.txt | 16 - lib/Target/MBlaze/Disassembler/LLVMBuild.txt | 23 - .../MBlaze/Disassembler/MBlazeDisassembler.cpp | 719 -- .../MBlaze/Disassembler/MBlazeDisassembler.h | 49 - lib/Target/MBlaze/Disassembler/Makefile | 16 - lib/Target/MBlaze/InstPrinter/CMakeLists.txt | 8 - lib/Target/MBlaze/InstPrinter/LLVMBuild.txt | 23 - .../MBlaze/InstPrinter/MBlazeInstPrinter.cpp | 71 - lib/Target/MBlaze/InstPrinter/MBlazeInstPrinter.h | 43 - lib/Target/MBlaze/InstPrinter/Makefile | 16 - lib/Target/MBlaze/LLVMBuild.txt | 34 - lib/Target/MBlaze/MBlaze.h | 32 - lib/Target/MBlaze/MBlaze.td | 73 - lib/Target/MBlaze/MBlazeAsmPrinter.cpp | 326 - lib/Target/MBlaze/MBlazeCallingConv.td | 24 - lib/Target/MBlaze/MBlazeDelaySlotFiller.cpp | 254 - lib/Target/MBlaze/MBlazeFrameLowering.cpp | 488 -- lib/Target/MBlaze/MBlazeFrameLowering.h | 56 - lib/Target/MBlaze/MBlazeISelDAGToDAG.cpp | 277 - lib/Target/MBlaze/MBlazeISelLowering.cpp | 1154 --- lib/Target/MBlaze/MBlazeISelLowering.h | 179 - lib/Target/MBlaze/MBlazeInstrFPU.td | 219 - lib/Target/MBlaze/MBlazeInstrFSL.td | 229 - lib/Target/MBlaze/MBlazeInstrFormats.td | 228 - lib/Target/MBlaze/MBlazeInstrInfo.cpp | 297 - lib/Target/MBlaze/MBlazeInstrInfo.h | 240 - lib/Target/MBlaze/MBlazeInstrInfo.td | 1051 --- lib/Target/MBlaze/MBlazeIntrinsicInfo.cpp | 112 - lib/Target/MBlaze/MBlazeIntrinsicInfo.h | 33 - lib/Target/MBlaze/MBlazeIntrinsics.td | 131 - lib/Target/MBlaze/MBlazeMCInstLower.cpp | 167 - lib/Target/MBlaze/MBlazeMCInstLower.h | 47 - lib/Target/MBlaze/MBlazeMachineFunction.cpp | 14 - lib/Target/MBlaze/MBlazeMachineFunction.h | 169 - lib/Target/MBlaze/MBlazeRegisterInfo.cpp | 145 - lib/Target/MBlaze/MBlazeRegisterInfo.h | 71 - lib/Target/MBlaze/MBlazeRegisterInfo.td | 148 - lib/Target/MBlaze/MBlazeRelocations.h | 47 - lib/Target/MBlaze/MBlazeSchedule.td | 50 - lib/Target/MBlaze/MBlazeSchedule3.td | 236 - lib/Target/MBlaze/MBlazeSchedule5.td | 267 - lib/Target/MBlaze/MBlazeSelectionDAGInfo.cpp | 23 - lib/Target/MBlaze/MBlazeSelectionDAGInfo.h | 31 - lib/Target/MBlaze/MBlazeSubtarget.cpp | 56 - lib/Target/MBlaze/MBlazeSubtarget.h | 75 - lib/Target/MBlaze/MBlazeTargetMachine.cpp | 81 - lib/Target/MBlaze/MBlazeTargetMachine.h | 80 - lib/Target/MBlaze/MBlazeTargetObjectFile.cpp | 90 - lib/Target/MBlaze/MBlazeTargetObjectFile.h | 40 - lib/Target/MBlaze/MCTargetDesc/CMakeLists.txt | 9 - lib/Target/MBlaze/MCTargetDesc/LLVMBuild.txt | 23 - .../MBlaze/MCTargetDesc/MBlazeAsmBackend.cpp | 171 - lib/Target/MBlaze/MCTargetDesc/MBlazeBaseInfo.h | 237 - .../MBlaze/MCTargetDesc/MBlazeELFObjectWriter.cpp | 77 - lib/Target/MBlaze/MCTargetDesc/MBlazeMCAsmInfo.cpp | 26 - lib/Target/MBlaze/MCTargetDesc/MBlazeMCAsmInfo.h | 30 - .../MBlaze/MCTargetDesc/MBlazeMCCodeEmitter.cpp | 222 - .../MBlaze/MCTargetDesc/MBlazeMCTargetDesc.cpp | 141 - .../MBlaze/MCTargetDesc/MBlazeMCTargetDesc.h | 56 - lib/Target/MBlaze/MCTargetDesc/Makefile | 16 - lib/Target/MBlaze/Makefile | 23 - lib/Target/MBlaze/TODO | 21 - lib/Target/MBlaze/TargetInfo/CMakeLists.txt | 8 - lib/Target/MBlaze/TargetInfo/LLVMBuild.txt | 23 - lib/Target/MBlaze/TargetInfo/MBlazeTargetInfo.cpp | 19 - lib/Target/MBlaze/TargetInfo/Makefile | 15 - lib/Target/MSP430/CMakeLists.txt | 2 +- lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.cpp | 5 +- lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.h | 7 +- lib/Target/MSP430/MSP430AsmPrinter.cpp | 2 +- lib/Target/MSP430/MSP430CallingConv.td | 7 +- lib/Target/MSP430/MSP430FrameLowering.h | 4 +- lib/Target/MSP430/MSP430ISelDAGToDAG.cpp | 10 +- lib/Target/MSP430/MSP430ISelLowering.cpp | 191 +- lib/Target/MSP430/MSP430ISelLowering.h | 13 +- lib/Target/MSP430/MSP430InstrInfo.cpp | 7 +- lib/Target/MSP430/MSP430InstrInfo.h | 1 + lib/Target/MSP430/MSP430InstrInfo.td | 4 +- lib/Target/MSP430/MSP430MCInstLower.cpp | 2 +- lib/Target/MSP430/MSP430RegisterInfo.cpp | 6 +- lib/Target/MSP430/MSP430RegisterInfo.h | 3 +- lib/Target/MSP430/MSP430RegisterInfo.td | 2 +- lib/Target/MSP430/MSP430TargetMachine.cpp | 4 +- lib/Target/Mangler.cpp | 149 +- lib/Target/Mips/AsmParser/MipsAsmParser.cpp | 1572 ++-- lib/Target/Mips/CMakeLists.txt | 4 +- lib/Target/Mips/Disassembler/MipsDisassembler.cpp | 446 +- lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp | 89 +- lib/Target/Mips/InstPrinter/MipsInstPrinter.h | 9 +- lib/Target/Mips/MCTargetDesc/CMakeLists.txt | 3 +- lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp | 43 +- .../Mips/MCTargetDesc/MipsDirectObjLower.cpp | 81 - lib/Target/Mips/MCTargetDesc/MipsDirectObjLower.h | 28 - .../Mips/MCTargetDesc/MipsELFObjectWriter.cpp | 39 + lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp | 89 - lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h | 43 - lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h | 39 + lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp | 3 +- lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h | 7 +- lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp | 210 +- lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp | 49 +- lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h | 16 +- .../Mips/MCTargetDesc/MipsTargetStreamer.cpp | 67 + lib/Target/Mips/MSA.txt | 78 + lib/Target/Mips/MicroMipsInstrFormats.td | 196 +- lib/Target/Mips/MicroMipsInstrInfo.td | 228 +- lib/Target/Mips/Mips.h | 1 - lib/Target/Mips/Mips.td | 3 + lib/Target/Mips/Mips16FrameLowering.cpp | 26 +- lib/Target/Mips/Mips16FrameLowering.h | 2 +- lib/Target/Mips/Mips16HardFloat.cpp | 517 ++ lib/Target/Mips/Mips16HardFloat.h | 54 + lib/Target/Mips/Mips16ISelDAGToDAG.cpp | 21 +- lib/Target/Mips/Mips16ISelDAGToDAG.h | 2 +- lib/Target/Mips/Mips16ISelLowering.cpp | 239 +- lib/Target/Mips/Mips16ISelLowering.h | 4 +- lib/Target/Mips/Mips16InstrFormats.td | 18 +- lib/Target/Mips/Mips16InstrInfo.cpp | 203 +- lib/Target/Mips/Mips16InstrInfo.h | 15 +- lib/Target/Mips/Mips16InstrInfo.td | 188 +- lib/Target/Mips/Mips16RegisterInfo.cpp | 17 +- lib/Target/Mips/Mips16RegisterInfo.h | 4 +- lib/Target/Mips/Mips64InstrInfo.td | 440 +- lib/Target/Mips/MipsAnalyzeImmediate.cpp | 2 +- lib/Target/Mips/MipsAnalyzeImmediate.h | 10 +- lib/Target/Mips/MipsAsmPrinter.cpp | 141 +- lib/Target/Mips/MipsAsmPrinter.h | 25 +- lib/Target/Mips/MipsCallingConv.td | 30 +- lib/Target/Mips/MipsCodeEmitter.cpp | 40 +- lib/Target/Mips/MipsCondMov.td | 210 +- lib/Target/Mips/MipsConstantIslandPass.cpp | 1470 +++- lib/Target/Mips/MipsDSPInstrInfo.td | 468 +- lib/Target/Mips/MipsDelaySlotFiller.cpp | 38 +- lib/Target/Mips/MipsISelDAGToDAG.cpp | 81 +- lib/Target/Mips/MipsISelDAGToDAG.h | 38 + lib/Target/Mips/MipsISelLowering.cpp | 824 +- lib/Target/Mips/MipsISelLowering.h | 184 +- lib/Target/Mips/MipsInstrFPU.td | 456 +- lib/Target/Mips/MipsInstrFormats.td | 143 +- lib/Target/Mips/MipsInstrInfo.cpp | 43 +- lib/Target/Mips/MipsInstrInfo.h | 16 +- lib/Target/Mips/MipsInstrInfo.td | 983 ++- lib/Target/Mips/MipsJITInfo.cpp | 6 +- lib/Target/Mips/MipsLongBranch.cpp | 25 +- lib/Target/Mips/MipsMCInstLower.cpp | 5 +- lib/Target/Mips/MipsMCInstLower.h | 4 +- lib/Target/Mips/MipsMSAInstrFormats.td | 406 + lib/Target/Mips/MipsMSAInstrInfo.td | 3694 +++++++++ lib/Target/Mips/MipsMachineFunction.cpp | 72 +- lib/Target/Mips/MipsMachineFunction.h | 103 +- lib/Target/Mips/MipsOs16.cpp | 45 +- lib/Target/Mips/MipsRegisterInfo.cpp | 79 +- lib/Target/Mips/MipsRegisterInfo.h | 8 +- lib/Target/Mips/MipsRegisterInfo.td | 411 +- lib/Target/Mips/MipsSEFrameLowering.cpp | 184 +- lib/Target/Mips/MipsSEFrameLowering.h | 2 +- lib/Target/Mips/MipsSEISelDAGToDAG.cpp | 443 +- lib/Target/Mips/MipsSEISelDAGToDAG.h | 48 +- lib/Target/Mips/MipsSEISelLowering.cpp | 2329 +++++- lib/Target/Mips/MipsSEISelLowering.h | 48 +- lib/Target/Mips/MipsSEInstrInfo.cpp | 310 +- lib/Target/Mips/MipsSEInstrInfo.h | 40 +- lib/Target/Mips/MipsSERegisterInfo.cpp | 85 +- lib/Target/Mips/MipsSERegisterInfo.h | 5 +- lib/Target/Mips/MipsSchedule.td | 15 +- lib/Target/Mips/MipsSubtarget.cpp | 34 +- lib/Target/Mips/MipsSubtarget.h | 31 +- lib/Target/Mips/MipsTargetMachine.cpp | 21 +- lib/Target/Mips/MipsTargetMachine.h | 6 + lib/Target/Mips/MipsTargetStreamer.h | 44 + lib/Target/NVPTX/CMakeLists.txt | 4 +- lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.cpp | 290 +- lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.h | 53 + lib/Target/NVPTX/MCTargetDesc/NVPTXBaseInfo.h | 1 - lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp | 12 +- lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h | 2 +- .../NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp | 17 + lib/Target/NVPTX/ManagedStringPool.h | 2 +- lib/Target/NVPTX/NVPTX.h | 55 +- lib/Target/NVPTX/NVPTX.td | 6 + lib/Target/NVPTX/NVPTXAllocaHoisting.cpp | 2 +- lib/Target/NVPTX/NVPTXAsmPrinter.cpp | 595 +- lib/Target/NVPTX/NVPTXAsmPrinter.h | 27 +- lib/Target/NVPTX/NVPTXFrameLowering.cpp | 41 +- lib/Target/NVPTX/NVPTXGenericToNVVM.cpp | 4 +- lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 1049 ++- lib/Target/NVPTX/NVPTXISelDAGToDAG.h | 31 +- lib/Target/NVPTX/NVPTXISelLowering.cpp | 1627 ++-- lib/Target/NVPTX/NVPTXISelLowering.h | 39 +- lib/Target/NVPTX/NVPTXInstrInfo.cpp | 51 +- lib/Target/NVPTX/NVPTXInstrInfo.h | 1 + lib/Target/NVPTX/NVPTXInstrInfo.td | 2114 +++-- lib/Target/NVPTX/NVPTXIntrinsics.td | 632 +- lib/Target/NVPTX/NVPTXMCExpr.cpp | 46 + lib/Target/NVPTX/NVPTXMCExpr.h | 83 + lib/Target/NVPTX/NVPTXNumRegisters.h | 16 - lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp | 225 + lib/Target/NVPTX/NVPTXRegisterInfo.cpp | 13 +- lib/Target/NVPTX/NVPTXRegisterInfo.h | 2 +- lib/Target/NVPTX/NVPTXRegisterInfo.td | 26 +- lib/Target/NVPTX/NVPTXSection.h | 4 +- lib/Target/NVPTX/NVPTXSplitBBatBar.cpp | 2 +- lib/Target/NVPTX/NVPTXSubtarget.cpp | 20 +- lib/Target/NVPTX/NVPTXSubtarget.h | 2 +- lib/Target/NVPTX/NVPTXTargetMachine.cpp | 64 +- lib/Target/NVPTX/NVPTXTargetObjectFile.h | 44 +- lib/Target/NVPTX/NVVMReflect.cpp | 4 +- lib/Target/PowerPC/AsmParser/LLVMBuild.txt | 2 +- lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp | 738 +- lib/Target/PowerPC/CMakeLists.txt | 5 +- lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp | 189 +- lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h | 18 +- lib/Target/PowerPC/MCTargetDesc/CMakeLists.txt | 2 + lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp | 101 +- .../PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp | 340 +- lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h | 28 +- lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp | 9 +- lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h | 3 +- .../PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp | 102 +- lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp | 155 + lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h | 96 + .../PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp | 84 +- lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h | 8 +- .../PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp | 389 + lib/Target/PowerPC/MCTargetDesc/PPCPredicates.cpp | 32 + lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h | 32 +- lib/Target/PowerPC/Makefile | 2 +- lib/Target/PowerPC/PPC.h | 24 +- lib/Target/PowerPC/PPC.td | 66 +- lib/Target/PowerPC/PPCAsmPrinter.cpp | 218 +- lib/Target/PowerPC/PPCCTRLoops.cpp | 1164 ++- lib/Target/PowerPC/PPCCallingConv.td | 70 +- lib/Target/PowerPC/PPCCodeEmitter.cpp | 51 +- lib/Target/PowerPC/PPCFastISel.cpp | 2236 +++++ lib/Target/PowerPC/PPCFrameLowering.cpp | 635 +- lib/Target/PowerPC/PPCFrameLowering.h | 10 + lib/Target/PowerPC/PPCHazardRecognizers.cpp | 6 +- lib/Target/PowerPC/PPCHazardRecognizers.h | 4 +- lib/Target/PowerPC/PPCISelDAGToDAG.cpp | 115 +- lib/Target/PowerPC/PPCISelLowering.cpp | 1056 ++- lib/Target/PowerPC/PPCISelLowering.h | 143 +- lib/Target/PowerPC/PPCInstr64Bit.td | 165 +- lib/Target/PowerPC/PPCInstrAltivec.td | 52 +- lib/Target/PowerPC/PPCInstrFormats.td | 71 +- lib/Target/PowerPC/PPCInstrInfo.cpp | 65 +- lib/Target/PowerPC/PPCInstrInfo.h | 7 +- lib/Target/PowerPC/PPCInstrInfo.td | 805 +- lib/Target/PowerPC/PPCJITInfo.cpp | 40 +- lib/Target/PowerPC/PPCMCInstLower.cpp | 62 +- lib/Target/PowerPC/PPCMachineFunctionInfo.h | 9 +- lib/Target/PowerPC/PPCRegisterInfo.cpp | 264 +- lib/Target/PowerPC/PPCRegisterInfo.h | 11 +- lib/Target/PowerPC/PPCRegisterInfo.td | 20 +- lib/Target/PowerPC/PPCSchedule.td | 8 + lib/Target/PowerPC/PPCScheduleA2.td | 841 +- lib/Target/PowerPC/PPCScheduleE500mc.td | 2 + lib/Target/PowerPC/PPCScheduleE5500.td | 1 + lib/Target/PowerPC/PPCSubtarget.cpp | 154 +- lib/Target/PowerPC/PPCSubtarget.h | 28 +- lib/Target/PowerPC/PPCTargetMachine.cpp | 16 +- lib/Target/PowerPC/PPCTargetObjectFile.cpp | 67 + lib/Target/PowerPC/PPCTargetObjectFile.h | 35 + lib/Target/PowerPC/PPCTargetStreamer.h | 23 + lib/Target/PowerPC/PPCTargetTransformInfo.cpp | 9 + .../PowerPC/TargetInfo/PowerPCTargetInfo.cpp | 5 +- lib/Target/R600/AMDGPU.h | 66 +- lib/Target/R600/AMDGPU.td | 85 + lib/Target/R600/AMDGPUAsmPrinter.cpp | 86 +- lib/Target/R600/AMDGPUAsmPrinter.h | 12 +- lib/Target/R600/AMDGPUCallingConv.td | 46 +- lib/Target/R600/AMDGPUFrameLowering.cpp | 23 +- lib/Target/R600/AMDGPUISelDAGToDAG.cpp | 585 ++ lib/Target/R600/AMDGPUISelLowering.cpp | 445 +- lib/Target/R600/AMDGPUISelLowering.h | 56 +- lib/Target/R600/AMDGPUIndirectAddressing.cpp | 343 - lib/Target/R600/AMDGPUInstrInfo.cpp | 140 +- lib/Target/R600/AMDGPUInstrInfo.h | 50 +- lib/Target/R600/AMDGPUInstrInfo.td | 22 +- lib/Target/R600/AMDGPUInstructions.td | 268 +- lib/Target/R600/AMDGPUIntrinsics.td | 2 + lib/Target/R600/AMDGPUMCInstLower.cpp | 43 +- lib/Target/R600/AMDGPUMachineFunction.cpp | 9 +- lib/Target/R600/AMDGPUMachineFunction.h | 9 +- lib/Target/R600/AMDGPURegisterInfo.cpp | 38 +- lib/Target/R600/AMDGPURegisterInfo.h | 11 +- lib/Target/R600/AMDGPURegisterInfo.td | 3 +- lib/Target/R600/AMDGPUStructurizeCFG.cpp | 896 -- lib/Target/R600/AMDGPUSubtarget.cpp | 78 +- lib/Target/R600/AMDGPUSubtarget.h | 33 +- lib/Target/R600/AMDGPUTargetMachine.cpp | 81 +- lib/Target/R600/AMDGPUTargetMachine.h | 61 +- lib/Target/R600/AMDGPUTargetTransformInfo.cpp | 90 + lib/Target/R600/AMDIL.h | 121 - lib/Target/R600/AMDIL7XXDevice.cpp | 115 - lib/Target/R600/AMDIL7XXDevice.h | 72 - lib/Target/R600/AMDILBase.td | 64 - lib/Target/R600/AMDILCFGStructurizer.cpp | 4155 ++++------ lib/Target/R600/AMDILDevice.cpp | 132 - lib/Target/R600/AMDILDevice.h | 117 - lib/Target/R600/AMDILDeviceInfo.cpp | 97 - lib/Target/R600/AMDILDeviceInfo.h | 88 - lib/Target/R600/AMDILDevices.h | 19 - lib/Target/R600/AMDILEvergreenDevice.cpp | 169 - lib/Target/R600/AMDILEvergreenDevice.h | 93 - lib/Target/R600/AMDILISelDAGToDAG.cpp | 666 -- lib/Target/R600/AMDILISelLowering.cpp | 61 +- lib/Target/R600/AMDILInstrInfo.td | 67 +- lib/Target/R600/AMDILIntrinsicInfo.cpp | 4 +- lib/Target/R600/AMDILNIDevice.cpp | 65 - lib/Target/R600/AMDILNIDevice.h | 57 - lib/Target/R600/AMDILSIDevice.cpp | 48 - lib/Target/R600/AMDILSIDevice.h | 39 - lib/Target/R600/CMakeLists.txt | 18 +- lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp | 137 +- lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h | 4 + lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp | 8 +- lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp | 13 +- lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.h | 4 +- .../R600/MCTargetDesc/AMDGPUMCCodeEmitter.cpp | 21 + lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.h | 1 + .../R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp | 2 +- lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h | 4 +- lib/Target/R600/MCTargetDesc/CMakeLists.txt | 1 + lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp | 102 +- lib/Target/R600/Processors.td | 48 +- lib/Target/R600/R600ClauseMergePass.cpp | 204 + lib/Target/R600/R600ControlFlowFinalizer.cpp | 152 +- lib/Target/R600/R600Defines.h | 122 +- lib/Target/R600/R600EmitClauseMarkers.cpp | 164 +- lib/Target/R600/R600ExpandSpecialInstrs.cpp | 90 +- lib/Target/R600/R600ISelLowering.cpp | 1299 ++- lib/Target/R600/R600ISelLowering.h | 14 +- lib/Target/R600/R600InstrFormats.td | 492 ++ lib/Target/R600/R600InstrInfo.cpp | 825 +- lib/Target/R600/R600InstrInfo.h | 121 +- lib/Target/R600/R600Instructions.td | 1785 ++-- lib/Target/R600/R600Intrinsics.td | 44 + lib/Target/R600/R600MachineFunctionInfo.cpp | 6 +- lib/Target/R600/R600MachineFunctionInfo.h | 3 +- lib/Target/R600/R600MachineScheduler.cpp | 300 +- lib/Target/R600/R600MachineScheduler.h | 44 +- lib/Target/R600/R600OptimizeVectorRegisters.cpp | 380 + lib/Target/R600/R600Packetizer.cpp | 311 +- lib/Target/R600/R600RegisterInfo.cpp | 53 +- lib/Target/R600/R600RegisterInfo.h | 13 +- lib/Target/R600/R600RegisterInfo.td | 79 +- lib/Target/R600/R600Schedule.td | 6 +- lib/Target/R600/R600TextureIntrinsicsReplacer.cpp | 303 + lib/Target/R600/SIAnnotateControlFlow.cpp | 16 +- lib/Target/R600/SIDefines.h | 16 + lib/Target/R600/SIFixSGPRCopies.cpp | 263 + lib/Target/R600/SIISelLowering.cpp | 756 +- lib/Target/R600/SIISelLowering.h | 31 +- lib/Target/R600/SIInsertWaits.cpp | 34 +- lib/Target/R600/SIInstrFormats.td | 116 +- lib/Target/R600/SIInstrInfo.cpp | 524 +- lib/Target/R600/SIInstrInfo.h | 80 +- lib/Target/R600/SIInstrInfo.td | 286 +- lib/Target/R600/SIInstructions.td | 798 +- lib/Target/R600/SIIntrinsics.td | 26 +- lib/Target/R600/SILowerControlFlow.cpp | 38 +- lib/Target/R600/SIMachineFunctionInfo.cpp | 4 + lib/Target/R600/SIMachineFunctionInfo.h | 1 + lib/Target/R600/SIRegisterInfo.cpp | 88 +- lib/Target/R600/SIRegisterInfo.h | 26 +- lib/Target/R600/SIRegisterInfo.td | 18 +- lib/Target/R600/SITypeRewriter.cpp | 162 + lib/Target/R600/TargetInfo/AMDGPUTargetInfo.cpp | 2 +- lib/Target/Sparc/CMakeLists.txt | 6 +- lib/Target/Sparc/DelaySlotFiller.cpp | 267 +- lib/Target/Sparc/FPMover.cpp | 141 - lib/Target/Sparc/LLVMBuild.txt | 4 +- lib/Target/Sparc/MCTargetDesc/SparcBaseInfo.h | 22 +- lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp | 15 +- lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h | 7 +- lib/Target/Sparc/Makefile | 3 +- lib/Target/Sparc/README.txt | 8 +- lib/Target/Sparc/Sparc.h | 24 +- lib/Target/Sparc/Sparc.td | 6 +- lib/Target/Sparc/SparcAsmPrinter.cpp | 112 +- lib/Target/Sparc/SparcCallingConv.td | 13 +- lib/Target/Sparc/SparcCodeEmitter.cpp | 245 + lib/Target/Sparc/SparcFrameLowering.cpp | 228 +- lib/Target/Sparc/SparcFrameLowering.h | 20 +- lib/Target/Sparc/SparcISelDAGToDAG.cpp | 24 +- lib/Target/Sparc/SparcISelLowering.cpp | 1169 ++- lib/Target/Sparc/SparcISelLowering.h | 50 +- lib/Target/Sparc/SparcInstr64Bit.td | 103 +- lib/Target/Sparc/SparcInstrFormats.td | 92 +- lib/Target/Sparc/SparcInstrInfo.cpp | 177 +- lib/Target/Sparc/SparcInstrInfo.h | 15 +- lib/Target/Sparc/SparcInstrInfo.td | 528 +- lib/Target/Sparc/SparcJITInfo.cpp | 165 + lib/Target/Sparc/SparcJITInfo.h | 67 + lib/Target/Sparc/SparcMachineFunctionInfo.h | 12 +- lib/Target/Sparc/SparcRegisterInfo.cpp | 166 +- lib/Target/Sparc/SparcRegisterInfo.h | 10 +- lib/Target/Sparc/SparcRegisterInfo.td | 124 +- lib/Target/Sparc/SparcRelocations.h | 41 + lib/Target/Sparc/SparcSubtarget.cpp | 33 +- lib/Target/Sparc/SparcSubtarget.h | 16 +- lib/Target/Sparc/SparcTargetMachine.cpp | 9 +- lib/Target/Sparc/SparcTargetMachine.h | 6 + lib/Target/Sparc/TargetInfo/SparcTargetInfo.cpp | 8 +- lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp | 475 +- lib/Target/SystemZ/CMakeLists.txt | 9 +- lib/Target/SystemZ/Disassembler/CMakeLists.txt | 7 + lib/Target/SystemZ/Disassembler/LLVMBuild.txt | 23 + lib/Target/SystemZ/Disassembler/Makefile | 16 + .../SystemZ/Disassembler/SystemZDisassembler.cpp | 323 + .../SystemZ/InstPrinter/SystemZInstPrinter.cpp | 23 +- .../SystemZ/InstPrinter/SystemZInstPrinter.h | 3 +- lib/Target/SystemZ/LLVMBuild.txt | 3 +- .../SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp | 49 +- .../SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp | 4 +- lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.h | 7 +- .../SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp | 114 +- .../SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp | 81 +- .../SystemZ/MCTargetDesc/SystemZMCTargetDesc.h | 38 +- lib/Target/SystemZ/Makefile | 3 +- lib/Target/SystemZ/README.txt | 65 +- lib/Target/SystemZ/SystemZ.h | 42 +- lib/Target/SystemZ/SystemZ.td | 7 +- lib/Target/SystemZ/SystemZAsmPrinter.cpp | 136 +- lib/Target/SystemZ/SystemZCallingConv.td | 4 +- lib/Target/SystemZ/SystemZConstantPoolValue.cpp | 2 +- lib/Target/SystemZ/SystemZElimCompare.cpp | 471 ++ lib/Target/SystemZ/SystemZFrameLowering.cpp | 108 +- lib/Target/SystemZ/SystemZFrameLowering.h | 27 +- lib/Target/SystemZ/SystemZISelDAGToDAG.cpp | 604 +- lib/Target/SystemZ/SystemZISelLowering.cpp | 1523 +++- lib/Target/SystemZ/SystemZISelLowering.h | 139 +- lib/Target/SystemZ/SystemZInstrFP.td | 320 +- lib/Target/SystemZ/SystemZInstrFormats.td | 1242 ++- lib/Target/SystemZ/SystemZInstrInfo.cpp | 907 +- lib/Target/SystemZ/SystemZInstrInfo.h | 150 +- lib/Target/SystemZ/SystemZInstrInfo.td | 1226 ++- lib/Target/SystemZ/SystemZLongBranch.cpp | 462 ++ lib/Target/SystemZ/SystemZMCInstLower.cpp | 116 +- lib/Target/SystemZ/SystemZMCInstLower.h | 15 +- lib/Target/SystemZ/SystemZMachineFunctionInfo.cpp | 17 + lib/Target/SystemZ/SystemZMachineFunctionInfo.h | 12 +- lib/Target/SystemZ/SystemZOperands.td | 153 +- lib/Target/SystemZ/SystemZOperators.td | 208 +- lib/Target/SystemZ/SystemZPatterns.td | 99 +- lib/Target/SystemZ/SystemZProcessors.td | 46 + lib/Target/SystemZ/SystemZRegisterInfo.cpp | 37 +- lib/Target/SystemZ/SystemZRegisterInfo.h | 16 +- lib/Target/SystemZ/SystemZRegisterInfo.td | 71 +- lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp | 293 + lib/Target/SystemZ/SystemZSelectionDAGInfo.h | 80 + lib/Target/SystemZ/SystemZShortenInst.cpp | 163 + lib/Target/SystemZ/SystemZSubtarget.cpp | 15 +- lib/Target/SystemZ/SystemZSubtarget.h | 22 + lib/Target/SystemZ/SystemZTargetMachine.cpp | 50 +- lib/Target/SystemZ/SystemZTargetMachine.h | 4 +- lib/Target/Target.cpp | 8 + lib/Target/TargetLibraryInfo.cpp | 44 +- lib/Target/TargetLoweringObjectFile.cpp | 20 +- lib/Target/TargetMachine.cpp | 6 +- lib/Target/TargetMachineC.cpp | 56 +- lib/Target/TargetSubtargetInfo.cpp | 19 + lib/Target/X86/AsmParser/X86AsmParser.cpp | 420 +- lib/Target/X86/CMakeLists.txt | 2 +- lib/Target/X86/Disassembler/X86Disassembler.cpp | 145 +- .../X86/Disassembler/X86DisassemblerDecoder.c | 267 +- .../X86/Disassembler/X86DisassemblerDecoder.h | 106 +- .../Disassembler/X86DisassemblerDecoderCommon.h | 165 +- lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp | 37 +- lib/Target/X86/InstPrinter/X86ATTInstPrinter.h | 22 +- lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp | 32 +- lib/Target/X86/InstPrinter/X86IntelInstPrinter.h | 52 +- lib/Target/X86/MCTargetDesc/CMakeLists.txt | 2 + lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp | 401 +- lib/Target/X86/MCTargetDesc/X86BaseInfo.h | 82 +- lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp | 22 +- .../X86/MCTargetDesc/X86ELFRelocationInfo.cpp | 135 + lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp | 6 +- lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h | 3 +- lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp | 335 +- lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp | 35 +- lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h | 13 +- .../X86/MCTargetDesc/X86MachORelocationInfo.cpp | 116 + .../X86/MCTargetDesc/X86MachObjectWriter.cpp | 177 +- .../X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp | 2 +- lib/Target/X86/README-SSE.txt | 31 - lib/Target/X86/X86.td | 71 +- lib/Target/X86/X86AsmPrinter.cpp | 86 +- lib/Target/X86/X86AsmPrinter.h | 20 +- lib/Target/X86/X86CallingConv.h | 35 + lib/Target/X86/X86CallingConv.td | 102 +- lib/Target/X86/X86CodeEmitter.cpp | 47 +- lib/Target/X86/X86FastISel.cpp | 488 +- lib/Target/X86/X86FixupLEAs.cpp | 11 +- lib/Target/X86/X86FloatingPoint.cpp | 18 +- lib/Target/X86/X86FrameLowering.cpp | 337 +- lib/Target/X86/X86FrameLowering.h | 27 - lib/Target/X86/X86ISelDAGToDAG.cpp | 226 +- lib/Target/X86/X86ISelLowering.cpp | 3138 +++++-- lib/Target/X86/X86ISelLowering.h | 136 +- lib/Target/X86/X86InstrAVX512.td | 3526 ++++++++ lib/Target/X86/X86InstrArithmetic.td | 250 +- lib/Target/X86/X86InstrCompiler.td | 130 +- lib/Target/X86/X86InstrControl.td | 5 +- lib/Target/X86/X86InstrExtension.td | 68 +- lib/Target/X86/X86InstrFMA.td | 77 +- lib/Target/X86/X86InstrFPStack.td | 30 +- lib/Target/X86/X86InstrFormats.td | 181 +- lib/Target/X86/X86InstrFragmentsSIMD.td | 141 +- lib/Target/X86/X86InstrInfo.cpp | 1000 ++- lib/Target/X86/X86InstrInfo.h | 26 +- lib/Target/X86/X86InstrInfo.td | 654 +- lib/Target/X86/X86InstrMMX.td | 93 +- lib/Target/X86/X86InstrSSE.td | 1833 +++-- lib/Target/X86/X86InstrSVM.td | 18 +- lib/Target/X86/X86InstrShiftRotate.td | 136 +- lib/Target/X86/X86InstrSystem.td | 76 +- lib/Target/X86/X86InstrTSX.td | 7 + lib/Target/X86/X86InstrXOP.td | 146 +- lib/Target/X86/X86JITInfo.cpp | 3 +- lib/Target/X86/X86MCInstLower.cpp | 270 +- lib/Target/X86/X86RegisterInfo.cpp | 89 +- lib/Target/X86/X86RegisterInfo.h | 10 +- lib/Target/X86/X86RegisterInfo.td | 92 +- lib/Target/X86/X86SchedHaswell.td | 14 +- lib/Target/X86/X86SchedSandyBridge.td | 13 +- lib/Target/X86/X86Schedule.td | 73 +- lib/Target/X86/X86ScheduleAtom.td | 41 +- lib/Target/X86/X86ScheduleSLM.td | 668 ++ lib/Target/X86/X86SelectionDAGInfo.cpp | 6 +- lib/Target/X86/X86SelectionDAGInfo.h | 4 +- lib/Target/X86/X86Subtarget.cpp | 60 +- lib/Target/X86/X86Subtarget.h | 51 +- lib/Target/X86/X86TargetMachine.cpp | 10 +- lib/Target/X86/X86TargetObjectFile.cpp | 10 +- lib/Target/X86/X86TargetObjectFile.h | 3 + lib/Target/X86/X86TargetTransformInfo.cpp | 234 +- lib/Target/X86/X86VZeroUpper.cpp | 45 +- lib/Target/XCore/CMakeLists.txt | 3 +- .../XCore/Disassembler/XCoreDisassembler.cpp | 8 +- lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.cpp | 10 +- lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.h | 6 +- .../XCore/MCTargetDesc/XCoreMCTargetDesc.cpp | 10 +- lib/Target/XCore/README.txt | 1 + lib/Target/XCore/XCore.h | 2 + lib/Target/XCore/XCoreAsmPrinter.cpp | 80 +- lib/Target/XCore/XCoreFrameLowering.cpp | 107 +- lib/Target/XCore/XCoreISelDAGToDAG.cpp | 14 +- lib/Target/XCore/XCoreISelLowering.cpp | 200 +- lib/Target/XCore/XCoreISelLowering.h | 24 +- lib/Target/XCore/XCoreInstrInfo.cpp | 17 +- lib/Target/XCore/XCoreInstrInfo.h | 7 +- lib/Target/XCore/XCoreInstrInfo.td | 26 +- lib/Target/XCore/XCoreLowerThreadLocal.cpp | 114 +- lib/Target/XCore/XCoreMCInstLower.cpp | 2 +- lib/Target/XCore/XCoreRegisterInfo.cpp | 6 +- lib/Target/XCore/XCoreRegisterInfo.h | 4 +- lib/Target/XCore/XCoreTargetMachine.cpp | 9 + lib/Target/XCore/XCoreTargetMachine.h | 2 + lib/Target/XCore/XCoreTargetTransformInfo.cpp | 83 + 723 files changed, 90115 insertions(+), 42790 deletions(-) create mode 100644 lib/Target/AArch64/AArch64InstrNEON.td create mode 100644 lib/Target/ARM/ARMFPUName.def create mode 100644 lib/Target/ARM/ARMFPUName.h create mode 100644 lib/Target/ARM/ARMFeatures.h delete mode 100644 lib/Target/ARM/MCTargetDesc/ARMELFStreamer.h create mode 100644 lib/Target/ARM/MCTargetDesc/ARMMachORelocationInfo.cpp create mode 100644 lib/Target/Hexagon/HexagonCopyToCombine.cpp create mode 100644 lib/Target/Hexagon/HexagonMachineFunctionInfo.cpp create mode 100644 lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp delete mode 100644 lib/Target/MBlaze/AsmParser/CMakeLists.txt delete mode 100644 lib/Target/MBlaze/AsmParser/LLVMBuild.txt delete mode 100644 lib/Target/MBlaze/AsmParser/MBlazeAsmParser.cpp delete mode 100644 lib/Target/MBlaze/AsmParser/Makefile delete mode 100644 lib/Target/MBlaze/CMakeLists.txt delete mode 100644 lib/Target/MBlaze/Disassembler/CMakeLists.txt delete mode 100644 lib/Target/MBlaze/Disassembler/LLVMBuild.txt delete mode 100644 lib/Target/MBlaze/Disassembler/MBlazeDisassembler.cpp delete mode 100644 lib/Target/MBlaze/Disassembler/MBlazeDisassembler.h delete mode 100644 lib/Target/MBlaze/Disassembler/Makefile delete mode 100644 lib/Target/MBlaze/InstPrinter/CMakeLists.txt delete mode 100644 lib/Target/MBlaze/InstPrinter/LLVMBuild.txt delete mode 100644 lib/Target/MBlaze/InstPrinter/MBlazeInstPrinter.cpp delete mode 100644 lib/Target/MBlaze/InstPrinter/MBlazeInstPrinter.h delete mode 100644 lib/Target/MBlaze/InstPrinter/Makefile delete mode 100644 lib/Target/MBlaze/LLVMBuild.txt delete mode 100644 lib/Target/MBlaze/MBlaze.h delete mode 100644 lib/Target/MBlaze/MBlaze.td delete mode 100644 lib/Target/MBlaze/MBlazeAsmPrinter.cpp delete mode 100644 lib/Target/MBlaze/MBlazeCallingConv.td delete mode 100644 lib/Target/MBlaze/MBlazeDelaySlotFiller.cpp delete mode 100644 lib/Target/MBlaze/MBlazeFrameLowering.cpp delete mode 100644 lib/Target/MBlaze/MBlazeFrameLowering.h delete mode 100644 lib/Target/MBlaze/MBlazeISelDAGToDAG.cpp delete mode 100644 lib/Target/MBlaze/MBlazeISelLowering.cpp delete mode 100644 lib/Target/MBlaze/MBlazeISelLowering.h delete mode 100644 lib/Target/MBlaze/MBlazeInstrFPU.td delete mode 100644 lib/Target/MBlaze/MBlazeInstrFSL.td delete mode 100644 lib/Target/MBlaze/MBlazeInstrFormats.td delete mode 100644 lib/Target/MBlaze/MBlazeInstrInfo.cpp delete mode 100644 lib/Target/MBlaze/MBlazeInstrInfo.h delete mode 100644 lib/Target/MBlaze/MBlazeInstrInfo.td delete mode 100644 lib/Target/MBlaze/MBlazeIntrinsicInfo.cpp delete mode 100644 lib/Target/MBlaze/MBlazeIntrinsicInfo.h delete mode 100644 lib/Target/MBlaze/MBlazeIntrinsics.td delete mode 100644 lib/Target/MBlaze/MBlazeMCInstLower.cpp delete mode 100644 lib/Target/MBlaze/MBlazeMCInstLower.h delete mode 100644 lib/Target/MBlaze/MBlazeMachineFunction.cpp delete mode 100644 lib/Target/MBlaze/MBlazeMachineFunction.h delete mode 100644 lib/Target/MBlaze/MBlazeRegisterInfo.cpp delete mode 100644 lib/Target/MBlaze/MBlazeRegisterInfo.h delete mode 100644 lib/Target/MBlaze/MBlazeRegisterInfo.td delete mode 100644 lib/Target/MBlaze/MBlazeRelocations.h delete mode 100644 lib/Target/MBlaze/MBlazeSchedule.td delete mode 100644 lib/Target/MBlaze/MBlazeSchedule3.td delete mode 100644 lib/Target/MBlaze/MBlazeSchedule5.td delete mode 100644 lib/Target/MBlaze/MBlazeSelectionDAGInfo.cpp delete mode 100644 lib/Target/MBlaze/MBlazeSelectionDAGInfo.h delete mode 100644 lib/Target/MBlaze/MBlazeSubtarget.cpp delete mode 100644 lib/Target/MBlaze/MBlazeSubtarget.h delete mode 100644 lib/Target/MBlaze/MBlazeTargetMachine.cpp delete mode 100644 lib/Target/MBlaze/MBlazeTargetMachine.h delete mode 100644 lib/Target/MBlaze/MBlazeTargetObjectFile.cpp delete mode 100644 lib/Target/MBlaze/MBlazeTargetObjectFile.h delete mode 100644 lib/Target/MBlaze/MCTargetDesc/CMakeLists.txt delete mode 100644 lib/Target/MBlaze/MCTargetDesc/LLVMBuild.txt delete mode 100644 lib/Target/MBlaze/MCTargetDesc/MBlazeAsmBackend.cpp delete mode 100644 lib/Target/MBlaze/MCTargetDesc/MBlazeBaseInfo.h delete mode 100644 lib/Target/MBlaze/MCTargetDesc/MBlazeELFObjectWriter.cpp delete mode 100644 lib/Target/MBlaze/MCTargetDesc/MBlazeMCAsmInfo.cpp delete mode 100644 lib/Target/MBlaze/MCTargetDesc/MBlazeMCAsmInfo.h delete mode 100644 lib/Target/MBlaze/MCTargetDesc/MBlazeMCCodeEmitter.cpp delete mode 100644 lib/Target/MBlaze/MCTargetDesc/MBlazeMCTargetDesc.cpp delete mode 100644 lib/Target/MBlaze/MCTargetDesc/MBlazeMCTargetDesc.h delete mode 100644 lib/Target/MBlaze/MCTargetDesc/Makefile delete mode 100644 lib/Target/MBlaze/Makefile delete mode 100644 lib/Target/MBlaze/TODO delete mode 100644 lib/Target/MBlaze/TargetInfo/CMakeLists.txt delete mode 100644 lib/Target/MBlaze/TargetInfo/LLVMBuild.txt delete mode 100644 lib/Target/MBlaze/TargetInfo/MBlazeTargetInfo.cpp delete mode 100644 lib/Target/MBlaze/TargetInfo/Makefile delete mode 100644 lib/Target/Mips/MCTargetDesc/MipsDirectObjLower.cpp delete mode 100644 lib/Target/Mips/MCTargetDesc/MipsDirectObjLower.h delete mode 100644 lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp delete mode 100644 lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h create mode 100644 lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp create mode 100644 lib/Target/Mips/MSA.txt create mode 100644 lib/Target/Mips/Mips16HardFloat.cpp create mode 100644 lib/Target/Mips/Mips16HardFloat.h create mode 100644 lib/Target/Mips/MipsMSAInstrFormats.td create mode 100644 lib/Target/Mips/MipsMSAInstrInfo.td create mode 100644 lib/Target/Mips/MipsTargetStreamer.h create mode 100644 lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.h create mode 100644 lib/Target/NVPTX/NVPTXMCExpr.cpp create mode 100644 lib/Target/NVPTX/NVPTXMCExpr.h delete mode 100644 lib/Target/NVPTX/NVPTXNumRegisters.h create mode 100644 lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp create mode 100644 lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp create mode 100644 lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h create mode 100644 lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp create mode 100644 lib/Target/PowerPC/PPCFastISel.cpp create mode 100644 lib/Target/PowerPC/PPCTargetObjectFile.cpp create mode 100644 lib/Target/PowerPC/PPCTargetObjectFile.h create mode 100644 lib/Target/PowerPC/PPCTargetStreamer.h create mode 100644 lib/Target/R600/AMDGPUISelDAGToDAG.cpp delete mode 100644 lib/Target/R600/AMDGPUIndirectAddressing.cpp delete mode 100644 lib/Target/R600/AMDGPUStructurizeCFG.cpp create mode 100644 lib/Target/R600/AMDGPUTargetTransformInfo.cpp delete mode 100644 lib/Target/R600/AMDIL.h delete mode 100644 lib/Target/R600/AMDIL7XXDevice.cpp delete mode 100644 lib/Target/R600/AMDIL7XXDevice.h delete mode 100644 lib/Target/R600/AMDILDevice.cpp delete mode 100644 lib/Target/R600/AMDILDevice.h delete mode 100644 lib/Target/R600/AMDILDeviceInfo.cpp delete mode 100644 lib/Target/R600/AMDILDeviceInfo.h delete mode 100644 lib/Target/R600/AMDILDevices.h delete mode 100644 lib/Target/R600/AMDILEvergreenDevice.cpp delete mode 100644 lib/Target/R600/AMDILEvergreenDevice.h delete mode 100644 lib/Target/R600/AMDILISelDAGToDAG.cpp delete mode 100644 lib/Target/R600/AMDILNIDevice.cpp delete mode 100644 lib/Target/R600/AMDILNIDevice.h delete mode 100644 lib/Target/R600/AMDILSIDevice.cpp delete mode 100644 lib/Target/R600/AMDILSIDevice.h create mode 100644 lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.cpp create mode 100644 lib/Target/R600/R600ClauseMergePass.cpp create mode 100644 lib/Target/R600/R600InstrFormats.td create mode 100644 lib/Target/R600/R600OptimizeVectorRegisters.cpp create mode 100644 lib/Target/R600/R600TextureIntrinsicsReplacer.cpp create mode 100644 lib/Target/R600/SIFixSGPRCopies.cpp create mode 100644 lib/Target/R600/SITypeRewriter.cpp delete mode 100644 lib/Target/Sparc/FPMover.cpp create mode 100644 lib/Target/Sparc/SparcCodeEmitter.cpp create mode 100644 lib/Target/Sparc/SparcJITInfo.cpp create mode 100644 lib/Target/Sparc/SparcJITInfo.h create mode 100644 lib/Target/Sparc/SparcRelocations.h create mode 100644 lib/Target/SystemZ/Disassembler/CMakeLists.txt create mode 100644 lib/Target/SystemZ/Disassembler/LLVMBuild.txt create mode 100644 lib/Target/SystemZ/Disassembler/Makefile create mode 100644 lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp create mode 100644 lib/Target/SystemZ/SystemZElimCompare.cpp create mode 100644 lib/Target/SystemZ/SystemZLongBranch.cpp create mode 100644 lib/Target/SystemZ/SystemZMachineFunctionInfo.cpp create mode 100644 lib/Target/SystemZ/SystemZProcessors.td create mode 100644 lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp create mode 100644 lib/Target/SystemZ/SystemZSelectionDAGInfo.h create mode 100644 lib/Target/SystemZ/SystemZShortenInst.cpp create mode 100644 lib/Target/X86/MCTargetDesc/X86ELFRelocationInfo.cpp create mode 100644 lib/Target/X86/MCTargetDesc/X86MachORelocationInfo.cpp create mode 100644 lib/Target/X86/X86CallingConv.h create mode 100644 lib/Target/X86/X86InstrAVX512.td create mode 100644 lib/Target/X86/X86ScheduleSLM.td create mode 100644 lib/Target/XCore/XCoreTargetTransformInfo.cpp (limited to 'lib/Target') diff --git a/lib/Target/AArch64/AArch64.td b/lib/Target/AArch64/AArch64.td index e17052b4a565..9c2c69a65935 100644 --- a/lib/Target/AArch64/AArch64.td +++ b/lib/Target/AArch64/AArch64.td @@ -21,8 +21,11 @@ include "llvm/Target/Target.td" // AArch64 Subtarget features. // +def FeatureFPARMv8 : SubtargetFeature<"fp-armv8", "HasFPARMv8", "true", + "Enable ARMv8 FP">; + def FeatureNEON : SubtargetFeature<"neon", "HasNEON", "true", - "Enable Advanced SIMD instructions">; + "Enable Advanced SIMD instructions", [FeatureFPARMv8]>; def FeatureCrypto : SubtargetFeature<"crypto", "HasCrypto", "true", "Enable cryptographic instructions">; @@ -33,7 +36,7 @@ def FeatureCrypto : SubtargetFeature<"crypto", "HasCrypto", "true", include "AArch64Schedule.td" -def : Processor<"generic", GenericItineraries, [FeatureNEON, FeatureCrypto]>; +def : Processor<"generic", GenericItineraries, [FeatureFPARMv8]>; //===----------------------------------------------------------------------===// // Register File Description diff --git a/lib/Target/AArch64/AArch64AsmPrinter.cpp b/lib/Target/AArch64/AArch64AsmPrinter.cpp index 47ebb826e0d0..d59ca56ba998 100644 --- a/lib/Target/AArch64/AArch64AsmPrinter.cpp +++ b/lib/Target/AArch64/AArch64AsmPrinter.cpp @@ -27,32 +27,23 @@ using namespace llvm; -MachineLocation -AArch64AsmPrinter::getDebugValueLocation(const MachineInstr *MI) const { - // See emitFrameIndexDebugValue in InstrInfo for where this instruction is - // expected to be created. - assert(MI->getNumOperands() == 4 && MI->getOperand(0).isReg() - && MI->getOperand(1).isImm() && "unexpected custom DBG_VALUE"); - return MachineLocation(MI->getOperand(0).getReg(), - MI->getOperand(1).getImm()); -} - /// Try to print a floating-point register as if it belonged to a specified /// register-class. For example the inline asm operand modifier "b" requires its /// argument to be printed as "bN". static bool printModifiedFPRAsmOperand(const MachineOperand &MO, const TargetRegisterInfo *TRI, - const TargetRegisterClass &RegClass, - raw_ostream &O) { + char RegType, raw_ostream &O) { if (!MO.isReg()) return true; for (MCRegAliasIterator AR(MO.getReg(), TRI, true); AR.isValid(); ++AR) { - if (RegClass.contains(*AR)) { - O << AArch64InstPrinter::getRegisterName(*AR); + if (AArch64::FPR8RegClass.contains(*AR)) { + O << RegType << TRI->getEncodingValue(MO.getReg()); return false; } } + + // The register doesn't correspond to anything floating-point like. return true; } @@ -91,9 +82,9 @@ bool AArch64AsmPrinter::printSymbolicAddress(const MachineOperand &MO, StringRef Modifier; switch (MO.getType()) { default: - llvm_unreachable("Unexpected operand for symbolic address constraint"); + return true; case MachineOperand::MO_GlobalAddress: - Name = Mang->getSymbol(MO.getGlobal())->getName(); + Name = getSymbol(MO.getGlobal())->getName(); // Global variables may be accessed either via a GOT or in various fun and // interesting TLS-model specific ways. Set the prefix modifier as @@ -155,57 +146,29 @@ bool AArch64AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum, unsigned AsmVariant, const char *ExtraCode, raw_ostream &O) { const TargetRegisterInfo *TRI = MF->getTarget().getRegisterInfo(); - if (!ExtraCode || !ExtraCode[0]) { - // There's actually no operand modifier, which leads to a slightly eclectic - // set of behaviour which we have to handle here. - const MachineOperand &MO = MI->getOperand(OpNum); - switch (MO.getType()) { - default: - llvm_unreachable("Unexpected operand for inline assembly"); - case MachineOperand::MO_Register: - // GCC prints the unmodified operand of a 'w' constraint as the vector - // register. Technically, we could allocate the argument as a VPR128, but - // that leads to extremely dodgy copies being generated to get the data - // there. - if (printModifiedFPRAsmOperand(MO, TRI, AArch64::VPR128RegClass, O)) - O << AArch64InstPrinter::getRegisterName(MO.getReg()); - break; - case MachineOperand::MO_Immediate: - O << '#' << MO.getImm(); - break; - case MachineOperand::MO_FPImmediate: - assert(MO.getFPImm()->isExactlyValue(0.0) && "Only FP 0.0 expected"); - O << "#0.0"; - break; - case MachineOperand::MO_BlockAddress: - case MachineOperand::MO_ConstantPoolIndex: - case MachineOperand::MO_GlobalAddress: - case MachineOperand::MO_ExternalSymbol: - return printSymbolicAddress(MO, false, "", O); - } - return false; - } - // We have a real modifier to handle. + if (!ExtraCode) + ExtraCode = ""; + switch(ExtraCode[0]) { default: - // See if this is a generic operand - return AsmPrinter::PrintAsmOperand(MI, OpNum, AsmVariant, ExtraCode, O); - case 'c': // Don't print "#" before an immediate operand. - if (!MI->getOperand(OpNum).isImm()) - return true; - O << MI->getOperand(OpNum).getImm(); - return false; + if (!AsmPrinter::PrintAsmOperand(MI, OpNum, AsmVariant, ExtraCode, O)) + return false; + break; case 'w': // Output 32-bit general register operand, constant zero as wzr, or stack // pointer as wsp. Ignored when used with other operand types. - return printModifiedGPRAsmOperand(MI->getOperand(OpNum), TRI, - AArch64::GPR32RegClass, O); + if (!printModifiedGPRAsmOperand(MI->getOperand(OpNum), TRI, + AArch64::GPR32RegClass, O)) + return false; + break; case 'x': // Output 64-bit general register operand, constant zero as xzr, or stack // pointer as sp. Ignored when used with other operand types. - return printModifiedGPRAsmOperand(MI->getOperand(OpNum), TRI, - AArch64::GPR64RegClass, O); + if (!printModifiedGPRAsmOperand(MI->getOperand(OpNum), TRI, + AArch64::GPR64RegClass, O)) + return false; + break; case 'H': // Output higher numbered of a 64-bit general register pair case 'Q': @@ -221,40 +184,65 @@ bool AArch64AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum, // copies ...). llvm_unreachable("FIXME: Unimplemented register pairs"); case 'b': - // Output 8-bit FP/SIMD scalar register operand, prefixed with b. - return printModifiedFPRAsmOperand(MI->getOperand(OpNum), TRI, - AArch64::FPR8RegClass, O); case 'h': - // Output 16-bit FP/SIMD scalar register operand, prefixed with h. - return printModifiedFPRAsmOperand(MI->getOperand(OpNum), TRI, - AArch64::FPR16RegClass, O); case 's': - // Output 32-bit FP/SIMD scalar register operand, prefixed with s. - return printModifiedFPRAsmOperand(MI->getOperand(OpNum), TRI, - AArch64::FPR32RegClass, O); case 'd': - // Output 64-bit FP/SIMD scalar register operand, prefixed with d. - return printModifiedFPRAsmOperand(MI->getOperand(OpNum), TRI, - AArch64::FPR64RegClass, O); case 'q': - // Output 128-bit FP/SIMD scalar register operand, prefixed with q. - return printModifiedFPRAsmOperand(MI->getOperand(OpNum), TRI, - AArch64::FPR128RegClass, O); + if (!printModifiedFPRAsmOperand(MI->getOperand(OpNum), TRI, + ExtraCode[0], O)) + return false; + break; case 'A': // Output symbolic address with appropriate relocation modifier (also // suitable for ADRP). - return printSymbolicAddress(MI->getOperand(OpNum), false, "", O); + if (!printSymbolicAddress(MI->getOperand(OpNum), false, "", O)) + return false; + break; case 'L': // Output bits 11:0 of symbolic address with appropriate :lo12: relocation // modifier. - return printSymbolicAddress(MI->getOperand(OpNum), true, "lo12", O); + if (!printSymbolicAddress(MI->getOperand(OpNum), true, "lo12", O)) + return false; + break; case 'G': // Output bits 23:12 of symbolic address with appropriate :hi12: relocation // modifier (currently only for TLS local exec). - return printSymbolicAddress(MI->getOperand(OpNum), true, "hi12", O); + if (!printSymbolicAddress(MI->getOperand(OpNum), true, "hi12", O)) + return false; + break; + case 'a': + return PrintAsmMemoryOperand(MI, OpNum, AsmVariant, ExtraCode, O); } + // There's actually no operand modifier, which leads to a slightly eclectic + // set of behaviour which we have to handle here. + const MachineOperand &MO = MI->getOperand(OpNum); + switch (MO.getType()) { + default: + llvm_unreachable("Unexpected operand for inline assembly"); + case MachineOperand::MO_Register: + // GCC prints the unmodified operand of a 'w' constraint as the vector + // register. Technically, we could allocate the argument as a VPR128, but + // that leads to extremely dodgy copies being generated to get the data + // there. + if (printModifiedFPRAsmOperand(MO, TRI, 'v', O)) + O << AArch64InstPrinter::getRegisterName(MO.getReg()); + break; + case MachineOperand::MO_Immediate: + O << '#' << MO.getImm(); + break; + case MachineOperand::MO_FPImmediate: + assert(MO.getFPImm()->isExactlyValue(0.0) && "Only FP 0.0 expected"); + O << "#0.0"; + break; + case MachineOperand::MO_BlockAddress: + case MachineOperand::MO_ConstantPoolIndex: + case MachineOperand::MO_GlobalAddress: + case MachineOperand::MO_ExternalSymbol: + return printSymbolicAddress(MO, false, "", O); + } + return false; } bool AArch64AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, @@ -271,24 +259,6 @@ bool AArch64AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, return false; } -void AArch64AsmPrinter::PrintDebugValueComment(const MachineInstr *MI, - raw_ostream &OS) { - unsigned NOps = MI->getNumOperands(); - assert(NOps==4); - OS << '\t' << MAI->getCommentString() << "DEBUG_VALUE: "; - // cast away const; DIetc do not take const operands for some reason. - DIVariable V(const_cast(MI->getOperand(NOps-1).getMetadata())); - OS << V.getName(); - OS << " <- "; - // Frame address. Currently handles register +- offset only. - assert(MI->getOperand(0).isReg() && MI->getOperand(1).isImm()); - OS << '[' << AArch64InstPrinter::getRegisterName(MI->getOperand(0).getReg()); - OS << '+' << MI->getOperand(1).getImm(); - OS << ']'; - OS << "+" << MI->getOperand(NOps - 2).getImm(); -} - - #include "AArch64GenMCPseudoLowering.inc" void AArch64AsmPrinter::EmitInstruction(const MachineInstr *MI) { @@ -296,18 +266,6 @@ void AArch64AsmPrinter::EmitInstruction(const MachineInstr *MI) { if (emitPseudoExpansionLowering(OutStreamer, MI)) return; - switch (MI->getOpcode()) { - case AArch64::DBG_VALUE: { - if (isVerbose() && OutStreamer.hasRawTextSupport()) { - SmallString<128> TmpStr; - raw_svector_ostream OS(TmpStr); - PrintDebugValueComment(MI, OS); - OutStreamer.EmitRawText(StringRef(OS.str())); - } - return; - } - } - MCInst TmpInst; LowerAArch64MachineInstrToMCInst(MI, TmpInst, *this); OutStreamer.EmitInstruction(TmpInst); @@ -329,7 +287,7 @@ void AArch64AsmPrinter::EmitEndOfAsmFile(Module &M) { for (unsigned i = 0, e = Stubs.size(); i != e; ++i) { OutStreamer.EmitLabel(Stubs[i].first); OutStreamer.EmitSymbolValue(Stubs[i].second.getPointer(), - TD->getPointerSize(0), 0); + TD->getPointerSize(0)); } Stubs.clear(); } diff --git a/lib/Target/AArch64/AArch64AsmPrinter.h b/lib/Target/AArch64/AArch64AsmPrinter.h index af0c9fed066f..824f0036bc5b 100644 --- a/lib/Target/AArch64/AArch64AsmPrinter.h +++ b/lib/Target/AArch64/AArch64AsmPrinter.h @@ -55,8 +55,6 @@ class LLVM_LIBRARY_VISIBILITY AArch64AsmPrinter : public AsmPrinter { unsigned AsmVariant, const char *ExtraCode, raw_ostream &O); - void PrintDebugValueComment(const MachineInstr *MI, raw_ostream &OS); - /// printSymbolicAddress - Given some kind of reasonably bare symbolic /// reference, print out the appropriate asm string to represent it. If /// appropriate, a relocation-specifier will be produced, composed of a @@ -67,8 +65,6 @@ class LLVM_LIBRARY_VISIBILITY AArch64AsmPrinter : public AsmPrinter { bool PrintImmediatePrefix, StringRef Suffix, raw_ostream &O); - MachineLocation getDebugValueLocation(const MachineInstr *MI) const; - virtual const char *getPassName() const { return "AArch64 Assembly Printer"; } diff --git a/lib/Target/AArch64/AArch64BranchFixupPass.cpp b/lib/Target/AArch64/AArch64BranchFixupPass.cpp index 71233ba5c3dc..11e7f41a3964 100644 --- a/lib/Target/AArch64/AArch64BranchFixupPass.cpp +++ b/lib/Target/AArch64/AArch64BranchFixupPass.cpp @@ -87,7 +87,7 @@ namespace { // If the block size isn't a multiple of the known bits, assume the // worst case padding. if (Size & ((1u << Bits) - 1)) - Bits = CountTrailingZeros_32(Size); + Bits = countTrailingZeros(Size); return Bits; } diff --git a/lib/Target/AArch64/AArch64CallingConv.td b/lib/Target/AArch64/AArch64CallingConv.td index b880d8373deb..a2a9f3f67455 100644 --- a/lib/Target/AArch64/AArch64CallingConv.td +++ b/lib/Target/AArch64/AArch64CallingConv.td @@ -59,9 +59,9 @@ def CC_A64_APCS : CallingConv<[ // Canonicalise the various types that live in different floating-point // registers. This makes sense because the PCS does not distinguish Short // Vectors and Floating-point types. - CCIfType<[v2i8], CCBitConvertToType>, - CCIfType<[v4i8, v2i16], CCBitConvertToType>, - CCIfType<[v8i8, v4i16, v2i32, v2f32], CCBitConvertToType>, + CCIfType<[v1i16, v2i8], CCBitConvertToType>, + CCIfType<[v1i32, v4i8, v2i16, v1f32], CCBitConvertToType>, + CCIfType<[v8i8, v4i16, v2i32, v2f32, v1i64, v1f64], CCBitConvertToType>, CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCBitConvertToType>, @@ -70,7 +70,8 @@ def CC_A64_APCS : CallingConv<[ // argument is allocated to the least significant bits of register // v[NSRN]. The NSRN is incremented by one. The argument has now been // allocated." - CCIfType<[f16], CCAssignToReg<[B0, B1, B2, B3, B4, B5, B6, B7]>>, + CCIfType<[v1i8], CCAssignToReg<[B0, B1, B2, B3, B4, B5, B6, B7]>>, + CCIfType<[f16], CCAssignToReg<[H0, H1, H2, H3, H4, H5, H6, H7]>>, CCIfType<[f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7]>>, CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>, CCIfType<[f128], CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, diff --git a/lib/Target/AArch64/AArch64FrameLowering.cpp b/lib/Target/AArch64/AArch64FrameLowering.cpp index daa7f1d5ef49..731823017c13 100644 --- a/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -54,7 +54,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF) const { DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc(); MachineModuleInfo &MMI = MF.getMMI(); - std::vector &Moves = MMI.getFrameMoves(); + const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo(); bool NeedsFrameMoves = MMI.hasDebugInfo() || MF.getFunction()->needsUnwindTableEntry(); @@ -97,8 +97,9 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF) const { .addSym(SPLabel); MachineLocation Dst(MachineLocation::VirtualFP); - MachineLocation Src(AArch64::XSP, NumInitialBytes); - Moves.push_back(MachineMove(SPLabel, Dst, Src)); + unsigned Reg = MRI->getDwarfRegNum(AArch64::XSP, true); + MMI.addFrameInst( + MCCFIInstruction::createDefCfa(SPLabel, Reg, -NumInitialBytes)); } // Otherwise we need to set the frame pointer and/or add a second stack @@ -131,9 +132,9 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF) const { MCSymbol *FPLabel = MMI.getContext().CreateTempSymbol(); BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::PROLOG_LABEL)) .addSym(FPLabel); - MachineLocation Dst(MachineLocation::VirtualFP); - MachineLocation Src(AArch64::X29, -MFI->getObjectOffset(X29FrameIdx)); - Moves.push_back(MachineMove(FPLabel, Dst, Src)); + unsigned Reg = MRI->getDwarfRegNum(AArch64::X29, true); + unsigned Offset = MFI->getObjectOffset(X29FrameIdx); + MMI.addFrameInst(MCCFIInstruction::createDefCfa(FPLabel, Reg, Offset)); } FPNeedsSetting = false; @@ -164,8 +165,9 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF) const { .addSym(CSLabel); MachineLocation Dst(MachineLocation::VirtualFP); - MachineLocation Src(AArch64::XSP, NumResidualBytes + NumInitialBytes); - Moves.push_back(MachineMove(CSLabel, Dst, Src)); + unsigned Reg = MRI->getDwarfRegNum(AArch64::XSP, true); + unsigned Offset = NumResidualBytes + NumInitialBytes; + MMI.addFrameInst(MCCFIInstruction::createDefCfa(CSLabel, Reg, -Offset)); } // And any callee-saved registers (it's fine to leave them to the end here, @@ -180,10 +182,9 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF) const { for (std::vector::const_iterator I = CSI.begin(), E = CSI.end(); I != E; ++I) { - MachineLocation Dst(MachineLocation::VirtualFP, - MFI->getObjectOffset(I->getFrameIdx())); - MachineLocation Src(I->getReg()); - Moves.push_back(MachineMove(CSLabel, Dst, Src)); + unsigned Offset = MFI->getObjectOffset(I->getFrameIdx()); + unsigned Reg = MRI->getDwarfRegNum(I->getReg(), true); + MMI.addFrameInst(MCCFIInstruction::createOffset(CSLabel, Reg, Offset)); } } } @@ -424,7 +425,7 @@ AArch64FrameLowering::emitFrameMemOps(bool isPrologue, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const std::vector &CSI, const TargetRegisterInfo *TRI, - LoadStoreMethod PossClasses[], + const LoadStoreMethod PossClasses[], unsigned NumClasses) const { DebugLoc DL = MBB.findDebugLoc(MBBI); MachineFunction &MF = *MBB.getParent(); @@ -527,11 +528,11 @@ AArch64FrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB, if (CSI.empty()) return false; - static LoadStoreMethod PossibleClasses[] = { + static const LoadStoreMethod PossibleClasses[] = { {&AArch64::GPR64RegClass, AArch64::LSPair64_STR, AArch64::LS64_STR}, {&AArch64::FPR64RegClass, AArch64::LSFPPair64_STR, AArch64::LSFP64_STR}, }; - unsigned NumClasses = llvm::array_lengthof(PossibleClasses); + const unsigned NumClasses = llvm::array_lengthof(PossibleClasses); emitFrameMemOps(/* isPrologue = */ true, MBB, MBBI, CSI, TRI, PossibleClasses, NumClasses); @@ -548,11 +549,11 @@ AArch64FrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB, if (CSI.empty()) return false; - static LoadStoreMethod PossibleClasses[] = { + static const LoadStoreMethod PossibleClasses[] = { {&AArch64::GPR64RegClass, AArch64::LSPair64_LDR, AArch64::LS64_LDR}, {&AArch64::FPR64RegClass, AArch64::LSFPPair64_LDR, AArch64::LSFP64_LDR}, }; - unsigned NumClasses = llvm::array_lengthof(PossibleClasses); + const unsigned NumClasses = llvm::array_lengthof(PossibleClasses); emitFrameMemOps(/* isPrologue = */ false, MBB, MBBI, CSI, TRI, PossibleClasses, NumClasses); diff --git a/lib/Target/AArch64/AArch64FrameLowering.h b/lib/Target/AArch64/AArch64FrameLowering.h index 45ea0ec8e071..032dd90fa0e6 100644 --- a/lib/Target/AArch64/AArch64FrameLowering.h +++ b/lib/Target/AArch64/AArch64FrameLowering.h @@ -90,7 +90,7 @@ public: MachineBasicBlock::iterator MI, const std::vector &CSI, const TargetRegisterInfo *TRI, - LoadStoreMethod PossibleClasses[], + const LoadStoreMethod PossibleClasses[], unsigned NumClasses) const; diff --git a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp index 102c71b5d989..ef99541c1700 100644 --- a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -33,7 +33,6 @@ namespace { class AArch64DAGToDAGISel : public SelectionDAGISel { AArch64TargetMachine &TM; - const AArch64InstrInfo *TII; /// Keep a pointer to the AArch64Subtarget around so that we can /// make the right decision when generating code for different targets. @@ -43,7 +42,6 @@ public: explicit AArch64DAGToDAGISel(AArch64TargetMachine &tm, CodeGenOpt::Level OptLevel) : SelectionDAGISel(tm, OptLevel), TM(tm), - TII(static_cast(TM.getInstrInfo())), Subtarget(&TM.getSubtarget()) { } @@ -72,10 +70,11 @@ public: /// Used for pre-lowered address-reference nodes, so we already know /// the fields match. This operand's job is simply to add an - /// appropriate shift operand (i.e. 0) to the MOVZ/MOVK instruction. + /// appropriate shift operand to the MOVZ/MOVK instruction. + template bool SelectMOVWAddressRef(SDValue N, SDValue &Imm, SDValue &Shift) { Imm = N; - Shift = CurDAG->getTargetConstant(0, MVT::i32); + Shift = CurDAG->getTargetConstant(LogShift, MVT::i32); return true; } @@ -102,7 +101,7 @@ public: /// Put the given constant into a pool and return a DAG which will give its /// address. - SDValue getConstantPoolItemAddress(DebugLoc DL, const Constant *CV); + SDValue getConstantPoolItemAddress(SDLoc DL, const Constant *CV); SDNode *TrySelectToMoveImm(SDNode *N); SDNode *LowerToFPLitPool(SDNode *Node); @@ -110,6 +109,45 @@ public: SDNode* Select(SDNode*); private: + /// Get the opcode for table lookup instruction + unsigned getTBLOpc(bool IsExt, bool Is64Bit, unsigned NumOfVec); + + /// Select NEON table lookup intrinsics. NumVecs should be 1, 2, 3 or 4. + /// IsExt is to indicate if the result will be extended with an argument. + SDNode *SelectVTBL(SDNode *N, unsigned NumVecs, bool IsExt); + + /// Select NEON load intrinsics. NumVecs should be 1, 2, 3 or 4. + SDNode *SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs, + const uint16_t *Opcode); + + /// Select NEON store intrinsics. NumVecs should be 1, 2, 3 or 4. + SDNode *SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs, + const uint16_t *Opcodes); + + /// Form sequences of consecutive 64/128-bit registers for use in NEON + /// instructions making use of a vector-list (e.g. ldN, tbl). Vecs must have + /// between 1 and 4 elements. If it contains a single element that is returned + /// unchanged; otherwise a REG_SEQUENCE value is returned. + SDValue createDTuple(ArrayRef Vecs); + SDValue createQTuple(ArrayRef Vecs); + + /// Generic helper for the createDTuple/createQTuple + /// functions. Those should almost always be called instead. + SDValue createTuple(ArrayRef Vecs, unsigned RegClassIDs[], + unsigned SubRegs[]); + + /// Select NEON load-duplicate intrinsics. NumVecs should be 2, 3 or 4. + /// The opcode array specifies the instructions used for load. + SDNode *SelectVLDDup(SDNode *N, bool isUpdating, unsigned NumVecs, + const uint16_t *Opcodes); + + /// Select NEON load/store lane intrinsics. NumVecs should be 2, 3 or 4. + /// The opcode arrays specify the instructions used for load/store. + SDNode *SelectVLDSTLane(SDNode *N, bool IsLoad, bool isUpdating, + unsigned NumVecs, const uint16_t *Opcodes); + + SDValue getTargetSubregToReg(int SRIdx, SDLoc DL, EVT VT, EVT VTD, + SDValue Operand); }; } @@ -191,7 +229,7 @@ bool AArch64DAGToDAGISel::SelectLogicalImm(SDValue N, SDValue &Imm) { SDNode *AArch64DAGToDAGISel::TrySelectToMoveImm(SDNode *Node) { SDNode *ResNode; - DebugLoc dl = Node->getDebugLoc(); + SDLoc dl(Node); EVT DestType = Node->getValueType(0); unsigned DestWidth = DestType.getSizeInBits(); @@ -241,14 +279,14 @@ SDNode *AArch64DAGToDAGISel::TrySelectToMoveImm(SDNode *Node) { } SDValue -AArch64DAGToDAGISel::getConstantPoolItemAddress(DebugLoc DL, +AArch64DAGToDAGISel::getConstantPoolItemAddress(SDLoc DL, const Constant *CV) { - EVT PtrVT = TLI.getPointerTy(); + EVT PtrVT = getTargetLowering()->getPointerTy(); - switch (TLI.getTargetMachine().getCodeModel()) { + switch (getTargetLowering()->getTargetMachine().getCodeModel()) { case CodeModel::Small: { unsigned Alignment = - TLI.getDataLayout()->getABITypeAlignment(CV->getType()); + getTargetLowering()->getDataLayout()->getABITypeAlignment(CV->getType()); return CurDAG->getNode( AArch64ISD::WrapperSmall, DL, PtrVT, CurDAG->getTargetConstantPool(CV, PtrVT, 0, 0, AArch64II::MO_NO_FLAG), @@ -260,15 +298,15 @@ AArch64DAGToDAGISel::getConstantPoolItemAddress(DebugLoc DL, LitAddr = CurDAG->getMachineNode( AArch64::MOVZxii, DL, PtrVT, CurDAG->getTargetConstantPool(CV, PtrVT, 0, 0, AArch64II::MO_ABS_G3), - CurDAG->getTargetConstant(0, MVT::i32)); + CurDAG->getTargetConstant(3, MVT::i32)); LitAddr = CurDAG->getMachineNode( AArch64::MOVKxii, DL, PtrVT, SDValue(LitAddr, 0), CurDAG->getTargetConstantPool(CV, PtrVT, 0, 0, AArch64II::MO_ABS_G2_NC), - CurDAG->getTargetConstant(0, MVT::i32)); + CurDAG->getTargetConstant(2, MVT::i32)); LitAddr = CurDAG->getMachineNode( AArch64::MOVKxii, DL, PtrVT, SDValue(LitAddr, 0), CurDAG->getTargetConstantPool(CV, PtrVT, 0, 0, AArch64II::MO_ABS_G1_NC), - CurDAG->getTargetConstant(0, MVT::i32)); + CurDAG->getTargetConstant(1, MVT::i32)); LitAddr = CurDAG->getMachineNode( AArch64::MOVKxii, DL, PtrVT, SDValue(LitAddr, 0), CurDAG->getTargetConstantPool(CV, PtrVT, 0, 0, AArch64II::MO_ABS_G0_NC), @@ -281,7 +319,7 @@ AArch64DAGToDAGISel::getConstantPoolItemAddress(DebugLoc DL, } SDNode *AArch64DAGToDAGISel::SelectToLitPool(SDNode *Node) { - DebugLoc DL = Node->getDebugLoc(); + SDLoc DL(Node); uint64_t UnsignedVal = cast(Node)->getZExtValue(); int64_t SignedVal = cast(Node)->getSExtValue(); EVT DestType = Node->getValueType(0); @@ -312,7 +350,8 @@ SDNode *AArch64DAGToDAGISel::SelectToLitPool(SDNode *Node) { MemType.getSizeInBits()), UnsignedVal); SDValue PoolAddr = getConstantPoolItemAddress(DL, CV); - unsigned Alignment = TLI.getDataLayout()->getABITypeAlignment(CV->getType()); + unsigned Alignment = + getTargetLowering()->getDataLayout()->getABITypeAlignment(CV->getType()); return CurDAG->getExtLoad(Extension, DL, DestType, CurDAG->getEntryNode(), PoolAddr, @@ -323,11 +362,12 @@ SDNode *AArch64DAGToDAGISel::SelectToLitPool(SDNode *Node) { } SDNode *AArch64DAGToDAGISel::LowerToFPLitPool(SDNode *Node) { - DebugLoc DL = Node->getDebugLoc(); + SDLoc DL(Node); const ConstantFP *FV = cast(Node)->getConstantFPValue(); EVT DestType = Node->getValueType(0); - unsigned Alignment = TLI.getDataLayout()->getABITypeAlignment(FV->getType()); + unsigned Alignment = + getTargetLowering()->getDataLayout()->getABITypeAlignment(FV->getType()); SDValue PoolAddr = getConstantPoolItemAddress(DL, FV); return CurDAG->getLoad(DestType, DL, CurDAG->getEntryNode(), PoolAddr, @@ -389,12 +429,607 @@ SDNode *AArch64DAGToDAGISel::SelectAtomic(SDNode *Node, unsigned Op8, &Ops[0], Ops.size()); } +SDValue AArch64DAGToDAGISel::createDTuple(ArrayRef Regs) { + static unsigned RegClassIDs[] = { AArch64::DPairRegClassID, + AArch64::DTripleRegClassID, + AArch64::DQuadRegClassID }; + static unsigned SubRegs[] = { AArch64::dsub_0, AArch64::dsub_1, + AArch64::dsub_2, AArch64::dsub_3 }; + + return createTuple(Regs, RegClassIDs, SubRegs); +} + +SDValue AArch64DAGToDAGISel::createQTuple(ArrayRef Regs) { + static unsigned RegClassIDs[] = { AArch64::QPairRegClassID, + AArch64::QTripleRegClassID, + AArch64::QQuadRegClassID }; + static unsigned SubRegs[] = { AArch64::qsub_0, AArch64::qsub_1, + AArch64::qsub_2, AArch64::qsub_3 }; + + return createTuple(Regs, RegClassIDs, SubRegs); +} + +SDValue AArch64DAGToDAGISel::createTuple(ArrayRef Regs, + unsigned RegClassIDs[], + unsigned SubRegs[]) { + // There's no special register-class for a vector-list of 1 element: it's just + // a vector. + if (Regs.size() == 1) + return Regs[0]; + + assert(Regs.size() >= 2 && Regs.size() <= 4); + + SDLoc DL(Regs[0].getNode()); + + SmallVector Ops; + + // First operand of REG_SEQUENCE is the desired RegClass. + Ops.push_back( + CurDAG->getTargetConstant(RegClassIDs[Regs.size() - 2], MVT::i32)); + + // Then we get pairs of source & subregister-position for the components. + for (unsigned i = 0; i < Regs.size(); ++i) { + Ops.push_back(Regs[i]); + Ops.push_back(CurDAG->getTargetConstant(SubRegs[i], MVT::i32)); + } + + SDNode *N = + CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, MVT::Untyped, Ops); + return SDValue(N, 0); +} + + +// Get the register stride update opcode of a VLD/VST instruction that +// is otherwise equivalent to the given fixed stride updating instruction. +static unsigned getVLDSTRegisterUpdateOpcode(unsigned Opc) { + switch (Opc) { + default: break; + case AArch64::LD1WB_8B_fixed: return AArch64::LD1WB_8B_register; + case AArch64::LD1WB_4H_fixed: return AArch64::LD1WB_4H_register; + case AArch64::LD1WB_2S_fixed: return AArch64::LD1WB_2S_register; + case AArch64::LD1WB_1D_fixed: return AArch64::LD1WB_1D_register; + case AArch64::LD1WB_16B_fixed: return AArch64::LD1WB_16B_register; + case AArch64::LD1WB_8H_fixed: return AArch64::LD1WB_8H_register; + case AArch64::LD1WB_4S_fixed: return AArch64::LD1WB_4S_register; + case AArch64::LD1WB_2D_fixed: return AArch64::LD1WB_2D_register; + + case AArch64::LD2WB_8B_fixed: return AArch64::LD2WB_8B_register; + case AArch64::LD2WB_4H_fixed: return AArch64::LD2WB_4H_register; + case AArch64::LD2WB_2S_fixed: return AArch64::LD2WB_2S_register; + case AArch64::LD2WB_16B_fixed: return AArch64::LD2WB_16B_register; + case AArch64::LD2WB_8H_fixed: return AArch64::LD2WB_8H_register; + case AArch64::LD2WB_4S_fixed: return AArch64::LD2WB_4S_register; + case AArch64::LD2WB_2D_fixed: return AArch64::LD2WB_2D_register; + + case AArch64::LD3WB_8B_fixed: return AArch64::LD3WB_8B_register; + case AArch64::LD3WB_4H_fixed: return AArch64::LD3WB_4H_register; + case AArch64::LD3WB_2S_fixed: return AArch64::LD3WB_2S_register; + case AArch64::LD3WB_16B_fixed: return AArch64::LD3WB_16B_register; + case AArch64::LD3WB_8H_fixed: return AArch64::LD3WB_8H_register; + case AArch64::LD3WB_4S_fixed: return AArch64::LD3WB_4S_register; + case AArch64::LD3WB_2D_fixed: return AArch64::LD3WB_2D_register; + + case AArch64::LD4WB_8B_fixed: return AArch64::LD4WB_8B_register; + case AArch64::LD4WB_4H_fixed: return AArch64::LD4WB_4H_register; + case AArch64::LD4WB_2S_fixed: return AArch64::LD4WB_2S_register; + case AArch64::LD4WB_16B_fixed: return AArch64::LD4WB_16B_register; + case AArch64::LD4WB_8H_fixed: return AArch64::LD4WB_8H_register; + case AArch64::LD4WB_4S_fixed: return AArch64::LD4WB_4S_register; + case AArch64::LD4WB_2D_fixed: return AArch64::LD4WB_2D_register; + + case AArch64::LD1x2WB_8B_fixed: return AArch64::LD1x2WB_8B_register; + case AArch64::LD1x2WB_4H_fixed: return AArch64::LD1x2WB_4H_register; + case AArch64::LD1x2WB_2S_fixed: return AArch64::LD1x2WB_2S_register; + case AArch64::LD1x2WB_1D_fixed: return AArch64::LD1x2WB_1D_register; + case AArch64::LD1x2WB_16B_fixed: return AArch64::LD1x2WB_16B_register; + case AArch64::LD1x2WB_8H_fixed: return AArch64::LD1x2WB_8H_register; + case AArch64::LD1x2WB_4S_fixed: return AArch64::LD1x2WB_4S_register; + case AArch64::LD1x2WB_2D_fixed: return AArch64::LD1x2WB_2D_register; + + case AArch64::LD1x3WB_8B_fixed: return AArch64::LD1x3WB_8B_register; + case AArch64::LD1x3WB_4H_fixed: return AArch64::LD1x3WB_4H_register; + case AArch64::LD1x3WB_2S_fixed: return AArch64::LD1x3WB_2S_register; + case AArch64::LD1x3WB_1D_fixed: return AArch64::LD1x3WB_1D_register; + case AArch64::LD1x3WB_16B_fixed: return AArch64::LD1x3WB_16B_register; + case AArch64::LD1x3WB_8H_fixed: return AArch64::LD1x3WB_8H_register; + case AArch64::LD1x3WB_4S_fixed: return AArch64::LD1x3WB_4S_register; + case AArch64::LD1x3WB_2D_fixed: return AArch64::LD1x3WB_2D_register; + + case AArch64::LD1x4WB_8B_fixed: return AArch64::LD1x4WB_8B_register; + case AArch64::LD1x4WB_4H_fixed: return AArch64::LD1x4WB_4H_register; + case AArch64::LD1x4WB_2S_fixed: return AArch64::LD1x4WB_2S_register; + case AArch64::LD1x4WB_1D_fixed: return AArch64::LD1x4WB_1D_register; + case AArch64::LD1x4WB_16B_fixed: return AArch64::LD1x4WB_16B_register; + case AArch64::LD1x4WB_8H_fixed: return AArch64::LD1x4WB_8H_register; + case AArch64::LD1x4WB_4S_fixed: return AArch64::LD1x4WB_4S_register; + case AArch64::LD1x4WB_2D_fixed: return AArch64::LD1x4WB_2D_register; + + case AArch64::ST1WB_8B_fixed: return AArch64::ST1WB_8B_register; + case AArch64::ST1WB_4H_fixed: return AArch64::ST1WB_4H_register; + case AArch64::ST1WB_2S_fixed: return AArch64::ST1WB_2S_register; + case AArch64::ST1WB_1D_fixed: return AArch64::ST1WB_1D_register; + case AArch64::ST1WB_16B_fixed: return AArch64::ST1WB_16B_register; + case AArch64::ST1WB_8H_fixed: return AArch64::ST1WB_8H_register; + case AArch64::ST1WB_4S_fixed: return AArch64::ST1WB_4S_register; + case AArch64::ST1WB_2D_fixed: return AArch64::ST1WB_2D_register; + + case AArch64::ST2WB_8B_fixed: return AArch64::ST2WB_8B_register; + case AArch64::ST2WB_4H_fixed: return AArch64::ST2WB_4H_register; + case AArch64::ST2WB_2S_fixed: return AArch64::ST2WB_2S_register; + case AArch64::ST2WB_16B_fixed: return AArch64::ST2WB_16B_register; + case AArch64::ST2WB_8H_fixed: return AArch64::ST2WB_8H_register; + case AArch64::ST2WB_4S_fixed: return AArch64::ST2WB_4S_register; + case AArch64::ST2WB_2D_fixed: return AArch64::ST2WB_2D_register; + + case AArch64::ST3WB_8B_fixed: return AArch64::ST3WB_8B_register; + case AArch64::ST3WB_4H_fixed: return AArch64::ST3WB_4H_register; + case AArch64::ST3WB_2S_fixed: return AArch64::ST3WB_2S_register; + case AArch64::ST3WB_16B_fixed: return AArch64::ST3WB_16B_register; + case AArch64::ST3WB_8H_fixed: return AArch64::ST3WB_8H_register; + case AArch64::ST3WB_4S_fixed: return AArch64::ST3WB_4S_register; + case AArch64::ST3WB_2D_fixed: return AArch64::ST3WB_2D_register; + + case AArch64::ST4WB_8B_fixed: return AArch64::ST4WB_8B_register; + case AArch64::ST4WB_4H_fixed: return AArch64::ST4WB_4H_register; + case AArch64::ST4WB_2S_fixed: return AArch64::ST4WB_2S_register; + case AArch64::ST4WB_16B_fixed: return AArch64::ST4WB_16B_register; + case AArch64::ST4WB_8H_fixed: return AArch64::ST4WB_8H_register; + case AArch64::ST4WB_4S_fixed: return AArch64::ST4WB_4S_register; + case AArch64::ST4WB_2D_fixed: return AArch64::ST4WB_2D_register; + + case AArch64::ST1x2WB_8B_fixed: return AArch64::ST1x2WB_8B_register; + case AArch64::ST1x2WB_4H_fixed: return AArch64::ST1x2WB_4H_register; + case AArch64::ST1x2WB_2S_fixed: return AArch64::ST1x2WB_2S_register; + case AArch64::ST1x2WB_1D_fixed: return AArch64::ST1x2WB_1D_register; + case AArch64::ST1x2WB_16B_fixed: return AArch64::ST1x2WB_16B_register; + case AArch64::ST1x2WB_8H_fixed: return AArch64::ST1x2WB_8H_register; + case AArch64::ST1x2WB_4S_fixed: return AArch64::ST1x2WB_4S_register; + case AArch64::ST1x2WB_2D_fixed: return AArch64::ST1x2WB_2D_register; + + case AArch64::ST1x3WB_8B_fixed: return AArch64::ST1x3WB_8B_register; + case AArch64::ST1x3WB_4H_fixed: return AArch64::ST1x3WB_4H_register; + case AArch64::ST1x3WB_2S_fixed: return AArch64::ST1x3WB_2S_register; + case AArch64::ST1x3WB_1D_fixed: return AArch64::ST1x3WB_1D_register; + case AArch64::ST1x3WB_16B_fixed: return AArch64::ST1x3WB_16B_register; + case AArch64::ST1x3WB_8H_fixed: return AArch64::ST1x3WB_8H_register; + case AArch64::ST1x3WB_4S_fixed: return AArch64::ST1x3WB_4S_register; + case AArch64::ST1x3WB_2D_fixed: return AArch64::ST1x3WB_2D_register; + + case AArch64::ST1x4WB_8B_fixed: return AArch64::ST1x4WB_8B_register; + case AArch64::ST1x4WB_4H_fixed: return AArch64::ST1x4WB_4H_register; + case AArch64::ST1x4WB_2S_fixed: return AArch64::ST1x4WB_2S_register; + case AArch64::ST1x4WB_1D_fixed: return AArch64::ST1x4WB_1D_register; + case AArch64::ST1x4WB_16B_fixed: return AArch64::ST1x4WB_16B_register; + case AArch64::ST1x4WB_8H_fixed: return AArch64::ST1x4WB_8H_register; + case AArch64::ST1x4WB_4S_fixed: return AArch64::ST1x4WB_4S_register; + case AArch64::ST1x4WB_2D_fixed: return AArch64::ST1x4WB_2D_register; + + // Post-index of duplicate loads + case AArch64::LD2R_WB_8B_fixed: return AArch64::LD2R_WB_8B_register; + case AArch64::LD2R_WB_4H_fixed: return AArch64::LD2R_WB_4H_register; + case AArch64::LD2R_WB_2S_fixed: return AArch64::LD2R_WB_2S_register; + case AArch64::LD2R_WB_1D_fixed: return AArch64::LD2R_WB_1D_register; + case AArch64::LD2R_WB_16B_fixed: return AArch64::LD2R_WB_16B_register; + case AArch64::LD2R_WB_8H_fixed: return AArch64::LD2R_WB_8H_register; + case AArch64::LD2R_WB_4S_fixed: return AArch64::LD2R_WB_4S_register; + case AArch64::LD2R_WB_2D_fixed: return AArch64::LD2R_WB_2D_register; + + case AArch64::LD3R_WB_8B_fixed: return AArch64::LD3R_WB_8B_register; + case AArch64::LD3R_WB_4H_fixed: return AArch64::LD3R_WB_4H_register; + case AArch64::LD3R_WB_2S_fixed: return AArch64::LD3R_WB_2S_register; + case AArch64::LD3R_WB_1D_fixed: return AArch64::LD3R_WB_1D_register; + case AArch64::LD3R_WB_16B_fixed: return AArch64::LD3R_WB_16B_register; + case AArch64::LD3R_WB_8H_fixed: return AArch64::LD3R_WB_8H_register; + case AArch64::LD3R_WB_4S_fixed: return AArch64::LD3R_WB_4S_register; + case AArch64::LD3R_WB_2D_fixed: return AArch64::LD3R_WB_2D_register; + + case AArch64::LD4R_WB_8B_fixed: return AArch64::LD4R_WB_8B_register; + case AArch64::LD4R_WB_4H_fixed: return AArch64::LD4R_WB_4H_register; + case AArch64::LD4R_WB_2S_fixed: return AArch64::LD4R_WB_2S_register; + case AArch64::LD4R_WB_1D_fixed: return AArch64::LD4R_WB_1D_register; + case AArch64::LD4R_WB_16B_fixed: return AArch64::LD4R_WB_16B_register; + case AArch64::LD4R_WB_8H_fixed: return AArch64::LD4R_WB_8H_register; + case AArch64::LD4R_WB_4S_fixed: return AArch64::LD4R_WB_4S_register; + case AArch64::LD4R_WB_2D_fixed: return AArch64::LD4R_WB_2D_register; + + // Post-index of lane loads + case AArch64::LD2LN_WB_B_fixed: return AArch64::LD2LN_WB_B_register; + case AArch64::LD2LN_WB_H_fixed: return AArch64::LD2LN_WB_H_register; + case AArch64::LD2LN_WB_S_fixed: return AArch64::LD2LN_WB_S_register; + case AArch64::LD2LN_WB_D_fixed: return AArch64::LD2LN_WB_D_register; + + case AArch64::LD3LN_WB_B_fixed: return AArch64::LD3LN_WB_B_register; + case AArch64::LD3LN_WB_H_fixed: return AArch64::LD3LN_WB_H_register; + case AArch64::LD3LN_WB_S_fixed: return AArch64::LD3LN_WB_S_register; + case AArch64::LD3LN_WB_D_fixed: return AArch64::LD3LN_WB_D_register; + + case AArch64::LD4LN_WB_B_fixed: return AArch64::LD4LN_WB_B_register; + case AArch64::LD4LN_WB_H_fixed: return AArch64::LD4LN_WB_H_register; + case AArch64::LD4LN_WB_S_fixed: return AArch64::LD4LN_WB_S_register; + case AArch64::LD4LN_WB_D_fixed: return AArch64::LD4LN_WB_D_register; + + // Post-index of lane stores + case AArch64::ST2LN_WB_B_fixed: return AArch64::ST2LN_WB_B_register; + case AArch64::ST2LN_WB_H_fixed: return AArch64::ST2LN_WB_H_register; + case AArch64::ST2LN_WB_S_fixed: return AArch64::ST2LN_WB_S_register; + case AArch64::ST2LN_WB_D_fixed: return AArch64::ST2LN_WB_D_register; + + case AArch64::ST3LN_WB_B_fixed: return AArch64::ST3LN_WB_B_register; + case AArch64::ST3LN_WB_H_fixed: return AArch64::ST3LN_WB_H_register; + case AArch64::ST3LN_WB_S_fixed: return AArch64::ST3LN_WB_S_register; + case AArch64::ST3LN_WB_D_fixed: return AArch64::ST3LN_WB_D_register; + + case AArch64::ST4LN_WB_B_fixed: return AArch64::ST4LN_WB_B_register; + case AArch64::ST4LN_WB_H_fixed: return AArch64::ST4LN_WB_H_register; + case AArch64::ST4LN_WB_S_fixed: return AArch64::ST4LN_WB_S_register; + case AArch64::ST4LN_WB_D_fixed: return AArch64::ST4LN_WB_D_register; + } + return Opc; // If not one we handle, return it unchanged. +} + +SDNode *AArch64DAGToDAGISel::SelectVLD(SDNode *N, bool isUpdating, + unsigned NumVecs, + const uint16_t *Opcodes) { + assert(NumVecs >= 1 && NumVecs <= 4 && "VLD NumVecs out-of-range"); + + EVT VT = N->getValueType(0); + unsigned OpcodeIndex; + bool is64BitVector = VT.is64BitVector(); + switch (VT.getScalarType().getSizeInBits()) { + case 8: OpcodeIndex = is64BitVector ? 0 : 4; break; + case 16: OpcodeIndex = is64BitVector ? 1 : 5; break; + case 32: OpcodeIndex = is64BitVector ? 2 : 6; break; + case 64: OpcodeIndex = is64BitVector ? 3 : 7; break; + default: llvm_unreachable("unhandled vector load type"); + } + unsigned Opc = Opcodes[OpcodeIndex]; + + SmallVector Ops; + unsigned AddrOpIdx = isUpdating ? 1 : 2; + Ops.push_back(N->getOperand(AddrOpIdx)); // Push back the Memory Address + + if (isUpdating) { + SDValue Inc = N->getOperand(AddrOpIdx + 1); + if (!isa(Inc.getNode())) // Increment in Register + Opc = getVLDSTRegisterUpdateOpcode(Opc); + Ops.push_back(Inc); + } + + Ops.push_back(N->getOperand(0)); // Push back the Chain + + SmallVector ResTys; + // Push back the type of return super register + if (NumVecs == 1) + ResTys.push_back(VT); + else if (NumVecs == 3) + ResTys.push_back(MVT::Untyped); + else { + EVT ResTy = EVT::getVectorVT(*CurDAG->getContext(), MVT::i64, + is64BitVector ? NumVecs : NumVecs * 2); + ResTys.push_back(ResTy); + } + + if (isUpdating) + ResTys.push_back(MVT::i64); // Type of the updated register + ResTys.push_back(MVT::Other); // Type of the Chain + SDLoc dl(N); + SDNode *VLd = CurDAG->getMachineNode(Opc, dl, ResTys, Ops); + + // Transfer memoperands. + MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); + MemOp[0] = cast(N)->getMemOperand(); + cast(VLd)->setMemRefs(MemOp, MemOp + 1); + + if (NumVecs == 1) + return VLd; + + // If NumVecs > 1, the return result is a super register containing 2-4 + // consecutive vector registers. + SDValue SuperReg = SDValue(VLd, 0); + + unsigned Sub0 = is64BitVector ? AArch64::dsub_0 : AArch64::qsub_0; + for (unsigned Vec = 0; Vec < NumVecs; ++Vec) + ReplaceUses(SDValue(N, Vec), + CurDAG->getTargetExtractSubreg(Sub0 + Vec, dl, VT, SuperReg)); + // Update users of the Chain + ReplaceUses(SDValue(N, NumVecs), SDValue(VLd, 1)); + if (isUpdating) + ReplaceUses(SDValue(N, NumVecs + 1), SDValue(VLd, 2)); + + return NULL; +} + +SDNode *AArch64DAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, + unsigned NumVecs, + const uint16_t *Opcodes) { + assert(NumVecs >= 1 && NumVecs <= 4 && "VST NumVecs out-of-range"); + SDLoc dl(N); + + MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); + MemOp[0] = cast(N)->getMemOperand(); + + unsigned AddrOpIdx = isUpdating ? 1 : 2; + unsigned Vec0Idx = 3; + EVT VT = N->getOperand(Vec0Idx).getValueType(); + unsigned OpcodeIndex; + bool is64BitVector = VT.is64BitVector(); + switch (VT.getScalarType().getSizeInBits()) { + case 8: OpcodeIndex = is64BitVector ? 0 : 4; break; + case 16: OpcodeIndex = is64BitVector ? 1 : 5; break; + case 32: OpcodeIndex = is64BitVector ? 2 : 6; break; + case 64: OpcodeIndex = is64BitVector ? 3 : 7; break; + default: llvm_unreachable("unhandled vector store type"); + } + unsigned Opc = Opcodes[OpcodeIndex]; + + SmallVector ResTys; + if (isUpdating) + ResTys.push_back(MVT::i64); + ResTys.push_back(MVT::Other); // Type for the Chain + + SmallVector Ops; + Ops.push_back(N->getOperand(AddrOpIdx)); // Push back the Memory Address + + if (isUpdating) { + SDValue Inc = N->getOperand(AddrOpIdx + 1); + if (!isa(Inc.getNode())) // Increment in Register + Opc = getVLDSTRegisterUpdateOpcode(Opc); + Ops.push_back(Inc); + } + + SmallVector Regs(N->op_begin() + Vec0Idx, + N->op_begin() + Vec0Idx + NumVecs); + SDValue SrcReg = is64BitVector ? createDTuple(Regs) : createQTuple(Regs); + Ops.push_back(SrcReg); + + // Push back the Chain + Ops.push_back(N->getOperand(0)); + + // Transfer memoperands. + SDNode *VSt = CurDAG->getMachineNode(Opc, dl, ResTys, Ops); + cast(VSt)->setMemRefs(MemOp, MemOp + 1); + + return VSt; +} + +SDValue +AArch64DAGToDAGISel::getTargetSubregToReg(int SRIdx, SDLoc DL, EVT VT, EVT VTD, + SDValue Operand) { + SDNode *Reg = CurDAG->getMachineNode(TargetOpcode::SUBREG_TO_REG, DL, + VT, VTD, MVT::Other, + CurDAG->getTargetConstant(0, MVT::i64), + Operand, + CurDAG->getTargetConstant(AArch64::sub_64, MVT::i32)); + return SDValue(Reg, 0); +} + +SDNode *AArch64DAGToDAGISel::SelectVLDDup(SDNode *N, bool isUpdating, + unsigned NumVecs, + const uint16_t *Opcodes) { + assert(NumVecs >=2 && NumVecs <= 4 && "Load Dup NumVecs out-of-range"); + SDLoc dl(N); + + EVT VT = N->getValueType(0); + unsigned OpcodeIndex; + bool is64BitVector = VT.is64BitVector(); + switch (VT.getScalarType().getSizeInBits()) { + case 8: OpcodeIndex = is64BitVector ? 0 : 4; break; + case 16: OpcodeIndex = is64BitVector ? 1 : 5; break; + case 32: OpcodeIndex = is64BitVector ? 2 : 6; break; + case 64: OpcodeIndex = is64BitVector ? 3 : 7; break; + default: llvm_unreachable("unhandled vector duplicate lane load type"); + } + unsigned Opc = Opcodes[OpcodeIndex]; + + SDValue SuperReg; + SmallVector Ops; + Ops.push_back(N->getOperand(1)); // Push back the Memory Address + if (isUpdating) { + SDValue Inc = N->getOperand(2); + if (!isa(Inc.getNode())) // Increment in Register + Opc = getVLDSTRegisterUpdateOpcode(Opc); + Ops.push_back(Inc); + } + Ops.push_back(N->getOperand(0)); // Push back the Chain + + SmallVector ResTys; + // Push back the type of return super register + if (NumVecs == 3) + ResTys.push_back(MVT::Untyped); + else { + EVT ResTy = EVT::getVectorVT(*CurDAG->getContext(), MVT::i64, + is64BitVector ? NumVecs : NumVecs * 2); + ResTys.push_back(ResTy); + } + if (isUpdating) + ResTys.push_back(MVT::i64); // Type of the updated register + ResTys.push_back(MVT::Other); // Type of the Chain + SDNode *VLdDup = CurDAG->getMachineNode(Opc, dl, ResTys, Ops); + + // Transfer memoperands. + MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); + MemOp[0] = cast(N)->getMemOperand(); + cast(VLdDup)->setMemRefs(MemOp, MemOp + 1); + + SuperReg = SDValue(VLdDup, 0); + unsigned Sub0 = is64BitVector ? AArch64::dsub_0 : AArch64::qsub_0; + // Update uses of each registers in super register + for (unsigned Vec = 0; Vec < NumVecs; ++Vec) + ReplaceUses(SDValue(N, Vec), + CurDAG->getTargetExtractSubreg(Sub0 + Vec, dl, VT, SuperReg)); + // Update uses of the Chain + ReplaceUses(SDValue(N, NumVecs), SDValue(VLdDup, 1)); + if (isUpdating) + ReplaceUses(SDValue(N, NumVecs + 1), SDValue(VLdDup, 2)); + return NULL; +} + +// We only have 128-bit vector type of load/store lane instructions. +// If it is 64-bit vector, we also select it to the 128-bit instructions. +// Just use SUBREG_TO_REG to adapt the input to 128-bit vector and +// EXTRACT_SUBREG to get the 64-bit vector from the 128-bit vector output. +SDNode *AArch64DAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad, + bool isUpdating, unsigned NumVecs, + const uint16_t *Opcodes) { + assert(NumVecs >= 2 && NumVecs <= 4 && "VLDSTLane NumVecs out-of-range"); + SDLoc dl(N); + unsigned AddrOpIdx = isUpdating ? 1 : 2; + unsigned Vec0Idx = 3; + + SDValue Chain = N->getOperand(0); + unsigned Lane = + cast(N->getOperand(Vec0Idx + NumVecs))->getZExtValue(); + EVT VT = N->getOperand(Vec0Idx).getValueType(); + bool is64BitVector = VT.is64BitVector(); + EVT VT64; // 64-bit Vector Type + + if (is64BitVector) { + VT64 = VT; + VT = EVT::getVectorVT(*CurDAG->getContext(), VT.getVectorElementType(), + VT.getVectorNumElements() * 2); + } + + unsigned OpcodeIndex; + switch (VT.getScalarType().getSizeInBits()) { + case 8: OpcodeIndex = 0; break; + case 16: OpcodeIndex = 1; break; + case 32: OpcodeIndex = 2; break; + case 64: OpcodeIndex = 3; break; + default: llvm_unreachable("unhandled vector lane load/store type"); + } + unsigned Opc = Opcodes[OpcodeIndex]; + + SmallVector ResTys; + if (IsLoad) { + // Push back the type of return super register + if (NumVecs == 3) + ResTys.push_back(MVT::Untyped); + else { + EVT ResTy = EVT::getVectorVT(*CurDAG->getContext(), MVT::i64, + is64BitVector ? NumVecs : NumVecs * 2); + ResTys.push_back(ResTy); + } + } + if (isUpdating) + ResTys.push_back(MVT::i64); // Type of the updated register + ResTys.push_back(MVT::Other); // Type of Chain + SmallVector Ops; + Ops.push_back(N->getOperand(AddrOpIdx)); // Push back the Memory Address + if (isUpdating) { + SDValue Inc = N->getOperand(AddrOpIdx + 1); + if (!isa(Inc.getNode())) // Increment in Register + Opc = getVLDSTRegisterUpdateOpcode(Opc); + Ops.push_back(Inc); + } + + SmallVector Regs(N->op_begin() + Vec0Idx, + N->op_begin() + Vec0Idx + NumVecs); + if (is64BitVector) + for (unsigned i = 0; i < Regs.size(); i++) + Regs[i] = getTargetSubregToReg(AArch64::sub_64, dl, VT, VT64, Regs[i]); + SDValue SuperReg = createQTuple(Regs); + + Ops.push_back(SuperReg); // Source Reg + SDValue LaneValue = CurDAG->getTargetConstant(Lane, MVT::i32); + Ops.push_back(LaneValue); + Ops.push_back(Chain); // Push back the Chain + + SDNode *VLdLn = CurDAG->getMachineNode(Opc, dl, ResTys, Ops); + MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); + MemOp[0] = cast(N)->getMemOperand(); + cast(VLdLn)->setMemRefs(MemOp, MemOp + 1); + if (!IsLoad) + return VLdLn; + + // Extract the subregisters. + SuperReg = SDValue(VLdLn, 0); + unsigned Sub0 = AArch64::qsub_0; + // Update uses of each registers in super register + for (unsigned Vec = 0; Vec < NumVecs; ++Vec) { + SDValue SUB0 = CurDAG->getTargetExtractSubreg(Sub0 + Vec, dl, VT, SuperReg); + if (is64BitVector) { + SUB0 = CurDAG->getTargetExtractSubreg(AArch64::sub_64, dl, VT64, SUB0); + } + ReplaceUses(SDValue(N, Vec), SUB0); + } + ReplaceUses(SDValue(N, NumVecs), SDValue(VLdLn, 1)); + if (isUpdating) + ReplaceUses(SDValue(N, NumVecs + 1), SDValue(VLdLn, 2)); + return NULL; +} + +unsigned AArch64DAGToDAGISel::getTBLOpc(bool IsExt, bool Is64Bit, + unsigned NumOfVec) { + assert(NumOfVec >= 1 && NumOfVec <= 4 && "VST NumVecs out-of-range"); + + unsigned Opc = 0; + switch (NumOfVec) { + default: + break; + case 1: + if (IsExt) + Opc = Is64Bit ? AArch64::TBX1_8b : AArch64::TBX1_16b; + else + Opc = Is64Bit ? AArch64::TBL1_8b : AArch64::TBL1_16b; + break; + case 2: + if (IsExt) + Opc = Is64Bit ? AArch64::TBX2_8b : AArch64::TBX2_16b; + else + Opc = Is64Bit ? AArch64::TBL2_8b : AArch64::TBL2_16b; + break; + case 3: + if (IsExt) + Opc = Is64Bit ? AArch64::TBX3_8b : AArch64::TBX3_16b; + else + Opc = Is64Bit ? AArch64::TBL3_8b : AArch64::TBL3_16b; + break; + case 4: + if (IsExt) + Opc = Is64Bit ? AArch64::TBX4_8b : AArch64::TBX4_16b; + else + Opc = Is64Bit ? AArch64::TBL4_8b : AArch64::TBL4_16b; + break; + } + + return Opc; +} + +SDNode *AArch64DAGToDAGISel::SelectVTBL(SDNode *N, unsigned NumVecs, + bool IsExt) { + assert(NumVecs >= 1 && NumVecs <= 4 && "VST NumVecs out-of-range"); + SDLoc dl(N); + + // Check the element of look up table is 64-bit or not + unsigned Vec0Idx = IsExt ? 2 : 1; + assert(!N->getOperand(Vec0Idx + 0).getValueType().is64BitVector() && + "The element of lookup table for vtbl and vtbx must be 128-bit"); + + // Check the return value type is 64-bit or not + EVT ResVT = N->getValueType(0); + bool is64BitRes = ResVT.is64BitVector(); + + // Create new SDValue for vector list + SmallVector Regs(N->op_begin() + Vec0Idx, + N->op_begin() + Vec0Idx + NumVecs); + SDValue TblReg = createQTuple(Regs); + unsigned Opc = getTBLOpc(IsExt, is64BitRes, NumVecs); + + SmallVector Ops; + if (IsExt) + Ops.push_back(N->getOperand(1)); + Ops.push_back(TblReg); + Ops.push_back(N->getOperand(Vec0Idx + NumVecs)); + return CurDAG->getMachineNode(Opc, dl, ResVT, Ops); +} + SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { // Dump information about the Node being selected DEBUG(dbgs() << "Selecting: "; Node->dump(CurDAG); dbgs() << "\n"); if (Node->isMachineOpcode()) { DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << "\n"); + Node->setNodeId(-1); return NULL; } @@ -473,7 +1108,7 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { AArch64::ATOMIC_CMP_SWAP_I64); case ISD::FrameIndex: { int FI = cast(Node)->getIndex(); - EVT PtrTy = TLI.getPointerTy(); + EVT PtrTy = getTargetLowering()->getPointerTy(); SDValue TFI = CurDAG->getTargetFrameIndex(FI, PtrTy); return CurDAG->SelectNodeTo(Node, AArch64::ADDxxi_lsl0_s, PtrTy, TFI, CurDAG->getTargetConstant(0, PtrTy)); @@ -497,7 +1132,7 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { assert((Ty == MVT::i32 || Ty == MVT::i64) && "unexpected type"); uint16_t Register = Ty == MVT::i32 ? AArch64::WZR : AArch64::XZR; ResNode = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), - Node->getDebugLoc(), + SDLoc(Node), Register, Ty).getNode(); } @@ -534,6 +1169,399 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { Node = ResNode; break; } + case AArch64ISD::NEON_LD1_UPD: { + static const uint16_t Opcodes[] = { + AArch64::LD1WB_8B_fixed, AArch64::LD1WB_4H_fixed, + AArch64::LD1WB_2S_fixed, AArch64::LD1WB_1D_fixed, + AArch64::LD1WB_16B_fixed, AArch64::LD1WB_8H_fixed, + AArch64::LD1WB_4S_fixed, AArch64::LD1WB_2D_fixed + }; + return SelectVLD(Node, true, 1, Opcodes); + } + case AArch64ISD::NEON_LD2_UPD: { + static const uint16_t Opcodes[] = { + AArch64::LD2WB_8B_fixed, AArch64::LD2WB_4H_fixed, + AArch64::LD2WB_2S_fixed, AArch64::LD1x2WB_1D_fixed, + AArch64::LD2WB_16B_fixed, AArch64::LD2WB_8H_fixed, + AArch64::LD2WB_4S_fixed, AArch64::LD2WB_2D_fixed + }; + return SelectVLD(Node, true, 2, Opcodes); + } + case AArch64ISD::NEON_LD3_UPD: { + static const uint16_t Opcodes[] = { + AArch64::LD3WB_8B_fixed, AArch64::LD3WB_4H_fixed, + AArch64::LD3WB_2S_fixed, AArch64::LD1x3WB_1D_fixed, + AArch64::LD3WB_16B_fixed, AArch64::LD3WB_8H_fixed, + AArch64::LD3WB_4S_fixed, AArch64::LD3WB_2D_fixed + }; + return SelectVLD(Node, true, 3, Opcodes); + } + case AArch64ISD::NEON_LD4_UPD: { + static const uint16_t Opcodes[] = { + AArch64::LD4WB_8B_fixed, AArch64::LD4WB_4H_fixed, + AArch64::LD4WB_2S_fixed, AArch64::LD1x4WB_1D_fixed, + AArch64::LD4WB_16B_fixed, AArch64::LD4WB_8H_fixed, + AArch64::LD4WB_4S_fixed, AArch64::LD4WB_2D_fixed + }; + return SelectVLD(Node, true, 4, Opcodes); + } + case AArch64ISD::NEON_LD1x2_UPD: { + static const uint16_t Opcodes[] = { + AArch64::LD1x2WB_8B_fixed, AArch64::LD1x2WB_4H_fixed, + AArch64::LD1x2WB_2S_fixed, AArch64::LD1x2WB_1D_fixed, + AArch64::LD1x2WB_16B_fixed, AArch64::LD1x2WB_8H_fixed, + AArch64::LD1x2WB_4S_fixed, AArch64::LD1x2WB_2D_fixed + }; + return SelectVLD(Node, true, 2, Opcodes); + } + case AArch64ISD::NEON_LD1x3_UPD: { + static const uint16_t Opcodes[] = { + AArch64::LD1x3WB_8B_fixed, AArch64::LD1x3WB_4H_fixed, + AArch64::LD1x3WB_2S_fixed, AArch64::LD1x3WB_1D_fixed, + AArch64::LD1x3WB_16B_fixed, AArch64::LD1x3WB_8H_fixed, + AArch64::LD1x3WB_4S_fixed, AArch64::LD1x3WB_2D_fixed + }; + return SelectVLD(Node, true, 3, Opcodes); + } + case AArch64ISD::NEON_LD1x4_UPD: { + static const uint16_t Opcodes[] = { + AArch64::LD1x4WB_8B_fixed, AArch64::LD1x4WB_4H_fixed, + AArch64::LD1x4WB_2S_fixed, AArch64::LD1x4WB_1D_fixed, + AArch64::LD1x4WB_16B_fixed, AArch64::LD1x4WB_8H_fixed, + AArch64::LD1x4WB_4S_fixed, AArch64::LD1x4WB_2D_fixed + }; + return SelectVLD(Node, true, 4, Opcodes); + } + case AArch64ISD::NEON_ST1_UPD: { + static const uint16_t Opcodes[] = { + AArch64::ST1WB_8B_fixed, AArch64::ST1WB_4H_fixed, + AArch64::ST1WB_2S_fixed, AArch64::ST1WB_1D_fixed, + AArch64::ST1WB_16B_fixed, AArch64::ST1WB_8H_fixed, + AArch64::ST1WB_4S_fixed, AArch64::ST1WB_2D_fixed + }; + return SelectVST(Node, true, 1, Opcodes); + } + case AArch64ISD::NEON_ST2_UPD: { + static const uint16_t Opcodes[] = { + AArch64::ST2WB_8B_fixed, AArch64::ST2WB_4H_fixed, + AArch64::ST2WB_2S_fixed, AArch64::ST1x2WB_1D_fixed, + AArch64::ST2WB_16B_fixed, AArch64::ST2WB_8H_fixed, + AArch64::ST2WB_4S_fixed, AArch64::ST2WB_2D_fixed + }; + return SelectVST(Node, true, 2, Opcodes); + } + case AArch64ISD::NEON_ST3_UPD: { + static const uint16_t Opcodes[] = { + AArch64::ST3WB_8B_fixed, AArch64::ST3WB_4H_fixed, + AArch64::ST3WB_2S_fixed, AArch64::ST1x3WB_1D_fixed, + AArch64::ST3WB_16B_fixed, AArch64::ST3WB_8H_fixed, + AArch64::ST3WB_4S_fixed, AArch64::ST3WB_2D_fixed + }; + return SelectVST(Node, true, 3, Opcodes); + } + case AArch64ISD::NEON_ST4_UPD: { + static const uint16_t Opcodes[] = { + AArch64::ST4WB_8B_fixed, AArch64::ST4WB_4H_fixed, + AArch64::ST4WB_2S_fixed, AArch64::ST1x4WB_1D_fixed, + AArch64::ST4WB_16B_fixed, AArch64::ST4WB_8H_fixed, + AArch64::ST4WB_4S_fixed, AArch64::ST4WB_2D_fixed + }; + return SelectVST(Node, true, 4, Opcodes); + } + case AArch64ISD::NEON_LD2DUP: { + static const uint16_t Opcodes[] = { + AArch64::LD2R_8B, AArch64::LD2R_4H, AArch64::LD2R_2S, + AArch64::LD2R_1D, AArch64::LD2R_16B, AArch64::LD2R_8H, + AArch64::LD2R_4S, AArch64::LD2R_2D + }; + return SelectVLDDup(Node, false, 2, Opcodes); + } + case AArch64ISD::NEON_LD3DUP: { + static const uint16_t Opcodes[] = { + AArch64::LD3R_8B, AArch64::LD3R_4H, AArch64::LD3R_2S, + AArch64::LD3R_1D, AArch64::LD3R_16B, AArch64::LD3R_8H, + AArch64::LD3R_4S, AArch64::LD3R_2D + }; + return SelectVLDDup(Node, false, 3, Opcodes); + } + case AArch64ISD::NEON_LD4DUP: { + static const uint16_t Opcodes[] = { + AArch64::LD4R_8B, AArch64::LD4R_4H, AArch64::LD4R_2S, + AArch64::LD4R_1D, AArch64::LD4R_16B, AArch64::LD4R_8H, + AArch64::LD4R_4S, AArch64::LD4R_2D + }; + return SelectVLDDup(Node, false, 4, Opcodes); + } + case AArch64ISD::NEON_LD2DUP_UPD: { + static const uint16_t Opcodes[] = { + AArch64::LD2R_WB_8B_fixed, AArch64::LD2R_WB_4H_fixed, + AArch64::LD2R_WB_2S_fixed, AArch64::LD2R_WB_1D_fixed, + AArch64::LD2R_WB_16B_fixed, AArch64::LD2R_WB_8H_fixed, + AArch64::LD2R_WB_4S_fixed, AArch64::LD2R_WB_2D_fixed + }; + return SelectVLDDup(Node, true, 2, Opcodes); + } + case AArch64ISD::NEON_LD3DUP_UPD: { + static const uint16_t Opcodes[] = { + AArch64::LD3R_WB_8B_fixed, AArch64::LD3R_WB_4H_fixed, + AArch64::LD3R_WB_2S_fixed, AArch64::LD3R_WB_1D_fixed, + AArch64::LD3R_WB_16B_fixed, AArch64::LD3R_WB_8H_fixed, + AArch64::LD3R_WB_4S_fixed, AArch64::LD3R_WB_2D_fixed + }; + return SelectVLDDup(Node, true, 3, Opcodes); + } + case AArch64ISD::NEON_LD4DUP_UPD: { + static const uint16_t Opcodes[] = { + AArch64::LD4R_WB_8B_fixed, AArch64::LD4R_WB_4H_fixed, + AArch64::LD4R_WB_2S_fixed, AArch64::LD4R_WB_1D_fixed, + AArch64::LD4R_WB_16B_fixed, AArch64::LD4R_WB_8H_fixed, + AArch64::LD4R_WB_4S_fixed, AArch64::LD4R_WB_2D_fixed + }; + return SelectVLDDup(Node, true, 4, Opcodes); + } + case AArch64ISD::NEON_LD2LN_UPD: { + static const uint16_t Opcodes[] = { + AArch64::LD2LN_WB_B_fixed, AArch64::LD2LN_WB_H_fixed, + AArch64::LD2LN_WB_S_fixed, AArch64::LD2LN_WB_D_fixed + }; + return SelectVLDSTLane(Node, true, true, 2, Opcodes); + } + case AArch64ISD::NEON_LD3LN_UPD: { + static const uint16_t Opcodes[] = { + AArch64::LD3LN_WB_B_fixed, AArch64::LD3LN_WB_H_fixed, + AArch64::LD3LN_WB_S_fixed, AArch64::LD3LN_WB_D_fixed + }; + return SelectVLDSTLane(Node, true, true, 3, Opcodes); + } + case AArch64ISD::NEON_LD4LN_UPD: { + static const uint16_t Opcodes[] = { + AArch64::LD4LN_WB_B_fixed, AArch64::LD4LN_WB_H_fixed, + AArch64::LD4LN_WB_S_fixed, AArch64::LD4LN_WB_D_fixed + }; + return SelectVLDSTLane(Node, true, true, 4, Opcodes); + } + case AArch64ISD::NEON_ST2LN_UPD: { + static const uint16_t Opcodes[] = { + AArch64::ST2LN_WB_B_fixed, AArch64::ST2LN_WB_H_fixed, + AArch64::ST2LN_WB_S_fixed, AArch64::ST2LN_WB_D_fixed + }; + return SelectVLDSTLane(Node, false, true, 2, Opcodes); + } + case AArch64ISD::NEON_ST3LN_UPD: { + static const uint16_t Opcodes[] = { + AArch64::ST3LN_WB_B_fixed, AArch64::ST3LN_WB_H_fixed, + AArch64::ST3LN_WB_S_fixed, AArch64::ST3LN_WB_D_fixed + }; + return SelectVLDSTLane(Node, false, true, 3, Opcodes); + } + case AArch64ISD::NEON_ST4LN_UPD: { + static const uint16_t Opcodes[] = { + AArch64::ST4LN_WB_B_fixed, AArch64::ST4LN_WB_H_fixed, + AArch64::ST4LN_WB_S_fixed, AArch64::ST4LN_WB_D_fixed + }; + return SelectVLDSTLane(Node, false, true, 4, Opcodes); + } + case AArch64ISD::NEON_ST1x2_UPD: { + static const uint16_t Opcodes[] = { + AArch64::ST1x2WB_8B_fixed, AArch64::ST1x2WB_4H_fixed, + AArch64::ST1x2WB_2S_fixed, AArch64::ST1x2WB_1D_fixed, + AArch64::ST1x2WB_16B_fixed, AArch64::ST1x2WB_8H_fixed, + AArch64::ST1x2WB_4S_fixed, AArch64::ST1x2WB_2D_fixed + }; + return SelectVST(Node, true, 2, Opcodes); + } + case AArch64ISD::NEON_ST1x3_UPD: { + static const uint16_t Opcodes[] = { + AArch64::ST1x3WB_8B_fixed, AArch64::ST1x3WB_4H_fixed, + AArch64::ST1x3WB_2S_fixed, AArch64::ST1x3WB_1D_fixed, + AArch64::ST1x3WB_16B_fixed, AArch64::ST1x3WB_8H_fixed, + AArch64::ST1x3WB_4S_fixed, AArch64::ST1x3WB_2D_fixed + }; + return SelectVST(Node, true, 3, Opcodes); + } + case AArch64ISD::NEON_ST1x4_UPD: { + static const uint16_t Opcodes[] = { + AArch64::ST1x4WB_8B_fixed, AArch64::ST1x4WB_4H_fixed, + AArch64::ST1x4WB_2S_fixed, AArch64::ST1x4WB_1D_fixed, + AArch64::ST1x4WB_16B_fixed, AArch64::ST1x4WB_8H_fixed, + AArch64::ST1x4WB_4S_fixed, AArch64::ST1x4WB_2D_fixed + }; + return SelectVST(Node, true, 4, Opcodes); + } + case ISD::INTRINSIC_WO_CHAIN: { + unsigned IntNo = cast(Node->getOperand(0))->getZExtValue(); + bool IsExt = false; + switch (IntNo) { + default: + break; + case Intrinsic::aarch64_neon_vtbx1: + IsExt = true; + case Intrinsic::aarch64_neon_vtbl1: + return SelectVTBL(Node, 1, IsExt); + case Intrinsic::aarch64_neon_vtbx2: + IsExt = true; + case Intrinsic::aarch64_neon_vtbl2: + return SelectVTBL(Node, 2, IsExt); + case Intrinsic::aarch64_neon_vtbx3: + IsExt = true; + case Intrinsic::aarch64_neon_vtbl3: + return SelectVTBL(Node, 3, IsExt); + case Intrinsic::aarch64_neon_vtbx4: + IsExt = true; + case Intrinsic::aarch64_neon_vtbl4: + return SelectVTBL(Node, 4, IsExt); + } + break; + } + case ISD::INTRINSIC_VOID: + case ISD::INTRINSIC_W_CHAIN: { + unsigned IntNo = cast(Node->getOperand(1))->getZExtValue(); + switch (IntNo) { + default: + break; + case Intrinsic::arm_neon_vld1: { + static const uint16_t Opcodes[] = { + AArch64::LD1_8B, AArch64::LD1_4H, AArch64::LD1_2S, AArch64::LD1_1D, + AArch64::LD1_16B, AArch64::LD1_8H, AArch64::LD1_4S, AArch64::LD1_2D + }; + return SelectVLD(Node, false, 1, Opcodes); + } + case Intrinsic::arm_neon_vld2: { + static const uint16_t Opcodes[] = { + AArch64::LD2_8B, AArch64::LD2_4H, AArch64::LD2_2S, AArch64::LD1x2_1D, + AArch64::LD2_16B, AArch64::LD2_8H, AArch64::LD2_4S, AArch64::LD2_2D + }; + return SelectVLD(Node, false, 2, Opcodes); + } + case Intrinsic::arm_neon_vld3: { + static const uint16_t Opcodes[] = { + AArch64::LD3_8B, AArch64::LD3_4H, AArch64::LD3_2S, AArch64::LD1x3_1D, + AArch64::LD3_16B, AArch64::LD3_8H, AArch64::LD3_4S, AArch64::LD3_2D + }; + return SelectVLD(Node, false, 3, Opcodes); + } + case Intrinsic::arm_neon_vld4: { + static const uint16_t Opcodes[] = { + AArch64::LD4_8B, AArch64::LD4_4H, AArch64::LD4_2S, AArch64::LD1x4_1D, + AArch64::LD4_16B, AArch64::LD4_8H, AArch64::LD4_4S, AArch64::LD4_2D + }; + return SelectVLD(Node, false, 4, Opcodes); + } + case Intrinsic::aarch64_neon_vld1x2: { + static const uint16_t Opcodes[] = { + AArch64::LD1x2_8B, AArch64::LD1x2_4H, AArch64::LD1x2_2S, + AArch64::LD1x2_1D, AArch64::LD1x2_16B, AArch64::LD1x2_8H, + AArch64::LD1x2_4S, AArch64::LD1x2_2D + }; + return SelectVLD(Node, false, 2, Opcodes); + } + case Intrinsic::aarch64_neon_vld1x3: { + static const uint16_t Opcodes[] = { + AArch64::LD1x3_8B, AArch64::LD1x3_4H, AArch64::LD1x3_2S, + AArch64::LD1x3_1D, AArch64::LD1x3_16B, AArch64::LD1x3_8H, + AArch64::LD1x3_4S, AArch64::LD1x3_2D + }; + return SelectVLD(Node, false, 3, Opcodes); + } + case Intrinsic::aarch64_neon_vld1x4: { + static const uint16_t Opcodes[] = { + AArch64::LD1x4_8B, AArch64::LD1x4_4H, AArch64::LD1x4_2S, + AArch64::LD1x4_1D, AArch64::LD1x4_16B, AArch64::LD1x4_8H, + AArch64::LD1x4_4S, AArch64::LD1x4_2D + }; + return SelectVLD(Node, false, 4, Opcodes); + } + case Intrinsic::arm_neon_vst1: { + static const uint16_t Opcodes[] = { + AArch64::ST1_8B, AArch64::ST1_4H, AArch64::ST1_2S, AArch64::ST1_1D, + AArch64::ST1_16B, AArch64::ST1_8H, AArch64::ST1_4S, AArch64::ST1_2D + }; + return SelectVST(Node, false, 1, Opcodes); + } + case Intrinsic::arm_neon_vst2: { + static const uint16_t Opcodes[] = { + AArch64::ST2_8B, AArch64::ST2_4H, AArch64::ST2_2S, AArch64::ST1x2_1D, + AArch64::ST2_16B, AArch64::ST2_8H, AArch64::ST2_4S, AArch64::ST2_2D + }; + return SelectVST(Node, false, 2, Opcodes); + } + case Intrinsic::arm_neon_vst3: { + static const uint16_t Opcodes[] = { + AArch64::ST3_8B, AArch64::ST3_4H, AArch64::ST3_2S, AArch64::ST1x3_1D, + AArch64::ST3_16B, AArch64::ST3_8H, AArch64::ST3_4S, AArch64::ST3_2D + }; + return SelectVST(Node, false, 3, Opcodes); + } + case Intrinsic::arm_neon_vst4: { + static const uint16_t Opcodes[] = { + AArch64::ST4_8B, AArch64::ST4_4H, AArch64::ST4_2S, AArch64::ST1x4_1D, + AArch64::ST4_16B, AArch64::ST4_8H, AArch64::ST4_4S, AArch64::ST4_2D + }; + return SelectVST(Node, false, 4, Opcodes); + } + case Intrinsic::aarch64_neon_vst1x2: { + static const uint16_t Opcodes[] = { + AArch64::ST1x2_8B, AArch64::ST1x2_4H, AArch64::ST1x2_2S, + AArch64::ST1x2_1D, AArch64::ST1x2_16B, AArch64::ST1x2_8H, + AArch64::ST1x2_4S, AArch64::ST1x2_2D + }; + return SelectVST(Node, false, 2, Opcodes); + } + case Intrinsic::aarch64_neon_vst1x3: { + static const uint16_t Opcodes[] = { + AArch64::ST1x3_8B, AArch64::ST1x3_4H, AArch64::ST1x3_2S, + AArch64::ST1x3_1D, AArch64::ST1x3_16B, AArch64::ST1x3_8H, + AArch64::ST1x3_4S, AArch64::ST1x3_2D + }; + return SelectVST(Node, false, 3, Opcodes); + } + case Intrinsic::aarch64_neon_vst1x4: { + static const uint16_t Opcodes[] = { + AArch64::ST1x4_8B, AArch64::ST1x4_4H, AArch64::ST1x4_2S, + AArch64::ST1x4_1D, AArch64::ST1x4_16B, AArch64::ST1x4_8H, + AArch64::ST1x4_4S, AArch64::ST1x4_2D + }; + return SelectVST(Node, false, 4, Opcodes); + } + case Intrinsic::arm_neon_vld2lane: { + static const uint16_t Opcodes[] = { + AArch64::LD2LN_B, AArch64::LD2LN_H, AArch64::LD2LN_S, AArch64::LD2LN_D + }; + return SelectVLDSTLane(Node, true, false, 2, Opcodes); + } + case Intrinsic::arm_neon_vld3lane: { + static const uint16_t Opcodes[] = { + AArch64::LD3LN_B, AArch64::LD3LN_H, AArch64::LD3LN_S, AArch64::LD3LN_D + }; + return SelectVLDSTLane(Node, true, false, 3, Opcodes); + } + case Intrinsic::arm_neon_vld4lane: { + static const uint16_t Opcodes[] = { + AArch64::LD4LN_B, AArch64::LD4LN_H, AArch64::LD4LN_S, AArch64::LD4LN_D + }; + return SelectVLDSTLane(Node, true, false, 4, Opcodes); + } + case Intrinsic::arm_neon_vst2lane: { + static const uint16_t Opcodes[] = { + AArch64::ST2LN_B, AArch64::ST2LN_H, AArch64::ST2LN_S, AArch64::ST2LN_D + }; + return SelectVLDSTLane(Node, false, false, 2, Opcodes); + } + case Intrinsic::arm_neon_vst3lane: { + static const uint16_t Opcodes[] = { + AArch64::ST3LN_B, AArch64::ST3LN_H, AArch64::ST3LN_S, AArch64::ST3LN_D + }; + return SelectVLDSTLane(Node, false, false, 3, Opcodes); + } + case Intrinsic::arm_neon_vst4lane: { + static const uint16_t Opcodes[] = { + AArch64::ST4LN_B, AArch64::ST4LN_H, AArch64::ST4LN_S, AArch64::ST4LN_D + }; + return SelectVLDSTLane(Node, false, false, 4, Opcodes); + } + } // End of switch IntNo + break; + } // End of case ISD::INTRINSIC_VOID and :ISD::INTRINSIC_W_CHAIN default: break; // Let generic code handle it } diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp index 56f675183cb4..4fdb667b9539 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -39,12 +39,10 @@ static TargetLoweringObjectFile *createTLOF(AArch64TargetMachine &TM) { llvm_unreachable("unknown subtarget type"); } - AArch64TargetLowering::AArch64TargetLowering(AArch64TargetMachine &TM) - : TargetLowering(TM, createTLOF(TM)), - Subtarget(&TM.getSubtarget()), - RegInfo(TM.getRegisterInfo()), - Itins(TM.getInstrItineraryData()) { + : TargetLowering(TM, createTLOF(TM)), Itins(TM.getInstrItineraryData()) { + + const AArch64Subtarget *Subtarget = &TM.getSubtarget(); // SIMD compares set the entire lane's bits to 1 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); @@ -52,10 +50,34 @@ AArch64TargetLowering::AArch64TargetLowering(AArch64TargetMachine &TM) // Scalar register <-> type mapping addRegisterClass(MVT::i32, &AArch64::GPR32RegClass); addRegisterClass(MVT::i64, &AArch64::GPR64RegClass); - addRegisterClass(MVT::f16, &AArch64::FPR16RegClass); - addRegisterClass(MVT::f32, &AArch64::FPR32RegClass); - addRegisterClass(MVT::f64, &AArch64::FPR64RegClass); - addRegisterClass(MVT::f128, &AArch64::FPR128RegClass); + + if (Subtarget->hasFPARMv8()) { + addRegisterClass(MVT::f16, &AArch64::FPR16RegClass); + addRegisterClass(MVT::f32, &AArch64::FPR32RegClass); + addRegisterClass(MVT::f64, &AArch64::FPR64RegClass); + addRegisterClass(MVT::f128, &AArch64::FPR128RegClass); + } + + if (Subtarget->hasNEON()) { + // And the vectors + addRegisterClass(MVT::v1i8, &AArch64::FPR8RegClass); + addRegisterClass(MVT::v1i16, &AArch64::FPR16RegClass); + addRegisterClass(MVT::v1i32, &AArch64::FPR32RegClass); + addRegisterClass(MVT::v1i64, &AArch64::FPR64RegClass); + addRegisterClass(MVT::v1f32, &AArch64::FPR32RegClass); + addRegisterClass(MVT::v1f64, &AArch64::FPR64RegClass); + addRegisterClass(MVT::v8i8, &AArch64::FPR64RegClass); + addRegisterClass(MVT::v4i16, &AArch64::FPR64RegClass); + addRegisterClass(MVT::v2i32, &AArch64::FPR64RegClass); + addRegisterClass(MVT::v1i64, &AArch64::FPR64RegClass); + addRegisterClass(MVT::v2f32, &AArch64::FPR64RegClass); + addRegisterClass(MVT::v16i8, &AArch64::FPR128RegClass); + addRegisterClass(MVT::v8i16, &AArch64::FPR128RegClass); + addRegisterClass(MVT::v4i32, &AArch64::FPR128RegClass); + addRegisterClass(MVT::v2i64, &AArch64::FPR128RegClass); + addRegisterClass(MVT::v4f32, &AArch64::FPR128RegClass); + addRegisterClass(MVT::v2f64, &AArch64::FPR128RegClass); + } computeRegisterProperties(); @@ -64,6 +86,12 @@ AArch64TargetLowering::AArch64TargetLowering(AArch64TargetMachine &TM) setTargetDAGCombine(ISD::AND); setTargetDAGCombine(ISD::SRA); + setTargetDAGCombine(ISD::SRL); + setTargetDAGCombine(ISD::SHL); + + setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); + setTargetDAGCombine(ISD::INTRINSIC_VOID); + setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN); // AArch64 does not have i1 loads, or much of anything for i1 really. setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote); @@ -253,14 +281,97 @@ AArch64TargetLowering::AArch64TargetLowering(AArch64TargetMachine &TM) setTruncStoreAction(MVT::f64, MVT::f16, Expand); setTruncStoreAction(MVT::f32, MVT::f16, Expand); - setOperationAction(ISD::EXCEPTIONADDR, MVT::i64, Expand); - setOperationAction(ISD::EHSELECTION, MVT::i64, Expand); - setExceptionPointerRegister(AArch64::X0); setExceptionSelectorRegister(AArch64::X1); + + if (Subtarget->hasNEON()) { + setOperationAction(ISD::BUILD_VECTOR, MVT::v1i8, Custom); + setOperationAction(ISD::BUILD_VECTOR, MVT::v8i8, Custom); + setOperationAction(ISD::BUILD_VECTOR, MVT::v16i8, Custom); + setOperationAction(ISD::BUILD_VECTOR, MVT::v1i16, Custom); + setOperationAction(ISD::BUILD_VECTOR, MVT::v4i16, Custom); + setOperationAction(ISD::BUILD_VECTOR, MVT::v8i16, Custom); + setOperationAction(ISD::BUILD_VECTOR, MVT::v1i32, Custom); + setOperationAction(ISD::BUILD_VECTOR, MVT::v2i32, Custom); + setOperationAction(ISD::BUILD_VECTOR, MVT::v4i32, Custom); + setOperationAction(ISD::BUILD_VECTOR, MVT::v1i64, Custom); + setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom); + setOperationAction(ISD::BUILD_VECTOR, MVT::v1f32, Custom); + setOperationAction(ISD::BUILD_VECTOR, MVT::v2f32, Custom); + setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); + setOperationAction(ISD::BUILD_VECTOR, MVT::v1f64, Custom); + setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom); + + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i8, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i8, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i16, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i16, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i32, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i32, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v1i64, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f32, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v1f64, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom); + + setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i8, Legal); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i16, Legal); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Legal); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v2i64, Legal); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i16, Legal); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Legal); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v2i64, Legal); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32, Legal); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v2f64, Legal); + + setOperationAction(ISD::SETCC, MVT::v8i8, Custom); + setOperationAction(ISD::SETCC, MVT::v16i8, Custom); + setOperationAction(ISD::SETCC, MVT::v4i16, Custom); + setOperationAction(ISD::SETCC, MVT::v8i16, Custom); + setOperationAction(ISD::SETCC, MVT::v2i32, Custom); + setOperationAction(ISD::SETCC, MVT::v4i32, Custom); + setOperationAction(ISD::SETCC, MVT::v1i64, Custom); + setOperationAction(ISD::SETCC, MVT::v2i64, Custom); + setOperationAction(ISD::SETCC, MVT::v1f32, Custom); + setOperationAction(ISD::SETCC, MVT::v2f32, Custom); + setOperationAction(ISD::SETCC, MVT::v4f32, Custom); + setOperationAction(ISD::SETCC, MVT::v1f64, Custom); + setOperationAction(ISD::SETCC, MVT::v2f64, Custom); + + setOperationAction(ISD::FFLOOR, MVT::v2f32, Legal); + setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal); + setOperationAction(ISD::FFLOOR, MVT::v1f64, Legal); + setOperationAction(ISD::FFLOOR, MVT::v2f64, Legal); + + setOperationAction(ISD::FCEIL, MVT::v2f32, Legal); + setOperationAction(ISD::FCEIL, MVT::v4f32, Legal); + setOperationAction(ISD::FCEIL, MVT::v1f64, Legal); + setOperationAction(ISD::FCEIL, MVT::v2f64, Legal); + + setOperationAction(ISD::FTRUNC, MVT::v2f32, Legal); + setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal); + setOperationAction(ISD::FTRUNC, MVT::v1f64, Legal); + setOperationAction(ISD::FTRUNC, MVT::v2f64, Legal); + + setOperationAction(ISD::FRINT, MVT::v2f32, Legal); + setOperationAction(ISD::FRINT, MVT::v4f32, Legal); + setOperationAction(ISD::FRINT, MVT::v1f64, Legal); + setOperationAction(ISD::FRINT, MVT::v2f64, Legal); + + setOperationAction(ISD::FNEARBYINT, MVT::v2f32, Legal); + setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Legal); + setOperationAction(ISD::FNEARBYINT, MVT::v1f64, Legal); + setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Legal); + + setOperationAction(ISD::FROUND, MVT::v2f32, Legal); + setOperationAction(ISD::FROUND, MVT::v4f32, Legal); + setOperationAction(ISD::FROUND, MVT::v1f64, Legal); + setOperationAction(ISD::FROUND, MVT::v2f64, Legal); + } } -EVT AArch64TargetLowering::getSetCCResultType(EVT VT) const { +EVT AArch64TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const { // It's reasonably important that this value matches the "natural" legal // promotion from i1 for scalar types. Otherwise LegalizeTypes can get itself // in a twist (e.g. inserting an any_extend which then becomes i64 -> i64). @@ -271,16 +382,16 @@ EVT AArch64TargetLowering::getSetCCResultType(EVT VT) const { static void getExclusiveOperation(unsigned Size, AtomicOrdering Ord, unsigned &LdrOpc, unsigned &StrOpc) { - static unsigned LoadBares[] = {AArch64::LDXR_byte, AArch64::LDXR_hword, - AArch64::LDXR_word, AArch64::LDXR_dword}; - static unsigned LoadAcqs[] = {AArch64::LDAXR_byte, AArch64::LDAXR_hword, - AArch64::LDAXR_word, AArch64::LDAXR_dword}; - static unsigned StoreBares[] = {AArch64::STXR_byte, AArch64::STXR_hword, - AArch64::STXR_word, AArch64::STXR_dword}; - static unsigned StoreRels[] = {AArch64::STLXR_byte, AArch64::STLXR_hword, - AArch64::STLXR_word, AArch64::STLXR_dword}; - - unsigned *LoadOps, *StoreOps; + static const unsigned LoadBares[] = {AArch64::LDXR_byte, AArch64::LDXR_hword, + AArch64::LDXR_word, AArch64::LDXR_dword}; + static const unsigned LoadAcqs[] = {AArch64::LDAXR_byte, AArch64::LDAXR_hword, + AArch64::LDAXR_word, AArch64::LDAXR_dword}; + static const unsigned StoreBares[] = {AArch64::STXR_byte, AArch64::STXR_hword, + AArch64::STXR_word, AArch64::STXR_dword}; + static const unsigned StoreRels[] = {AArch64::STLXR_byte,AArch64::STLXR_hword, + AArch64::STLXR_word, AArch64::STLXR_dword}; + + const unsigned *LoadOps, *StoreOps; if (Ord == Acquire || Ord == AcquireRelease || Ord == SequentiallyConsistent) LoadOps = LoadAcqs; else @@ -298,6 +409,29 @@ static void getExclusiveOperation(unsigned Size, AtomicOrdering Ord, StrOpc = StoreOps[Log2_32(Size)]; } +// FIXME: AArch64::DTripleRegClass and AArch64::QTripleRegClass don't really +// have value type mapped, and they are both being defined as MVT::untyped. +// Without knowing the MVT type, MachineLICM::getRegisterClassIDAndCost +// would fail to figure out the register pressure correctly. +std::pair +AArch64TargetLowering::findRepresentativeClass(MVT VT) const{ + const TargetRegisterClass *RRC = 0; + uint8_t Cost = 1; + switch (VT.SimpleTy) { + default: + return TargetLowering::findRepresentativeClass(VT); + case MVT::v4i64: + RRC = &AArch64::QPairRegClass; + Cost = 2; + break; + case MVT::v8i64: + RRC = &AArch64::QQuadRegClass; + Cost = 4; + break; + } + return std::make_pair(RRC, Cost); +} + MachineBasicBlock * AArch64TargetLowering::emitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB, unsigned Size, @@ -623,6 +757,12 @@ AArch64TargetLowering::EmitF128CSEL(MachineInstr *MI, MBB->addSuccessor(TrueBB); MBB->addSuccessor(EndBB); + if (!NZCVKilled) { + // NZCV is live-through TrueBB. + TrueBB->addLiveIn(AArch64::NZCV); + EndBB->addLiveIn(AArch64::NZCV); + } + // IfTrue: // str qIFTRUE, [sp] BuildMI(TrueBB, DL, TII->get(AArch64::LSFP128_STR)) @@ -637,8 +777,6 @@ AArch64TargetLowering::EmitF128CSEL(MachineInstr *MI, // Done: // ldr qDEST, [sp] // [... rest of incoming MBB ...] - if (!NZCVKilled) - EndBB->addLiveIn(AArch64::NZCV); MachineInstr *StartOfEnd = EndBB->begin(); BuildMI(*EndBB, StartOfEnd, DL, TII->get(AArch64::LSFP128_LDR), DestReg) .addFrameIndex(ScratchFI) @@ -784,7 +922,102 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { case AArch64ISD::WrapperLarge: return "AArch64ISD::WrapperLarge"; case AArch64ISD::WrapperSmall: return "AArch64ISD::WrapperSmall"; - default: return NULL; + case AArch64ISD::NEON_BSL: + return "AArch64ISD::NEON_BSL"; + case AArch64ISD::NEON_MOVIMM: + return "AArch64ISD::NEON_MOVIMM"; + case AArch64ISD::NEON_MVNIMM: + return "AArch64ISD::NEON_MVNIMM"; + case AArch64ISD::NEON_FMOVIMM: + return "AArch64ISD::NEON_FMOVIMM"; + case AArch64ISD::NEON_CMP: + return "AArch64ISD::NEON_CMP"; + case AArch64ISD::NEON_CMPZ: + return "AArch64ISD::NEON_CMPZ"; + case AArch64ISD::NEON_TST: + return "AArch64ISD::NEON_TST"; + case AArch64ISD::NEON_QSHLs: + return "AArch64ISD::NEON_QSHLs"; + case AArch64ISD::NEON_QSHLu: + return "AArch64ISD::NEON_QSHLu"; + case AArch64ISD::NEON_VDUP: + return "AArch64ISD::NEON_VDUP"; + case AArch64ISD::NEON_VDUPLANE: + return "AArch64ISD::NEON_VDUPLANE"; + case AArch64ISD::NEON_REV16: + return "AArch64ISD::NEON_REV16"; + case AArch64ISD::NEON_REV32: + return "AArch64ISD::NEON_REV32"; + case AArch64ISD::NEON_REV64: + return "AArch64ISD::NEON_REV64"; + case AArch64ISD::NEON_UZP1: + return "AArch64ISD::NEON_UZP1"; + case AArch64ISD::NEON_UZP2: + return "AArch64ISD::NEON_UZP2"; + case AArch64ISD::NEON_ZIP1: + return "AArch64ISD::NEON_ZIP1"; + case AArch64ISD::NEON_ZIP2: + return "AArch64ISD::NEON_ZIP2"; + case AArch64ISD::NEON_TRN1: + return "AArch64ISD::NEON_TRN1"; + case AArch64ISD::NEON_TRN2: + return "AArch64ISD::NEON_TRN2"; + case AArch64ISD::NEON_LD1_UPD: + return "AArch64ISD::NEON_LD1_UPD"; + case AArch64ISD::NEON_LD2_UPD: + return "AArch64ISD::NEON_LD2_UPD"; + case AArch64ISD::NEON_LD3_UPD: + return "AArch64ISD::NEON_LD3_UPD"; + case AArch64ISD::NEON_LD4_UPD: + return "AArch64ISD::NEON_LD4_UPD"; + case AArch64ISD::NEON_ST1_UPD: + return "AArch64ISD::NEON_ST1_UPD"; + case AArch64ISD::NEON_ST2_UPD: + return "AArch64ISD::NEON_ST2_UPD"; + case AArch64ISD::NEON_ST3_UPD: + return "AArch64ISD::NEON_ST3_UPD"; + case AArch64ISD::NEON_ST4_UPD: + return "AArch64ISD::NEON_ST4_UPD"; + case AArch64ISD::NEON_LD1x2_UPD: + return "AArch64ISD::NEON_LD1x2_UPD"; + case AArch64ISD::NEON_LD1x3_UPD: + return "AArch64ISD::NEON_LD1x3_UPD"; + case AArch64ISD::NEON_LD1x4_UPD: + return "AArch64ISD::NEON_LD1x4_UPD"; + case AArch64ISD::NEON_ST1x2_UPD: + return "AArch64ISD::NEON_ST1x2_UPD"; + case AArch64ISD::NEON_ST1x3_UPD: + return "AArch64ISD::NEON_ST1x3_UPD"; + case AArch64ISD::NEON_ST1x4_UPD: + return "AArch64ISD::NEON_ST1x4_UPD"; + case AArch64ISD::NEON_LD2DUP: + return "AArch64ISD::NEON_LD2DUP"; + case AArch64ISD::NEON_LD3DUP: + return "AArch64ISD::NEON_LD3DUP"; + case AArch64ISD::NEON_LD4DUP: + return "AArch64ISD::NEON_LD4DUP"; + case AArch64ISD::NEON_LD2DUP_UPD: + return "AArch64ISD::NEON_LD2DUP_UPD"; + case AArch64ISD::NEON_LD3DUP_UPD: + return "AArch64ISD::NEON_LD3DUP_UPD"; + case AArch64ISD::NEON_LD4DUP_UPD: + return "AArch64ISD::NEON_LD4DUP_UPD"; + case AArch64ISD::NEON_LD2LN_UPD: + return "AArch64ISD::NEON_LD2LN_UPD"; + case AArch64ISD::NEON_LD3LN_UPD: + return "AArch64ISD::NEON_LD3LN_UPD"; + case AArch64ISD::NEON_LD4LN_UPD: + return "AArch64ISD::NEON_LD4LN_UPD"; + case AArch64ISD::NEON_ST2LN_UPD: + return "AArch64ISD::NEON_ST2LN_UPD"; + case AArch64ISD::NEON_ST3LN_UPD: + return "AArch64ISD::NEON_ST3LN_UPD"; + case AArch64ISD::NEON_ST4LN_UPD: + return "AArch64ISD::NEON_ST4LN_UPD"; + case AArch64ISD::NEON_VEXTRACT: + return "AArch64ISD::NEON_VEXTRACT"; + default: + return NULL; } } @@ -826,7 +1059,7 @@ CCAssignFn *AArch64TargetLowering::CCAssignFnForNode(CallingConv::ID CC) const { void AArch64TargetLowering::SaveVarArgRegisters(CCState &CCInfo, SelectionDAG &DAG, - DebugLoc DL, SDValue &Chain) const { + SDLoc DL, SDValue &Chain) const { MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo *MFI = MF.getFrameInfo(); AArch64MachineFunctionInfo *FuncInfo @@ -858,24 +1091,31 @@ AArch64TargetLowering::SaveVarArgRegisters(CCState &CCInfo, SelectionDAG &DAG, } } + if (getSubtarget()->hasFPARMv8()) { unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR); int FPRIdx = 0; - if (FPRSaveSize != 0) { - FPRIdx = MFI->CreateStackObject(FPRSaveSize, 16, false); - - SDValue FIN = DAG.getFrameIndex(FPRIdx, getPointerTy()); - - for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) { - unsigned VReg = MF.addLiveIn(AArch64FPRArgRegs[i], - &AArch64::FPR128RegClass); - SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128); - SDValue Store = DAG.getStore(Val.getValue(1), DL, Val, FIN, - MachinePointerInfo::getStack(i * 16), - false, false, 0); - MemOps.push_back(Store); - FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), FIN, - DAG.getConstant(16, getPointerTy())); + // According to the AArch64 Procedure Call Standard, section B.1/B.3, we + // can omit a register save area if we know we'll never use registers of + // that class. + if (FPRSaveSize != 0) { + FPRIdx = MFI->CreateStackObject(FPRSaveSize, 16, false); + + SDValue FIN = DAG.getFrameIndex(FPRIdx, getPointerTy()); + + for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) { + unsigned VReg = MF.addLiveIn(AArch64FPRArgRegs[i], + &AArch64::FPR128RegClass); + SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128); + SDValue Store = DAG.getStore(Val.getValue(1), DL, Val, FIN, + MachinePointerInfo::getStack(i * 16), + false, false, 0); + MemOps.push_back(Store); + FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), FIN, + DAG.getConstant(16, getPointerTy())); + } } + FuncInfo->setVariadicFPRIdx(FPRIdx); + FuncInfo->setVariadicFPRSize(FPRSaveSize); } int StackIdx = MFI->CreateFixedObject(8, CCInfo.getNextStackOffset(), true); @@ -883,8 +1123,6 @@ AArch64TargetLowering::SaveVarArgRegisters(CCState &CCInfo, SelectionDAG &DAG, FuncInfo->setVariadicStackIdx(StackIdx); FuncInfo->setVariadicGPRIdx(GPRIdx); FuncInfo->setVariadicGPRSize(GPRSaveSize); - FuncInfo->setVariadicFPRIdx(FPRIdx); - FuncInfo->setVariadicFPRSize(FPRSaveSize); if (!MemOps.empty()) { Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, &MemOps[0], @@ -897,7 +1135,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Ins, - DebugLoc dl, SelectionDAG &DAG, + SDLoc dl, SelectionDAG &DAG, SmallVectorImpl &InVals) const { MachineFunction &MF = DAG.getMachineFunction(); AArch64MachineFunctionInfo *FuncInfo @@ -1012,7 +1250,7 @@ AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Outs, const SmallVectorImpl &OutVals, - DebugLoc dl, SelectionDAG &DAG) const { + SDLoc dl, SelectionDAG &DAG) const { // CCValAssign - represent the assignment of the return value to a location. SmallVector RVLocs; @@ -1085,10 +1323,10 @@ SDValue AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, SmallVectorImpl &InVals) const { SelectionDAG &DAG = CLI.DAG; - DebugLoc &dl = CLI.DL; - SmallVector &Outs = CLI.Outs; - SmallVector &OutVals = CLI.OutVals; - SmallVector &Ins = CLI.Ins; + SDLoc &dl = CLI.DL; + SmallVectorImpl &Outs = CLI.Outs; + SmallVectorImpl &OutVals = CLI.OutVals; + SmallVectorImpl &Ins = CLI.Ins; SDValue Chain = CLI.Chain; SDValue Callee = CLI.Callee; bool &IsTailCall = CLI.IsTailCall; @@ -1151,7 +1389,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, } if (!IsSibCall) - Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true)); + Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true), + dl); SDValue StackPtr = DAG.getCopyFromReg(Chain, dl, AArch64::XSP, getPointerTy()); @@ -1282,7 +1521,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, // in the correct location. if (IsTailCall && !IsSibCall) { Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true), - DAG.getIntPtrConstant(0, true), InFlag); + DAG.getIntPtrConstant(0, true), InFlag, dl); InFlag = Chain.getValue(1); } @@ -1336,7 +1575,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true), DAG.getIntPtrConstant(CalleePopBytes, true), - InFlag); + InFlag, dl); InFlag = Chain.getValue(1); } @@ -1348,7 +1587,7 @@ SDValue AArch64TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl &Ins, - DebugLoc dl, SelectionDAG &DAG, + SDLoc dl, SelectionDAG &DAG, SmallVectorImpl &InVals) const { // Assign locations to each value returned by this call. SmallVector RVLocs; @@ -1537,7 +1776,7 @@ SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain, } // Build a tokenfactor for all the chains. - return DAG.getNode(ISD::TokenFactor, Chain.getDebugLoc(), MVT::Other, + return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, &ArgChains[0], ArgChains.size()); } @@ -1570,7 +1809,7 @@ bool AArch64TargetLowering::isLegalICmpImmediate(int64_t Val) const { SDValue AArch64TargetLowering::getSelectableIntSetCC(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue &A64cc, - SelectionDAG &DAG, DebugLoc &dl) const { + SelectionDAG &DAG, SDLoc &dl) const { if (ConstantSDNode *RHSC = dyn_cast(RHS.getNode())) { int64_t C = 0; EVT VT = RHSC->getValueType(0); @@ -1663,7 +1902,7 @@ static A64CC::CondCodes FPCCToA64CC(ISD::CondCode CC, SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { - DebugLoc DL = Op.getDebugLoc(); + SDLoc DL(Op); EVT PtrVT = getPointerTy(); const BlockAddress *BA = cast(Op)->getBlockAddress(); @@ -1693,7 +1932,7 @@ AArch64TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { // (BRCOND chain, val, dest) SDValue AArch64TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); SDValue Chain = Op.getOperand(0); SDValue TheBit = Op.getOperand(1); SDValue DestBB = Op.getOperand(2); @@ -1716,7 +1955,7 @@ AArch64TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { // (BR_CC chain, condcode, lhs, rhs, dest) SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); SDValue Chain = Op.getOperand(0); ISD::CondCode CC = cast(Op.getOperand(1))->get(); SDValue LHS = Op.getOperand(2); @@ -1802,7 +2041,7 @@ AArch64TargetLowering::LowerF128ToCall(SDValue Op, SelectionDAG &DAG, CallLoweringInfo CLI(InChain, RetTy, false, false, false, false, 0, getLibcallCallingConv(Call), isTailCall, /*doesNotReturn=*/false, /*isReturnValueUsed=*/true, - Callee, Args, DAG, Op->getDebugLoc()); + Callee, Args, DAG, SDLoc(Op)); std::pair CallInfo = LowerCallTo(CLI); if (!CallInfo.second.getNode()) @@ -1824,7 +2063,7 @@ AArch64TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { SDValue SrcVal = Op.getOperand(0); return makeLibCall(DAG, LC, Op.getValueType(), &SrcVal, 1, - /*isSigned*/ false, Op.getDebugLoc()); + /*isSigned*/ false, SDLoc(Op)).first; } SDValue @@ -1854,6 +2093,45 @@ AArch64TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG, return LowerF128ToCall(Op, DAG, LC); } +SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{ + MachineFunction &MF = DAG.getMachineFunction(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + MFI->setReturnAddressIsTaken(true); + + EVT VT = Op.getValueType(); + SDLoc dl(Op); + unsigned Depth = cast(Op.getOperand(0))->getZExtValue(); + if (Depth) { + SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); + SDValue Offset = DAG.getConstant(8, MVT::i64); + return DAG.getLoad(VT, dl, DAG.getEntryNode(), + DAG.getNode(ISD::ADD, dl, VT, FrameAddr, Offset), + MachinePointerInfo(), false, false, false, 0); + } + + // Return X30, which contains the return address. Mark it an implicit live-in. + unsigned Reg = MF.addLiveIn(AArch64::X30, getRegClassFor(MVT::i64)); + return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, MVT::i64); +} + + +SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) + const { + MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); + MFI->setFrameAddressIsTaken(true); + + EVT VT = Op.getValueType(); + SDLoc dl(Op); + unsigned Depth = cast(Op.getOperand(0))->getZExtValue(); + unsigned FrameReg = AArch64::X29; + SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT); + while (Depth--) + FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, + MachinePointerInfo(), + false, false, false, 0); + return FrameAddr; +} + SDValue AArch64TargetLowering::LowerGlobalAddressELFLarge(SDValue Op, SelectionDAG &DAG) const { @@ -1861,7 +2139,7 @@ AArch64TargetLowering::LowerGlobalAddressELFLarge(SDValue Op, assert(getTargetMachine().getRelocationModel() == Reloc::Static); EVT PtrVT = getPointerTy(); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); const GlobalAddressSDNode *GN = cast(Op); const GlobalValue *GV = GN->getGlobal(); @@ -1885,7 +2163,7 @@ AArch64TargetLowering::LowerGlobalAddressELFSmall(SDValue Op, assert(getTargetMachine().getCodeModel() == CodeModel::Small); EVT PtrVT = getPointerTy(); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); const GlobalAddressSDNode *GN = cast(Op); const GlobalValue *GV = GN->getGlobal(); unsigned Alignment = GV->getAlignment(); @@ -1927,7 +2205,7 @@ AArch64TargetLowering::LowerGlobalAddressELFSmall(SDValue Op, } unsigned char HiFixup, LoFixup; - bool UseGOT = Subtarget->GVIsIndirectSymbol(GV, RelocM); + bool UseGOT = getSubtarget()->GVIsIndirectSymbol(GV, RelocM); if (UseGOT) { HiFixup = AArch64II::MO_GOT; @@ -1978,7 +2256,7 @@ AArch64TargetLowering::LowerGlobalAddressELF(SDValue Op, SDValue AArch64TargetLowering::LowerTLSDescCall(SDValue SymAddr, SDValue DescAddr, - DebugLoc DL, + SDLoc DL, SelectionDAG &DAG) const { EVT PtrVT = getPointerTy(); @@ -2023,7 +2301,7 @@ SDValue AArch64TargetLowering::LowerTLSDescCall(SDValue SymAddr, SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { - assert(Subtarget->isTargetELF() && + assert(getSubtarget()->isTargetELF() && "TLS not implemented for non-ELF targets"); assert(getTargetMachine().getCodeModel() == CodeModel::Small && "TLS only supported in small memory model"); @@ -2033,7 +2311,7 @@ AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op, SDValue TPOff; EVT PtrVT = getPointerTy(); - DebugLoc DL = Op.getDebugLoc(); + SDLoc DL(Op); const GlobalValue *GV = GA->getGlobal(); SDValue ThreadBase = DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT); @@ -2054,7 +2332,7 @@ AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op, AArch64II::MO_TPREL_G0_NC); TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZxii, DL, PtrVT, HiVar, - DAG.getTargetConstant(0, MVT::i32)), 0); + DAG.getTargetConstant(1, MVT::i32)), 0); TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKxii, DL, PtrVT, TPOff, LoVar, DAG.getTargetConstant(0, MVT::i32)), 0); @@ -2134,7 +2412,7 @@ AArch64TargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG, SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const { JumpTableSDNode *JT = cast(Op); - DebugLoc dl = JT->getDebugLoc(); + SDLoc dl(JT); EVT PtrVT = getPointerTy(); // When compiling PIC, jump tables get put in the code section so a static @@ -2161,7 +2439,7 @@ AArch64TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const { // (SELECT_CC lhs, rhs, iftrue, iffalse, condcode) SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); SDValue IfTrue = Op.getOperand(2); @@ -2217,7 +2495,7 @@ AArch64TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { // (SELECT testbit, iftrue, iffalse) SDValue AArch64TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); SDValue TheBit = Op.getOperand(0); SDValue IfTrue = Op.getOperand(1); SDValue IfFalse = Op.getOperand(2); @@ -2236,15 +2514,225 @@ AArch64TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { DAG.getConstant(A64CC::NE, MVT::i32)); } +static SDValue LowerVectorSETCC(SDValue Op, SelectionDAG &DAG) { + SDLoc DL(Op); + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + ISD::CondCode CC = cast(Op.getOperand(2))->get(); + EVT VT = Op.getValueType(); + bool Invert = false; + SDValue Op0, Op1; + unsigned Opcode; + + if (LHS.getValueType().isInteger()) { + + // Attempt to use Vector Integer Compare Mask Test instruction. + // TST = icmp ne (and (op0, op1), zero). + if (CC == ISD::SETNE) { + if (((LHS.getOpcode() == ISD::AND) && + ISD::isBuildVectorAllZeros(RHS.getNode())) || + ((RHS.getOpcode() == ISD::AND) && + ISD::isBuildVectorAllZeros(LHS.getNode()))) { + + SDValue AndOp = (LHS.getOpcode() == ISD::AND) ? LHS : RHS; + SDValue NewLHS = DAG.getNode(ISD::BITCAST, DL, VT, AndOp.getOperand(0)); + SDValue NewRHS = DAG.getNode(ISD::BITCAST, DL, VT, AndOp.getOperand(1)); + return DAG.getNode(AArch64ISD::NEON_TST, DL, VT, NewLHS, NewRHS); + } + } + + // Attempt to use Vector Integer Compare Mask against Zero instr (Signed). + // Note: Compare against Zero does not support unsigned predicates. + if ((ISD::isBuildVectorAllZeros(RHS.getNode()) || + ISD::isBuildVectorAllZeros(LHS.getNode())) && + !isUnsignedIntSetCC(CC)) { + + // If LHS is the zero value, swap operands and CondCode. + if (ISD::isBuildVectorAllZeros(LHS.getNode())) { + CC = getSetCCSwappedOperands(CC); + Op0 = RHS; + } else + Op0 = LHS; + + // Ensure valid CondCode for Compare Mask against Zero instruction: + // EQ, GE, GT, LE, LT. + if (ISD::SETNE == CC) { + Invert = true; + CC = ISD::SETEQ; + } + + // Using constant type to differentiate integer and FP compares with zero. + Op1 = DAG.getConstant(0, MVT::i32); + Opcode = AArch64ISD::NEON_CMPZ; + + } else { + // Attempt to use Vector Integer Compare Mask instr (Signed/Unsigned). + // Ensure valid CondCode for Compare Mask instr: EQ, GE, GT, UGE, UGT. + bool Swap = false; + switch (CC) { + default: + llvm_unreachable("Illegal integer comparison."); + case ISD::SETEQ: + case ISD::SETGT: + case ISD::SETGE: + case ISD::SETUGT: + case ISD::SETUGE: + break; + case ISD::SETNE: + Invert = true; + CC = ISD::SETEQ; + break; + case ISD::SETULT: + case ISD::SETULE: + case ISD::SETLT: + case ISD::SETLE: + Swap = true; + CC = getSetCCSwappedOperands(CC); + } + + if (Swap) + std::swap(LHS, RHS); + + Opcode = AArch64ISD::NEON_CMP; + Op0 = LHS; + Op1 = RHS; + } + + // Generate Compare Mask instr or Compare Mask against Zero instr. + SDValue NeonCmp = + DAG.getNode(Opcode, DL, VT, Op0, Op1, DAG.getCondCode(CC)); + + if (Invert) + NeonCmp = DAG.getNOT(DL, NeonCmp, VT); + + return NeonCmp; + } + + // Now handle Floating Point cases. + // Attempt to use Vector Floating Point Compare Mask against Zero instruction. + if (ISD::isBuildVectorAllZeros(RHS.getNode()) || + ISD::isBuildVectorAllZeros(LHS.getNode())) { + + // If LHS is the zero value, swap operands and CondCode. + if (ISD::isBuildVectorAllZeros(LHS.getNode())) { + CC = getSetCCSwappedOperands(CC); + Op0 = RHS; + } else + Op0 = LHS; + + // Using constant type to differentiate integer and FP compares with zero. + Op1 = DAG.getConstantFP(0, MVT::f32); + Opcode = AArch64ISD::NEON_CMPZ; + } else { + // Attempt to use Vector Floating Point Compare Mask instruction. + Op0 = LHS; + Op1 = RHS; + Opcode = AArch64ISD::NEON_CMP; + } + + SDValue NeonCmpAlt; + // Some register compares have to be implemented with swapped CC and operands, + // e.g.: OLT implemented as OGT with swapped operands. + bool SwapIfRegArgs = false; + + // Ensure valid CondCode for FP Compare Mask against Zero instruction: + // EQ, GE, GT, LE, LT. + // And ensure valid CondCode for FP Compare Mask instruction: EQ, GE, GT. + switch (CC) { + default: + llvm_unreachable("Illegal FP comparison"); + case ISD::SETUNE: + case ISD::SETNE: + Invert = true; // Fallthrough + case ISD::SETOEQ: + case ISD::SETEQ: + CC = ISD::SETEQ; + break; + case ISD::SETOLT: + case ISD::SETLT: + CC = ISD::SETLT; + SwapIfRegArgs = true; + break; + case ISD::SETOGT: + case ISD::SETGT: + CC = ISD::SETGT; + break; + case ISD::SETOLE: + case ISD::SETLE: + CC = ISD::SETLE; + SwapIfRegArgs = true; + break; + case ISD::SETOGE: + case ISD::SETGE: + CC = ISD::SETGE; + break; + case ISD::SETUGE: + Invert = true; + CC = ISD::SETLT; + SwapIfRegArgs = true; + break; + case ISD::SETULE: + Invert = true; + CC = ISD::SETGT; + break; + case ISD::SETUGT: + Invert = true; + CC = ISD::SETLE; + SwapIfRegArgs = true; + break; + case ISD::SETULT: + Invert = true; + CC = ISD::SETGE; + break; + case ISD::SETUEQ: + Invert = true; // Fallthrough + case ISD::SETONE: + // Expand this to (OGT |OLT). + NeonCmpAlt = + DAG.getNode(Opcode, DL, VT, Op0, Op1, DAG.getCondCode(ISD::SETGT)); + CC = ISD::SETLT; + SwapIfRegArgs = true; + break; + case ISD::SETUO: + Invert = true; // Fallthrough + case ISD::SETO: + // Expand this to (OGE | OLT). + NeonCmpAlt = + DAG.getNode(Opcode, DL, VT, Op0, Op1, DAG.getCondCode(ISD::SETGE)); + CC = ISD::SETLT; + SwapIfRegArgs = true; + break; + } + + if (Opcode == AArch64ISD::NEON_CMP && SwapIfRegArgs) { + CC = getSetCCSwappedOperands(CC); + std::swap(Op0, Op1); + } + + // Generate FP Compare Mask instr or FP Compare Mask against Zero instr + SDValue NeonCmp = DAG.getNode(Opcode, DL, VT, Op0, Op1, DAG.getCondCode(CC)); + + if (NeonCmpAlt.getNode()) + NeonCmp = DAG.getNode(ISD::OR, DL, VT, NeonCmp, NeonCmpAlt); + + if (Invert) + NeonCmp = DAG.getNOT(DL, NeonCmp, VT); + + return NeonCmp; +} + // (SETCC lhs, rhs, condcode) SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); ISD::CondCode CC = cast(Op.getOperand(2))->get(); EVT VT = Op.getValueType(); + if (VT.isVector()) + return LowerVectorSETCC(Op, DAG); + if (LHS.getValueType() == MVT::f128) { // f128 comparisons will be lowered to libcalls giving a valid LHS and RHS // for the rest of the function (some i32 or i64 values). @@ -2298,7 +2786,7 @@ AArch64TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const { // We have to make sure we copy the entire structure: 8+8+8+4+4 = 32 bytes // rather than just 8. - return DAG.getMemcpy(Op.getOperand(0), Op.getDebugLoc(), + return DAG.getMemcpy(Op.getOperand(0), SDLoc(Op), Op.getOperand(1), Op.getOperand(2), DAG.getConstant(32, MVT::i32), 8, false, false, MachinePointerInfo(DestSV), MachinePointerInfo(SrcSV)); @@ -2311,7 +2799,7 @@ AArch64TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); AArch64MachineFunctionInfo *FuncInfo = MF.getInfo(); - DebugLoc DL = Op.getDebugLoc(); + SDLoc DL(Op); SDValue Chain = Op.getOperand(0); SDValue VAList = Op.getOperand(1); @@ -2389,6 +2877,8 @@ AArch64TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::UINT_TO_FP: return LowerINT_TO_FP(Op, DAG, false); case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG); case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG); + case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); + case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); case ISD::BRCOND: return LowerBRCOND(Op, DAG); @@ -2401,16 +2891,161 @@ AArch64TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::SETCC: return LowerSETCC(Op, DAG); case ISD::VACOPY: return LowerVACOPY(Op, DAG); case ISD::VASTART: return LowerVASTART(Op, DAG); + case ISD::BUILD_VECTOR: + return LowerBUILD_VECTOR(Op, DAG, getSubtarget()); + case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); } return SDValue(); } +/// Check if the specified splat value corresponds to a valid vector constant +/// for a Neon instruction with a "modified immediate" operand (e.g., MOVI). If +/// so, return the encoded 8-bit immediate and the OpCmode instruction fields +/// values. +static bool isNeonModifiedImm(uint64_t SplatBits, uint64_t SplatUndef, + unsigned SplatBitSize, SelectionDAG &DAG, + bool is128Bits, NeonModImmType type, EVT &VT, + unsigned &Imm, unsigned &OpCmode) { + switch (SplatBitSize) { + default: + llvm_unreachable("unexpected size for isNeonModifiedImm"); + case 8: { + if (type != Neon_Mov_Imm) + return false; + assert((SplatBits & ~0xff) == 0 && "one byte splat value is too big"); + // Neon movi per byte: Op=0, Cmode=1110. + OpCmode = 0xe; + Imm = SplatBits; + VT = is128Bits ? MVT::v16i8 : MVT::v8i8; + break; + } + case 16: { + // Neon move inst per halfword + VT = is128Bits ? MVT::v8i16 : MVT::v4i16; + if ((SplatBits & ~0xff) == 0) { + // Value = 0x00nn is 0x00nn LSL 0 + // movi: Op=0, Cmode=1000; mvni: Op=1, Cmode=1000 + // bic: Op=1, Cmode=1001; orr: Op=0, Cmode=1001 + // Op=x, Cmode=100y + Imm = SplatBits; + OpCmode = 0x8; + break; + } + if ((SplatBits & ~0xff00) == 0) { + // Value = 0xnn00 is 0x00nn LSL 8 + // movi: Op=0, Cmode=1010; mvni: Op=1, Cmode=1010 + // bic: Op=1, Cmode=1011; orr: Op=0, Cmode=1011 + // Op=x, Cmode=101x + Imm = SplatBits >> 8; + OpCmode = 0xa; + break; + } + // can't handle any other + return false; + } + + case 32: { + // First the LSL variants (MSL is unusable by some interested instructions). + + // Neon move instr per word, shift zeros + VT = is128Bits ? MVT::v4i32 : MVT::v2i32; + if ((SplatBits & ~0xff) == 0) { + // Value = 0x000000nn is 0x000000nn LSL 0 + // movi: Op=0, Cmode= 0000; mvni: Op=1, Cmode= 0000 + // bic: Op=1, Cmode= 0001; orr: Op=0, Cmode= 0001 + // Op=x, Cmode=000x + Imm = SplatBits; + OpCmode = 0; + break; + } + if ((SplatBits & ~0xff00) == 0) { + // Value = 0x0000nn00 is 0x000000nn LSL 8 + // movi: Op=0, Cmode= 0010; mvni: Op=1, Cmode= 0010 + // bic: Op=1, Cmode= 0011; orr : Op=0, Cmode= 0011 + // Op=x, Cmode=001x + Imm = SplatBits >> 8; + OpCmode = 0x2; + break; + } + if ((SplatBits & ~0xff0000) == 0) { + // Value = 0x00nn0000 is 0x000000nn LSL 16 + // movi: Op=0, Cmode= 0100; mvni: Op=1, Cmode= 0100 + // bic: Op=1, Cmode= 0101; orr: Op=0, Cmode= 0101 + // Op=x, Cmode=010x + Imm = SplatBits >> 16; + OpCmode = 0x4; + break; + } + if ((SplatBits & ~0xff000000) == 0) { + // Value = 0xnn000000 is 0x000000nn LSL 24 + // movi: Op=0, Cmode= 0110; mvni: Op=1, Cmode= 0110 + // bic: Op=1, Cmode= 0111; orr: Op=0, Cmode= 0111 + // Op=x, Cmode=011x + Imm = SplatBits >> 24; + OpCmode = 0x6; + break; + } + + // Now the MSL immediates. + + // Neon move instr per word, shift ones + if ((SplatBits & ~0xffff) == 0 && + ((SplatBits | SplatUndef) & 0xff) == 0xff) { + // Value = 0x0000nnff is 0x000000nn MSL 8 + // movi: Op=0, Cmode= 1100; mvni: Op=1, Cmode= 1100 + // Op=x, Cmode=1100 + Imm = SplatBits >> 8; + OpCmode = 0xc; + break; + } + if ((SplatBits & ~0xffffff) == 0 && + ((SplatBits | SplatUndef) & 0xffff) == 0xffff) { + // Value = 0x00nnffff is 0x000000nn MSL 16 + // movi: Op=1, Cmode= 1101; mvni: Op=1, Cmode= 1101 + // Op=x, Cmode=1101 + Imm = SplatBits >> 16; + OpCmode = 0xd; + break; + } + // can't handle any other + return false; + } + + case 64: { + if (type != Neon_Mov_Imm) + return false; + // Neon move instr bytemask, where each byte is either 0x00 or 0xff. + // movi Op=1, Cmode=1110. + OpCmode = 0x1e; + uint64_t BitMask = 0xff; + uint64_t Val = 0; + unsigned ImmMask = 1; + Imm = 0; + for (int ByteNum = 0; ByteNum < 8; ++ByteNum) { + if (((SplatBits | SplatUndef) & BitMask) == BitMask) { + Val |= BitMask; + Imm |= ImmMask; + } else if ((SplatBits & BitMask) != 0) { + return false; + } + BitMask <<= 8; + ImmMask <<= 1; + } + SplatBits = Val; + VT = is128Bits ? MVT::v2i64 : MVT::v1i64; + break; + } + } + + return true; +} + static SDValue PerformANDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { SelectionDAG &DAG = DCI.DAG; - DebugLoc DL = N->getDebugLoc(); + SDLoc DL(N); EVT VT = N->getValueType(0); // We're looking for an SRA/SHL pair which form an SBFX. @@ -2448,7 +3083,7 @@ static SDValue PerformANDCombine(SDNode *N, /// a compatible SHL operation (unless they're already low). This function /// checks that condition and returns the least-significant bit that's /// intended. If the operation not a field preparation, -1 is returned. -static int32_t getLSBForBFI(SelectionDAG &DAG, DebugLoc DL, EVT VT, +static int32_t getLSBForBFI(SelectionDAG &DAG, SDLoc DL, EVT VT, SDValue &MaskedVal, uint64_t Mask) { if (!isShiftedMask_64(Mask)) return -1; @@ -2464,7 +3099,7 @@ static int32_t getLSBForBFI(SelectionDAG &DAG, DebugLoc DL, EVT VT, // cases (e.g. bitfield to bitfield copy) may still need a real shift before // the BFI. - uint64_t LSB = CountTrailingZeros_64(Mask); + uint64_t LSB = countTrailingZeros(Mask); int64_t ShiftRightRequired = LSB; if (MaskedVal.getOpcode() == ISD::SHL && isa(MaskedVal.getOperand(1))) { @@ -2524,7 +3159,7 @@ static SDValue tryCombineToBFI(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget) { SelectionDAG &DAG = DCI.DAG; - DebugLoc DL = N->getDebugLoc(); + SDLoc DL(N); EVT VT = N->getValueType(0); assert(N->getOpcode() == ISD::OR && "Unexpected root"); @@ -2605,7 +3240,7 @@ static SDValue tryCombineToLargerBFI(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget) { SelectionDAG &DAG = DCI.DAG; - DebugLoc DL = N->getDebugLoc(); + SDLoc DL(N); EVT VT = N->getValueType(0); // First job is to hunt for a MaskedBFI on either the left or right. Swap @@ -2687,7 +3322,7 @@ static bool findEXTRHalf(SDValue N, SDValue &Src, uint32_t &ShiftAmount, static SDValue tryCombineToEXTR(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { SelectionDAG &DAG = DCI.DAG; - DebugLoc DL = N->getDebugLoc(); + SDLoc DL(N); EVT VT = N->getValueType(0); assert(N->getOpcode() == ISD::OR && "Unexpected root"); @@ -2731,6 +3366,7 @@ static SDValue PerformORCombine(SDNode *N, const AArch64Subtarget *Subtarget) { SelectionDAG &DAG = DCI.DAG; + SDLoc DL(N); EVT VT = N->getValueType(0); if(!DAG.getTargetLoweringInfo().isTypeLegal(VT)) @@ -2751,6 +3387,44 @@ static SDValue PerformORCombine(SDNode *N, if (Res.getNode()) return Res; + if (!Subtarget->hasNEON()) + return SDValue(); + + // Attempt to use vector immediate-form BSL + // (or (and B, A), (and C, ~A)) => (VBSL A, B, C) when A is a constant. + + SDValue N0 = N->getOperand(0); + if (N0.getOpcode() != ISD::AND) + return SDValue(); + + SDValue N1 = N->getOperand(1); + if (N1.getOpcode() != ISD::AND) + return SDValue(); + + if (VT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT)) { + APInt SplatUndef; + unsigned SplatBitSize; + bool HasAnyUndefs; + BuildVectorSDNode *BVN0 = dyn_cast(N0->getOperand(1)); + APInt SplatBits0; + if (BVN0 && BVN0->isConstantSplat(SplatBits0, SplatUndef, SplatBitSize, + HasAnyUndefs) && + !HasAnyUndefs) { + BuildVectorSDNode *BVN1 = dyn_cast(N1->getOperand(1)); + APInt SplatBits1; + if (BVN1 && BVN1->isConstantSplat(SplatBits1, SplatUndef, SplatBitSize, + HasAnyUndefs) && + !HasAnyUndefs && SplatBits0 == ~SplatBits1) { + // Canonicalize the vector type to make instruction selection simpler. + EVT CanonicalVT = VT.is128BitVector() ? MVT::v16i8 : MVT::v8i8; + SDValue Result = DAG.getNode(AArch64ISD::NEON_BSL, DL, CanonicalVT, + N0->getOperand(1), N0->getOperand(0), + N1->getOperand(0)); + return DAG.getNode(ISD::BITCAST, DL, VT, Result); + } + } + } + return SDValue(); } @@ -2759,7 +3433,7 @@ static SDValue PerformSRACombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { SelectionDAG &DAG = DCI.DAG; - DebugLoc DL = N->getDebugLoc(); + SDLoc DL(N); EVT VT = N->getValueType(0); // We're looking for an SRA/SHL pair which form an SBFX. @@ -2791,6 +3465,336 @@ static SDValue PerformSRACombine(SDNode *N, DAG.getConstant(LSB + Width - 1, MVT::i64)); } +/// Check if this is a valid build_vector for the immediate operand of +/// a vector shift operation, where all the elements of the build_vector +/// must have the same constant integer value. +static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) { + // Ignore bit_converts. + while (Op.getOpcode() == ISD::BITCAST) + Op = Op.getOperand(0); + BuildVectorSDNode *BVN = dyn_cast(Op.getNode()); + APInt SplatBits, SplatUndef; + unsigned SplatBitSize; + bool HasAnyUndefs; + if (!BVN || !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, + HasAnyUndefs, ElementBits) || + SplatBitSize > ElementBits) + return false; + Cnt = SplatBits.getSExtValue(); + return true; +} + +/// Check if this is a valid build_vector for the immediate operand of +/// a vector shift left operation. That value must be in the range: +/// 0 <= Value < ElementBits +static bool isVShiftLImm(SDValue Op, EVT VT, int64_t &Cnt) { + assert(VT.isVector() && "vector shift count is not a vector type"); + unsigned ElementBits = VT.getVectorElementType().getSizeInBits(); + if (!getVShiftImm(Op, ElementBits, Cnt)) + return false; + return (Cnt >= 0 && Cnt < ElementBits); +} + +/// Check if this is a valid build_vector for the immediate operand of a +/// vector shift right operation. The value must be in the range: +/// 1 <= Value <= ElementBits +static bool isVShiftRImm(SDValue Op, EVT VT, int64_t &Cnt) { + assert(VT.isVector() && "vector shift count is not a vector type"); + unsigned ElementBits = VT.getVectorElementType().getSizeInBits(); + if (!getVShiftImm(Op, ElementBits, Cnt)) + return false; + return (Cnt >= 1 && Cnt <= ElementBits); +} + +/// Checks for immediate versions of vector shifts and lowers them. +static SDValue PerformShiftCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + const AArch64Subtarget *ST) { + SelectionDAG &DAG = DCI.DAG; + EVT VT = N->getValueType(0); + if (N->getOpcode() == ISD::SRA && (VT == MVT::i32 || VT == MVT::i64)) + return PerformSRACombine(N, DCI); + + // Nothing to be done for scalar shifts. + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (!VT.isVector() || !TLI.isTypeLegal(VT)) + return SDValue(); + + assert(ST->hasNEON() && "unexpected vector shift"); + int64_t Cnt; + + switch (N->getOpcode()) { + default: + llvm_unreachable("unexpected shift opcode"); + + case ISD::SHL: + if (isVShiftLImm(N->getOperand(1), VT, Cnt)) { + SDValue RHS = + DAG.getNode(AArch64ISD::NEON_VDUP, SDLoc(N->getOperand(1)), VT, + DAG.getConstant(Cnt, MVT::i32)); + return DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0), RHS); + } + break; + + case ISD::SRA: + case ISD::SRL: + if (isVShiftRImm(N->getOperand(1), VT, Cnt)) { + SDValue RHS = + DAG.getNode(AArch64ISD::NEON_VDUP, SDLoc(N->getOperand(1)), VT, + DAG.getConstant(Cnt, MVT::i32)); + return DAG.getNode(N->getOpcode(), SDLoc(N), VT, N->getOperand(0), RHS); + } + break; + } + + return SDValue(); +} + +/// ARM-specific DAG combining for intrinsics. +static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) { + unsigned IntNo = cast(N->getOperand(0))->getZExtValue(); + + switch (IntNo) { + default: + // Don't do anything for most intrinsics. + break; + + case Intrinsic::arm_neon_vqshifts: + case Intrinsic::arm_neon_vqshiftu: + EVT VT = N->getOperand(1).getValueType(); + int64_t Cnt; + if (!isVShiftLImm(N->getOperand(2), VT, Cnt)) + break; + unsigned VShiftOpc = (IntNo == Intrinsic::arm_neon_vqshifts) + ? AArch64ISD::NEON_QSHLs + : AArch64ISD::NEON_QSHLu; + return DAG.getNode(VShiftOpc, SDLoc(N), N->getValueType(0), + N->getOperand(1), DAG.getConstant(Cnt, MVT::i32)); + } + + return SDValue(); +} + +/// Target-specific DAG combine function for NEON load/store intrinsics +/// to merge base address updates. +static SDValue CombineBaseUpdate(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) + return SDValue(); + + SelectionDAG &DAG = DCI.DAG; + bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID || + N->getOpcode() == ISD::INTRINSIC_W_CHAIN); + unsigned AddrOpIdx = (isIntrinsic ? 2 : 1); + SDValue Addr = N->getOperand(AddrOpIdx); + + // Search for a use of the address operand that is an increment. + for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), + UE = Addr.getNode()->use_end(); UI != UE; ++UI) { + SDNode *User = *UI; + if (User->getOpcode() != ISD::ADD || + UI.getUse().getResNo() != Addr.getResNo()) + continue; + + // Check that the add is independent of the load/store. Otherwise, folding + // it would create a cycle. + if (User->isPredecessorOf(N) || N->isPredecessorOf(User)) + continue; + + // Find the new opcode for the updating load/store. + bool isLoad = true; + bool isLaneOp = false; + unsigned NewOpc = 0; + unsigned NumVecs = 0; + if (isIntrinsic) { + unsigned IntNo = cast(N->getOperand(1))->getZExtValue(); + switch (IntNo) { + default: llvm_unreachable("unexpected intrinsic for Neon base update"); + case Intrinsic::arm_neon_vld1: NewOpc = AArch64ISD::NEON_LD1_UPD; + NumVecs = 1; break; + case Intrinsic::arm_neon_vld2: NewOpc = AArch64ISD::NEON_LD2_UPD; + NumVecs = 2; break; + case Intrinsic::arm_neon_vld3: NewOpc = AArch64ISD::NEON_LD3_UPD; + NumVecs = 3; break; + case Intrinsic::arm_neon_vld4: NewOpc = AArch64ISD::NEON_LD4_UPD; + NumVecs = 4; break; + case Intrinsic::arm_neon_vst1: NewOpc = AArch64ISD::NEON_ST1_UPD; + NumVecs = 1; isLoad = false; break; + case Intrinsic::arm_neon_vst2: NewOpc = AArch64ISD::NEON_ST2_UPD; + NumVecs = 2; isLoad = false; break; + case Intrinsic::arm_neon_vst3: NewOpc = AArch64ISD::NEON_ST3_UPD; + NumVecs = 3; isLoad = false; break; + case Intrinsic::arm_neon_vst4: NewOpc = AArch64ISD::NEON_ST4_UPD; + NumVecs = 4; isLoad = false; break; + case Intrinsic::aarch64_neon_vld1x2: NewOpc = AArch64ISD::NEON_LD1x2_UPD; + NumVecs = 2; break; + case Intrinsic::aarch64_neon_vld1x3: NewOpc = AArch64ISD::NEON_LD1x3_UPD; + NumVecs = 3; break; + case Intrinsic::aarch64_neon_vld1x4: NewOpc = AArch64ISD::NEON_LD1x4_UPD; + NumVecs = 4; break; + case Intrinsic::aarch64_neon_vst1x2: NewOpc = AArch64ISD::NEON_ST1x2_UPD; + NumVecs = 2; isLoad = false; break; + case Intrinsic::aarch64_neon_vst1x3: NewOpc = AArch64ISD::NEON_ST1x3_UPD; + NumVecs = 3; isLoad = false; break; + case Intrinsic::aarch64_neon_vst1x4: NewOpc = AArch64ISD::NEON_ST1x4_UPD; + NumVecs = 4; isLoad = false; break; + case Intrinsic::arm_neon_vld2lane: NewOpc = AArch64ISD::NEON_LD2LN_UPD; + NumVecs = 2; isLaneOp = true; break; + case Intrinsic::arm_neon_vld3lane: NewOpc = AArch64ISD::NEON_LD3LN_UPD; + NumVecs = 3; isLaneOp = true; break; + case Intrinsic::arm_neon_vld4lane: NewOpc = AArch64ISD::NEON_LD4LN_UPD; + NumVecs = 4; isLaneOp = true; break; + case Intrinsic::arm_neon_vst2lane: NewOpc = AArch64ISD::NEON_ST2LN_UPD; + NumVecs = 2; isLoad = false; isLaneOp = true; break; + case Intrinsic::arm_neon_vst3lane: NewOpc = AArch64ISD::NEON_ST3LN_UPD; + NumVecs = 3; isLoad = false; isLaneOp = true; break; + case Intrinsic::arm_neon_vst4lane: NewOpc = AArch64ISD::NEON_ST4LN_UPD; + NumVecs = 4; isLoad = false; isLaneOp = true; break; + } + } else { + isLaneOp = true; + switch (N->getOpcode()) { + default: llvm_unreachable("unexpected opcode for Neon base update"); + case AArch64ISD::NEON_LD2DUP: NewOpc = AArch64ISD::NEON_LD2DUP_UPD; + NumVecs = 2; break; + case AArch64ISD::NEON_LD3DUP: NewOpc = AArch64ISD::NEON_LD3DUP_UPD; + NumVecs = 3; break; + case AArch64ISD::NEON_LD4DUP: NewOpc = AArch64ISD::NEON_LD4DUP_UPD; + NumVecs = 4; break; + } + } + + // Find the size of memory referenced by the load/store. + EVT VecTy; + if (isLoad) + VecTy = N->getValueType(0); + else + VecTy = N->getOperand(AddrOpIdx + 1).getValueType(); + unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8; + if (isLaneOp) + NumBytes /= VecTy.getVectorNumElements(); + + // If the increment is a constant, it must match the memory ref size. + SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0); + if (ConstantSDNode *CInc = dyn_cast(Inc.getNode())) { + uint32_t IncVal = CInc->getZExtValue(); + if (IncVal != NumBytes) + continue; + Inc = DAG.getTargetConstant(IncVal, MVT::i32); + } + + // Create the new updating load/store node. + EVT Tys[6]; + unsigned NumResultVecs = (isLoad ? NumVecs : 0); + unsigned n; + for (n = 0; n < NumResultVecs; ++n) + Tys[n] = VecTy; + Tys[n++] = MVT::i64; + Tys[n] = MVT::Other; + SDVTList SDTys = DAG.getVTList(Tys, NumResultVecs + 2); + SmallVector Ops; + Ops.push_back(N->getOperand(0)); // incoming chain + Ops.push_back(N->getOperand(AddrOpIdx)); + Ops.push_back(Inc); + for (unsigned i = AddrOpIdx + 1; i < N->getNumOperands(); ++i) { + Ops.push_back(N->getOperand(i)); + } + MemIntrinsicSDNode *MemInt = cast(N); + SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys, + Ops.data(), Ops.size(), + MemInt->getMemoryVT(), + MemInt->getMemOperand()); + + // Update the uses. + std::vector NewResults; + for (unsigned i = 0; i < NumResultVecs; ++i) { + NewResults.push_back(SDValue(UpdN.getNode(), i)); + } + NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1)); // chain + DCI.CombineTo(N, NewResults); + DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs)); + + break; + } + return SDValue(); +} + +/// For a VDUPLANE node N, check if its source operand is a vldN-lane (N > 1) +/// intrinsic, and if all the other uses of that intrinsic are also VDUPLANEs. +/// If so, combine them to a vldN-dup operation and return true. +static SDValue CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { + SelectionDAG &DAG = DCI.DAG; + EVT VT = N->getValueType(0); + + // Check if the VDUPLANE operand is a vldN-dup intrinsic. + SDNode *VLD = N->getOperand(0).getNode(); + if (VLD->getOpcode() != ISD::INTRINSIC_W_CHAIN) + return SDValue(); + unsigned NumVecs = 0; + unsigned NewOpc = 0; + unsigned IntNo = cast(VLD->getOperand(1))->getZExtValue(); + if (IntNo == Intrinsic::arm_neon_vld2lane) { + NumVecs = 2; + NewOpc = AArch64ISD::NEON_LD2DUP; + } else if (IntNo == Intrinsic::arm_neon_vld3lane) { + NumVecs = 3; + NewOpc = AArch64ISD::NEON_LD3DUP; + } else if (IntNo == Intrinsic::arm_neon_vld4lane) { + NumVecs = 4; + NewOpc = AArch64ISD::NEON_LD4DUP; + } else { + return SDValue(); + } + + // First check that all the vldN-lane uses are VDUPLANEs and that the lane + // numbers match the load. + unsigned VLDLaneNo = + cast(VLD->getOperand(NumVecs + 3))->getZExtValue(); + for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end(); + UI != UE; ++UI) { + // Ignore uses of the chain result. + if (UI.getUse().getResNo() == NumVecs) + continue; + SDNode *User = *UI; + if (User->getOpcode() != AArch64ISD::NEON_VDUPLANE || + VLDLaneNo != cast(User->getOperand(1))->getZExtValue()) + return SDValue(); + } + + // Create the vldN-dup node. + EVT Tys[5]; + unsigned n; + for (n = 0; n < NumVecs; ++n) + Tys[n] = VT; + Tys[n] = MVT::Other; + SDVTList SDTys = DAG.getVTList(Tys, NumVecs + 1); + SDValue Ops[] = { VLD->getOperand(0), VLD->getOperand(2) }; + MemIntrinsicSDNode *VLDMemInt = cast(VLD); + SDValue VLDDup = DAG.getMemIntrinsicNode(NewOpc, SDLoc(VLD), SDTys, Ops, 2, + VLDMemInt->getMemoryVT(), + VLDMemInt->getMemOperand()); + + // Update the uses. + for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end(); + UI != UE; ++UI) { + unsigned ResNo = UI.getUse().getResNo(); + // Ignore uses of the chain result. + if (ResNo == NumVecs) + continue; + SDNode *User = *UI; + DCI.CombineTo(User, SDValue(VLDDup.getNode(), ResNo)); + } + + // Now the vldN-lane intrinsic is dead except for its chain result. + // Update uses of the chain. + std::vector VLDDupResults; + for (unsigned n = 0; n < NumVecs; ++n) + VLDDupResults.push_back(SDValue(VLDDup.getNode(), n)); + VLDDupResults.push_back(SDValue(VLDDup.getNode(), NumVecs)); + DCI.CombineTo(VLD, VLDDupResults); + + return SDValue(N, 0); +} SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, @@ -2798,12 +3802,578 @@ AArch64TargetLowering::PerformDAGCombine(SDNode *N, switch (N->getOpcode()) { default: break; case ISD::AND: return PerformANDCombine(N, DCI); - case ISD::OR: return PerformORCombine(N, DCI, Subtarget); - case ISD::SRA: return PerformSRACombine(N, DCI); + case ISD::OR: return PerformORCombine(N, DCI, getSubtarget()); + case ISD::SHL: + case ISD::SRA: + case ISD::SRL: + return PerformShiftCombine(N, DCI, getSubtarget()); + case ISD::INTRINSIC_WO_CHAIN: + return PerformIntrinsicCombine(N, DCI.DAG); + case AArch64ISD::NEON_VDUPLANE: + return CombineVLDDUP(N, DCI); + case AArch64ISD::NEON_LD2DUP: + case AArch64ISD::NEON_LD3DUP: + case AArch64ISD::NEON_LD4DUP: + return CombineBaseUpdate(N, DCI); + case ISD::INTRINSIC_VOID: + case ISD::INTRINSIC_W_CHAIN: + switch (cast(N->getOperand(1))->getZExtValue()) { + case Intrinsic::arm_neon_vld1: + case Intrinsic::arm_neon_vld2: + case Intrinsic::arm_neon_vld3: + case Intrinsic::arm_neon_vld4: + case Intrinsic::arm_neon_vst1: + case Intrinsic::arm_neon_vst2: + case Intrinsic::arm_neon_vst3: + case Intrinsic::arm_neon_vst4: + case Intrinsic::arm_neon_vld2lane: + case Intrinsic::arm_neon_vld3lane: + case Intrinsic::arm_neon_vld4lane: + case Intrinsic::aarch64_neon_vld1x2: + case Intrinsic::aarch64_neon_vld1x3: + case Intrinsic::aarch64_neon_vld1x4: + case Intrinsic::aarch64_neon_vst1x2: + case Intrinsic::aarch64_neon_vst1x3: + case Intrinsic::aarch64_neon_vst1x4: + case Intrinsic::arm_neon_vst2lane: + case Intrinsic::arm_neon_vst3lane: + case Intrinsic::arm_neon_vst4lane: + return CombineBaseUpdate(N, DCI); + default: + break; + } } return SDValue(); } +bool +AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { + VT = VT.getScalarType(); + + if (!VT.isSimple()) + return false; + + switch (VT.getSimpleVT().SimpleTy) { + case MVT::f16: + case MVT::f32: + case MVT::f64: + return true; + case MVT::f128: + return false; + default: + break; + } + + return false; +} + +// Check whether a Build Vector could be presented as Shuffle Vector. If yes, +// try to call LowerVECTOR_SHUFFLE to lower it. +bool AArch64TargetLowering::isKnownShuffleVector(SDValue Op, SelectionDAG &DAG, + SDValue &Res) const { + SDLoc DL(Op); + EVT VT = Op.getValueType(); + unsigned NumElts = VT.getVectorNumElements(); + unsigned V0NumElts = 0; + int Mask[16]; + SDValue V0, V1; + + // Check if all elements are extracted from less than 3 vectors. + for (unsigned i = 0; i < NumElts; ++i) { + SDValue Elt = Op.getOperand(i); + if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT) + return false; + + if (V0.getNode() == 0) { + V0 = Elt.getOperand(0); + V0NumElts = V0.getValueType().getVectorNumElements(); + } + if (Elt.getOperand(0) == V0) { + Mask[i] = (cast(Elt->getOperand(1))->getZExtValue()); + continue; + } else if (V1.getNode() == 0) { + V1 = Elt.getOperand(0); + } + if (Elt.getOperand(0) == V1) { + unsigned Lane = cast(Elt->getOperand(1))->getZExtValue(); + Mask[i] = (Lane + V0NumElts); + continue; + } else { + return false; + } + } + + if (!V1.getNode() && V0NumElts == NumElts * 2) { + V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V0, + DAG.getConstant(NumElts, MVT::i64)); + V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V0, + DAG.getConstant(0, MVT::i64)); + V0NumElts = V0.getValueType().getVectorNumElements(); + } + + if (V1.getNode() && NumElts == V0NumElts && + V0NumElts == V1.getValueType().getVectorNumElements()) { + SDValue Shuffle = DAG.getVectorShuffle(VT, DL, V0, V1, Mask); + Res = LowerVECTOR_SHUFFLE(Shuffle, DAG); + return true; + } else + return false; +} + +// If this is a case we can't handle, return null and let the default +// expansion code take care of it. +SDValue +AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, + const AArch64Subtarget *ST) const { + + BuildVectorSDNode *BVN = cast(Op.getNode()); + SDLoc DL(Op); + EVT VT = Op.getValueType(); + + APInt SplatBits, SplatUndef; + unsigned SplatBitSize; + bool HasAnyUndefs; + + unsigned UseNeonMov = VT.getSizeInBits() >= 64; + + // Note we favor lowering MOVI over MVNI. + // This has implications on the definition of patterns in TableGen to select + // BIC immediate instructions but not ORR immediate instructions. + // If this lowering order is changed, TableGen patterns for BIC immediate and + // ORR immediate instructions have to be updated. + if (UseNeonMov && + BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { + if (SplatBitSize <= 64) { + // First attempt to use vector immediate-form MOVI + EVT NeonMovVT; + unsigned Imm = 0; + unsigned OpCmode = 0; + + if (isNeonModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(), + SplatBitSize, DAG, VT.is128BitVector(), + Neon_Mov_Imm, NeonMovVT, Imm, OpCmode)) { + SDValue ImmVal = DAG.getTargetConstant(Imm, MVT::i32); + SDValue OpCmodeVal = DAG.getConstant(OpCmode, MVT::i32); + + if (ImmVal.getNode() && OpCmodeVal.getNode()) { + SDValue NeonMov = DAG.getNode(AArch64ISD::NEON_MOVIMM, DL, NeonMovVT, + ImmVal, OpCmodeVal); + return DAG.getNode(ISD::BITCAST, DL, VT, NeonMov); + } + } + + // Then attempt to use vector immediate-form MVNI + uint64_t NegatedImm = (~SplatBits).getZExtValue(); + if (isNeonModifiedImm(NegatedImm, SplatUndef.getZExtValue(), SplatBitSize, + DAG, VT.is128BitVector(), Neon_Mvn_Imm, NeonMovVT, + Imm, OpCmode)) { + SDValue ImmVal = DAG.getTargetConstant(Imm, MVT::i32); + SDValue OpCmodeVal = DAG.getConstant(OpCmode, MVT::i32); + if (ImmVal.getNode() && OpCmodeVal.getNode()) { + SDValue NeonMov = DAG.getNode(AArch64ISD::NEON_MVNIMM, DL, NeonMovVT, + ImmVal, OpCmodeVal); + return DAG.getNode(ISD::BITCAST, DL, VT, NeonMov); + } + } + + // Attempt to use vector immediate-form FMOV + if (((VT == MVT::v2f32 || VT == MVT::v4f32) && SplatBitSize == 32) || + (VT == MVT::v2f64 && SplatBitSize == 64)) { + APFloat RealVal( + SplatBitSize == 32 ? APFloat::IEEEsingle : APFloat::IEEEdouble, + SplatBits); + uint32_t ImmVal; + if (A64Imms::isFPImm(RealVal, ImmVal)) { + SDValue Val = DAG.getTargetConstant(ImmVal, MVT::i32); + return DAG.getNode(AArch64ISD::NEON_FMOVIMM, DL, VT, Val); + } + } + } + } + + unsigned NumElts = VT.getVectorNumElements(); + bool isOnlyLowElement = true; + bool usesOnlyOneValue = true; + bool hasDominantValue = false; + bool isConstant = true; + + // Map of the number of times a particular SDValue appears in the + // element list. + DenseMap ValueCounts; + SDValue Value; + for (unsigned i = 0; i < NumElts; ++i) { + SDValue V = Op.getOperand(i); + if (V.getOpcode() == ISD::UNDEF) + continue; + if (i > 0) + isOnlyLowElement = false; + if (!isa(V) && !isa(V)) + isConstant = false; + + ValueCounts.insert(std::make_pair(V, 0)); + unsigned &Count = ValueCounts[V]; + + // Is this value dominant? (takes up more than half of the lanes) + if (++Count > (NumElts / 2)) { + hasDominantValue = true; + Value = V; + } + } + if (ValueCounts.size() != 1) + usesOnlyOneValue = false; + if (!Value.getNode() && ValueCounts.size() > 0) + Value = ValueCounts.begin()->first; + + if (ValueCounts.size() == 0) + return DAG.getUNDEF(VT); + + // Loads are better lowered with insert_vector_elt. + // Keep going if we are hitting this case. + if (isOnlyLowElement && !ISD::isNormalLoad(Value.getNode())) + return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Value); + + unsigned EltSize = VT.getVectorElementType().getSizeInBits(); + if (hasDominantValue && EltSize <= 64) { + // Use VDUP for non-constant splats. + if (!isConstant) { + SDValue N; + + // If we are DUPing a value that comes directly from a vector, we could + // just use DUPLANE. We can only do this if the lane being extracted + // is at a constant index, as the DUP from lane instructions only have + // constant-index forms. + if (Value->getOpcode() == ISD::EXTRACT_VECTOR_ELT && + isa(Value->getOperand(1))) { + N = DAG.getNode(AArch64ISD::NEON_VDUPLANE, DL, VT, + Value->getOperand(0), Value->getOperand(1)); + } else + N = DAG.getNode(AArch64ISD::NEON_VDUP, DL, VT, Value); + + if (!usesOnlyOneValue) { + // The dominant value was splatted as 'N', but we now have to insert + // all differing elements. + for (unsigned I = 0; I < NumElts; ++I) { + if (Op.getOperand(I) == Value) + continue; + SmallVector Ops; + Ops.push_back(N); + Ops.push_back(Op.getOperand(I)); + Ops.push_back(DAG.getConstant(I, MVT::i64)); + N = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, &Ops[0], 3); + } + } + return N; + } + if (usesOnlyOneValue && isConstant) { + return DAG.getNode(AArch64ISD::NEON_VDUP, DL, VT, Value); + } + } + // If all elements are constants and the case above didn't get hit, fall back + // to the default expansion, which will generate a load from the constant + // pool. + if (isConstant) + return SDValue(); + + // Try to lower this in lowering ShuffleVector way. + SDValue Shuf; + if (isKnownShuffleVector(Op, DAG, Shuf)) + return Shuf; + + // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we + // know the default expansion would otherwise fall back on something even + // worse. For a vector with one or two non-undef values, that's + // scalar_to_vector for the elements followed by a shuffle (provided the + // shuffle is valid for the target) and materialization element by element + // on the stack followed by a load for everything else. + if (!isConstant && !usesOnlyOneValue) { + SDValue Vec = DAG.getUNDEF(VT); + for (unsigned i = 0 ; i < NumElts; ++i) { + SDValue V = Op.getOperand(i); + if (V.getOpcode() == ISD::UNDEF) + continue; + SDValue LaneIdx = DAG.getConstant(i, MVT::i64); + Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, Vec, V, LaneIdx); + } + return Vec; + } + return SDValue(); +} + +/// isREVMask - Check if a vector shuffle corresponds to a REV +/// instruction with the specified blocksize. (The order of the elements +/// within each block of the vector is reversed.) +static bool isREVMask(ArrayRef M, EVT VT, unsigned BlockSize) { + assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) && + "Only possible block sizes for REV are: 16, 32, 64"); + + unsigned EltSz = VT.getVectorElementType().getSizeInBits(); + if (EltSz == 64) + return false; + + unsigned NumElts = VT.getVectorNumElements(); + unsigned BlockElts = M[0] + 1; + // If the first shuffle index is UNDEF, be optimistic. + if (M[0] < 0) + BlockElts = BlockSize / EltSz; + + if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz) + return false; + + for (unsigned i = 0; i < NumElts; ++i) { + if (M[i] < 0) + continue; // ignore UNDEF indices + if ((unsigned)M[i] != (i - i % BlockElts) + (BlockElts - 1 - i % BlockElts)) + return false; + } + + return true; +} + +// isPermuteMask - Check whether the vector shuffle matches to UZP, ZIP and +// TRN instruction. +static unsigned isPermuteMask(ArrayRef M, EVT VT) { + unsigned NumElts = VT.getVectorNumElements(); + if (NumElts < 4) + return 0; + + bool ismatch = true; + + // Check UZP1 + for (unsigned i = 0; i < NumElts; ++i) { + if ((unsigned)M[i] != i * 2) { + ismatch = false; + break; + } + } + if (ismatch) + return AArch64ISD::NEON_UZP1; + + // Check UZP2 + ismatch = true; + for (unsigned i = 0; i < NumElts; ++i) { + if ((unsigned)M[i] != i * 2 + 1) { + ismatch = false; + break; + } + } + if (ismatch) + return AArch64ISD::NEON_UZP2; + + // Check ZIP1 + ismatch = true; + for (unsigned i = 0; i < NumElts; ++i) { + if ((unsigned)M[i] != i / 2 + NumElts * (i % 2)) { + ismatch = false; + break; + } + } + if (ismatch) + return AArch64ISD::NEON_ZIP1; + + // Check ZIP2 + ismatch = true; + for (unsigned i = 0; i < NumElts; ++i) { + if ((unsigned)M[i] != (NumElts + i) / 2 + NumElts * (i % 2)) { + ismatch = false; + break; + } + } + if (ismatch) + return AArch64ISD::NEON_ZIP2; + + // Check TRN1 + ismatch = true; + for (unsigned i = 0; i < NumElts; ++i) { + if ((unsigned)M[i] != i + (NumElts - 1) * (i % 2)) { + ismatch = false; + break; + } + } + if (ismatch) + return AArch64ISD::NEON_TRN1; + + // Check TRN2 + ismatch = true; + for (unsigned i = 0; i < NumElts; ++i) { + if ((unsigned)M[i] != 1 + i + (NumElts - 1) * (i % 2)) { + ismatch = false; + break; + } + } + if (ismatch) + return AArch64ISD::NEON_TRN2; + + return 0; +} + +SDValue +AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, + SelectionDAG &DAG) const { + SDValue V1 = Op.getOperand(0); + SDValue V2 = Op.getOperand(1); + SDLoc dl(Op); + EVT VT = Op.getValueType(); + ShuffleVectorSDNode *SVN = cast(Op.getNode()); + + // Convert shuffles that are directly supported on NEON to target-specific + // DAG nodes, instead of keeping them as shuffles and matching them again + // during code selection. This is more efficient and avoids the possibility + // of inconsistencies between legalization and selection. + ArrayRef ShuffleMask = SVN->getMask(); + + unsigned EltSize = VT.getVectorElementType().getSizeInBits(); + if (EltSize > 64) + return SDValue(); + + if (isREVMask(ShuffleMask, VT, 64)) + return DAG.getNode(AArch64ISD::NEON_REV64, dl, VT, V1); + if (isREVMask(ShuffleMask, VT, 32)) + return DAG.getNode(AArch64ISD::NEON_REV32, dl, VT, V1); + if (isREVMask(ShuffleMask, VT, 16)) + return DAG.getNode(AArch64ISD::NEON_REV16, dl, VT, V1); + + unsigned ISDNo = isPermuteMask(ShuffleMask, VT); + if (ISDNo) + return DAG.getNode(ISDNo, dl, VT, V1, V2); + + // If the element of shuffle mask are all the same constant, we can + // transform it into either NEON_VDUP or NEON_VDUPLANE + if (ShuffleVectorSDNode::isSplatMask(&ShuffleMask[0], VT)) { + int Lane = SVN->getSplatIndex(); + // If this is undef splat, generate it via "just" vdup, if possible. + if (Lane == -1) Lane = 0; + + // Test if V1 is a SCALAR_TO_VECTOR. + if (V1.getOpcode() == ISD::SCALAR_TO_VECTOR) { + return DAG.getNode(AArch64ISD::NEON_VDUP, dl, VT, V1.getOperand(0)); + } + // Test if V1 is a BUILD_VECTOR which is equivalent to a SCALAR_TO_VECTOR. + if (V1.getOpcode() == ISD::BUILD_VECTOR) { + bool IsScalarToVector = true; + for (unsigned i = 0, e = V1.getNumOperands(); i != e; ++i) + if (V1.getOperand(i).getOpcode() != ISD::UNDEF && + i != (unsigned)Lane) { + IsScalarToVector = false; + break; + } + if (IsScalarToVector) + return DAG.getNode(AArch64ISD::NEON_VDUP, dl, VT, + V1.getOperand(Lane)); + } + + // Test if V1 is a EXTRACT_SUBVECTOR. + if (V1.getOpcode() == ISD::EXTRACT_SUBVECTOR) { + int ExtLane = cast(V1.getOperand(1))->getZExtValue(); + return DAG.getNode(AArch64ISD::NEON_VDUPLANE, dl, VT, V1.getOperand(0), + DAG.getConstant(Lane + ExtLane, MVT::i64)); + } + // Test if V1 is a CONCAT_VECTORS. + if (V1.getOpcode() == ISD::CONCAT_VECTORS && + V1.getOperand(1).getOpcode() == ISD::UNDEF) { + SDValue Op0 = V1.getOperand(0); + assert((unsigned)Lane < Op0.getValueType().getVectorNumElements() && + "Invalid vector lane access"); + return DAG.getNode(AArch64ISD::NEON_VDUPLANE, dl, VT, Op0, + DAG.getConstant(Lane, MVT::i64)); + } + + return DAG.getNode(AArch64ISD::NEON_VDUPLANE, dl, VT, V1, + DAG.getConstant(Lane, MVT::i64)); + } + + int Length = ShuffleMask.size(); + int V1EltNum = V1.getValueType().getVectorNumElements(); + + // If the number of v1 elements is the same as the number of shuffle mask + // element and the shuffle masks are sequential values, we can transform + // it into NEON_VEXTRACT. + if (V1EltNum == Length) { + // Check if the shuffle mask is sequential. + bool IsSequential = true; + int CurMask = ShuffleMask[0]; + for (int I = 0; I < Length; ++I) { + if (ShuffleMask[I] != CurMask) { + IsSequential = false; + break; + } + CurMask++; + } + if (IsSequential) { + assert((EltSize % 8 == 0) && "Bitsize of vector element is incorrect"); + unsigned VecSize = EltSize * V1EltNum; + unsigned Index = (EltSize/8) * ShuffleMask[0]; + if (VecSize == 64 || VecSize == 128) + return DAG.getNode(AArch64ISD::NEON_VEXTRACT, dl, VT, V1, V2, + DAG.getConstant(Index, MVT::i64)); + } + } + + // For shuffle mask like "0, 1, 2, 3, 4, 5, 13, 7", try to generate insert + // by element from V2 to V1 . + // If shuffle mask is like "0, 1, 10, 11, 12, 13, 14, 15", V2 would be a + // better choice to be inserted than V1 as less insert needed, so we count + // element to be inserted for both V1 and V2, and select less one as insert + // target. + + // Collect elements need to be inserted and their index. + SmallVector NV1Elt; + SmallVector N1Index; + SmallVector NV2Elt; + SmallVector N2Index; + for (int I = 0; I != Length; ++I) { + if (ShuffleMask[I] != I) { + NV1Elt.push_back(ShuffleMask[I]); + N1Index.push_back(I); + } + } + for (int I = 0; I != Length; ++I) { + if (ShuffleMask[I] != (I + V1EltNum)) { + NV2Elt.push_back(ShuffleMask[I]); + N2Index.push_back(I); + } + } + + // Decide which to be inserted. If all lanes mismatch, neither V1 nor V2 + // will be inserted. + SDValue InsV = V1; + SmallVector InsMasks = NV1Elt; + SmallVector InsIndex = N1Index; + if ((int)NV1Elt.size() != Length || (int)NV2Elt.size() != Length) { + if (NV1Elt.size() > NV2Elt.size()) { + InsV = V2; + InsMasks = NV2Elt; + InsIndex = N2Index; + } + } else { + InsV = DAG.getNode(ISD::UNDEF, dl, VT); + } + + for (int I = 0, E = InsMasks.size(); I != E; ++I) { + SDValue ExtV = V1; + int Mask = InsMasks[I]; + if (Mask >= V1EltNum) { + ExtV = V2; + Mask -= V1EltNum; + } + // Any value type smaller than i32 is illegal in AArch64, and this lower + // function is called after legalize pass, so we need to legalize + // the result here. + EVT EltVT; + if (VT.getVectorElementType().isFloatingPoint()) + EltVT = (EltSize == 64) ? MVT::f64 : MVT::f32; + else + EltVT = (EltSize == 64) ? MVT::i64 : MVT::i32; + + if (Mask >= 0) { + ExtV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, ExtV, + DAG.getConstant(Mask, MVT::i64)); + InsV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, InsV, ExtV, + DAG.getConstant(InsIndex[I], MVT::i64)); + } + } + return InsV; +} + AArch64TargetLowering::ConstraintType AArch64TargetLowering::getConstraintType(const std::string &Constraint) const { if (Constraint.size() == 1) { @@ -2899,7 +4469,7 @@ AArch64TargetLowering::LowerAsmOperandForConstraint(SDValue Op, case 'S': { // An absolute symbolic address or label reference. if (const GlobalAddressSDNode *GA = dyn_cast(Op)) { - Result = DAG.getTargetGlobalAddress(GA->getGlobal(), Op.getDebugLoc(), + Result = DAG.getTargetGlobalAddress(GA->getGlobal(), SDLoc(Op), GA->getValueType(0)); } else if (const BlockAddressSDNode *BA = dyn_cast(Op)) { @@ -2935,7 +4505,7 @@ AArch64TargetLowering::LowerAsmOperandForConstraint(SDValue Op, std::pair AArch64TargetLowering::getRegForInlineAsmConstraint( const std::string &Constraint, - EVT VT) const { + MVT VT) const { if (Constraint.size() == 1) { switch (Constraint[0]) { case 'r': @@ -2949,14 +4519,10 @@ AArch64TargetLowering::getRegForInlineAsmConstraint( return std::make_pair(0U, &AArch64::FPR16RegClass); else if (VT == MVT::f32) return std::make_pair(0U, &AArch64::FPR32RegClass); - else if (VT == MVT::f64) - return std::make_pair(0U, &AArch64::FPR64RegClass); else if (VT.getSizeInBits() == 64) - return std::make_pair(0U, &AArch64::VPR64RegClass); - else if (VT == MVT::f128) - return std::make_pair(0U, &AArch64::FPR128RegClass); + return std::make_pair(0U, &AArch64::FPR64RegClass); else if (VT.getSizeInBits() == 128) - return std::make_pair(0U, &AArch64::VPR128RegClass); + return std::make_pair(0U, &AArch64::FPR128RegClass); break; } } @@ -2965,3 +4531,69 @@ AArch64TargetLowering::getRegForInlineAsmConstraint( // constraint into a member of a register class. return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT); } + +/// Represent NEON load and store intrinsics as MemIntrinsicNodes. +/// The associated MachineMemOperands record the alignment specified +/// in the intrinsic calls. +bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, + const CallInst &I, + unsigned Intrinsic) const { + switch (Intrinsic) { + case Intrinsic::arm_neon_vld1: + case Intrinsic::arm_neon_vld2: + case Intrinsic::arm_neon_vld3: + case Intrinsic::arm_neon_vld4: + case Intrinsic::aarch64_neon_vld1x2: + case Intrinsic::aarch64_neon_vld1x3: + case Intrinsic::aarch64_neon_vld1x4: + case Intrinsic::arm_neon_vld2lane: + case Intrinsic::arm_neon_vld3lane: + case Intrinsic::arm_neon_vld4lane: { + Info.opc = ISD::INTRINSIC_W_CHAIN; + // Conservatively set memVT to the entire set of vectors loaded. + uint64_t NumElts = getDataLayout()->getTypeAllocSize(I.getType()) / 8; + Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); + Info.ptrVal = I.getArgOperand(0); + Info.offset = 0; + Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1); + Info.align = cast(AlignArg)->getZExtValue(); + Info.vol = false; // volatile loads with NEON intrinsics not supported + Info.readMem = true; + Info.writeMem = false; + return true; + } + case Intrinsic::arm_neon_vst1: + case Intrinsic::arm_neon_vst2: + case Intrinsic::arm_neon_vst3: + case Intrinsic::arm_neon_vst4: + case Intrinsic::aarch64_neon_vst1x2: + case Intrinsic::aarch64_neon_vst1x3: + case Intrinsic::aarch64_neon_vst1x4: + case Intrinsic::arm_neon_vst2lane: + case Intrinsic::arm_neon_vst3lane: + case Intrinsic::arm_neon_vst4lane: { + Info.opc = ISD::INTRINSIC_VOID; + // Conservatively set memVT to the entire set of vectors stored. + unsigned NumElts = 0; + for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) { + Type *ArgTy = I.getArgOperand(ArgI)->getType(); + if (!ArgTy->isVectorTy()) + break; + NumElts += getDataLayout()->getTypeAllocSize(ArgTy) / 8; + } + Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); + Info.ptrVal = I.getArgOperand(0); + Info.offset = 0; + Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1); + Info.align = cast(AlignArg)->getZExtValue(); + Info.vol = false; // volatile stores with NEON intrinsics not supported + Info.readMem = false; + Info.writeMem = true; + return true; + } + default: + break; + } + + return false; +} diff --git a/lib/Target/AArch64/AArch64ISelLowering.h b/lib/Target/AArch64/AArch64ISelLowering.h index d49b3ee453f9..8ad5a79a33ee 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.h +++ b/lib/Target/AArch64/AArch64ISelLowering.h @@ -19,7 +19,7 @@ #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/Target/TargetLowering.h" - +#include "llvm/IR/Intrinsics.h" namespace llvm { namespace AArch64ISD { @@ -111,7 +111,92 @@ namespace AArch64ISD { // created using the small memory model style: i.e. adrp/add or // adrp/mem-op. This exists to prevent bare TargetAddresses which may never // get selected. - WrapperSmall + WrapperSmall, + + // Vector bitwise select + NEON_BSL, + + // Vector move immediate + NEON_MOVIMM, + + // Vector Move Inverted Immediate + NEON_MVNIMM, + + // Vector FP move immediate + NEON_FMOVIMM, + + // Vector permute + NEON_UZP1, + NEON_UZP2, + NEON_ZIP1, + NEON_ZIP2, + NEON_TRN1, + NEON_TRN2, + + // Vector Element reverse + NEON_REV64, + NEON_REV32, + NEON_REV16, + + // Vector compare + NEON_CMP, + + // Vector compare zero + NEON_CMPZ, + + // Vector compare bitwise test + NEON_TST, + + // Vector saturating shift + NEON_QSHLs, + NEON_QSHLu, + + // Vector dup + NEON_VDUP, + + // Vector dup by lane + NEON_VDUPLANE, + + // Vector extract + NEON_VEXTRACT, + + // NEON duplicate lane loads + NEON_LD2DUP = ISD::FIRST_TARGET_MEMORY_OPCODE, + NEON_LD3DUP, + NEON_LD4DUP, + + // NEON loads with post-increment base updates: + NEON_LD1_UPD, + NEON_LD2_UPD, + NEON_LD3_UPD, + NEON_LD4_UPD, + NEON_LD1x2_UPD, + NEON_LD1x3_UPD, + NEON_LD1x4_UPD, + + // NEON stores with post-increment base updates: + NEON_ST1_UPD, + NEON_ST2_UPD, + NEON_ST3_UPD, + NEON_ST4_UPD, + NEON_ST1x2_UPD, + NEON_ST1x3_UPD, + NEON_ST1x4_UPD, + + // NEON duplicate lane loads with post-increment base updates: + NEON_LD2DUP_UPD, + NEON_LD3DUP_UPD, + NEON_LD4DUP_UPD, + + // NEON lane loads with post-increment base updates: + NEON_LD2LN_UPD, + NEON_LD3LN_UPD, + NEON_LD4LN_UPD, + + // NEON lane store with post-increment base updates: + NEON_ST2LN_UPD, + NEON_ST3LN_UPD, + NEON_ST4LN_UPD }; } @@ -130,14 +215,14 @@ public: SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Ins, - DebugLoc dl, SelectionDAG &DAG, + SDLoc dl, SelectionDAG &DAG, SmallVectorImpl &InVals) const; SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Outs, const SmallVectorImpl &OutVals, - DebugLoc dl, SelectionDAG &DAG) const; + SDLoc dl, SelectionDAG &DAG) const; SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl &InVals) const; @@ -145,12 +230,18 @@ public: SDValue LowerCallResult(SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl &Ins, - DebugLoc dl, SelectionDAG &DAG, + SDLoc dl, SelectionDAG &DAG, SmallVectorImpl &InVals) const; - void SaveVarArgRegisters(CCState &CCInfo, SelectionDAG &DAG, - DebugLoc DL, SDValue &Chain) const; + bool isKnownShuffleVector(SDValue Op, SelectionDAG &DAG, SDValue &Res) const; + + SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, + const AArch64Subtarget *ST) const; + + SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const; + void SaveVarArgRegisters(CCState &CCInfo, SelectionDAG &DAG, SDLoc DL, + SDValue &Chain) const; /// IsEligibleForTailCallOptimization - Check whether the call is eligible /// for tail call optimization. Targets which want to do tail call @@ -171,7 +262,7 @@ public: SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo *MFI, int ClobberedFI) const; - EVT getSetCCResultType(EVT VT) const; + EVT getSetCCResultType(LLVMContext &Context, EVT VT) const; bool DoesCalleeRestoreStack(CallingConv::ID CallCC, bool TailCallOpt) const; @@ -181,7 +272,7 @@ public: bool isLegalICmpImmediate(int64_t Val) const; SDValue getSelectableIntSetCC(SDValue LHS, SDValue RHS, ISD::CondCode CC, - SDValue &A64cc, SelectionDAG &DAG, DebugLoc &dl) const; + SDValue &A64cc, SelectionDAG &DAG, SDLoc &dl) const; virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *MBB) const; @@ -211,12 +302,14 @@ public: SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG, bool IsSigned) const; + SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerGlobalAddressELFSmall(SDValue Op, SelectionDAG &DAG) const; SDValue LowerGlobalAddressELFLarge(SDValue Op, SelectionDAG &DAG) const; SDValue LowerGlobalAddressELF(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerTLSDescCall(SDValue SymAddr, SDValue DescAddr, DebugLoc DL, + SDValue LowerTLSDescCall(SDValue SymAddr, SDValue DescAddr, SDLoc DL, SelectionDAG &DAG) const; SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG, bool IsSigned) const; @@ -229,11 +322,11 @@ public: virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const; - /// isFMAFasterThanMulAndAdd - Return true if an FMA operation is faster than - /// a pair of mul and add instructions. fmuladd intrinsics will be expanded to - /// FMAs when this method returns true (and FMAs are legal), otherwise fmuladd - /// is expanded to mul + add. - virtual bool isFMAFasterThanMulAndAdd(EVT) const { return true; } + /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster + /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be + /// expanded to FMAs when this method returns true, otherwise fmuladd is + /// expanded to fmul + fadd. + virtual bool isFMAFasterThanFMulAndFAdd(EVT VT) const; ConstraintType getConstraintType(const std::string &Constraint) const; @@ -245,12 +338,30 @@ public: SelectionDAG &DAG) const; std::pair - getRegForInlineAsmConstraint(const std::string &Constraint, EVT VT) const; + getRegForInlineAsmConstraint(const std::string &Constraint, MVT VT) const; + + virtual bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, + unsigned Intrinsic) const LLVM_OVERRIDE; + +protected: + std::pair + findRepresentativeClass(MVT VT) const; + private: - const AArch64Subtarget *Subtarget; - const TargetRegisterInfo *RegInfo; const InstrItineraryData *Itins; + + const AArch64Subtarget *getSubtarget() const { + return &getTargetMachine().getSubtarget(); + } }; +enum NeonModImmType { + Neon_Mov_Imm, + Neon_Mvn_Imm +}; + +extern SDValue ScanBUILD_VECTOR(SDValue Op, bool &isOnlyLowElement, + bool &usesOnlyOneValue, bool &hasDominantValue, + bool &isConstant, bool &isUNDEF); } // namespace llvm #endif // LLVM_TARGET_AARCH64_ISELLOWERING_H diff --git a/lib/Target/AArch64/AArch64InstrFormats.td b/lib/Target/AArch64/AArch64InstrFormats.td index 9dd122f14941..34f917caabe7 100644 --- a/lib/Target/AArch64/AArch64InstrFormats.td +++ b/lib/Target/AArch64/AArch64InstrFormats.td @@ -120,6 +120,14 @@ class A64InstRdnm patterns, InstrItinClass itin> + : A64InstRtn { + bits<5> Rm; + + let Inst{20-16} = Rm; +} + //===----------------------------------------------------------------------===// // // Actual A64 Instruction Formats @@ -383,6 +391,8 @@ class A64I_extract op, bit n, // Inherits Rd in 4-0 } +let Predicates = [HasFPARMv8] in { + // Format for floating-point compare instructions. class A64I_fpcmp type, bits<2> op, bits<5> opcode2, dag outs, dag ins, string asmstr, @@ -562,6 +572,8 @@ class A64I_fpimm type, bits<5> imm5, // Inherit Rd in 4-0 } +} + // Format for load-register (literal) instructions. class A64I_LDRlit opc, bit v, dag outs, dag ins, string asmstr, @@ -959,3 +971,519 @@ class A64I_Breg opc, bits<5> op2, bits<6> op3, bits<5> op4, let Inst{4-0} = op4; } + +//===----------------------------------------------------------------------===// +// +// Neon Instruction Format Definitions. +// + +let Predicates = [HasNEON] in { + +class NeonInstAlias + : InstAlias { +} + +// Format AdvSIMD bitwise extract +class NeonI_BitExtract op2, + dag outs, dag ins, string asmstr, + list patterns, InstrItinClass itin> + : A64InstRdnm { + let Inst{31} = 0b0; + let Inst{30} = q; + let Inst{29-24} = 0b101110; + let Inst{23-22} = op2; + let Inst{21} = 0b0; + // Inherit Rm in 20-16 + let Inst{15} = 0b0; + // imm4 in 14-11 + let Inst{10} = 0b0; + // Inherit Rn in 9-5 + // Inherit Rd in 4-0 +} + +// Format AdvSIMD perm +class NeonI_Perm size, bits<3> opcode, + dag outs, dag ins, string asmstr, + list patterns, InstrItinClass itin> + : A64InstRdnm { + let Inst{31} = 0b0; + let Inst{30} = q; + let Inst{29-24} = 0b001110; + let Inst{23-22} = size; + let Inst{21} = 0b0; + // Inherit Rm in 20-16 + let Inst{15} = 0b0; + let Inst{14-12} = opcode; + let Inst{11-10} = 0b10; + // Inherit Rn in 9-5 + // Inherit Rd in 4-0 +} + +// Format AdvSIMD table lookup +class NeonI_TBL op2, bits<2> len, bit op, + dag outs, dag ins, string asmstr, + list patterns, InstrItinClass itin> + : A64InstRdnm { + let Inst{31} = 0b0; + let Inst{30} = q; + let Inst{29-24} = 0b001110; + let Inst{23-22} = op2; + let Inst{21} = 0b0; + // Inherit Rm in 20-16 + let Inst{15} = 0b0; + let Inst{14-13} = len; + let Inst{12} = op; + let Inst{11-10} = 0b00; + // Inherit Rn in 9-5 + // Inherit Rd in 4-0 +} + +// Format AdvSIMD 3 vector registers with same vector type +class NeonI_3VSame size, bits<5> opcode, + dag outs, dag ins, string asmstr, + list patterns, InstrItinClass itin> + : A64InstRdnm { + let Inst{31} = 0b0; + let Inst{30} = q; + let Inst{29} = u; + let Inst{28-24} = 0b01110; + let Inst{23-22} = size; + let Inst{21} = 0b1; + // Inherit Rm in 20-16 + let Inst{15-11} = opcode; + let Inst{10} = 0b1; + // Inherit Rn in 9-5 + // Inherit Rd in 4-0 +} + +// Format AdvSIMD 3 vector registers with different vector type +class NeonI_3VDiff size, bits<4> opcode, + dag outs, dag ins, string asmstr, + list patterns, InstrItinClass itin> + : A64InstRdnm { + let Inst{31} = 0b0; + let Inst{30} = q; + let Inst{29} = u; + let Inst{28-24} = 0b01110; + let Inst{23-22} = size; + let Inst{21} = 0b1; + // Inherit Rm in 20-16 + let Inst{15-12} = opcode; + let Inst{11} = 0b0; + let Inst{10} = 0b0; + // Inherit Rn in 9-5 + // Inherit Rd in 4-0 +} + +// Format AdvSIMD two registers and an element +class NeonI_2VElem size, bits<4> opcode, + dag outs, dag ins, string asmstr, + list patterns, InstrItinClass itin> + : A64InstRdnm { + let Inst{31} = 0b0; + let Inst{30} = q; + let Inst{29} = u; + let Inst{28-24} = 0b01111; + let Inst{23-22} = size; + // l in Inst{21} + // m in Inst{20} + // Inherit Rm in 19-16 + let Inst{15-12} = opcode; + // h in Inst{11} + let Inst{10} = 0b0; + // Inherit Rn in 9-5 + // Inherit Rd in 4-0 +} + +// Format AdvSIMD 1 vector register with modified immediate +class NeonI_1VModImm patterns, InstrItinClass itin> + : A64InstRd { + bits<8> Imm; + bits<4> cmode; + let Inst{31} = 0b0; + let Inst{30} = q; + let Inst{29} = op; + let Inst{28-19} = 0b0111100000; + let Inst{15-12} = cmode; + let Inst{11} = 0b0; // o2 + let Inst{10} = 1; + // Inherit Rd in 4-0 + let Inst{18-16} = Imm{7-5}; // imm a:b:c + let Inst{9-5} = Imm{4-0}; // imm d:e:f:g:h +} + +// Format AdvSIMD 3 scalar registers with same type + +class NeonI_Scalar3Same size, bits<5> opcode, + dag outs, dag ins, string asmstr, + list patterns, InstrItinClass itin> + : A64InstRdnm { + let Inst{31} = 0b0; + let Inst{30} = 0b1; + let Inst{29} = u; + let Inst{28-24} = 0b11110; + let Inst{23-22} = size; + let Inst{21} = 0b1; + // Inherit Rm in 20-16 + let Inst{15-11} = opcode; + let Inst{10} = 0b1; + // Inherit Rn in 9-5 + // Inherit Rd in 4-0 +} + + +// Format AdvSIMD 2 vector registers miscellaneous +class NeonI_2VMisc size, bits<5> opcode, + dag outs, dag ins, string asmstr, + list patterns, InstrItinClass itin> + : A64InstRdn { + let Inst{31} = 0b0; + let Inst{30} = q; + let Inst{29} = u; + let Inst{28-24} = 0b01110; + let Inst{23-22} = size; + let Inst{21-17} = 0b10000; + let Inst{16-12} = opcode; + let Inst{11-10} = 0b10; + + // Inherit Rn in 9-5 + // Inherit Rd in 4-0 +} + +// Format AdvSIMD 2 vector 1 immediate shift +class NeonI_2VShiftImm opcode, + dag outs, dag ins, string asmstr, + list patterns, InstrItinClass itin> + : A64InstRdn { + bits<7> Imm; + let Inst{31} = 0b0; + let Inst{30} = q; + let Inst{29} = u; + let Inst{28-23} = 0b011110; + let Inst{22-16} = Imm; + let Inst{15-11} = opcode; + let Inst{10} = 0b1; + + // Inherit Rn in 9-5 + // Inherit Rd in 4-0 +} + +// Format AdvSIMD duplicate and insert +class NeonI_copy imm4, + dag outs, dag ins, string asmstr, + list patterns, InstrItinClass itin> + : A64InstRdn { + bits<5> Imm5; + let Inst{31} = 0b0; + let Inst{30} = q; + let Inst{29} = op; + let Inst{28-21} = 0b01110000; + let Inst{20-16} = Imm5; + let Inst{15} = 0b0; + let Inst{14-11} = imm4; + let Inst{10} = 0b1; + + // Inherit Rn in 9-5 + // Inherit Rd in 4-0 +} +// Format AdvSIMD insert from element to vector +class NeonI_insert patterns, InstrItinClass itin> + : A64InstRdn { + bits<5> Imm5; + bits<4> Imm4; + let Inst{31} = 0b0; + let Inst{30} = q; + let Inst{29} = op; + let Inst{28-21} = 0b01110000; + let Inst{20-16} = Imm5; + let Inst{15} = 0b0; + let Inst{14-11} = Imm4; + let Inst{10} = 0b1; + + // Inherit Rn in 9-5 + // Inherit Rd in 4-0 +} + +// Format AdvSIMD scalar pairwise +class NeonI_ScalarPair size, bits<5> opcode, + dag outs, dag ins, string asmstr, + list patterns, InstrItinClass itin> + : A64InstRdn { + let Inst{31} = 0b0; + let Inst{30} = 0b1; + let Inst{29} = u; + let Inst{28-24} = 0b11110; + let Inst{23-22} = size; + let Inst{21-17} = 0b11000; + let Inst{16-12} = opcode; + let Inst{11-10} = 0b10; + + // Inherit Rn in 9-5 + // Inherit Rd in 4-0 +} + +// Format AdvSIMD 2 vector across lanes +class NeonI_2VAcross size, bits<5> opcode, + dag outs, dag ins, string asmstr, + list patterns, InstrItinClass itin> + : A64InstRdn +{ + let Inst{31} = 0b0; + let Inst{30} = q; + let Inst{29} = u; + let Inst{28-24} = 0b01110; + let Inst{23-22} = size; + let Inst{21-17} = 0b11000; + let Inst{16-12} = opcode; + let Inst{11-10} = 0b10; + + // Inherit Rn in 9-5 + // Inherit Rd in 4-0 +} + +// Format AdvSIMD scalar two registers miscellaneous +class NeonI_Scalar2SameMisc size, bits<5> opcode, dag outs, dag ins, + string asmstr, list patterns, InstrItinClass itin> + : A64InstRdn { + let Inst{31} = 0b0; + let Inst{30} = 0b1; + let Inst{29} = u; + let Inst{28-24} = 0b11110; + let Inst{23-22} = size; + let Inst{21-17} = 0b10000; + let Inst{16-12} = opcode; + let Inst{11-10} = 0b10; + // Inherit Rn in 9-5 + // Inherit Rd in 4-0 +} + +// Format AdvSIMD vector load/store multiple N-element structure +class NeonI_LdStMult opcode, bits<2> size, + dag outs, dag ins, string asmstr, + list patterns, InstrItinClass itin> + : A64InstRtn +{ + let Inst{31} = 0b0; + let Inst{30} = q; + let Inst{29-23} = 0b0011000; + let Inst{22} = l; + let Inst{21-16} = 0b000000; + let Inst{15-12} = opcode; + let Inst{11-10} = size; + + // Inherit Rn in 9-5 + // Inherit Rt in 4-0 +} + +// Format AdvSIMD vector load/store multiple N-element structure (post-index) +class NeonI_LdStMult_Post opcode, bits<2> size, + dag outs, dag ins, string asmstr, + list patterns, InstrItinClass itin> + : A64InstRtnm +{ + let Inst{31} = 0b0; + let Inst{30} = q; + let Inst{29-23} = 0b0011001; + let Inst{22} = l; + let Inst{21} = 0b0; + // Inherit Rm in 20-16 + let Inst{15-12} = opcode; + let Inst{11-10} = size; + // Inherit Rn in 9-5 + // Inherit Rt in 4-0 +} + +// Format AdvSIMD vector load Single N-element structure to all lanes +class NeonI_LdOne_Dup opcode, bits<2> size, dag outs, + dag ins, string asmstr, list patterns, + InstrItinClass itin> + : A64InstRtn +{ + let Inst{31} = 0b0; + let Inst{30} = q; + let Inst{29-23} = 0b0011010; + let Inst{22} = 0b1; + let Inst{21} = r; + let Inst{20-16} = 0b00000; + let Inst{15-13} = opcode; + let Inst{12} = 0b0; + let Inst{11-10} = size; + + // Inherit Rn in 9-5 + // Inherit Rt in 4-0 +} + +// Format AdvSIMD vector load/store Single N-element structure to/from one lane +class NeonI_LdStOne_Lane op2_1, bit op0, dag outs, + dag ins, string asmstr, + list patterns, InstrItinClass itin> + : A64InstRtn +{ + bits<4> lane; + let Inst{31} = 0b0; + let Inst{29-23} = 0b0011010; + let Inst{22} = l; + let Inst{21} = r; + let Inst{20-16} = 0b00000; + let Inst{15-14} = op2_1; + let Inst{13} = op0; + + // Inherit Rn in 9-5 + // Inherit Rt in 4-0 +} + +// Format AdvSIMD post-index vector load Single N-element structure to all lanes +class NeonI_LdOne_Dup_Post opcode, bits<2> size, dag outs, + dag ins, string asmstr, list patterns, + InstrItinClass itin> + : A64InstRtnm +{ + let Inst{31} = 0b0; + let Inst{30} = q; + let Inst{29-23} = 0b0011011; + let Inst{22} = 0b1; + let Inst{21} = r; + // Inherit Rm in 20-16 + let Inst{15-13} = opcode; + let Inst{12} = 0b0; + let Inst{11-10} = size; + + // Inherit Rn in 9-5 + // Inherit Rt in 4-0 +} + +// Format AdvSIMD post-index vector load/store Single N-element structure +// to/from one lane +class NeonI_LdStOne_Lane_Post op2_1, bit op0, dag outs, + dag ins, string asmstr, + list patterns, InstrItinClass itin> + : A64InstRtnm +{ + bits<4> lane; + let Inst{31} = 0b0; + let Inst{29-23} = 0b0011011; + let Inst{22} = l; + let Inst{21} = r; + // Inherit Rm in 20-16 + let Inst{15-14} = op2_1; + let Inst{13} = op0; + + // Inherit Rn in 9-5 + // Inherit Rt in 4-0 +} + +// Format AdvSIMD 3 scalar registers with different type + +class NeonI_Scalar3Diff size, bits<4> opcode, + dag outs, dag ins, string asmstr, + list patterns, InstrItinClass itin> + : A64InstRdnm { + let Inst{31-30} = 0b01; + let Inst{29} = u; + let Inst{28-24} = 0b11110; + let Inst{23-22} = size; + let Inst{21} = 0b1; + // Inherit Rm in 20-16 + let Inst{15-12} = opcode; + let Inst{11-10} = 0b00; + // Inherit Rn in 9-5 + // Inherit Rd in 4-0 +} + +// Format AdvSIMD scalar shift by immediate + +class NeonI_ScalarShiftImm opcode, + dag outs, dag ins, string asmstr, + list patterns, InstrItinClass itin> + : A64InstRdn { + bits<4> Imm4; + bits<3> Imm3; + let Inst{31-30} = 0b01; + let Inst{29} = u; + let Inst{28-23} = 0b111110; + let Inst{22-19} = Imm4; + let Inst{18-16} = Imm3; + let Inst{15-11} = opcode; + let Inst{10} = 0b1; + // Inherit Rn in 9-5 + // Inherit Rd in 4-0 +} + +// Format AdvSIMD crypto AES +class NeonI_Crypto_AES size, bits<5> opcode, + dag outs, dag ins, string asmstr, + list patterns, InstrItinClass itin> + : A64InstRdn { + let Inst{31-24} = 0b01001110; + let Inst{23-22} = size; + let Inst{21-17} = 0b10100; + let Inst{16-12} = opcode; + let Inst{11-10} = 0b10; + // Inherit Rn in 9-5 + // Inherit Rd in 4-0 +} + +// Format AdvSIMD crypto SHA +class NeonI_Crypto_SHA size, bits<5> opcode, + dag outs, dag ins, string asmstr, + list patterns, InstrItinClass itin> + : A64InstRdn { + let Inst{31-24} = 0b01011110; + let Inst{23-22} = size; + let Inst{21-17} = 0b10100; + let Inst{16-12} = opcode; + let Inst{11-10} = 0b10; + // Inherit Rn in 9-5 + // Inherit Rd in 4-0 +} + +// Format AdvSIMD crypto 3V SHA +class NeonI_Crypto_3VSHA size, bits<3> opcode, + dag outs, dag ins, string asmstr, + list patterns, InstrItinClass itin> + : A64InstRdnm { + let Inst{31-24} = 0b01011110; + let Inst{23-22} = size; + let Inst{21} = 0b0; + // Inherit Rm in 20-16 + let Inst{15} = 0b0; + let Inst{14-12} = opcode; + let Inst{11-10} = 0b00; + // Inherit Rn in 9-5 + // Inherit Rd in 4-0 +} + +// Format AdvSIMD scalar x indexed element +class NeonI_ScalarXIndexedElem opcode, dag outs, dag ins, + string asmstr, list patterns, + InstrItinClass itin> + : A64InstRdnm +{ + let Inst{31} = 0b0; + let Inst{30} = 0b1; + let Inst{29} = u; + let Inst{28-24} = 0b11111; + let Inst{23} = szhi; + let Inst{22} = szlo; + // l in Inst{21} + // m in Instr{20} + // Inherit Rm in 19-16 + let Inst{15-12} = opcode; + // h in Inst{11} + let Inst{10} = 0b0; + // Inherit Rn in 9-5 + // Inherit Rd in 4-0 +} +// Format AdvSIMD scalar copy - insert from element to scalar +class NeonI_ScalarCopy patterns, InstrItinClass itin> + : NeonI_copy<0b1, 0b0, 0b0000, outs, ins, asmstr, patterns, itin> { + let Inst{28} = 0b1; +} +} + diff --git a/lib/Target/AArch64/AArch64InstrInfo.cpp b/lib/Target/AArch64/AArch64InstrInfo.cpp index cf3a2c3707d9..180110a84dd6 100644 --- a/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -29,14 +29,14 @@ #include -#define GET_INSTRINFO_CTOR +#define GET_INSTRINFO_CTOR_DTOR #include "AArch64GenInstrInfo.inc" using namespace llvm; AArch64InstrInfo::AArch64InstrInfo(const AArch64Subtarget &STI) : AArch64GenInstrInfo(AArch64::ADJCALLSTACKDOWN, AArch64::ADJCALLSTACKUP), - RI(*this, STI), Subtarget(STI) {} + Subtarget(STI) {} void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, DebugLoc DL, @@ -68,43 +68,71 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB, BuildMI(MBB, I, DL, get(AArch64::MRSxi), DestReg) .addImm(A64SysReg::NZCV); } else if (AArch64::GPR64RegClass.contains(DestReg)) { - assert(AArch64::GPR64RegClass.contains(SrcReg)); - Opc = AArch64::ORRxxx_lsl; - ZeroReg = AArch64::XZR; + if(AArch64::GPR64RegClass.contains(SrcReg)){ + Opc = AArch64::ORRxxx_lsl; + ZeroReg = AArch64::XZR; + } else{ + assert(AArch64::FPR64RegClass.contains(SrcReg)); + BuildMI(MBB, I, DL, get(AArch64::FMOVxd), DestReg) + .addReg(SrcReg); + return; + } } else if (AArch64::GPR32RegClass.contains(DestReg)) { - assert(AArch64::GPR32RegClass.contains(SrcReg)); - Opc = AArch64::ORRwww_lsl; - ZeroReg = AArch64::WZR; + if(AArch64::GPR32RegClass.contains(SrcReg)){ + Opc = AArch64::ORRwww_lsl; + ZeroReg = AArch64::WZR; + } else{ + assert(AArch64::FPR32RegClass.contains(SrcReg)); + BuildMI(MBB, I, DL, get(AArch64::FMOVws), DestReg) + .addReg(SrcReg); + return; + } } else if (AArch64::FPR32RegClass.contains(DestReg)) { - assert(AArch64::FPR32RegClass.contains(SrcReg)); - BuildMI(MBB, I, DL, get(AArch64::FMOVss), DestReg) - .addReg(SrcReg); - return; + if(AArch64::FPR32RegClass.contains(SrcReg)){ + BuildMI(MBB, I, DL, get(AArch64::FMOVss), DestReg) + .addReg(SrcReg); + return; + } + else { + assert(AArch64::GPR32RegClass.contains(SrcReg)); + BuildMI(MBB, I, DL, get(AArch64::FMOVsw), DestReg) + .addReg(SrcReg); + return; + } } else if (AArch64::FPR64RegClass.contains(DestReg)) { - assert(AArch64::FPR64RegClass.contains(SrcReg)); - BuildMI(MBB, I, DL, get(AArch64::FMOVdd), DestReg) - .addReg(SrcReg); - return; + if(AArch64::FPR64RegClass.contains(SrcReg)){ + BuildMI(MBB, I, DL, get(AArch64::FMOVdd), DestReg) + .addReg(SrcReg); + return; + } + else { + assert(AArch64::GPR64RegClass.contains(SrcReg)); + BuildMI(MBB, I, DL, get(AArch64::FMOVdx), DestReg) + .addReg(SrcReg); + return; + } } else if (AArch64::FPR128RegClass.contains(DestReg)) { assert(AArch64::FPR128RegClass.contains(SrcReg)); - // FIXME: there's no good way to do this, at least without NEON: - // + There's no single move instruction for q-registers - // + We can't create a spill slot and use normal STR/LDR because stack - // allocation has already happened - // + We can't go via X-registers with FMOV because register allocation has - // already happened. - // This may not be efficient, but at least it works. - BuildMI(MBB, I, DL, get(AArch64::LSFP128_PreInd_STR), AArch64::XSP) - .addReg(SrcReg) - .addReg(AArch64::XSP) - .addImm(0x1ff & -16); - - BuildMI(MBB, I, DL, get(AArch64::LSFP128_PostInd_LDR), DestReg) - .addReg(AArch64::XSP, RegState::Define) - .addReg(AArch64::XSP) - .addImm(16); - return; + // If NEON is enable, we use ORR to implement this copy. + // If NEON isn't available, emit STR and LDR to handle this. + if(getSubTarget().hasNEON()) { + BuildMI(MBB, I, DL, get(AArch64::ORRvvv_16B), DestReg) + .addReg(SrcReg) + .addReg(SrcReg); + return; + } else { + BuildMI(MBB, I, DL, get(AArch64::LSFP128_PreInd_STR), AArch64::XSP) + .addReg(SrcReg) + .addReg(AArch64::XSP) + .addImm(0x1ff & -16); + + BuildMI(MBB, I, DL, get(AArch64::LSFP128_PostInd_LDR), DestReg) + .addReg(AArch64::XSP, RegState::Define) + .addReg(AArch64::XSP) + .addImm(16); + return; + } } else { llvm_unreachable("Unknown register class in copyPhysReg"); } @@ -116,17 +144,6 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB, .addImm(0); } -MachineInstr * -AArch64InstrInfo::emitFrameIndexDebugValue(MachineFunction &MF, int FrameIx, - uint64_t Offset, const MDNode *MDPtr, - DebugLoc DL) const { - MachineInstrBuilder MIB = BuildMI(MF, DL, get(AArch64::DBG_VALUE)) - .addFrameIndex(FrameIx).addImm(0) - .addImm(Offset) - .addMetadata(MDPtr); - return &*MIB; -} - /// Does the Opcode represent a conditional branch that we can remove and re-add /// at the end of a basic block? static bool isCondBranch(unsigned Opc) { diff --git a/lib/Target/AArch64/AArch64InstrInfo.h b/lib/Target/AArch64/AArch64InstrInfo.h index 22a2ab4cf60a..620ecc93b170 100644 --- a/lib/Target/AArch64/AArch64InstrInfo.h +++ b/lib/Target/AArch64/AArch64InstrInfo.h @@ -43,10 +43,6 @@ public: unsigned DestReg, unsigned SrcReg, bool KillSrc) const; - MachineInstr *emitFrameIndexDebugValue(MachineFunction &MF, int FrameIx, - uint64_t Offset, const MDNode *MDPtr, - DebugLoc DL) const; - void storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned SrcReg, bool isKill, int FrameIndex, diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td index d2cfc7db2232..23d81fc478e8 100644 --- a/lib/Target/AArch64/AArch64InstrInfo.td +++ b/lib/Target/AArch64/AArch64InstrInfo.td @@ -11,6 +11,19 @@ // //===----------------------------------------------------------------------===// +//===----------------------------------------------------------------------===// +// ARM Instruction Predicate Definitions. +// +def HasFPARMv8 : Predicate<"Subtarget->hasFPARMv8()">, + AssemblerPredicate<"FeatureFPARMv8", "fp-armv8">; +def HasNEON : Predicate<"Subtarget->hasNEON()">, + AssemblerPredicate<"FeatureNEON", "neon">; +def HasCrypto : Predicate<"Subtarget->hasCrypto()">, + AssemblerPredicate<"FeatureCrypto","crypto">; + +// Use fused MAC if more precision in FP computation is allowed. +def UseFusedMAC : Predicate<"(TM.Options.AllowFPOpFusion ==" + " FPOpFusion::Fast)">; include "AArch64InstrFormats.td" //===----------------------------------------------------------------------===// @@ -114,6 +127,8 @@ def A64Sbfx : SDNode<"AArch64ISD::SBFX", SDTA64BFX>; def A64Ubfx : SDNode<"AArch64ISD::UBFX", SDTA64BFX>; +class BinOpFrag : PatFrag<(ops node:$LHS, node:$RHS), res>; + //===----------------------------------------------------------------------===// // Call sequence pseudo-instructions //===----------------------------------------------------------------------===// @@ -1263,7 +1278,7 @@ def : Pat<(i64 (sext_inreg (anyext i32:$Rn), i1)), // UBFX makes sense as an implementation of a 64-bit zero-extension too. Could // use either 64-bit or 32-bit variant, but 32-bit might be more efficient. -def : Pat<(zext i32:$Rn), (SUBREG_TO_REG (i64 0), (UBFXwwii $Rn, 0, 31), +def : Pat<(i64 (zext i32:$Rn)), (SUBREG_TO_REG (i64 0), (UBFXwwii $Rn, 0, 31), sub_32)>; //===------------------------------- @@ -1967,6 +1982,13 @@ def fpz64 : Operand, let DecoderMethod = "DecodeFPZeroOperand"; } +def fpz64movi : Operand, + ComplexPattern { + let ParserMatchClass = fpzero_asmoperand; + let PrintMethod = "printFPZeroOperand"; + let DecoderMethod = "DecodeFPZeroOperand"; +} + multiclass A64I_fpcmpSignal type, bit imm, dag ins, dag pattern> { def _quiet : A64I_fpcmp<0b0, 0b0, type, 0b00, {0b0, imm, 0b0, 0b0, 0b0}, (outs), ins, "fcmp\t$Rn, $Rm", [pattern], @@ -2173,6 +2195,29 @@ def FMSUBdddd : A64I_fpdp3Impl<"fmsub", FPR64, f64, 0b01, 0b0, 0b1, fmsub>; def FNMADDdddd : A64I_fpdp3Impl<"fnmadd", FPR64, f64, 0b01, 0b1, 0b0, fnmadd>; def FNMSUBdddd : A64I_fpdp3Impl<"fnmsub", FPR64, f64, 0b01, 0b1, 0b1, fnmsub>; +// Extra patterns for when we're allowed to optimise separate multiplication and +// addition. +let Predicates = [HasFPARMv8, UseFusedMAC] in { +def : Pat<(f32 (fadd FPR32:$Ra, (f32 (fmul FPR32:$Rn, FPR32:$Rm)))), + (FMADDssss FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>; +def : Pat<(f32 (fsub FPR32:$Ra, (f32 (fmul FPR32:$Rn, FPR32:$Rm)))), + (FMSUBssss FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>; +def : Pat<(f32 (fsub (f32 (fmul FPR32:$Rn, FPR32:$Rm)), FPR32:$Ra)), + (FNMADDssss FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>; +def : Pat<(f32 (fsub (f32 (fneg FPR32:$Ra)), (f32 (fmul FPR32:$Rn, FPR32:$Rm)))), + (FNMSUBssss FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>; + +def : Pat<(f64 (fadd FPR64:$Ra, (f64 (fmul FPR64:$Rn, FPR64:$Rm)))), + (FMADDdddd FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>; +def : Pat<(f64 (fsub FPR64:$Ra, (f64 (fmul FPR64:$Rn, FPR64:$Rm)))), + (FMSUBdddd FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>; +def : Pat<(f64 (fsub (f64 (fmul FPR64:$Rn, FPR64:$Rm)), FPR64:$Ra)), + (FNMADDdddd FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>; +def : Pat<(f64 (fsub (f64 (fneg FPR64:$Ra)), (f64 (fmul FPR64:$Rn, FPR64:$Rm)))), + (FNMSUBdddd FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>; +} + + //===----------------------------------------------------------------------===// // Floating-point <-> fixed-point conversion instructions //===----------------------------------------------------------------------===// @@ -2308,6 +2353,7 @@ defm FCVTM : A64I_fptointRM<0b10, 0b0, "fcvtm">; defm FCVTZ : A64I_fptointRM<0b11, 0b0, "fcvtz">; defm FCVTA : A64I_fptointRM<0b00, 0b1, "fcvta">; +let Predicates = [HasFPARMv8] in { def : Pat<(i32 (fp_to_sint f32:$Rn)), (FCVTZSws $Rn)>; def : Pat<(i64 (fp_to_sint f32:$Rn)), (FCVTZSxs $Rn)>; def : Pat<(i32 (fp_to_uint f32:$Rn)), (FCVTZUws $Rn)>; @@ -2316,6 +2362,7 @@ def : Pat<(i32 (fp_to_sint f64:$Rn)), (FCVTZSwd $Rn)>; def : Pat<(i64 (fp_to_sint f64:$Rn)), (FCVTZSxd $Rn)>; def : Pat<(i32 (fp_to_uint f64:$Rn)), (FCVTZUwd $Rn)>; def : Pat<(i64 (fp_to_uint f64:$Rn)), (FCVTZUxd $Rn)>; +} multiclass A64I_inttofp { def CVTFsw : A64I_fpintI<0b0, 0b00, 0b00, {0, 1, o0}, FPR32, GPR32, asmop>; @@ -2327,6 +2374,7 @@ multiclass A64I_inttofp { defm S : A64I_inttofp<0b0, "scvtf">; defm U : A64I_inttofp<0b1, "ucvtf">; +let Predicates = [HasFPARMv8] in { def : Pat<(f32 (sint_to_fp i32:$Rn)), (SCVTFsw $Rn)>; def : Pat<(f32 (sint_to_fp i64:$Rn)), (SCVTFsx $Rn)>; def : Pat<(f64 (sint_to_fp i32:$Rn)), (SCVTFdw $Rn)>; @@ -2335,16 +2383,19 @@ def : Pat<(f32 (uint_to_fp i32:$Rn)), (UCVTFsw $Rn)>; def : Pat<(f32 (uint_to_fp i64:$Rn)), (UCVTFsx $Rn)>; def : Pat<(f64 (uint_to_fp i32:$Rn)), (UCVTFdw $Rn)>; def : Pat<(f64 (uint_to_fp i64:$Rn)), (UCVTFdx $Rn)>; +} def FMOVws : A64I_fpintI<0b0, 0b00, 0b00, 0b110, GPR32, FPR32, "fmov">; def FMOVsw : A64I_fpintI<0b0, 0b00, 0b00, 0b111, FPR32, GPR32, "fmov">; def FMOVxd : A64I_fpintI<0b1, 0b01, 0b00, 0b110, GPR64, FPR64, "fmov">; def FMOVdx : A64I_fpintI<0b1, 0b01, 0b00, 0b111, FPR64, GPR64, "fmov">; +let Predicates = [HasFPARMv8] in { def : Pat<(i32 (bitconvert f32:$Rn)), (FMOVws $Rn)>; def : Pat<(f32 (bitconvert i32:$Rn)), (FMOVsw $Rn)>; def : Pat<(i64 (bitconvert f64:$Rn)), (FMOVxd $Rn)>; def : Pat<(f64 (bitconvert i64:$Rn)), (FMOVdx $Rn)>; +} def lane1_asmoperand : AsmOperandClass { let Name = "Lane1"; @@ -2367,11 +2418,13 @@ let DecoderMethod = "DecodeFMOVLaneInstruction" in { "fmov\t$Rd.d[$Lane], $Rn", [], NoItinerary>; } +let Predicates = [HasFPARMv8] in { def : InstAlias<"fmov $Rd, $Rn.2d[$Lane]", (FMOVxv GPR64:$Rd, VPR128:$Rn, lane1:$Lane), 0b0>; def : InstAlias<"fmov $Rd.2d[$Lane], $Rn", (FMOVvx VPR128:$Rd, GPR64:$Rn, lane1:$Lane), 0b0>; +} //===----------------------------------------------------------------------===// // Floating-point immediate instructions @@ -2465,11 +2518,15 @@ let mayLoad = 1 in { def LDRx_lit : A64I_LDRlitSimple<0b01, 0b0, GPR64>; } +let Predicates = [HasFPARMv8] in { def LDRs_lit : A64I_LDRlitSimple<0b00, 0b1, FPR32>; def LDRd_lit : A64I_LDRlitSimple<0b01, 0b1, FPR64>; +} let mayLoad = 1 in { + let Predicates = [HasFPARMv8] in { def LDRq_lit : A64I_LDRlitSimple<0b10, 0b1, FPR128>; + } def LDRSWx_lit : A64I_LDRlit<0b10, 0b0, @@ -3063,6 +3120,7 @@ defm LS32 defm LS64 : A64I_LDRSTR_unsigned<"LS64", 0b11, 0b0, 0b0, "", GPR64, dword_addrparams>; +let Predicates = [HasFPARMv8] in { // STR/LDR to/from a B register defm LSFP8 : A64I_LDRSTR_unsigned<"LSFP8", 0b00, 0b1, 0b0, "", FPR8, byte_addrparams>; @@ -3081,6 +3139,7 @@ defm LSFP64 defm LSFP128 : A64I_LDRSTR_unsigned<"LSFP128", 0b00, 0b1, 0b1, "", FPR128, qword_addrparams>; +} //===------------------------------ // 2.3 Signed loads @@ -3536,10 +3595,13 @@ multiclass A64I_LSPsimple opc, bit v, RegisterClass SomeReg, defm LSPair32 : A64I_LSPsimple<0b00, 0b0, GPR32, word_simm7, "LSPair32">; defm LSPair64 : A64I_LSPsimple<0b10, 0b0, GPR64, dword_simm7, "LSPair64">; + +let Predicates = [HasFPARMv8] in { defm LSFPPair32 : A64I_LSPsimple<0b00, 0b1, FPR32, word_simm7, "LSFPPair32">; defm LSFPPair64 : A64I_LSPsimple<0b01, 0b1, FPR64, dword_simm7, "LSFPPair64">; defm LSFPPair128 : A64I_LSPsimple<0b10, 0b1, FPR128, qword_simm7, "LSFPPair128">; +} def LDPSWx : A64I_LSPoffset<0b01, 0b0, 0b1, @@ -3974,14 +4036,17 @@ def : movalias; def : movalias; def : movalias; -def movw_addressref : ComplexPattern; +def movw_addressref_g0 : ComplexPattern">; +def movw_addressref_g1 : ComplexPattern">; +def movw_addressref_g2 : ComplexPattern">; +def movw_addressref_g3 : ComplexPattern">; -def : Pat<(A64WrapperLarge movw_addressref:$G3, movw_addressref:$G2, - movw_addressref:$G1, movw_addressref:$G0), - (MOVKxii (MOVKxii (MOVKxii (MOVZxii movw_addressref:$G3), - movw_addressref:$G2), - movw_addressref:$G1), - movw_addressref:$G0)>; +def : Pat<(A64WrapperLarge movw_addressref_g3:$G3, movw_addressref_g2:$G2, + movw_addressref_g1:$G1, movw_addressref_g0:$G0), + (MOVKxii (MOVKxii (MOVKxii (MOVZxii movw_addressref_g3:$G3), + movw_addressref_g2:$G2), + movw_addressref_g1:$G1), + movw_addressref_g0:$G0)>; //===----------------------------------------------------------------------===// // PC-relative addressing instructions @@ -5120,3 +5185,9 @@ defm : regoff_pats<"Xm", (add i64:$Rn, i64:$Rm), defm : regoff_pats<"Xm", (add i64:$Rn, (shl i64:$Rm, SHIFT)), (i64 i64:$Rn), (i64 i64:$Rm), (i64 3)>; + +//===----------------------------------------------------------------------===// +// Advanced SIMD (NEON) Support +// + +include "AArch64InstrNEON.td" diff --git a/lib/Target/AArch64/AArch64InstrNEON.td b/lib/Target/AArch64/AArch64InstrNEON.td new file mode 100644 index 000000000000..d71749d0e61b --- /dev/null +++ b/lib/Target/AArch64/AArch64InstrNEON.td @@ -0,0 +1,8671 @@ +//===-- AArch64InstrNEON.td - NEON support for AArch64 -----*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the AArch64 NEON instruction set. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// NEON-specific DAG Nodes. +//===----------------------------------------------------------------------===// +def Neon_bsl : SDNode<"AArch64ISD::NEON_BSL", SDTypeProfile<1, 3, + [SDTCisVec<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, + SDTCisSameAs<0, 3>]>>; + +// (outs Result), (ins Imm, OpCmode) +def SDT_Neon_movi : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVT<1, i32>]>; + +def Neon_movi : SDNode<"AArch64ISD::NEON_MOVIMM", SDT_Neon_movi>; + +def Neon_mvni : SDNode<"AArch64ISD::NEON_MVNIMM", SDT_Neon_movi>; + +// (outs Result), (ins Imm) +def Neon_fmovi : SDNode<"AArch64ISD::NEON_FMOVIMM", SDTypeProfile<1, 1, + [SDTCisVec<0>, SDTCisVT<1, i32>]>>; + +// (outs Result), (ins LHS, RHS, CondCode) +def Neon_cmp : SDNode<"AArch64ISD::NEON_CMP", SDTypeProfile<1, 3, + [SDTCisVec<0>, SDTCisSameAs<1, 2>]>>; + +// (outs Result), (ins LHS, 0/0.0 constant, CondCode) +def Neon_cmpz : SDNode<"AArch64ISD::NEON_CMPZ", SDTypeProfile<1, 3, + [SDTCisVec<0>, SDTCisVec<1>]>>; + +// (outs Result), (ins LHS, RHS) +def Neon_tst : SDNode<"AArch64ISD::NEON_TST", SDTypeProfile<1, 2, + [SDTCisVec<0>, SDTCisSameAs<1, 2>]>>; + +def SDTARMVSH : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0, 1>, + SDTCisVT<2, i32>]>; +def Neon_sqrshlImm : SDNode<"AArch64ISD::NEON_QSHLs", SDTARMVSH>; +def Neon_uqrshlImm : SDNode<"AArch64ISD::NEON_QSHLu", SDTARMVSH>; + +def SDTPERMUTE : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0, 1>, + SDTCisSameAs<0, 2>]>; +def Neon_uzp1 : SDNode<"AArch64ISD::NEON_UZP1", SDTPERMUTE>; +def Neon_uzp2 : SDNode<"AArch64ISD::NEON_UZP2", SDTPERMUTE>; +def Neon_zip1 : SDNode<"AArch64ISD::NEON_ZIP1", SDTPERMUTE>; +def Neon_zip2 : SDNode<"AArch64ISD::NEON_ZIP2", SDTPERMUTE>; +def Neon_trn1 : SDNode<"AArch64ISD::NEON_TRN1", SDTPERMUTE>; +def Neon_trn2 : SDNode<"AArch64ISD::NEON_TRN2", SDTPERMUTE>; + +def SDTVSHUF : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0, 1>]>; +def Neon_rev64 : SDNode<"AArch64ISD::NEON_REV64", SDTVSHUF>; +def Neon_rev32 : SDNode<"AArch64ISD::NEON_REV32", SDTVSHUF>; +def Neon_rev16 : SDNode<"AArch64ISD::NEON_REV16", SDTVSHUF>; +def Neon_vdup : SDNode<"AArch64ISD::NEON_VDUP", SDTypeProfile<1, 1, + [SDTCisVec<0>]>>; +def Neon_vduplane : SDNode<"AArch64ISD::NEON_VDUPLANE", SDTypeProfile<1, 2, + [SDTCisVec<0>, SDTCisVec<1>, SDTCisVT<2, i64>]>>; +def Neon_vextract : SDNode<"AArch64ISD::NEON_VEXTRACT", SDTypeProfile<1, 3, + [SDTCisVec<0>, SDTCisSameAs<0, 1>, + SDTCisSameAs<0, 2>, SDTCisVT<3, i64>]>>; + +def SDT_assertext : SDTypeProfile<1, 1, + [SDTCisInt<0>, SDTCisInt<1>, SDTCisSameAs<1, 0>]>; +def assertsext : SDNode<"ISD::AssertSext", SDT_assertext>; +def assertzext : SDNode<"ISD::AssertZext", SDT_assertext>; + +//===----------------------------------------------------------------------===// +// Multiclasses +//===----------------------------------------------------------------------===// + +multiclass NeonI_3VSame_B_sizes size, bits<5> opcode, + string asmop, SDPatternOperator opnode8B, + SDPatternOperator opnode16B, + bit Commutable = 0> { + let isCommutable = Commutable in { + def _8B : NeonI_3VSame<0b0, u, size, opcode, + (outs VPR64:$Rd), (ins VPR64:$Rn, VPR64:$Rm), + asmop # "\t$Rd.8b, $Rn.8b, $Rm.8b", + [(set (v8i8 VPR64:$Rd), + (v8i8 (opnode8B (v8i8 VPR64:$Rn), (v8i8 VPR64:$Rm))))], + NoItinerary>; + + def _16B : NeonI_3VSame<0b1, u, size, opcode, + (outs VPR128:$Rd), (ins VPR128:$Rn, VPR128:$Rm), + asmop # "\t$Rd.16b, $Rn.16b, $Rm.16b", + [(set (v16i8 VPR128:$Rd), + (v16i8 (opnode16B (v16i8 VPR128:$Rn), (v16i8 VPR128:$Rm))))], + NoItinerary>; + } + +} + +multiclass NeonI_3VSame_HS_sizes opcode, + string asmop, SDPatternOperator opnode, + bit Commutable = 0> { + let isCommutable = Commutable in { + def _4H : NeonI_3VSame<0b0, u, 0b01, opcode, + (outs VPR64:$Rd), (ins VPR64:$Rn, VPR64:$Rm), + asmop # "\t$Rd.4h, $Rn.4h, $Rm.4h", + [(set (v4i16 VPR64:$Rd), + (v4i16 (opnode (v4i16 VPR64:$Rn), (v4i16 VPR64:$Rm))))], + NoItinerary>; + + def _8H : NeonI_3VSame<0b1, u, 0b01, opcode, + (outs VPR128:$Rd), (ins VPR128:$Rn, VPR128:$Rm), + asmop # "\t$Rd.8h, $Rn.8h, $Rm.8h", + [(set (v8i16 VPR128:$Rd), + (v8i16 (opnode (v8i16 VPR128:$Rn), (v8i16 VPR128:$Rm))))], + NoItinerary>; + + def _2S : NeonI_3VSame<0b0, u, 0b10, opcode, + (outs VPR64:$Rd), (ins VPR64:$Rn, VPR64:$Rm), + asmop # "\t$Rd.2s, $Rn.2s, $Rm.2s", + [(set (v2i32 VPR64:$Rd), + (v2i32 (opnode (v2i32 VPR64:$Rn), (v2i32 VPR64:$Rm))))], + NoItinerary>; + + def _4S : NeonI_3VSame<0b1, u, 0b10, opcode, + (outs VPR128:$Rd), (ins VPR128:$Rn, VPR128:$Rm), + asmop # "\t$Rd.4s, $Rn.4s, $Rm.4s", + [(set (v4i32 VPR128:$Rd), + (v4i32 (opnode (v4i32 VPR128:$Rn), (v4i32 VPR128:$Rm))))], + NoItinerary>; + } +} +multiclass NeonI_3VSame_BHS_sizes opcode, + string asmop, SDPatternOperator opnode, + bit Commutable = 0> + : NeonI_3VSame_HS_sizes { + let isCommutable = Commutable in { + def _8B : NeonI_3VSame<0b0, u, 0b00, opcode, + (outs VPR64:$Rd), (ins VPR64:$Rn, VPR64:$Rm), + asmop # "\t$Rd.8b, $Rn.8b, $Rm.8b", + [(set (v8i8 VPR64:$Rd), + (v8i8 (opnode (v8i8 VPR64:$Rn), (v8i8 VPR64:$Rm))))], + NoItinerary>; + + def _16B : NeonI_3VSame<0b1, u, 0b00, opcode, + (outs VPR128:$Rd), (ins VPR128:$Rn, VPR128:$Rm), + asmop # "\t$Rd.16b, $Rn.16b, $Rm.16b", + [(set (v16i8 VPR128:$Rd), + (v16i8 (opnode (v16i8 VPR128:$Rn), (v16i8 VPR128:$Rm))))], + NoItinerary>; + } +} + +multiclass NeonI_3VSame_BHSD_sizes opcode, + string asmop, SDPatternOperator opnode, + bit Commutable = 0> + : NeonI_3VSame_BHS_sizes { + let isCommutable = Commutable in { + def _2D : NeonI_3VSame<0b1, u, 0b11, opcode, + (outs VPR128:$Rd), (ins VPR128:$Rn, VPR128:$Rm), + asmop # "\t$Rd.2d, $Rn.2d, $Rm.2d", + [(set (v2i64 VPR128:$Rd), + (v2i64 (opnode (v2i64 VPR128:$Rn), (v2i64 VPR128:$Rm))))], + NoItinerary>; + } +} + +// Multiclass NeonI_3VSame_SD_sizes: Operand types are floating point types, +// but Result types can be integer or floating point types. +multiclass NeonI_3VSame_SD_sizes opcode, + string asmop, SDPatternOperator opnode2S, + SDPatternOperator opnode4S, + SDPatternOperator opnode2D, + ValueType ResTy2S, ValueType ResTy4S, + ValueType ResTy2D, bit Commutable = 0> { + let isCommutable = Commutable in { + def _2S : NeonI_3VSame<0b0, u, {size, 0b0}, opcode, + (outs VPR64:$Rd), (ins VPR64:$Rn, VPR64:$Rm), + asmop # "\t$Rd.2s, $Rn.2s, $Rm.2s", + [(set (ResTy2S VPR64:$Rd), + (ResTy2S (opnode2S (v2f32 VPR64:$Rn), (v2f32 VPR64:$Rm))))], + NoItinerary>; + + def _4S : NeonI_3VSame<0b1, u, {size, 0b0}, opcode, + (outs VPR128:$Rd), (ins VPR128:$Rn, VPR128:$Rm), + asmop # "\t$Rd.4s, $Rn.4s, $Rm.4s", + [(set (ResTy4S VPR128:$Rd), + (ResTy4S (opnode4S (v4f32 VPR128:$Rn), (v4f32 VPR128:$Rm))))], + NoItinerary>; + + def _2D : NeonI_3VSame<0b1, u, {size, 0b1}, opcode, + (outs VPR128:$Rd), (ins VPR128:$Rn, VPR128:$Rm), + asmop # "\t$Rd.2d, $Rn.2d, $Rm.2d", + [(set (ResTy2D VPR128:$Rd), + (ResTy2D (opnode2D (v2f64 VPR128:$Rn), (v2f64 VPR128:$Rm))))], + NoItinerary>; + } +} + +//===----------------------------------------------------------------------===// +// Instruction Definitions +//===----------------------------------------------------------------------===// + +// Vector Arithmetic Instructions + +// Vector Add (Integer and Floating-Point) + +defm ADDvvv : NeonI_3VSame_BHSD_sizes<0b0, 0b10000, "add", add, 1>; +defm FADDvvv : NeonI_3VSame_SD_sizes<0b0, 0b0, 0b11010, "fadd", fadd, fadd, fadd, + v2f32, v4f32, v2f64, 1>; + +// Vector Sub (Integer and Floating-Point) + +defm SUBvvv : NeonI_3VSame_BHSD_sizes<0b1, 0b10000, "sub", sub, 0>; +defm FSUBvvv : NeonI_3VSame_SD_sizes<0b0, 0b1, 0b11010, "fsub", fsub, fsub, fsub, + v2f32, v4f32, v2f64, 0>; + +// Vector Multiply (Integer and Floating-Point) + +defm MULvvv : NeonI_3VSame_BHS_sizes<0b0, 0b10011, "mul", mul, 1>; +defm FMULvvv : NeonI_3VSame_SD_sizes<0b1, 0b0, 0b11011, "fmul", fmul, fmul, fmul, + v2f32, v4f32, v2f64, 1>; + +// Vector Multiply (Polynomial) + +defm PMULvvv : NeonI_3VSame_B_sizes<0b1, 0b00, 0b10011, "pmul", + int_arm_neon_vmulp, int_arm_neon_vmulp, 1>; + +// Vector Multiply-accumulate and Multiply-subtract (Integer) + +// class NeonI_3VSame_Constraint_impl: NeonI_3VSame with no data type and +// two operands constraints. +class NeonI_3VSame_Constraint_impl size, + bits<5> opcode, SDPatternOperator opnode> + : NeonI_3VSame { + let Constraints = "$src = $Rd"; +} + +def Neon_mla : PatFrag<(ops node:$Ra, node:$Rn, node:$Rm), + (add node:$Ra, (mul node:$Rn, node:$Rm))>; + +def Neon_mls : PatFrag<(ops node:$Ra, node:$Rn, node:$Rm), + (sub node:$Ra, (mul node:$Rn, node:$Rm))>; + + +def MLAvvv_8B: NeonI_3VSame_Constraint_impl<"mla", ".8b", VPR64, v8i8, + 0b0, 0b0, 0b00, 0b10010, Neon_mla>; +def MLAvvv_16B: NeonI_3VSame_Constraint_impl<"mla", ".16b", VPR128, v16i8, + 0b1, 0b0, 0b00, 0b10010, Neon_mla>; +def MLAvvv_4H: NeonI_3VSame_Constraint_impl<"mla", ".4h", VPR64, v4i16, + 0b0, 0b0, 0b01, 0b10010, Neon_mla>; +def MLAvvv_8H: NeonI_3VSame_Constraint_impl<"mla", ".8h", VPR128, v8i16, + 0b1, 0b0, 0b01, 0b10010, Neon_mla>; +def MLAvvv_2S: NeonI_3VSame_Constraint_impl<"mla", ".2s", VPR64, v2i32, + 0b0, 0b0, 0b10, 0b10010, Neon_mla>; +def MLAvvv_4S: NeonI_3VSame_Constraint_impl<"mla", ".4s", VPR128, v4i32, + 0b1, 0b0, 0b10, 0b10010, Neon_mla>; + +def MLSvvv_8B: NeonI_3VSame_Constraint_impl<"mls", ".8b", VPR64, v8i8, + 0b0, 0b1, 0b00, 0b10010, Neon_mls>; +def MLSvvv_16B: NeonI_3VSame_Constraint_impl<"mls", ".16b", VPR128, v16i8, + 0b1, 0b1, 0b00, 0b10010, Neon_mls>; +def MLSvvv_4H: NeonI_3VSame_Constraint_impl<"mls", ".4h", VPR64, v4i16, + 0b0, 0b1, 0b01, 0b10010, Neon_mls>; +def MLSvvv_8H: NeonI_3VSame_Constraint_impl<"mls", ".8h", VPR128, v8i16, + 0b1, 0b1, 0b01, 0b10010, Neon_mls>; +def MLSvvv_2S: NeonI_3VSame_Constraint_impl<"mls", ".2s", VPR64, v2i32, + 0b0, 0b1, 0b10, 0b10010, Neon_mls>; +def MLSvvv_4S: NeonI_3VSame_Constraint_impl<"mls", ".4s", VPR128, v4i32, + 0b1, 0b1, 0b10, 0b10010, Neon_mls>; + +// Vector Multiply-accumulate and Multiply-subtract (Floating Point) + +def Neon_fmla : PatFrag<(ops node:$Ra, node:$Rn, node:$Rm), + (fadd node:$Ra, (fmul node:$Rn, node:$Rm))>; + +def Neon_fmls : PatFrag<(ops node:$Ra, node:$Rn, node:$Rm), + (fsub node:$Ra, (fmul node:$Rn, node:$Rm))>; + +let Predicates = [HasNEON, UseFusedMAC] in { +def FMLAvvv_2S: NeonI_3VSame_Constraint_impl<"fmla", ".2s", VPR64, v2f32, + 0b0, 0b0, 0b00, 0b11001, Neon_fmla>; +def FMLAvvv_4S: NeonI_3VSame_Constraint_impl<"fmla", ".4s", VPR128, v4f32, + 0b1, 0b0, 0b00, 0b11001, Neon_fmla>; +def FMLAvvv_2D: NeonI_3VSame_Constraint_impl<"fmla", ".2d", VPR128, v2f64, + 0b1, 0b0, 0b01, 0b11001, Neon_fmla>; + +def FMLSvvv_2S: NeonI_3VSame_Constraint_impl<"fmls", ".2s", VPR64, v2f32, + 0b0, 0b0, 0b10, 0b11001, Neon_fmls>; +def FMLSvvv_4S: NeonI_3VSame_Constraint_impl<"fmls", ".4s", VPR128, v4f32, + 0b1, 0b0, 0b10, 0b11001, Neon_fmls>; +def FMLSvvv_2D: NeonI_3VSame_Constraint_impl<"fmls", ".2d", VPR128, v2f64, + 0b1, 0b0, 0b11, 0b11001, Neon_fmls>; +} + +// We're also allowed to match the fma instruction regardless of compile +// options. +def : Pat<(v2f32 (fma VPR64:$Rn, VPR64:$Rm, VPR64:$Ra)), + (FMLAvvv_2S VPR64:$Ra, VPR64:$Rn, VPR64:$Rm)>; +def : Pat<(v4f32 (fma VPR128:$Rn, VPR128:$Rm, VPR128:$Ra)), + (FMLAvvv_4S VPR128:$Ra, VPR128:$Rn, VPR128:$Rm)>; +def : Pat<(v2f64 (fma VPR128:$Rn, VPR128:$Rm, VPR128:$Ra)), + (FMLAvvv_2D VPR128:$Ra, VPR128:$Rn, VPR128:$Rm)>; + +def : Pat<(v2f32 (fma (fneg VPR64:$Rn), VPR64:$Rm, VPR64:$Ra)), + (FMLSvvv_2S VPR64:$Ra, VPR64:$Rn, VPR64:$Rm)>; +def : Pat<(v4f32 (fma (fneg VPR128:$Rn), VPR128:$Rm, VPR128:$Ra)), + (FMLSvvv_4S VPR128:$Ra, VPR128:$Rn, VPR128:$Rm)>; +def : Pat<(v2f64 (fma (fneg VPR128:$Rn), VPR128:$Rm, VPR128:$Ra)), + (FMLSvvv_2D VPR128:$Ra, VPR128:$Rn, VPR128:$Rm)>; + +// Vector Divide (Floating-Point) + +defm FDIVvvv : NeonI_3VSame_SD_sizes<0b1, 0b0, 0b11111, "fdiv", fdiv, fdiv, fdiv, + v2f32, v4f32, v2f64, 0>; + +// Vector Bitwise Operations + +// Vector Bitwise AND + +defm ANDvvv : NeonI_3VSame_B_sizes<0b0, 0b00, 0b00011, "and", and, and, 1>; + +// Vector Bitwise Exclusive OR + +defm EORvvv : NeonI_3VSame_B_sizes<0b1, 0b00, 0b00011, "eor", xor, xor, 1>; + +// Vector Bitwise OR + +defm ORRvvv : NeonI_3VSame_B_sizes<0b0, 0b10, 0b00011, "orr", or, or, 1>; + +// ORR disassembled as MOV if Vn==Vm + +// Vector Move - register +// Alias for ORR if Vn=Vm. +// FIXME: This is actually the preferred syntax but TableGen can't deal with +// custom printing of aliases. +def : NeonInstAlias<"mov $Rd.8b, $Rn.8b", + (ORRvvv_8B VPR64:$Rd, VPR64:$Rn, VPR64:$Rn), 0>; +def : NeonInstAlias<"mov $Rd.16b, $Rn.16b", + (ORRvvv_16B VPR128:$Rd, VPR128:$Rn, VPR128:$Rn), 0>; + +// The MOVI instruction takes two immediate operands. The first is the +// immediate encoding, while the second is the cmode. A cmode of 14, or +// 0b1110, produces a MOVI operation, rather than a MVNI, ORR, or BIC. +def Neon_AllZero : PatFrag<(ops), (Neon_movi (i32 0), (i32 14))>; +def Neon_AllOne : PatFrag<(ops), (Neon_movi (i32 255), (i32 14))>; + +def Neon_not8B : PatFrag<(ops node:$in), + (xor node:$in, (bitconvert (v8i8 Neon_AllOne)))>; +def Neon_not16B : PatFrag<(ops node:$in), + (xor node:$in, (bitconvert (v16i8 Neon_AllOne)))>; + +def Neon_orn8B : PatFrag<(ops node:$Rn, node:$Rm), + (or node:$Rn, (Neon_not8B node:$Rm))>; + +def Neon_orn16B : PatFrag<(ops node:$Rn, node:$Rm), + (or node:$Rn, (Neon_not16B node:$Rm))>; + +def Neon_bic8B : PatFrag<(ops node:$Rn, node:$Rm), + (and node:$Rn, (Neon_not8B node:$Rm))>; + +def Neon_bic16B : PatFrag<(ops node:$Rn, node:$Rm), + (and node:$Rn, (Neon_not16B node:$Rm))>; + + +// Vector Bitwise OR NOT - register + +defm ORNvvv : NeonI_3VSame_B_sizes<0b0, 0b11, 0b00011, "orn", + Neon_orn8B, Neon_orn16B, 0>; + +// Vector Bitwise Bit Clear (AND NOT) - register + +defm BICvvv : NeonI_3VSame_B_sizes<0b0, 0b01, 0b00011, "bic", + Neon_bic8B, Neon_bic16B, 0>; + +multiclass Neon_bitwise2V_patterns { + def : Pat<(v2i32 (opnode8B VPR64:$Rn, VPR64:$Rm)), + (INST8B VPR64:$Rn, VPR64:$Rm)>; + def : Pat<(v4i16 (opnode8B VPR64:$Rn, VPR64:$Rm)), + (INST8B VPR64:$Rn, VPR64:$Rm)>; + def : Pat<(v1i64 (opnode8B VPR64:$Rn, VPR64:$Rm)), + (INST8B VPR64:$Rn, VPR64:$Rm)>; + def : Pat<(v4i32 (opnode16B VPR128:$Rn, VPR128:$Rm)), + (INST16B VPR128:$Rn, VPR128:$Rm)>; + def : Pat<(v8i16 (opnode16B VPR128:$Rn, VPR128:$Rm)), + (INST16B VPR128:$Rn, VPR128:$Rm)>; + def : Pat<(v2i64 (opnode16B VPR128:$Rn, VPR128:$Rm)), + (INST16B VPR128:$Rn, VPR128:$Rm)>; +} + +// Additional patterns for bitwise instructions AND, EOR, ORR, BIC, ORN +defm : Neon_bitwise2V_patterns; +defm : Neon_bitwise2V_patterns; +defm : Neon_bitwise2V_patterns; +defm : Neon_bitwise2V_patterns; +defm : Neon_bitwise2V_patterns; + +// Vector Bitwise Select +def BSLvvv_8B : NeonI_3VSame_Constraint_impl<"bsl", ".8b", VPR64, v8i8, + 0b0, 0b1, 0b01, 0b00011, Neon_bsl>; + +def BSLvvv_16B : NeonI_3VSame_Constraint_impl<"bsl", ".16b", VPR128, v16i8, + 0b1, 0b1, 0b01, 0b00011, Neon_bsl>; + +multiclass Neon_bitwise3V_patterns { + // Disassociate type from instruction definition + def : Pat<(v2i32 (opnode VPR64:$src,VPR64:$Rn, VPR64:$Rm)), + (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>; + def : Pat<(v4i16 (opnode VPR64:$src, VPR64:$Rn, VPR64:$Rm)), + (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>; + def : Pat<(v1i64 (opnode VPR64:$src, VPR64:$Rn, VPR64:$Rm)), + (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>; + def : Pat<(v4i32 (opnode VPR128:$src, VPR128:$Rn, VPR128:$Rm)), + (INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>; + def : Pat<(v8i16 (opnode VPR128:$src, VPR128:$Rn, VPR128:$Rm)), + (INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>; + def : Pat<(v2i64 (opnode VPR128:$src, VPR128:$Rn, VPR128:$Rm)), + (INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>; + + // Allow to match BSL instruction pattern with non-constant operand + def : Pat<(v8i8 (or (and VPR64:$Rn, VPR64:$Rd), + (and VPR64:$Rm, (Neon_not8B VPR64:$Rd)))), + (INST8B VPR64:$Rd, VPR64:$Rn, VPR64:$Rm)>; + def : Pat<(v4i16 (or (and VPR64:$Rn, VPR64:$Rd), + (and VPR64:$Rm, (Neon_not8B VPR64:$Rd)))), + (INST8B VPR64:$Rd, VPR64:$Rn, VPR64:$Rm)>; + def : Pat<(v2i32 (or (and VPR64:$Rn, VPR64:$Rd), + (and VPR64:$Rm, (Neon_not8B VPR64:$Rd)))), + (INST8B VPR64:$Rd, VPR64:$Rn, VPR64:$Rm)>; + def : Pat<(v1i64 (or (and VPR64:$Rn, VPR64:$Rd), + (and VPR64:$Rm, (Neon_not8B VPR64:$Rd)))), + (INST8B VPR64:$Rd, VPR64:$Rn, VPR64:$Rm)>; + def : Pat<(v16i8 (or (and VPR128:$Rn, VPR128:$Rd), + (and VPR128:$Rm, (Neon_not16B VPR128:$Rd)))), + (INST16B VPR128:$Rd, VPR128:$Rn, VPR128:$Rm)>; + def : Pat<(v8i16 (or (and VPR128:$Rn, VPR128:$Rd), + (and VPR128:$Rm, (Neon_not16B VPR128:$Rd)))), + (INST16B VPR128:$Rd, VPR128:$Rn, VPR128:$Rm)>; + def : Pat<(v4i32 (or (and VPR128:$Rn, VPR128:$Rd), + (and VPR128:$Rm, (Neon_not16B VPR128:$Rd)))), + (INST16B VPR128:$Rd, VPR128:$Rn, VPR128:$Rm)>; + def : Pat<(v2i64 (or (and VPR128:$Rn, VPR128:$Rd), + (and VPR128:$Rm, (Neon_not16B VPR128:$Rd)))), + (INST16B VPR128:$Rd, VPR128:$Rn, VPR128:$Rm)>; + + // Allow to match llvm.arm.* intrinsics. + def : Pat<(v8i8 (int_arm_neon_vbsl (v8i8 VPR64:$src), + (v8i8 VPR64:$Rn), (v8i8 VPR64:$Rm))), + (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>; + def : Pat<(v4i16 (int_arm_neon_vbsl (v4i16 VPR64:$src), + (v4i16 VPR64:$Rn), (v4i16 VPR64:$Rm))), + (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>; + def : Pat<(v2i32 (int_arm_neon_vbsl (v2i32 VPR64:$src), + (v2i32 VPR64:$Rn), (v2i32 VPR64:$Rm))), + (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>; + def : Pat<(v1i64 (int_arm_neon_vbsl (v1i64 VPR64:$src), + (v1i64 VPR64:$Rn), (v1i64 VPR64:$Rm))), + (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>; + def : Pat<(v2f32 (int_arm_neon_vbsl (v2f32 VPR64:$src), + (v2f32 VPR64:$Rn), (v2f32 VPR64:$Rm))), + (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>; + def : Pat<(v1f64 (int_arm_neon_vbsl (v1f64 VPR64:$src), + (v1f64 VPR64:$Rn), (v1f64 VPR64:$Rm))), + (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>; + def : Pat<(v16i8 (int_arm_neon_vbsl (v16i8 VPR128:$src), + (v16i8 VPR128:$Rn), (v16i8 VPR128:$Rm))), + (INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>; + def : Pat<(v8i16 (int_arm_neon_vbsl (v8i16 VPR128:$src), + (v8i16 VPR128:$Rn), (v8i16 VPR128:$Rm))), + (INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>; + def : Pat<(v4i32 (int_arm_neon_vbsl (v4i32 VPR128:$src), + (v4i32 VPR128:$Rn), (v4i32 VPR128:$Rm))), + (INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>; + def : Pat<(v2i64 (int_arm_neon_vbsl (v2i64 VPR128:$src), + (v2i64 VPR128:$Rn), (v2i64 VPR128:$Rm))), + (INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>; + def : Pat<(v4f32 (int_arm_neon_vbsl (v4f32 VPR128:$src), + (v4f32 VPR128:$Rn), (v4f32 VPR128:$Rm))), + (INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>; + def : Pat<(v2f64 (int_arm_neon_vbsl (v2f64 VPR128:$src), + (v2f64 VPR128:$Rn), (v2f64 VPR128:$Rm))), + (INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>; +} + +// Additional patterns for bitwise instruction BSL +defm: Neon_bitwise3V_patterns; + +def Neon_NoBSLop : PatFrag<(ops node:$src, node:$Rn, node:$Rm), + (Neon_bsl node:$src, node:$Rn, node:$Rm), + [{ (void)N; return false; }]>; + +// Vector Bitwise Insert if True + +def BITvvv_8B : NeonI_3VSame_Constraint_impl<"bit", ".8b", VPR64, v8i8, + 0b0, 0b1, 0b10, 0b00011, Neon_NoBSLop>; +def BITvvv_16B : NeonI_3VSame_Constraint_impl<"bit", ".16b", VPR128, v16i8, + 0b1, 0b1, 0b10, 0b00011, Neon_NoBSLop>; + +// Vector Bitwise Insert if False + +def BIFvvv_8B : NeonI_3VSame_Constraint_impl<"bif", ".8b", VPR64, v8i8, + 0b0, 0b1, 0b11, 0b00011, Neon_NoBSLop>; +def BIFvvv_16B : NeonI_3VSame_Constraint_impl<"bif", ".16b", VPR128, v16i8, + 0b1, 0b1, 0b11, 0b00011, Neon_NoBSLop>; + +// Vector Absolute Difference and Accumulate (Signed, Unsigned) + +def Neon_uaba : PatFrag<(ops node:$Ra, node:$Rn, node:$Rm), + (add node:$Ra, (int_arm_neon_vabdu node:$Rn, node:$Rm))>; +def Neon_saba : PatFrag<(ops node:$Ra, node:$Rn, node:$Rm), + (add node:$Ra, (int_arm_neon_vabds node:$Rn, node:$Rm))>; + +// Vector Absolute Difference and Accumulate (Unsigned) +def UABAvvv_8B : NeonI_3VSame_Constraint_impl<"uaba", ".8b", VPR64, v8i8, + 0b0, 0b1, 0b00, 0b01111, Neon_uaba>; +def UABAvvv_16B : NeonI_3VSame_Constraint_impl<"uaba", ".16b", VPR128, v16i8, + 0b1, 0b1, 0b00, 0b01111, Neon_uaba>; +def UABAvvv_4H : NeonI_3VSame_Constraint_impl<"uaba", ".4h", VPR64, v4i16, + 0b0, 0b1, 0b01, 0b01111, Neon_uaba>; +def UABAvvv_8H : NeonI_3VSame_Constraint_impl<"uaba", ".8h", VPR128, v8i16, + 0b1, 0b1, 0b01, 0b01111, Neon_uaba>; +def UABAvvv_2S : NeonI_3VSame_Constraint_impl<"uaba", ".2s", VPR64, v2i32, + 0b0, 0b1, 0b10, 0b01111, Neon_uaba>; +def UABAvvv_4S : NeonI_3VSame_Constraint_impl<"uaba", ".4s", VPR128, v4i32, + 0b1, 0b1, 0b10, 0b01111, Neon_uaba>; + +// Vector Absolute Difference and Accumulate (Signed) +def SABAvvv_8B : NeonI_3VSame_Constraint_impl<"saba", ".8b", VPR64, v8i8, + 0b0, 0b0, 0b00, 0b01111, Neon_saba>; +def SABAvvv_16B : NeonI_3VSame_Constraint_impl<"saba", ".16b", VPR128, v16i8, + 0b1, 0b0, 0b00, 0b01111, Neon_saba>; +def SABAvvv_4H : NeonI_3VSame_Constraint_impl<"saba", ".4h", VPR64, v4i16, + 0b0, 0b0, 0b01, 0b01111, Neon_saba>; +def SABAvvv_8H : NeonI_3VSame_Constraint_impl<"saba", ".8h", VPR128, v8i16, + 0b1, 0b0, 0b01, 0b01111, Neon_saba>; +def SABAvvv_2S : NeonI_3VSame_Constraint_impl<"saba", ".2s", VPR64, v2i32, + 0b0, 0b0, 0b10, 0b01111, Neon_saba>; +def SABAvvv_4S : NeonI_3VSame_Constraint_impl<"saba", ".4s", VPR128, v4i32, + 0b1, 0b0, 0b10, 0b01111, Neon_saba>; + + +// Vector Absolute Difference (Signed, Unsigned) +defm UABDvvv : NeonI_3VSame_BHS_sizes<0b1, 0b01110, "uabd", int_arm_neon_vabdu, 0>; +defm SABDvvv : NeonI_3VSame_BHS_sizes<0b0, 0b01110, "sabd", int_arm_neon_vabds, 0>; + +// Vector Absolute Difference (Floating Point) +defm FABDvvv: NeonI_3VSame_SD_sizes<0b1, 0b1, 0b11010, "fabd", + int_arm_neon_vabds, int_arm_neon_vabds, + int_arm_neon_vabds, v2f32, v4f32, v2f64, 0>; + +// Vector Reciprocal Step (Floating Point) +defm FRECPSvvv : NeonI_3VSame_SD_sizes<0b0, 0b0, 0b11111, "frecps", + int_arm_neon_vrecps, int_arm_neon_vrecps, + int_arm_neon_vrecps, + v2f32, v4f32, v2f64, 0>; + +// Vector Reciprocal Square Root Step (Floating Point) +defm FRSQRTSvvv : NeonI_3VSame_SD_sizes<0b0, 0b1, 0b11111, "frsqrts", + int_arm_neon_vrsqrts, + int_arm_neon_vrsqrts, + int_arm_neon_vrsqrts, + v2f32, v4f32, v2f64, 0>; + +// Vector Comparisons + +def Neon_cmeq : PatFrag<(ops node:$lhs, node:$rhs), + (Neon_cmp node:$lhs, node:$rhs, SETEQ)>; +def Neon_cmphs : PatFrag<(ops node:$lhs, node:$rhs), + (Neon_cmp node:$lhs, node:$rhs, SETUGE)>; +def Neon_cmge : PatFrag<(ops node:$lhs, node:$rhs), + (Neon_cmp node:$lhs, node:$rhs, SETGE)>; +def Neon_cmhi : PatFrag<(ops node:$lhs, node:$rhs), + (Neon_cmp node:$lhs, node:$rhs, SETUGT)>; +def Neon_cmgt : PatFrag<(ops node:$lhs, node:$rhs), + (Neon_cmp node:$lhs, node:$rhs, SETGT)>; + +// NeonI_compare_aliases class: swaps register operands to implement +// comparison aliases, e.g., CMLE is alias for CMGE with operands reversed. +class NeonI_compare_aliases + : NeonInstAlias; + +// Vector Comparisons (Integer) + +// Vector Compare Mask Equal (Integer) +let isCommutable =1 in { +defm CMEQvvv : NeonI_3VSame_BHSD_sizes<0b1, 0b10001, "cmeq", Neon_cmeq, 0>; +} + +// Vector Compare Mask Higher or Same (Unsigned Integer) +defm CMHSvvv : NeonI_3VSame_BHSD_sizes<0b1, 0b00111, "cmhs", Neon_cmphs, 0>; + +// Vector Compare Mask Greater Than or Equal (Integer) +defm CMGEvvv : NeonI_3VSame_BHSD_sizes<0b0, 0b00111, "cmge", Neon_cmge, 0>; + +// Vector Compare Mask Higher (Unsigned Integer) +defm CMHIvvv : NeonI_3VSame_BHSD_sizes<0b1, 0b00110, "cmhi", Neon_cmhi, 0>; + +// Vector Compare Mask Greater Than (Integer) +defm CMGTvvv : NeonI_3VSame_BHSD_sizes<0b0, 0b00110, "cmgt", Neon_cmgt, 0>; + +// Vector Compare Mask Bitwise Test (Integer) +defm CMTSTvvv: NeonI_3VSame_BHSD_sizes<0b0, 0b10001, "cmtst", Neon_tst, 0>; + +// Vector Compare Mask Less or Same (Unsigned Integer) +// CMLS is alias for CMHS with operands reversed. +def CMLSvvv_8B : NeonI_compare_aliases<"cmls", ".8b", CMHSvvv_8B, VPR64>; +def CMLSvvv_16B : NeonI_compare_aliases<"cmls", ".16b", CMHSvvv_16B, VPR128>; +def CMLSvvv_4H : NeonI_compare_aliases<"cmls", ".4h", CMHSvvv_4H, VPR64>; +def CMLSvvv_8H : NeonI_compare_aliases<"cmls", ".8h", CMHSvvv_8H, VPR128>; +def CMLSvvv_2S : NeonI_compare_aliases<"cmls", ".2s", CMHSvvv_2S, VPR64>; +def CMLSvvv_4S : NeonI_compare_aliases<"cmls", ".4s", CMHSvvv_4S, VPR128>; +def CMLSvvv_2D : NeonI_compare_aliases<"cmls", ".2d", CMHSvvv_2D, VPR128>; + +// Vector Compare Mask Less Than or Equal (Integer) +// CMLE is alias for CMGE with operands reversed. +def CMLEvvv_8B : NeonI_compare_aliases<"cmle", ".8b", CMGEvvv_8B, VPR64>; +def CMLEvvv_16B : NeonI_compare_aliases<"cmle", ".16b", CMGEvvv_16B, VPR128>; +def CMLEvvv_4H : NeonI_compare_aliases<"cmle", ".4h", CMGEvvv_4H, VPR64>; +def CMLEvvv_8H : NeonI_compare_aliases<"cmle", ".8h", CMGEvvv_8H, VPR128>; +def CMLEvvv_2S : NeonI_compare_aliases<"cmle", ".2s", CMGEvvv_2S, VPR64>; +def CMLEvvv_4S : NeonI_compare_aliases<"cmle", ".4s", CMGEvvv_4S, VPR128>; +def CMLEvvv_2D : NeonI_compare_aliases<"cmle", ".2d", CMGEvvv_2D, VPR128>; + +// Vector Compare Mask Lower (Unsigned Integer) +// CMLO is alias for CMHI with operands reversed. +def CMLOvvv_8B : NeonI_compare_aliases<"cmlo", ".8b", CMHIvvv_8B, VPR64>; +def CMLOvvv_16B : NeonI_compare_aliases<"cmlo", ".16b", CMHIvvv_16B, VPR128>; +def CMLOvvv_4H : NeonI_compare_aliases<"cmlo", ".4h", CMHIvvv_4H, VPR64>; +def CMLOvvv_8H : NeonI_compare_aliases<"cmlo", ".8h", CMHIvvv_8H, VPR128>; +def CMLOvvv_2S : NeonI_compare_aliases<"cmlo", ".2s", CMHIvvv_2S, VPR64>; +def CMLOvvv_4S : NeonI_compare_aliases<"cmlo", ".4s", CMHIvvv_4S, VPR128>; +def CMLOvvv_2D : NeonI_compare_aliases<"cmlo", ".2d", CMHIvvv_2D, VPR128>; + +// Vector Compare Mask Less Than (Integer) +// CMLT is alias for CMGT with operands reversed. +def CMLTvvv_8B : NeonI_compare_aliases<"cmlt", ".8b", CMGTvvv_8B, VPR64>; +def CMLTvvv_16B : NeonI_compare_aliases<"cmlt", ".16b", CMGTvvv_16B, VPR128>; +def CMLTvvv_4H : NeonI_compare_aliases<"cmlt", ".4h", CMGTvvv_4H, VPR64>; +def CMLTvvv_8H : NeonI_compare_aliases<"cmlt", ".8h", CMGTvvv_8H, VPR128>; +def CMLTvvv_2S : NeonI_compare_aliases<"cmlt", ".2s", CMGTvvv_2S, VPR64>; +def CMLTvvv_4S : NeonI_compare_aliases<"cmlt", ".4s", CMGTvvv_4S, VPR128>; +def CMLTvvv_2D : NeonI_compare_aliases<"cmlt", ".2d", CMGTvvv_2D, VPR128>; + + +def neon_uimm0_asmoperand : AsmOperandClass +{ + let Name = "UImm0"; + let PredicateMethod = "isUImm<0>"; + let RenderMethod = "addImmOperands"; +} + +def neon_uimm0 : Operand, ImmLeaf { + let ParserMatchClass = neon_uimm0_asmoperand; + let PrintMethod = "printNeonUImm0Operand"; + +} + +multiclass NeonI_cmpz_sizes opcode, string asmop, CondCode CC> +{ + def _8B : NeonI_2VMisc<0b0, u, 0b00, opcode, + (outs VPR64:$Rd), (ins VPR64:$Rn, neon_uimm0:$Imm), + asmop # "\t$Rd.8b, $Rn.8b, $Imm", + [(set (v8i8 VPR64:$Rd), + (v8i8 (Neon_cmpz (v8i8 VPR64:$Rn), (i32 imm:$Imm), CC)))], + NoItinerary>; + + def _16B : NeonI_2VMisc<0b1, u, 0b00, opcode, + (outs VPR128:$Rd), (ins VPR128:$Rn, neon_uimm0:$Imm), + asmop # "\t$Rd.16b, $Rn.16b, $Imm", + [(set (v16i8 VPR128:$Rd), + (v16i8 (Neon_cmpz (v16i8 VPR128:$Rn), (i32 imm:$Imm), CC)))], + NoItinerary>; + + def _4H : NeonI_2VMisc<0b0, u, 0b01, opcode, + (outs VPR64:$Rd), (ins VPR64:$Rn, neon_uimm0:$Imm), + asmop # "\t$Rd.4h, $Rn.4h, $Imm", + [(set (v4i16 VPR64:$Rd), + (v4i16 (Neon_cmpz (v4i16 VPR64:$Rn), (i32 imm:$Imm), CC)))], + NoItinerary>; + + def _8H : NeonI_2VMisc<0b1, u, 0b01, opcode, + (outs VPR128:$Rd), (ins VPR128:$Rn, neon_uimm0:$Imm), + asmop # "\t$Rd.8h, $Rn.8h, $Imm", + [(set (v8i16 VPR128:$Rd), + (v8i16 (Neon_cmpz (v8i16 VPR128:$Rn), (i32 imm:$Imm), CC)))], + NoItinerary>; + + def _2S : NeonI_2VMisc<0b0, u, 0b10, opcode, + (outs VPR64:$Rd), (ins VPR64:$Rn, neon_uimm0:$Imm), + asmop # "\t$Rd.2s, $Rn.2s, $Imm", + [(set (v2i32 VPR64:$Rd), + (v2i32 (Neon_cmpz (v2i32 VPR64:$Rn), (i32 imm:$Imm), CC)))], + NoItinerary>; + + def _4S : NeonI_2VMisc<0b1, u, 0b10, opcode, + (outs VPR128:$Rd), (ins VPR128:$Rn, neon_uimm0:$Imm), + asmop # "\t$Rd.4s, $Rn.4s, $Imm", + [(set (v4i32 VPR128:$Rd), + (v4i32 (Neon_cmpz (v4i32 VPR128:$Rn), (i32 imm:$Imm), CC)))], + NoItinerary>; + + def _2D : NeonI_2VMisc<0b1, u, 0b11, opcode, + (outs VPR128:$Rd), (ins VPR128:$Rn, neon_uimm0:$Imm), + asmop # "\t$Rd.2d, $Rn.2d, $Imm", + [(set (v2i64 VPR128:$Rd), + (v2i64 (Neon_cmpz (v2i64 VPR128:$Rn), (i32 imm:$Imm), CC)))], + NoItinerary>; +} + +// Vector Compare Mask Equal to Zero (Integer) +defm CMEQvvi : NeonI_cmpz_sizes<0b0, 0b01001, "cmeq", SETEQ>; + +// Vector Compare Mask Greater Than or Equal to Zero (Signed Integer) +defm CMGEvvi : NeonI_cmpz_sizes<0b1, 0b01000, "cmge", SETGE>; + +// Vector Compare Mask Greater Than Zero (Signed Integer) +defm CMGTvvi : NeonI_cmpz_sizes<0b0, 0b01000, "cmgt", SETGT>; + +// Vector Compare Mask Less Than or Equal To Zero (Signed Integer) +defm CMLEvvi : NeonI_cmpz_sizes<0b1, 0b01001, "cmle", SETLE>; + +// Vector Compare Mask Less Than Zero (Signed Integer) +defm CMLTvvi : NeonI_cmpz_sizes<0b0, 0b01010, "cmlt", SETLT>; + +// Vector Comparisons (Floating Point) + +// Vector Compare Mask Equal (Floating Point) +let isCommutable =1 in { +defm FCMEQvvv : NeonI_3VSame_SD_sizes<0b0, 0b0, 0b11100, "fcmeq", Neon_cmeq, + Neon_cmeq, Neon_cmeq, + v2i32, v4i32, v2i64, 0>; +} + +// Vector Compare Mask Greater Than Or Equal (Floating Point) +defm FCMGEvvv : NeonI_3VSame_SD_sizes<0b1, 0b0, 0b11100, "fcmge", Neon_cmge, + Neon_cmge, Neon_cmge, + v2i32, v4i32, v2i64, 0>; + +// Vector Compare Mask Greater Than (Floating Point) +defm FCMGTvvv : NeonI_3VSame_SD_sizes<0b1, 0b1, 0b11100, "fcmgt", Neon_cmgt, + Neon_cmgt, Neon_cmgt, + v2i32, v4i32, v2i64, 0>; + +// Vector Compare Mask Less Than Or Equal (Floating Point) +// FCMLE is alias for FCMGE with operands reversed. +def FCMLEvvv_2S : NeonI_compare_aliases<"fcmle", ".2s", FCMGEvvv_2S, VPR64>; +def FCMLEvvv_4S : NeonI_compare_aliases<"fcmle", ".4s", FCMGEvvv_4S, VPR128>; +def FCMLEvvv_2D : NeonI_compare_aliases<"fcmle", ".2d", FCMGEvvv_2D, VPR128>; + +// Vector Compare Mask Less Than (Floating Point) +// FCMLT is alias for FCMGT with operands reversed. +def FCMLTvvv_2S : NeonI_compare_aliases<"fcmlt", ".2s", FCMGTvvv_2S, VPR64>; +def FCMLTvvv_4S : NeonI_compare_aliases<"fcmlt", ".4s", FCMGTvvv_4S, VPR128>; +def FCMLTvvv_2D : NeonI_compare_aliases<"fcmlt", ".2d", FCMGTvvv_2D, VPR128>; + + +multiclass NeonI_fpcmpz_sizes opcode, + string asmop, CondCode CC> +{ + def _2S : NeonI_2VMisc<0b0, u, {size, 0b0}, opcode, + (outs VPR64:$Rd), (ins VPR64:$Rn, fpz32:$FPImm), + asmop # "\t$Rd.2s, $Rn.2s, $FPImm", + [(set (v2i32 VPR64:$Rd), + (v2i32 (Neon_cmpz (v2f32 VPR64:$Rn), (f32 fpimm:$FPImm), CC)))], + NoItinerary>; + + def _4S : NeonI_2VMisc<0b1, u, {size, 0b0}, opcode, + (outs VPR128:$Rd), (ins VPR128:$Rn, fpz32:$FPImm), + asmop # "\t$Rd.4s, $Rn.4s, $FPImm", + [(set (v4i32 VPR128:$Rd), + (v4i32 (Neon_cmpz (v4f32 VPR128:$Rn), (f32 fpimm:$FPImm), CC)))], + NoItinerary>; + + def _2D : NeonI_2VMisc<0b1, u, {size, 0b1}, opcode, + (outs VPR128:$Rd), (ins VPR128:$Rn, fpz32:$FPImm), + asmop # "\t$Rd.2d, $Rn.2d, $FPImm", + [(set (v2i64 VPR128:$Rd), + (v2i64 (Neon_cmpz (v2f64 VPR128:$Rn), (f32 fpimm:$FPImm), CC)))], + NoItinerary>; +} + +// Vector Compare Mask Equal to Zero (Floating Point) +defm FCMEQvvi : NeonI_fpcmpz_sizes<0b0, 0b1, 0b01101, "fcmeq", SETEQ>; + +// Vector Compare Mask Greater Than or Equal to Zero (Floating Point) +defm FCMGEvvi : NeonI_fpcmpz_sizes<0b1, 0b1, 0b01100, "fcmge", SETGE>; + +// Vector Compare Mask Greater Than Zero (Floating Point) +defm FCMGTvvi : NeonI_fpcmpz_sizes<0b0, 0b1, 0b01100, "fcmgt", SETGT>; + +// Vector Compare Mask Less Than or Equal To Zero (Floating Point) +defm FCMLEvvi : NeonI_fpcmpz_sizes<0b1, 0b1, 0b01101, "fcmle", SETLE>; + +// Vector Compare Mask Less Than Zero (Floating Point) +defm FCMLTvvi : NeonI_fpcmpz_sizes<0b0, 0b1, 0b01110, "fcmlt", SETLT>; + +// Vector Absolute Comparisons (Floating Point) + +// Vector Absolute Compare Mask Greater Than Or Equal (Floating Point) +defm FACGEvvv : NeonI_3VSame_SD_sizes<0b1, 0b0, 0b11101, "facge", + int_arm_neon_vacged, int_arm_neon_vacgeq, + int_aarch64_neon_vacgeq, + v2i32, v4i32, v2i64, 0>; + +// Vector Absolute Compare Mask Greater Than (Floating Point) +defm FACGTvvv : NeonI_3VSame_SD_sizes<0b1, 0b1, 0b11101, "facgt", + int_arm_neon_vacgtd, int_arm_neon_vacgtq, + int_aarch64_neon_vacgtq, + v2i32, v4i32, v2i64, 0>; + +// Vector Absolute Compare Mask Less Than Or Equal (Floating Point) +// FACLE is alias for FACGE with operands reversed. +def FACLEvvv_2S : NeonI_compare_aliases<"facle", ".2s", FACGEvvv_2S, VPR64>; +def FACLEvvv_4S : NeonI_compare_aliases<"facle", ".4s", FACGEvvv_4S, VPR128>; +def FACLEvvv_2D : NeonI_compare_aliases<"facle", ".2d", FACGEvvv_2D, VPR128>; + +// Vector Absolute Compare Mask Less Than (Floating Point) +// FACLT is alias for FACGT with operands reversed. +def FACLTvvv_2S : NeonI_compare_aliases<"faclt", ".2s", FACGTvvv_2S, VPR64>; +def FACLTvvv_4S : NeonI_compare_aliases<"faclt", ".4s", FACGTvvv_4S, VPR128>; +def FACLTvvv_2D : NeonI_compare_aliases<"faclt", ".2d", FACGTvvv_2D, VPR128>; + +// Vector halving add (Integer Signed, Unsigned) +defm SHADDvvv : NeonI_3VSame_BHS_sizes<0b0, 0b00000, "shadd", + int_arm_neon_vhadds, 1>; +defm UHADDvvv : NeonI_3VSame_BHS_sizes<0b1, 0b00000, "uhadd", + int_arm_neon_vhaddu, 1>; + +// Vector halving sub (Integer Signed, Unsigned) +defm SHSUBvvv : NeonI_3VSame_BHS_sizes<0b0, 0b00100, "shsub", + int_arm_neon_vhsubs, 0>; +defm UHSUBvvv : NeonI_3VSame_BHS_sizes<0b1, 0b00100, "uhsub", + int_arm_neon_vhsubu, 0>; + +// Vector rouding halving add (Integer Signed, Unsigned) +defm SRHADDvvv : NeonI_3VSame_BHS_sizes<0b0, 0b00010, "srhadd", + int_arm_neon_vrhadds, 1>; +defm URHADDvvv : NeonI_3VSame_BHS_sizes<0b1, 0b00010, "urhadd", + int_arm_neon_vrhaddu, 1>; + +// Vector Saturating add (Integer Signed, Unsigned) +defm SQADDvvv : NeonI_3VSame_BHSD_sizes<0b0, 0b00001, "sqadd", + int_arm_neon_vqadds, 1>; +defm UQADDvvv : NeonI_3VSame_BHSD_sizes<0b1, 0b00001, "uqadd", + int_arm_neon_vqaddu, 1>; + +// Vector Saturating sub (Integer Signed, Unsigned) +defm SQSUBvvv : NeonI_3VSame_BHSD_sizes<0b0, 0b00101, "sqsub", + int_arm_neon_vqsubs, 1>; +defm UQSUBvvv : NeonI_3VSame_BHSD_sizes<0b1, 0b00101, "uqsub", + int_arm_neon_vqsubu, 1>; + +// Vector Shift Left (Signed and Unsigned Integer) +defm SSHLvvv : NeonI_3VSame_BHSD_sizes<0b0, 0b01000, "sshl", + int_arm_neon_vshifts, 1>; +defm USHLvvv : NeonI_3VSame_BHSD_sizes<0b1, 0b01000, "ushl", + int_arm_neon_vshiftu, 1>; + +// Vector Saturating Shift Left (Signed and Unsigned Integer) +defm SQSHLvvv : NeonI_3VSame_BHSD_sizes<0b0, 0b01001, "sqshl", + int_arm_neon_vqshifts, 1>; +defm UQSHLvvv : NeonI_3VSame_BHSD_sizes<0b1, 0b01001, "uqshl", + int_arm_neon_vqshiftu, 1>; + +// Vector Rouding Shift Left (Signed and Unsigned Integer) +defm SRSHLvvv : NeonI_3VSame_BHSD_sizes<0b0, 0b01010, "srshl", + int_arm_neon_vrshifts, 1>; +defm URSHLvvv : NeonI_3VSame_BHSD_sizes<0b1, 0b01010, "urshl", + int_arm_neon_vrshiftu, 1>; + +// Vector Saturating Rouding Shift Left (Signed and Unsigned Integer) +defm SQRSHLvvv : NeonI_3VSame_BHSD_sizes<0b0, 0b01011, "sqrshl", + int_arm_neon_vqrshifts, 1>; +defm UQRSHLvvv : NeonI_3VSame_BHSD_sizes<0b1, 0b01011, "uqrshl", + int_arm_neon_vqrshiftu, 1>; + +// Vector Maximum (Signed and Unsigned Integer) +defm SMAXvvv : NeonI_3VSame_BHS_sizes<0b0, 0b01100, "smax", int_arm_neon_vmaxs, 1>; +defm UMAXvvv : NeonI_3VSame_BHS_sizes<0b1, 0b01100, "umax", int_arm_neon_vmaxu, 1>; + +// Vector Minimum (Signed and Unsigned Integer) +defm SMINvvv : NeonI_3VSame_BHS_sizes<0b0, 0b01101, "smin", int_arm_neon_vmins, 1>; +defm UMINvvv : NeonI_3VSame_BHS_sizes<0b1, 0b01101, "umin", int_arm_neon_vminu, 1>; + +// Vector Maximum (Floating Point) +defm FMAXvvv : NeonI_3VSame_SD_sizes<0b0, 0b0, 0b11110, "fmax", + int_arm_neon_vmaxs, int_arm_neon_vmaxs, + int_arm_neon_vmaxs, v2f32, v4f32, v2f64, 1>; + +// Vector Minimum (Floating Point) +defm FMINvvv : NeonI_3VSame_SD_sizes<0b0, 0b1, 0b11110, "fmin", + int_arm_neon_vmins, int_arm_neon_vmins, + int_arm_neon_vmins, v2f32, v4f32, v2f64, 1>; + +// Vector maxNum (Floating Point) - prefer a number over a quiet NaN) +defm FMAXNMvvv : NeonI_3VSame_SD_sizes<0b0, 0b0, 0b11000, "fmaxnm", + int_aarch64_neon_vmaxnm, + int_aarch64_neon_vmaxnm, + int_aarch64_neon_vmaxnm, + v2f32, v4f32, v2f64, 1>; + +// Vector minNum (Floating Point) - prefer a number over a quiet NaN) +defm FMINNMvvv : NeonI_3VSame_SD_sizes<0b0, 0b1, 0b11000, "fminnm", + int_aarch64_neon_vminnm, + int_aarch64_neon_vminnm, + int_aarch64_neon_vminnm, + v2f32, v4f32, v2f64, 1>; + +// Vector Maximum Pairwise (Signed and Unsigned Integer) +defm SMAXPvvv : NeonI_3VSame_BHS_sizes<0b0, 0b10100, "smaxp", int_arm_neon_vpmaxs, 1>; +defm UMAXPvvv : NeonI_3VSame_BHS_sizes<0b1, 0b10100, "umaxp", int_arm_neon_vpmaxu, 1>; + +// Vector Minimum Pairwise (Signed and Unsigned Integer) +defm SMINPvvv : NeonI_3VSame_BHS_sizes<0b0, 0b10101, "sminp", int_arm_neon_vpmins, 1>; +defm UMINPvvv : NeonI_3VSame_BHS_sizes<0b1, 0b10101, "uminp", int_arm_neon_vpminu, 1>; + +// Vector Maximum Pairwise (Floating Point) +defm FMAXPvvv : NeonI_3VSame_SD_sizes<0b1, 0b0, 0b11110, "fmaxp", + int_arm_neon_vpmaxs, int_arm_neon_vpmaxs, + int_arm_neon_vpmaxs, v2f32, v4f32, v2f64, 1>; + +// Vector Minimum Pairwise (Floating Point) +defm FMINPvvv : NeonI_3VSame_SD_sizes<0b1, 0b1, 0b11110, "fminp", + int_arm_neon_vpmins, int_arm_neon_vpmins, + int_arm_neon_vpmins, v2f32, v4f32, v2f64, 1>; + +// Vector maxNum Pairwise (Floating Point) - prefer a number over a quiet NaN) +defm FMAXNMPvvv : NeonI_3VSame_SD_sizes<0b1, 0b0, 0b11000, "fmaxnmp", + int_aarch64_neon_vpmaxnm, + int_aarch64_neon_vpmaxnm, + int_aarch64_neon_vpmaxnm, + v2f32, v4f32, v2f64, 1>; + +// Vector minNum Pairwise (Floating Point) - prefer a number over a quiet NaN) +defm FMINNMPvvv : NeonI_3VSame_SD_sizes<0b1, 0b1, 0b11000, "fminnmp", + int_aarch64_neon_vpminnm, + int_aarch64_neon_vpminnm, + int_aarch64_neon_vpminnm, + v2f32, v4f32, v2f64, 1>; + +// Vector Addition Pairwise (Integer) +defm ADDP : NeonI_3VSame_BHSD_sizes<0b0, 0b10111, "addp", int_arm_neon_vpadd, 1>; + +// Vector Addition Pairwise (Floating Point) +defm FADDP : NeonI_3VSame_SD_sizes<0b1, 0b0, 0b11010, "faddp", + int_arm_neon_vpadd, + int_arm_neon_vpadd, + int_arm_neon_vpadd, + v2f32, v4f32, v2f64, 1>; + +// Vector Saturating Doubling Multiply High +defm SQDMULHvvv : NeonI_3VSame_HS_sizes<0b0, 0b10110, "sqdmulh", + int_arm_neon_vqdmulh, 1>; + +// Vector Saturating Rouding Doubling Multiply High +defm SQRDMULHvvv : NeonI_3VSame_HS_sizes<0b1, 0b10110, "sqrdmulh", + int_arm_neon_vqrdmulh, 1>; + +// Vector Multiply Extended (Floating Point) +defm FMULXvvv : NeonI_3VSame_SD_sizes<0b0, 0b0, 0b11011, "fmulx", + int_aarch64_neon_vmulx, + int_aarch64_neon_vmulx, + int_aarch64_neon_vmulx, + v2f32, v4f32, v2f64, 1>; + +// Vector Immediate Instructions + +multiclass neon_mov_imm_shift_asmoperands +{ + def _asmoperand : AsmOperandClass + { + let Name = "NeonMovImmShift" # PREFIX; + let RenderMethod = "addNeonMovImmShift" # PREFIX # "Operands"; + let PredicateMethod = "isNeonMovImmShift" # PREFIX; + } +} + +// Definition of vector immediates shift operands + +// The selectable use-cases extract the shift operation +// information from the OpCmode fields encoded in the immediate. +def neon_mod_shift_imm_XFORM : SDNodeXFormgetZExtValue(); + unsigned ShiftImm; + unsigned ShiftOnesIn; + unsigned HasShift = + A64Imms::decodeNeonModShiftImm(OpCmode, ShiftImm, ShiftOnesIn); + if (!HasShift) return SDValue(); + return CurDAG->getTargetConstant(ShiftImm, MVT::i32); +}]>; + +// Vector immediates shift operands which accept LSL and MSL +// shift operators with shift value in the range of 0, 8, 16, 24 (LSL), +// or 0, 8 (LSLH) or 8, 16 (MSL). +defm neon_mov_imm_LSL : neon_mov_imm_shift_asmoperands<"LSL">; +defm neon_mov_imm_MSL : neon_mov_imm_shift_asmoperands<"MSL">; +// LSLH restricts shift amount to 0, 8 out of 0, 8, 16, 24 +defm neon_mov_imm_LSLH : neon_mov_imm_shift_asmoperands<"LSLH">; + +multiclass neon_mov_imm_shift_operands +{ + def _operand : Operand, ImmLeaf + { + let PrintMethod = + "printNeonMovImmShiftOperand"; + let DecoderMethod = + "DecodeNeonMovImmShiftOperand"; + let ParserMatchClass = + !cast("neon_mov_imm_" # PREFIX # HALF # "_asmoperand"); + } +} + +defm neon_mov_imm_LSL : neon_mov_imm_shift_operands<"LSL", "", "false", [{ + unsigned ShiftImm; + unsigned ShiftOnesIn; + unsigned HasShift = + A64Imms::decodeNeonModShiftImm(Imm, ShiftImm, ShiftOnesIn); + return (HasShift && !ShiftOnesIn); +}]>; + +defm neon_mov_imm_MSL : neon_mov_imm_shift_operands<"MSL", "", "false", [{ + unsigned ShiftImm; + unsigned ShiftOnesIn; + unsigned HasShift = + A64Imms::decodeNeonModShiftImm(Imm, ShiftImm, ShiftOnesIn); + return (HasShift && ShiftOnesIn); +}]>; + +defm neon_mov_imm_LSLH : neon_mov_imm_shift_operands<"LSL", "H", "true", [{ + unsigned ShiftImm; + unsigned ShiftOnesIn; + unsigned HasShift = + A64Imms::decodeNeonModShiftImm(Imm, ShiftImm, ShiftOnesIn); + return (HasShift && !ShiftOnesIn); +}]>; + +def neon_uimm1_asmoperand : AsmOperandClass +{ + let Name = "UImm1"; + let PredicateMethod = "isUImm<1>"; + let RenderMethod = "addImmOperands"; +} + +def neon_uimm2_asmoperand : AsmOperandClass +{ + let Name = "UImm2"; + let PredicateMethod = "isUImm<2>"; + let RenderMethod = "addImmOperands"; +} + +def neon_uimm8_asmoperand : AsmOperandClass +{ + let Name = "UImm8"; + let PredicateMethod = "isUImm<8>"; + let RenderMethod = "addImmOperands"; +} + +def neon_uimm8 : Operand, ImmLeaf { + let ParserMatchClass = neon_uimm8_asmoperand; + let PrintMethod = "printUImmHexOperand"; +} + +def neon_uimm64_mask_asmoperand : AsmOperandClass +{ + let Name = "NeonUImm64Mask"; + let PredicateMethod = "isNeonUImm64Mask"; + let RenderMethod = "addNeonUImm64MaskOperands"; +} + +// MCOperand for 64-bit bytemask with each byte having only the +// value 0x00 and 0xff is encoded as an unsigned 8-bit value +def neon_uimm64_mask : Operand, ImmLeaf { + let ParserMatchClass = neon_uimm64_mask_asmoperand; + let PrintMethod = "printNeonUImm64MaskOperand"; +} + +multiclass NeonI_mov_imm_lsl_sizes +{ + // shift zeros, per word + def _2S : NeonI_1VModImm<0b0, op, + (outs VPR64:$Rd), + (ins neon_uimm8:$Imm, + neon_mov_imm_LSL_operand:$Simm), + !strconcat(asmop, "\t$Rd.2s, $Imm$Simm"), + [(set (v2i32 VPR64:$Rd), + (v2i32 (opnode (timm:$Imm), + (neon_mov_imm_LSL_operand:$Simm))))], + NoItinerary> { + bits<2> Simm; + let cmode = {0b0, Simm{1}, Simm{0}, 0b0}; + } + + def _4S : NeonI_1VModImm<0b1, op, + (outs VPR128:$Rd), + (ins neon_uimm8:$Imm, + neon_mov_imm_LSL_operand:$Simm), + !strconcat(asmop, "\t$Rd.4s, $Imm$Simm"), + [(set (v4i32 VPR128:$Rd), + (v4i32 (opnode (timm:$Imm), + (neon_mov_imm_LSL_operand:$Simm))))], + NoItinerary> { + bits<2> Simm; + let cmode = {0b0, Simm{1}, Simm{0}, 0b0}; + } + + // shift zeros, per halfword + def _4H : NeonI_1VModImm<0b0, op, + (outs VPR64:$Rd), + (ins neon_uimm8:$Imm, + neon_mov_imm_LSLH_operand:$Simm), + !strconcat(asmop, "\t$Rd.4h, $Imm$Simm"), + [(set (v4i16 VPR64:$Rd), + (v4i16 (opnode (timm:$Imm), + (neon_mov_imm_LSLH_operand:$Simm))))], + NoItinerary> { + bit Simm; + let cmode = {0b1, 0b0, Simm, 0b0}; + } + + def _8H : NeonI_1VModImm<0b1, op, + (outs VPR128:$Rd), + (ins neon_uimm8:$Imm, + neon_mov_imm_LSLH_operand:$Simm), + !strconcat(asmop, "\t$Rd.8h, $Imm$Simm"), + [(set (v8i16 VPR128:$Rd), + (v8i16 (opnode (timm:$Imm), + (neon_mov_imm_LSLH_operand:$Simm))))], + NoItinerary> { + bit Simm; + let cmode = {0b1, 0b0, Simm, 0b0}; + } +} + +multiclass NeonI_mov_imm_with_constraint_lsl_sizes +{ + let Constraints = "$src = $Rd" in { + // shift zeros, per word + def _2S : NeonI_1VModImm<0b0, op, + (outs VPR64:$Rd), + (ins VPR64:$src, neon_uimm8:$Imm, + neon_mov_imm_LSL_operand:$Simm), + !strconcat(asmop, "\t$Rd.2s, $Imm$Simm"), + [(set (v2i32 VPR64:$Rd), + (v2i32 (opnode (v2i32 VPR64:$src), + (v2i32 (bitconvert (v2i32 (neonopnode timm:$Imm, + neon_mov_imm_LSL_operand:$Simm)))))))], + NoItinerary> { + bits<2> Simm; + let cmode = {0b0, Simm{1}, Simm{0}, 0b1}; + } + + def _4S : NeonI_1VModImm<0b1, op, + (outs VPR128:$Rd), + (ins VPR128:$src, neon_uimm8:$Imm, + neon_mov_imm_LSL_operand:$Simm), + !strconcat(asmop, "\t$Rd.4s, $Imm$Simm"), + [(set (v4i32 VPR128:$Rd), + (v4i32 (opnode (v4i32 VPR128:$src), + (v4i32 (bitconvert (v4i32 (neonopnode timm:$Imm, + neon_mov_imm_LSL_operand:$Simm)))))))], + NoItinerary> { + bits<2> Simm; + let cmode = {0b0, Simm{1}, Simm{0}, 0b1}; + } + + // shift zeros, per halfword + def _4H : NeonI_1VModImm<0b0, op, + (outs VPR64:$Rd), + (ins VPR64:$src, neon_uimm8:$Imm, + neon_mov_imm_LSLH_operand:$Simm), + !strconcat(asmop, "\t$Rd.4h, $Imm$Simm"), + [(set (v4i16 VPR64:$Rd), + (v4i16 (opnode (v4i16 VPR64:$src), + (v4i16 (bitconvert (v4i16 (neonopnode timm:$Imm, + neon_mov_imm_LSL_operand:$Simm)))))))], + NoItinerary> { + bit Simm; + let cmode = {0b1, 0b0, Simm, 0b1}; + } + + def _8H : NeonI_1VModImm<0b1, op, + (outs VPR128:$Rd), + (ins VPR128:$src, neon_uimm8:$Imm, + neon_mov_imm_LSLH_operand:$Simm), + !strconcat(asmop, "\t$Rd.8h, $Imm$Simm"), + [(set (v8i16 VPR128:$Rd), + (v8i16 (opnode (v8i16 VPR128:$src), + (v8i16 (bitconvert (v8i16 (neonopnode timm:$Imm, + neon_mov_imm_LSL_operand:$Simm)))))))], + NoItinerary> { + bit Simm; + let cmode = {0b1, 0b0, Simm, 0b1}; + } + } +} + +multiclass NeonI_mov_imm_msl_sizes +{ + // shift ones, per word + def _2S : NeonI_1VModImm<0b0, op, + (outs VPR64:$Rd), + (ins neon_uimm8:$Imm, + neon_mov_imm_MSL_operand:$Simm), + !strconcat(asmop, "\t$Rd.2s, $Imm$Simm"), + [(set (v2i32 VPR64:$Rd), + (v2i32 (opnode (timm:$Imm), + (neon_mov_imm_MSL_operand:$Simm))))], + NoItinerary> { + bit Simm; + let cmode = {0b1, 0b1, 0b0, Simm}; + } + + def _4S : NeonI_1VModImm<0b1, op, + (outs VPR128:$Rd), + (ins neon_uimm8:$Imm, + neon_mov_imm_MSL_operand:$Simm), + !strconcat(asmop, "\t$Rd.4s, $Imm$Simm"), + [(set (v4i32 VPR128:$Rd), + (v4i32 (opnode (timm:$Imm), + (neon_mov_imm_MSL_operand:$Simm))))], + NoItinerary> { + bit Simm; + let cmode = {0b1, 0b1, 0b0, Simm}; + } +} + +// Vector Move Immediate Shifted +let isReMaterializable = 1 in { +defm MOVIvi_lsl : NeonI_mov_imm_lsl_sizes<"movi", 0b0, Neon_movi>; +} + +// Vector Move Inverted Immediate Shifted +let isReMaterializable = 1 in { +defm MVNIvi_lsl : NeonI_mov_imm_lsl_sizes<"mvni", 0b1, Neon_mvni>; +} + +// Vector Bitwise Bit Clear (AND NOT) - immediate +let isReMaterializable = 1 in { +defm BICvi_lsl : NeonI_mov_imm_with_constraint_lsl_sizes<"bic", 0b1, + and, Neon_mvni>; +} + +// Vector Bitwise OR - immedidate + +let isReMaterializable = 1 in { +defm ORRvi_lsl : NeonI_mov_imm_with_constraint_lsl_sizes<"orr", 0b0, + or, Neon_movi>; +} + +// Additional patterns for Vector Bitwise Bit Clear (AND NOT) - immedidate +// LowerBUILD_VECTOR favors lowering MOVI over MVNI. +// BIC immediate instructions selection requires additional patterns to +// transform Neon_movi operands into BIC immediate operands + +def neon_mov_imm_LSLH_transform_XFORM : SDNodeXFormgetZExtValue(); + unsigned ShiftImm; + unsigned ShiftOnesIn; + (void)A64Imms::decodeNeonModShiftImm(OpCmode, ShiftImm, ShiftOnesIn); + // LSLH restricts shift amount to 0, 8 which are encoded as 0 and 1 + // Transform encoded shift amount 0 to 1 and 1 to 0. + return CurDAG->getTargetConstant(!ShiftImm, MVT::i32); +}]>; + +def neon_mov_imm_LSLH_transform_operand + : ImmLeaf; + +// Transform (and A, (4h Neon_movi 0xff)) -> BIC 4h (A, 0x00, LSL 8) +// Transform (and A, (4h Neon_movi 0xff LSL #8)) -> BIC 4h (A, 0x00) +def : Pat<(v4i16 (and VPR64:$src, + (v4i16 (Neon_movi 255, neon_mov_imm_LSLH_transform_operand:$Simm)))), + (BICvi_lsl_4H VPR64:$src, 0, + neon_mov_imm_LSLH_transform_operand:$Simm)>; + +// Transform (and A, (8h Neon_movi 8h 0xff)) -> BIC 8h (A, 0x00, LSL 8) +// Transform (and A, (8h Neon_movi 0xff LSL #8)) -> BIC 8h (A, 0x00) +def : Pat<(v8i16 (and VPR128:$src, + (v8i16 (Neon_movi 255, neon_mov_imm_LSLH_transform_operand:$Simm)))), + (BICvi_lsl_8H VPR128:$src, 0, + neon_mov_imm_LSLH_transform_operand:$Simm)>; + + +multiclass Neon_bitwiseVi_patterns { + def : Pat<(v8i8 (opnode VPR64:$src, + (bitconvert(v4i16 (neonopnode timm:$Imm, + neon_mov_imm_LSLH_operand:$Simm))))), + (INST4H VPR64:$src, neon_uimm8:$Imm, + neon_mov_imm_LSLH_operand:$Simm)>; + def : Pat<(v1i64 (opnode VPR64:$src, + (bitconvert(v4i16 (neonopnode timm:$Imm, + neon_mov_imm_LSLH_operand:$Simm))))), + (INST4H VPR64:$src, neon_uimm8:$Imm, + neon_mov_imm_LSLH_operand:$Simm)>; + + def : Pat<(v16i8 (opnode VPR128:$src, + (bitconvert(v8i16 (neonopnode timm:$Imm, + neon_mov_imm_LSLH_operand:$Simm))))), + (INST8H VPR128:$src, neon_uimm8:$Imm, + neon_mov_imm_LSLH_operand:$Simm)>; + def : Pat<(v4i32 (opnode VPR128:$src, + (bitconvert(v8i16 (neonopnode timm:$Imm, + neon_mov_imm_LSLH_operand:$Simm))))), + (INST8H VPR128:$src, neon_uimm8:$Imm, + neon_mov_imm_LSLH_operand:$Simm)>; + def : Pat<(v2i64 (opnode VPR128:$src, + (bitconvert(v8i16 (neonopnode timm:$Imm, + neon_mov_imm_LSLH_operand:$Simm))))), + (INST8H VPR128:$src, neon_uimm8:$Imm, + neon_mov_imm_LSLH_operand:$Simm)>; +} + +// Additional patterns for Vector Vector Bitwise Bit Clear (AND NOT) - immediate +defm : Neon_bitwiseVi_patterns; + +// Additional patterns for Vector Bitwise OR - immedidate +defm : Neon_bitwiseVi_patterns; + + +// Vector Move Immediate Masked +let isReMaterializable = 1 in { +defm MOVIvi_msl : NeonI_mov_imm_msl_sizes<"movi", 0b0, Neon_movi>; +} + +// Vector Move Inverted Immediate Masked +let isReMaterializable = 1 in { +defm MVNIvi_msl : NeonI_mov_imm_msl_sizes<"mvni", 0b1, Neon_mvni>; +} + +class NeonI_mov_imm_lsl_aliases + : NeonInstAlias; + +// Aliases for Vector Move Immediate Shifted +def : NeonI_mov_imm_lsl_aliases<"movi", ".2s", MOVIvi_lsl_2S, VPR64>; +def : NeonI_mov_imm_lsl_aliases<"movi", ".4s", MOVIvi_lsl_4S, VPR128>; +def : NeonI_mov_imm_lsl_aliases<"movi", ".4h", MOVIvi_lsl_4H, VPR64>; +def : NeonI_mov_imm_lsl_aliases<"movi", ".8h", MOVIvi_lsl_8H, VPR128>; + +// Aliases for Vector Move Inverted Immediate Shifted +def : NeonI_mov_imm_lsl_aliases<"mvni", ".2s", MVNIvi_lsl_2S, VPR64>; +def : NeonI_mov_imm_lsl_aliases<"mvni", ".4s", MVNIvi_lsl_4S, VPR128>; +def : NeonI_mov_imm_lsl_aliases<"mvni", ".4h", MVNIvi_lsl_4H, VPR64>; +def : NeonI_mov_imm_lsl_aliases<"mvni", ".8h", MVNIvi_lsl_8H, VPR128>; + +// Aliases for Vector Bitwise Bit Clear (AND NOT) - immediate +def : NeonI_mov_imm_lsl_aliases<"bic", ".2s", BICvi_lsl_2S, VPR64>; +def : NeonI_mov_imm_lsl_aliases<"bic", ".4s", BICvi_lsl_4S, VPR128>; +def : NeonI_mov_imm_lsl_aliases<"bic", ".4h", BICvi_lsl_4H, VPR64>; +def : NeonI_mov_imm_lsl_aliases<"bic", ".8h", BICvi_lsl_8H, VPR128>; + +// Aliases for Vector Bitwise OR - immedidate +def : NeonI_mov_imm_lsl_aliases<"orr", ".2s", ORRvi_lsl_2S, VPR64>; +def : NeonI_mov_imm_lsl_aliases<"orr", ".4s", ORRvi_lsl_4S, VPR128>; +def : NeonI_mov_imm_lsl_aliases<"orr", ".4h", ORRvi_lsl_4H, VPR64>; +def : NeonI_mov_imm_lsl_aliases<"orr", ".8h", ORRvi_lsl_8H, VPR128>; + +// Vector Move Immediate - per byte +let isReMaterializable = 1 in { +def MOVIvi_8B : NeonI_1VModImm<0b0, 0b0, + (outs VPR64:$Rd), (ins neon_uimm8:$Imm), + "movi\t$Rd.8b, $Imm", + [(set (v8i8 VPR64:$Rd), + (v8i8 (Neon_movi (timm:$Imm), (i32 imm))))], + NoItinerary> { + let cmode = 0b1110; +} + +def MOVIvi_16B : NeonI_1VModImm<0b1, 0b0, + (outs VPR128:$Rd), (ins neon_uimm8:$Imm), + "movi\t$Rd.16b, $Imm", + [(set (v16i8 VPR128:$Rd), + (v16i8 (Neon_movi (timm:$Imm), (i32 imm))))], + NoItinerary> { + let cmode = 0b1110; +} +} + +// Vector Move Immediate - bytemask, per double word +let isReMaterializable = 1 in { +def MOVIvi_2D : NeonI_1VModImm<0b1, 0b1, + (outs VPR128:$Rd), (ins neon_uimm64_mask:$Imm), + "movi\t $Rd.2d, $Imm", + [(set (v2i64 VPR128:$Rd), + (v2i64 (Neon_movi (timm:$Imm), (i32 imm))))], + NoItinerary> { + let cmode = 0b1110; +} +} + +// Vector Move Immediate - bytemask, one doubleword + +let isReMaterializable = 1 in { +def MOVIdi : NeonI_1VModImm<0b0, 0b1, + (outs FPR64:$Rd), (ins neon_uimm64_mask:$Imm), + "movi\t $Rd, $Imm", + [(set (v1i64 FPR64:$Rd), + (v1i64 (Neon_movi (timm:$Imm), (i32 imm))))], + NoItinerary> { + let cmode = 0b1110; +} +} + +// Vector Floating Point Move Immediate + +class NeonI_FMOV_impl + : NeonI_1VModImm { + let cmode = 0b1111; + } + +let isReMaterializable = 1 in { +def FMOVvi_2S : NeonI_FMOV_impl<".2s", VPR64, v2f32, fmov32_operand, 0b0, 0b0>; +def FMOVvi_4S : NeonI_FMOV_impl<".4s", VPR128, v4f32, fmov32_operand, 0b1, 0b0>; +def FMOVvi_2D : NeonI_FMOV_impl<".2d", VPR128, v2f64, fmov64_operand, 0b1, 0b1>; +} + +// Vector Shift (Immediate) +// Immediate in [0, 63] +def imm0_63 : Operand { + let ParserMatchClass = uimm6_asmoperand; +} + +// Shift Right/Left Immediate - The immh:immb field of these shifts are encoded +// as follows: +// +// Offset Encoding +// 8 immh:immb<6:3> = '0001xxx', is encoded in immh:immb<2:0> +// 16 immh:immb<6:4> = '001xxxx', is encoded in immh:immb<3:0> +// 32 immh:immb<6:5> = '01xxxxx', is encoded in immh:immb<4:0> +// 64 immh:immb<6> = '1xxxxxx', is encoded in immh:immb<5:0> +// +// The shift right immediate amount, in the range 1 to element bits, is computed +// as Offset - UInt(immh:immb). The shift left immediate amount, in the range 0 +// to element bits - 1, is computed as UInt(immh:immb) - Offset. + +class shr_imm_asmoperands : AsmOperandClass { + let Name = "ShrImm" # OFFSET; + let RenderMethod = "addImmOperands"; + let DiagnosticType = "ShrImm" # OFFSET; +} + +class shr_imm : Operand { + let EncoderMethod = "getShiftRightImm" # OFFSET; + let DecoderMethod = "DecodeShiftRightImm" # OFFSET; + let ParserMatchClass = + !cast("shr_imm" # OFFSET # "_asmoperand"); +} + +def shr_imm8_asmoperand : shr_imm_asmoperands<"8">; +def shr_imm16_asmoperand : shr_imm_asmoperands<"16">; +def shr_imm32_asmoperand : shr_imm_asmoperands<"32">; +def shr_imm64_asmoperand : shr_imm_asmoperands<"64">; + +def shr_imm8 : shr_imm<"8">, ImmLeaf 0 && Imm <= 8;}]>; +def shr_imm16 : shr_imm<"16">, ImmLeaf 0 && Imm <= 16;}]>; +def shr_imm32 : shr_imm<"32">, ImmLeaf 0 && Imm <= 32;}]>; +def shr_imm64 : shr_imm<"64">, ImmLeaf 0 && Imm <= 64;}]>; + +class shl_imm_asmoperands : AsmOperandClass { + let Name = "ShlImm" # OFFSET; + let RenderMethod = "addImmOperands"; + let DiagnosticType = "ShlImm" # OFFSET; +} + +class shl_imm : Operand { + let EncoderMethod = "getShiftLeftImm" # OFFSET; + let DecoderMethod = "DecodeShiftLeftImm" # OFFSET; + let ParserMatchClass = + !cast("shl_imm" # OFFSET # "_asmoperand"); +} + +def shl_imm8_asmoperand : shl_imm_asmoperands<"8">; +def shl_imm16_asmoperand : shl_imm_asmoperands<"16">; +def shl_imm32_asmoperand : shl_imm_asmoperands<"32">; +def shl_imm64_asmoperand : shl_imm_asmoperands<"64">; + +def shl_imm8 : shl_imm<"8">, ImmLeaf= 0 && Imm < 8;}]>; +def shl_imm16 : shl_imm<"16">, ImmLeaf= 0 && Imm < 16;}]>; +def shl_imm32 : shl_imm<"32">, ImmLeaf= 0 && Imm < 32;}]>; +def shl_imm64 : shl_imm<"64">, ImmLeaf= 0 && Imm < 64;}]>; + +class N2VShift opcode, string asmop, string T, + RegisterOperand VPRC, ValueType Ty, Operand ImmTy, SDNode OpNode> + : NeonI_2VShiftImm; + +multiclass NeonI_N2VShL opcode, string asmop> { + // 64-bit vector types. + def _8B : N2VShift<0b0, u, opcode, asmop, "8b", VPR64, v8i8, shl_imm8, shl> { + let Inst{22-19} = 0b0001; // immh:immb = 0001xxx + } + + def _4H : N2VShift<0b0, u, opcode, asmop, "4h", VPR64, v4i16, shl_imm16, shl> { + let Inst{22-20} = 0b001; // immh:immb = 001xxxx + } + + def _2S : N2VShift<0b0, u, opcode, asmop, "2s", VPR64, v2i32, shl_imm32, shl> { + let Inst{22-21} = 0b01; // immh:immb = 01xxxxx + } + + // 128-bit vector types. + def _16B : N2VShift<0b1, u, opcode, asmop, "16b", VPR128, v16i8, shl_imm8, shl> { + let Inst{22-19} = 0b0001; // immh:immb = 0001xxx + } + + def _8H : N2VShift<0b1, u, opcode, asmop, "8h", VPR128, v8i16, shl_imm16, shl> { + let Inst{22-20} = 0b001; // immh:immb = 001xxxx + } + + def _4S : N2VShift<0b1, u, opcode, asmop, "4s", VPR128, v4i32, shl_imm32, shl> { + let Inst{22-21} = 0b01; // immh:immb = 01xxxxx + } + + def _2D : N2VShift<0b1, u, opcode, asmop, "2d", VPR128, v2i64, shl_imm64, shl> { + let Inst{22} = 0b1; // immh:immb = 1xxxxxx + } +} + +multiclass NeonI_N2VShR opcode, string asmop, SDNode OpNode> { + def _8B : N2VShift<0b0, u, opcode, asmop, "8b", VPR64, v8i8, shr_imm8, + OpNode> { + let Inst{22-19} = 0b0001; + } + + def _4H : N2VShift<0b0, u, opcode, asmop, "4h", VPR64, v4i16, shr_imm16, + OpNode> { + let Inst{22-20} = 0b001; + } + + def _2S : N2VShift<0b0, u, opcode, asmop, "2s", VPR64, v2i32, shr_imm32, + OpNode> { + let Inst{22-21} = 0b01; + } + + def _16B : N2VShift<0b1, u, opcode, asmop, "16b", VPR128, v16i8, shr_imm8, + OpNode> { + let Inst{22-19} = 0b0001; + } + + def _8H : N2VShift<0b1, u, opcode, asmop, "8h", VPR128, v8i16, shr_imm16, + OpNode> { + let Inst{22-20} = 0b001; + } + + def _4S : N2VShift<0b1, u, opcode, asmop, "4s", VPR128, v4i32, shr_imm32, + OpNode> { + let Inst{22-21} = 0b01; + } + + def _2D : N2VShift<0b1, u, opcode, asmop, "2d", VPR128, v2i64, shr_imm64, + OpNode> { + let Inst{22} = 0b1; + } +} + +// Shift left +defm SHLvvi : NeonI_N2VShL<0b0, 0b01010, "shl">; + +// Shift right +defm SSHRvvi : NeonI_N2VShR<0b0, 0b00000, "sshr", sra>; +defm USHRvvi : NeonI_N2VShR<0b1, 0b00000, "ushr", srl>; + +def Neon_High16B : PatFrag<(ops node:$in), + (extract_subvector (v16i8 node:$in), (iPTR 8))>; +def Neon_High8H : PatFrag<(ops node:$in), + (extract_subvector (v8i16 node:$in), (iPTR 4))>; +def Neon_High4S : PatFrag<(ops node:$in), + (extract_subvector (v4i32 node:$in), (iPTR 2))>; +def Neon_High2D : PatFrag<(ops node:$in), + (extract_subvector (v2i64 node:$in), (iPTR 1))>; +def Neon_High4float : PatFrag<(ops node:$in), + (extract_subvector (v4f32 node:$in), (iPTR 2))>; +def Neon_High2double : PatFrag<(ops node:$in), + (extract_subvector (v2f64 node:$in), (iPTR 1))>; + +def Neon_Low16B : PatFrag<(ops node:$in), + (v8i8 (extract_subvector (v16i8 node:$in), + (iPTR 0)))>; +def Neon_Low8H : PatFrag<(ops node:$in), + (v4i16 (extract_subvector (v8i16 node:$in), + (iPTR 0)))>; +def Neon_Low4S : PatFrag<(ops node:$in), + (v2i32 (extract_subvector (v4i32 node:$in), + (iPTR 0)))>; +def Neon_Low2D : PatFrag<(ops node:$in), + (v1i64 (extract_subvector (v2i64 node:$in), + (iPTR 0)))>; +def Neon_Low4float : PatFrag<(ops node:$in), + (v2f32 (extract_subvector (v4f32 node:$in), + (iPTR 0)))>; +def Neon_Low2double : PatFrag<(ops node:$in), + (v1f64 (extract_subvector (v2f64 node:$in), + (iPTR 0)))>; + +class N2VShiftLong opcode, string asmop, string DestT, + string SrcT, ValueType DestTy, ValueType SrcTy, + Operand ImmTy, SDPatternOperator ExtOp> + : NeonI_2VShiftImm; + +class N2VShiftLongHigh opcode, string asmop, string DestT, + string SrcT, ValueType DestTy, ValueType SrcTy, + int StartIndex, Operand ImmTy, + SDPatternOperator ExtOp, PatFrag getTop> + : NeonI_2VShiftImm; + +multiclass NeonI_N2VShLL opcode, string asmop, + SDNode ExtOp> { + // 64-bit vector types. + def _8B : N2VShiftLong<0b0, u, opcode, asmop, "8h", "8b", v8i16, v8i8, + shl_imm8, ExtOp> { + let Inst{22-19} = 0b0001; // immh:immb = 0001xxx + } + + def _4H : N2VShiftLong<0b0, u, opcode, asmop, "4s", "4h", v4i32, v4i16, + shl_imm16, ExtOp> { + let Inst{22-20} = 0b001; // immh:immb = 001xxxx + } + + def _2S : N2VShiftLong<0b0, u, opcode, asmop, "2d", "2s", v2i64, v2i32, + shl_imm32, ExtOp> { + let Inst{22-21} = 0b01; // immh:immb = 01xxxxx + } + + // 128-bit vector types + def _16B : N2VShiftLongHigh<0b1, u, opcode, asmop, "8h", "16b", v8i16, v8i8, + 8, shl_imm8, ExtOp, Neon_High16B> { + let Inst{22-19} = 0b0001; // immh:immb = 0001xxx + } + + def _8H : N2VShiftLongHigh<0b1, u, opcode, asmop, "4s", "8h", v4i32, v4i16, + 4, shl_imm16, ExtOp, Neon_High8H> { + let Inst{22-20} = 0b001; // immh:immb = 001xxxx + } + + def _4S : N2VShiftLongHigh<0b1, u, opcode, asmop, "2d", "4s", v2i64, v2i32, + 2, shl_imm32, ExtOp, Neon_High4S> { + let Inst{22-21} = 0b01; // immh:immb = 01xxxxx + } + + // Use other patterns to match when the immediate is 0. + def : Pat<(v8i16 (ExtOp (v8i8 VPR64:$Rn))), + (!cast(prefix # "_8B") VPR64:$Rn, 0)>; + + def : Pat<(v4i32 (ExtOp (v4i16 VPR64:$Rn))), + (!cast(prefix # "_4H") VPR64:$Rn, 0)>; + + def : Pat<(v2i64 (ExtOp (v2i32 VPR64:$Rn))), + (!cast(prefix # "_2S") VPR64:$Rn, 0)>; + + def : Pat<(v8i16 (ExtOp (v8i8 (Neon_High16B VPR128:$Rn)))), + (!cast(prefix # "_16B") VPR128:$Rn, 0)>; + + def : Pat<(v4i32 (ExtOp (v4i16 (Neon_High8H VPR128:$Rn)))), + (!cast(prefix # "_8H") VPR128:$Rn, 0)>; + + def : Pat<(v2i64 (ExtOp (v2i32 (Neon_High4S VPR128:$Rn)))), + (!cast(prefix # "_4S") VPR128:$Rn, 0)>; +} + +// Shift left long +defm SSHLLvvi : NeonI_N2VShLL<"SSHLLvvi", 0b0, 0b10100, "sshll", sext>; +defm USHLLvvi : NeonI_N2VShLL<"USHLLvvi", 0b1, 0b10100, "ushll", zext>; + +// Rounding/Saturating shift +class N2VShift_RQ opcode, string asmop, string T, + RegisterOperand VPRC, ValueType Ty, Operand ImmTy, + SDPatternOperator OpNode> + : NeonI_2VShiftImm; + +// shift right (vector by immediate) +multiclass NeonI_N2VShR_RQ opcode, string asmop, + SDPatternOperator OpNode> { + def _8B : N2VShift_RQ<0b0, u, opcode, asmop, "8b", VPR64, v8i8, shr_imm8, + OpNode> { + let Inst{22-19} = 0b0001; + } + + def _4H : N2VShift_RQ<0b0, u, opcode, asmop, "4h", VPR64, v4i16, shr_imm16, + OpNode> { + let Inst{22-20} = 0b001; + } + + def _2S : N2VShift_RQ<0b0, u, opcode, asmop, "2s", VPR64, v2i32, shr_imm32, + OpNode> { + let Inst{22-21} = 0b01; + } + + def _16B : N2VShift_RQ<0b1, u, opcode, asmop, "16b", VPR128, v16i8, shr_imm8, + OpNode> { + let Inst{22-19} = 0b0001; + } + + def _8H : N2VShift_RQ<0b1, u, opcode, asmop, "8h", VPR128, v8i16, shr_imm16, + OpNode> { + let Inst{22-20} = 0b001; + } + + def _4S : N2VShift_RQ<0b1, u, opcode, asmop, "4s", VPR128, v4i32, shr_imm32, + OpNode> { + let Inst{22-21} = 0b01; + } + + def _2D : N2VShift_RQ<0b1, u, opcode, asmop, "2d", VPR128, v2i64, shr_imm64, + OpNode> { + let Inst{22} = 0b1; + } +} + +multiclass NeonI_N2VShL_Q opcode, string asmop, + SDPatternOperator OpNode> { + // 64-bit vector types. + def _8B : N2VShift_RQ<0b0, u, opcode, asmop, "8b", VPR64, v8i8, shl_imm8, + OpNode> { + let Inst{22-19} = 0b0001; + } + + def _4H : N2VShift_RQ<0b0, u, opcode, asmop, "4h", VPR64, v4i16, shl_imm16, + OpNode> { + let Inst{22-20} = 0b001; + } + + def _2S : N2VShift_RQ<0b0, u, opcode, asmop, "2s", VPR64, v2i32, shl_imm32, + OpNode> { + let Inst{22-21} = 0b01; + } + + // 128-bit vector types. + def _16B : N2VShift_RQ<0b1, u, opcode, asmop, "16b", VPR128, v16i8, shl_imm8, + OpNode> { + let Inst{22-19} = 0b0001; + } + + def _8H : N2VShift_RQ<0b1, u, opcode, asmop, "8h", VPR128, v8i16, shl_imm16, + OpNode> { + let Inst{22-20} = 0b001; + } + + def _4S : N2VShift_RQ<0b1, u, opcode, asmop, "4s", VPR128, v4i32, shl_imm32, + OpNode> { + let Inst{22-21} = 0b01; + } + + def _2D : N2VShift_RQ<0b1, u, opcode, asmop, "2d", VPR128, v2i64, shl_imm64, + OpNode> { + let Inst{22} = 0b1; + } +} + +// Rounding shift right +defm SRSHRvvi : NeonI_N2VShR_RQ<0b0, 0b00100, "srshr", + int_aarch64_neon_vsrshr>; +defm URSHRvvi : NeonI_N2VShR_RQ<0b1, 0b00100, "urshr", + int_aarch64_neon_vurshr>; + +// Saturating shift left unsigned +defm SQSHLUvvi : NeonI_N2VShL_Q<0b1, 0b01100, "sqshlu", int_aarch64_neon_vsqshlu>; + +// Saturating shift left +defm SQSHLvvi : NeonI_N2VShL_Q<0b0, 0b01110, "sqshl", Neon_sqrshlImm>; +defm UQSHLvvi : NeonI_N2VShL_Q<0b1, 0b01110, "uqshl", Neon_uqrshlImm>; + +class N2VShiftAdd opcode, string asmop, string T, + RegisterOperand VPRC, ValueType Ty, Operand ImmTy, + SDNode OpNode> + : NeonI_2VShiftImm { + let Constraints = "$src = $Rd"; +} + +// Shift Right accumulate +multiclass NeonI_N2VShRAdd opcode, string asmop, SDNode OpNode> { + def _8B : N2VShiftAdd<0b0, u, opcode, asmop, "8b", VPR64, v8i8, shr_imm8, + OpNode> { + let Inst{22-19} = 0b0001; + } + + def _4H : N2VShiftAdd<0b0, u, opcode, asmop, "4h", VPR64, v4i16, shr_imm16, + OpNode> { + let Inst{22-20} = 0b001; + } + + def _2S : N2VShiftAdd<0b0, u, opcode, asmop, "2s", VPR64, v2i32, shr_imm32, + OpNode> { + let Inst{22-21} = 0b01; + } + + def _16B : N2VShiftAdd<0b1, u, opcode, asmop, "16b", VPR128, v16i8, shr_imm8, + OpNode> { + let Inst{22-19} = 0b0001; + } + + def _8H : N2VShiftAdd<0b1, u, opcode, asmop, "8h", VPR128, v8i16, shr_imm16, + OpNode> { + let Inst{22-20} = 0b001; + } + + def _4S : N2VShiftAdd<0b1, u, opcode, asmop, "4s", VPR128, v4i32, shr_imm32, + OpNode> { + let Inst{22-21} = 0b01; + } + + def _2D : N2VShiftAdd<0b1, u, opcode, asmop, "2d", VPR128, v2i64, shr_imm64, + OpNode> { + let Inst{22} = 0b1; + } +} + +// Shift right and accumulate +defm SSRAvvi : NeonI_N2VShRAdd<0, 0b00010, "ssra", sra>; +defm USRAvvi : NeonI_N2VShRAdd<1, 0b00010, "usra", srl>; + +// Rounding shift accumulate +class N2VShiftAdd_R opcode, string asmop, string T, + RegisterOperand VPRC, ValueType Ty, Operand ImmTy, + SDPatternOperator OpNode> + : NeonI_2VShiftImm { + let Constraints = "$src = $Rd"; +} + +multiclass NeonI_N2VShRAdd_R opcode, string asmop, + SDPatternOperator OpNode> { + def _8B : N2VShiftAdd_R<0b0, u, opcode, asmop, "8b", VPR64, v8i8, shr_imm8, + OpNode> { + let Inst{22-19} = 0b0001; + } + + def _4H : N2VShiftAdd_R<0b0, u, opcode, asmop, "4h", VPR64, v4i16, shr_imm16, + OpNode> { + let Inst{22-20} = 0b001; + } + + def _2S : N2VShiftAdd_R<0b0, u, opcode, asmop, "2s", VPR64, v2i32, shr_imm32, + OpNode> { + let Inst{22-21} = 0b01; + } + + def _16B : N2VShiftAdd_R<0b1, u, opcode, asmop, "16b", VPR128, v16i8, shr_imm8, + OpNode> { + let Inst{22-19} = 0b0001; + } + + def _8H : N2VShiftAdd_R<0b1, u, opcode, asmop, "8h", VPR128, v8i16, shr_imm16, + OpNode> { + let Inst{22-20} = 0b001; + } + + def _4S : N2VShiftAdd_R<0b1, u, opcode, asmop, "4s", VPR128, v4i32, shr_imm32, + OpNode> { + let Inst{22-21} = 0b01; + } + + def _2D : N2VShiftAdd_R<0b1, u, opcode, asmop, "2d", VPR128, v2i64, shr_imm64, + OpNode> { + let Inst{22} = 0b1; + } +} + +// Rounding shift right and accumulate +defm SRSRAvvi : NeonI_N2VShRAdd_R<0, 0b00110, "srsra", int_aarch64_neon_vsrshr>; +defm URSRAvvi : NeonI_N2VShRAdd_R<1, 0b00110, "ursra", int_aarch64_neon_vurshr>; + +// Shift insert by immediate +class N2VShiftIns opcode, string asmop, string T, + RegisterOperand VPRC, ValueType Ty, Operand ImmTy, + SDPatternOperator OpNode> + : NeonI_2VShiftImm { + let Constraints = "$src = $Rd"; +} + +// shift left insert (vector by immediate) +multiclass NeonI_N2VShLIns opcode, string asmop> { + def _8B : N2VShiftIns<0b0, u, opcode, asmop, "8b", VPR64, v8i8, shl_imm8, + int_aarch64_neon_vsli> { + let Inst{22-19} = 0b0001; + } + + def _4H : N2VShiftIns<0b0, u, opcode, asmop, "4h", VPR64, v4i16, shl_imm16, + int_aarch64_neon_vsli> { + let Inst{22-20} = 0b001; + } + + def _2S : N2VShiftIns<0b0, u, opcode, asmop, "2s", VPR64, v2i32, shl_imm32, + int_aarch64_neon_vsli> { + let Inst{22-21} = 0b01; + } + + // 128-bit vector types + def _16B : N2VShiftIns<0b1, u, opcode, asmop, "16b", VPR128, v16i8, shl_imm8, + int_aarch64_neon_vsli> { + let Inst{22-19} = 0b0001; + } + + def _8H : N2VShiftIns<0b1, u, opcode, asmop, "8h", VPR128, v8i16, shl_imm16, + int_aarch64_neon_vsli> { + let Inst{22-20} = 0b001; + } + + def _4S : N2VShiftIns<0b1, u, opcode, asmop, "4s", VPR128, v4i32, shl_imm32, + int_aarch64_neon_vsli> { + let Inst{22-21} = 0b01; + } + + def _2D : N2VShiftIns<0b1, u, opcode, asmop, "2d", VPR128, v2i64, shl_imm64, + int_aarch64_neon_vsli> { + let Inst{22} = 0b1; + } +} + +// shift right insert (vector by immediate) +multiclass NeonI_N2VShRIns opcode, string asmop> { + // 64-bit vector types. + def _8B : N2VShiftIns<0b0, u, opcode, asmop, "8b", VPR64, v8i8, shr_imm8, + int_aarch64_neon_vsri> { + let Inst{22-19} = 0b0001; + } + + def _4H : N2VShiftIns<0b0, u, opcode, asmop, "4h", VPR64, v4i16, shr_imm16, + int_aarch64_neon_vsri> { + let Inst{22-20} = 0b001; + } + + def _2S : N2VShiftIns<0b0, u, opcode, asmop, "2s", VPR64, v2i32, shr_imm32, + int_aarch64_neon_vsri> { + let Inst{22-21} = 0b01; + } + + // 128-bit vector types + def _16B : N2VShiftIns<0b1, u, opcode, asmop, "16b", VPR128, v16i8, shr_imm8, + int_aarch64_neon_vsri> { + let Inst{22-19} = 0b0001; + } + + def _8H : N2VShiftIns<0b1, u, opcode, asmop, "8h", VPR128, v8i16, shr_imm16, + int_aarch64_neon_vsri> { + let Inst{22-20} = 0b001; + } + + def _4S : N2VShiftIns<0b1, u, opcode, asmop, "4s", VPR128, v4i32, shr_imm32, + int_aarch64_neon_vsri> { + let Inst{22-21} = 0b01; + } + + def _2D : N2VShiftIns<0b1, u, opcode, asmop, "2d", VPR128, v2i64, shr_imm64, + int_aarch64_neon_vsri> { + let Inst{22} = 0b1; + } +} + +// Shift left and insert +defm SLIvvi : NeonI_N2VShLIns<0b1, 0b01010, "sli">; + +// Shift right and insert +defm SRIvvi : NeonI_N2VShRIns<0b1, 0b01000, "sri">; + +class N2VShR_Narrow opcode, string asmop, string DestT, + string SrcT, Operand ImmTy> + : NeonI_2VShiftImm; + +class N2VShR_Narrow_Hi opcode, string asmop, string DestT, + string SrcT, Operand ImmTy> + : NeonI_2VShiftImm { + let Constraints = "$src = $Rd"; +} + +// left long shift by immediate +multiclass NeonI_N2VShR_Narrow opcode, string asmop> { + def _8B : N2VShR_Narrow<0b0, u, opcode, asmop, "8b", "8h", shr_imm8> { + let Inst{22-19} = 0b0001; + } + + def _4H : N2VShR_Narrow<0b0, u, opcode, asmop, "4h", "4s", shr_imm16> { + let Inst{22-20} = 0b001; + } + + def _2S : N2VShR_Narrow<0b0, u, opcode, asmop, "2s", "2d", shr_imm32> { + let Inst{22-21} = 0b01; + } + + // Shift Narrow High + def _16B : N2VShR_Narrow_Hi<0b1, u, opcode, asmop # "2", "16b", "8h", + shr_imm8> { + let Inst{22-19} = 0b0001; + } + + def _8H : N2VShR_Narrow_Hi<0b1, u, opcode, asmop # "2", "8h", "4s", + shr_imm16> { + let Inst{22-20} = 0b001; + } + + def _4S : N2VShR_Narrow_Hi<0b1, u, opcode, asmop # "2", "4s", "2d", + shr_imm32> { + let Inst{22-21} = 0b01; + } +} + +// Shift right narrow +defm SHRNvvi : NeonI_N2VShR_Narrow<0b0, 0b10000, "shrn">; + +// Shift right narrow (prefix Q is saturating, prefix R is rounding) +defm QSHRUNvvi :NeonI_N2VShR_Narrow<0b1, 0b10000, "sqshrun">; +defm RSHRNvvi : NeonI_N2VShR_Narrow<0b0, 0b10001, "rshrn">; +defm QRSHRUNvvi : NeonI_N2VShR_Narrow<0b1, 0b10001, "sqrshrun">; +defm SQSHRNvvi : NeonI_N2VShR_Narrow<0b0, 0b10010, "sqshrn">; +defm UQSHRNvvi : NeonI_N2VShR_Narrow<0b1, 0b10010, "uqshrn">; +defm SQRSHRNvvi : NeonI_N2VShR_Narrow<0b0, 0b10011, "sqrshrn">; +defm UQRSHRNvvi : NeonI_N2VShR_Narrow<0b1, 0b10011, "uqrshrn">; + +def Neon_combine_2D : PatFrag<(ops node:$Rm, node:$Rn), + (v2i64 (concat_vectors (v1i64 node:$Rm), + (v1i64 node:$Rn)))>; +def Neon_combine_8H : PatFrag<(ops node:$Rm, node:$Rn), + (v8i16 (concat_vectors (v4i16 node:$Rm), + (v4i16 node:$Rn)))>; +def Neon_combine_4S : PatFrag<(ops node:$Rm, node:$Rn), + (v4i32 (concat_vectors (v2i32 node:$Rm), + (v2i32 node:$Rn)))>; +def Neon_combine_4f : PatFrag<(ops node:$Rm, node:$Rn), + (v4f32 (concat_vectors (v2f32 node:$Rm), + (v2f32 node:$Rn)))>; +def Neon_combine_2d : PatFrag<(ops node:$Rm, node:$Rn), + (v2f64 (concat_vectors (v1f64 node:$Rm), + (v1f64 node:$Rn)))>; + +def Neon_lshrImm8H : PatFrag<(ops node:$lhs, node:$rhs), + (v8i16 (srl (v8i16 node:$lhs), + (v8i16 (Neon_vdup (i32 node:$rhs)))))>; +def Neon_lshrImm4S : PatFrag<(ops node:$lhs, node:$rhs), + (v4i32 (srl (v4i32 node:$lhs), + (v4i32 (Neon_vdup (i32 node:$rhs)))))>; +def Neon_lshrImm2D : PatFrag<(ops node:$lhs, node:$rhs), + (v2i64 (srl (v2i64 node:$lhs), + (v2i64 (Neon_vdup (i32 node:$rhs)))))>; +def Neon_ashrImm8H : PatFrag<(ops node:$lhs, node:$rhs), + (v8i16 (sra (v8i16 node:$lhs), + (v8i16 (Neon_vdup (i32 node:$rhs)))))>; +def Neon_ashrImm4S : PatFrag<(ops node:$lhs, node:$rhs), + (v4i32 (sra (v4i32 node:$lhs), + (v4i32 (Neon_vdup (i32 node:$rhs)))))>; +def Neon_ashrImm2D : PatFrag<(ops node:$lhs, node:$rhs), + (v2i64 (sra (v2i64 node:$lhs), + (v2i64 (Neon_vdup (i32 node:$rhs)))))>; + +// Normal shift right narrow is matched by IR (srl/sra, trunc, concat_vectors) +multiclass Neon_shiftNarrow_patterns { + def : Pat<(v8i8 (trunc (!cast("Neon_" # shr # "Imm8H") VPR128:$Rn, + (i32 shr_imm8:$Imm)))), + (SHRNvvi_8B VPR128:$Rn, imm:$Imm)>; + def : Pat<(v4i16 (trunc (!cast("Neon_" # shr # "Imm4S") VPR128:$Rn, + (i32 shr_imm16:$Imm)))), + (SHRNvvi_4H VPR128:$Rn, imm:$Imm)>; + def : Pat<(v2i32 (trunc (!cast("Neon_" # shr # "Imm2D") VPR128:$Rn, + (i32 shr_imm32:$Imm)))), + (SHRNvvi_2S VPR128:$Rn, imm:$Imm)>; + + def : Pat<(Neon_combine_2D (v1i64 VPR64:$src), (v1i64 (bitconvert + (v8i8 (trunc (!cast("Neon_" # shr # "Imm8H") + VPR128:$Rn, (i32 shr_imm8:$Imm))))))), + (SHRNvvi_16B (v2i64 (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64)), + VPR128:$Rn, imm:$Imm)>; + def : Pat<(Neon_combine_2D (v1i64 VPR64:$src), (v1i64 (bitconvert + (v4i16 (trunc (!cast("Neon_" # shr # "Imm4S") + VPR128:$Rn, (i32 shr_imm16:$Imm))))))), + (SHRNvvi_8H (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64), + VPR128:$Rn, imm:$Imm)>; + def : Pat<(Neon_combine_2D (v1i64 VPR64:$src), (v1i64 (bitconvert + (v2i32 (trunc (!cast("Neon_" # shr # "Imm2D") + VPR128:$Rn, (i32 shr_imm32:$Imm))))))), + (SHRNvvi_4S (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64), + VPR128:$Rn, imm:$Imm)>; +} + +multiclass Neon_shiftNarrow_QR_patterns { + def : Pat<(v8i8 (op (v8i16 VPR128:$Rn), shr_imm8:$Imm)), + (!cast(prefix # "_8B") VPR128:$Rn, imm:$Imm)>; + def : Pat<(v4i16 (op (v4i32 VPR128:$Rn), shr_imm16:$Imm)), + (!cast(prefix # "_4H") VPR128:$Rn, imm:$Imm)>; + def : Pat<(v2i32 (op (v2i64 VPR128:$Rn), shr_imm32:$Imm)), + (!cast(prefix # "_2S") VPR128:$Rn, imm:$Imm)>; + + def : Pat<(Neon_combine_2D (v1i64 VPR64:$src), + (v1i64 (bitconvert (v8i8 + (op (v8i16 VPR128:$Rn), shr_imm8:$Imm))))), + (!cast(prefix # "_16B") + (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64), + VPR128:$Rn, imm:$Imm)>; + def : Pat<(Neon_combine_2D (v1i64 VPR64:$src), + (v1i64 (bitconvert (v4i16 + (op (v4i32 VPR128:$Rn), shr_imm16:$Imm))))), + (!cast(prefix # "_8H") + (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64), + VPR128:$Rn, imm:$Imm)>; + def : Pat<(Neon_combine_2D (v1i64 VPR64:$src), + (v1i64 (bitconvert (v2i32 + (op (v2i64 VPR128:$Rn), shr_imm32:$Imm))))), + (!cast(prefix # "_4S") + (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64), + VPR128:$Rn, imm:$Imm)>; +} + +defm : Neon_shiftNarrow_patterns<"lshr">; +defm : Neon_shiftNarrow_patterns<"ashr">; + +defm : Neon_shiftNarrow_QR_patterns; +defm : Neon_shiftNarrow_QR_patterns; +defm : Neon_shiftNarrow_QR_patterns; +defm : Neon_shiftNarrow_QR_patterns; +defm : Neon_shiftNarrow_QR_patterns; +defm : Neon_shiftNarrow_QR_patterns; +defm : Neon_shiftNarrow_QR_patterns; + +// Convert fix-point and float-pointing +class N2VCvt_Fx opcode, string asmop, string T, + RegisterOperand VPRC, ValueType DestTy, ValueType SrcTy, + Operand ImmTy, SDPatternOperator IntOp> + : NeonI_2VShiftImm; + +multiclass NeonI_N2VCvt_Fx2fp opcode, string asmop, + SDPatternOperator IntOp> { + def _2S : N2VCvt_Fx<0, u, opcode, asmop, "2s", VPR64, v2f32, v2i32, + shr_imm32, IntOp> { + let Inst{22-21} = 0b01; + } + + def _4S : N2VCvt_Fx<1, u, opcode, asmop, "4s", VPR128, v4f32, v4i32, + shr_imm32, IntOp> { + let Inst{22-21} = 0b01; + } + + def _2D : N2VCvt_Fx<1, u, opcode, asmop, "2d", VPR128, v2f64, v2i64, + shr_imm64, IntOp> { + let Inst{22} = 0b1; + } +} + +multiclass NeonI_N2VCvt_Fp2fx opcode, string asmop, + SDPatternOperator IntOp> { + def _2S : N2VCvt_Fx<0, u, opcode, asmop, "2s", VPR64, v2i32, v2f32, + shr_imm32, IntOp> { + let Inst{22-21} = 0b01; + } + + def _4S : N2VCvt_Fx<1, u, opcode, asmop, "4s", VPR128, v4i32, v4f32, + shr_imm32, IntOp> { + let Inst{22-21} = 0b01; + } + + def _2D : N2VCvt_Fx<1, u, opcode, asmop, "2d", VPR128, v2i64, v2f64, + shr_imm64, IntOp> { + let Inst{22} = 0b1; + } +} + +// Convert fixed-point to floating-point +defm VCVTxs2f : NeonI_N2VCvt_Fx2fp<0, 0b11100, "scvtf", + int_arm_neon_vcvtfxs2fp>; +defm VCVTxu2f : NeonI_N2VCvt_Fx2fp<1, 0b11100, "ucvtf", + int_arm_neon_vcvtfxu2fp>; + +// Convert floating-point to fixed-point +defm VCVTf2xs : NeonI_N2VCvt_Fp2fx<0, 0b11111, "fcvtzs", + int_arm_neon_vcvtfp2fxs>; +defm VCVTf2xu : NeonI_N2VCvt_Fp2fx<1, 0b11111, "fcvtzu", + int_arm_neon_vcvtfp2fxu>; + +multiclass Neon_sshll2_0 +{ + def _v8i8 : PatFrag<(ops node:$Rn), + (v8i16 (ext (v8i8 (Neon_High16B node:$Rn))))>; + def _v4i16 : PatFrag<(ops node:$Rn), + (v4i32 (ext (v4i16 (Neon_High8H node:$Rn))))>; + def _v2i32 : PatFrag<(ops node:$Rn), + (v2i64 (ext (v2i32 (Neon_High4S node:$Rn))))>; +} + +defm NI_sext_high : Neon_sshll2_0; +defm NI_zext_high : Neon_sshll2_0; + + +//===----------------------------------------------------------------------===// +// Multiclasses for NeonI_Across +//===----------------------------------------------------------------------===// + +// Variant 1 + +multiclass NeonI_2VAcross_1 opcode, + string asmop, SDPatternOperator opnode> +{ + def _1h8b: NeonI_2VAcross<0b0, u, 0b00, opcode, + (outs FPR16:$Rd), (ins VPR64:$Rn), + asmop # "\t$Rd, $Rn.8b", + [(set (v1i16 FPR16:$Rd), + (v1i16 (opnode (v8i8 VPR64:$Rn))))], + NoItinerary>; + + def _1h16b: NeonI_2VAcross<0b1, u, 0b00, opcode, + (outs FPR16:$Rd), (ins VPR128:$Rn), + asmop # "\t$Rd, $Rn.16b", + [(set (v1i16 FPR16:$Rd), + (v1i16 (opnode (v16i8 VPR128:$Rn))))], + NoItinerary>; + + def _1s4h: NeonI_2VAcross<0b0, u, 0b01, opcode, + (outs FPR32:$Rd), (ins VPR64:$Rn), + asmop # "\t$Rd, $Rn.4h", + [(set (v1i32 FPR32:$Rd), + (v1i32 (opnode (v4i16 VPR64:$Rn))))], + NoItinerary>; + + def _1s8h: NeonI_2VAcross<0b1, u, 0b01, opcode, + (outs FPR32:$Rd), (ins VPR128:$Rn), + asmop # "\t$Rd, $Rn.8h", + [(set (v1i32 FPR32:$Rd), + (v1i32 (opnode (v8i16 VPR128:$Rn))))], + NoItinerary>; + + // _1d2s doesn't exist! + + def _1d4s: NeonI_2VAcross<0b1, u, 0b10, opcode, + (outs FPR64:$Rd), (ins VPR128:$Rn), + asmop # "\t$Rd, $Rn.4s", + [(set (v1i64 FPR64:$Rd), + (v1i64 (opnode (v4i32 VPR128:$Rn))))], + NoItinerary>; +} + +defm SADDLV : NeonI_2VAcross_1<0b0, 0b00011, "saddlv", int_aarch64_neon_saddlv>; +defm UADDLV : NeonI_2VAcross_1<0b1, 0b00011, "uaddlv", int_aarch64_neon_uaddlv>; + +// Variant 2 + +multiclass NeonI_2VAcross_2 opcode, + string asmop, SDPatternOperator opnode> +{ + def _1b8b: NeonI_2VAcross<0b0, u, 0b00, opcode, + (outs FPR8:$Rd), (ins VPR64:$Rn), + asmop # "\t$Rd, $Rn.8b", + [(set (v1i8 FPR8:$Rd), + (v1i8 (opnode (v8i8 VPR64:$Rn))))], + NoItinerary>; + + def _1b16b: NeonI_2VAcross<0b1, u, 0b00, opcode, + (outs FPR8:$Rd), (ins VPR128:$Rn), + asmop # "\t$Rd, $Rn.16b", + [(set (v1i8 FPR8:$Rd), + (v1i8 (opnode (v16i8 VPR128:$Rn))))], + NoItinerary>; + + def _1h4h: NeonI_2VAcross<0b0, u, 0b01, opcode, + (outs FPR16:$Rd), (ins VPR64:$Rn), + asmop # "\t$Rd, $Rn.4h", + [(set (v1i16 FPR16:$Rd), + (v1i16 (opnode (v4i16 VPR64:$Rn))))], + NoItinerary>; + + def _1h8h: NeonI_2VAcross<0b1, u, 0b01, opcode, + (outs FPR16:$Rd), (ins VPR128:$Rn), + asmop # "\t$Rd, $Rn.8h", + [(set (v1i16 FPR16:$Rd), + (v1i16 (opnode (v8i16 VPR128:$Rn))))], + NoItinerary>; + + // _1s2s doesn't exist! + + def _1s4s: NeonI_2VAcross<0b1, u, 0b10, opcode, + (outs FPR32:$Rd), (ins VPR128:$Rn), + asmop # "\t$Rd, $Rn.4s", + [(set (v1i32 FPR32:$Rd), + (v1i32 (opnode (v4i32 VPR128:$Rn))))], + NoItinerary>; +} + +defm SMAXV : NeonI_2VAcross_2<0b0, 0b01010, "smaxv", int_aarch64_neon_smaxv>; +defm UMAXV : NeonI_2VAcross_2<0b1, 0b01010, "umaxv", int_aarch64_neon_umaxv>; + +defm SMINV : NeonI_2VAcross_2<0b0, 0b11010, "sminv", int_aarch64_neon_sminv>; +defm UMINV : NeonI_2VAcross_2<0b1, 0b11010, "uminv", int_aarch64_neon_uminv>; + +defm ADDV : NeonI_2VAcross_2<0b0, 0b11011, "addv", int_aarch64_neon_vaddv>; + +// Variant 3 + +multiclass NeonI_2VAcross_3 opcode, bits<2> size, + string asmop, SDPatternOperator opnode> { + def _1s4s: NeonI_2VAcross<0b1, u, size, opcode, + (outs FPR32:$Rd), (ins VPR128:$Rn), + asmop # "\t$Rd, $Rn.4s", + [(set (v1f32 FPR32:$Rd), + (v1f32 (opnode (v4f32 VPR128:$Rn))))], + NoItinerary>; +} + +defm FMAXNMV : NeonI_2VAcross_3<0b1, 0b01100, 0b00, "fmaxnmv", + int_aarch64_neon_vmaxnmv>; +defm FMINNMV : NeonI_2VAcross_3<0b1, 0b01100, 0b10, "fminnmv", + int_aarch64_neon_vminnmv>; + +defm FMAXV : NeonI_2VAcross_3<0b1, 0b01111, 0b00, "fmaxv", + int_aarch64_neon_vmaxv>; +defm FMINV : NeonI_2VAcross_3<0b1, 0b01111, 0b10, "fminv", + int_aarch64_neon_vminv>; + +// The followings are for instruction class (Perm) + +class NeonI_Permute size, bits<3> opcode, + string asmop, RegisterOperand OpVPR, string OpS, + SDPatternOperator opnode, ValueType Ty> + : NeonI_Perm; + +multiclass NeonI_Perm_pat opcode, string asmop, + SDPatternOperator opnode> { + def _8b : NeonI_Permute<0b0, 0b00, opcode, asmop, + VPR64, "8b", opnode, v8i8>; + def _16b : NeonI_Permute<0b1, 0b00, opcode, asmop, + VPR128, "16b",opnode, v16i8>; + def _4h : NeonI_Permute<0b0, 0b01, opcode, asmop, + VPR64, "4h", opnode, v4i16>; + def _8h : NeonI_Permute<0b1, 0b01, opcode, asmop, + VPR128, "8h", opnode, v8i16>; + def _2s : NeonI_Permute<0b0, 0b10, opcode, asmop, + VPR64, "2s", opnode, v2i32>; + def _4s : NeonI_Permute<0b1, 0b10, opcode, asmop, + VPR128, "4s", opnode, v4i32>; + def _2d : NeonI_Permute<0b1, 0b11, opcode, asmop, + VPR128, "2d", opnode, v2i64>; +} + +defm UZP1vvv : NeonI_Perm_pat<0b001, "uzp1", Neon_uzp1>; +defm TRN1vvv : NeonI_Perm_pat<0b010, "trn1", Neon_trn1>; +defm ZIP1vvv : NeonI_Perm_pat<0b011, "zip1", Neon_zip1>; +defm UZP2vvv : NeonI_Perm_pat<0b101, "uzp2", Neon_uzp2>; +defm TRN2vvv : NeonI_Perm_pat<0b110, "trn2", Neon_trn2>; +defm ZIP2vvv : NeonI_Perm_pat<0b111, "zip2", Neon_zip2>; + +multiclass NeonI_Perm_float_pat { + def : Pat<(v2f32 (opnode (v2f32 VPR64:$Rn), (v2f32 VPR64:$Rm))), + (!cast(INS # "_2s") VPR64:$Rn, VPR64:$Rm)>; + + def : Pat<(v4f32 (opnode (v4f32 VPR128:$Rn), (v4f32 VPR128:$Rm))), + (!cast(INS # "_4s") VPR128:$Rn, VPR128:$Rm)>; + + def : Pat<(v2f64 (opnode (v2f64 VPR128:$Rn), (v2f64 VPR128:$Rm))), + (!cast(INS # "_2d") VPR128:$Rn, VPR128:$Rm)>; +} + +defm : NeonI_Perm_float_pat<"UZP1vvv", Neon_uzp1>; +defm : NeonI_Perm_float_pat<"UZP2vvv", Neon_uzp2>; +defm : NeonI_Perm_float_pat<"ZIP1vvv", Neon_zip1>; +defm : NeonI_Perm_float_pat<"ZIP2vvv", Neon_zip2>; +defm : NeonI_Perm_float_pat<"TRN1vvv", Neon_trn1>; +defm : NeonI_Perm_float_pat<"TRN2vvv", Neon_trn2>; + +// The followings are for instruction class (3V Diff) + +// normal long/long2 pattern +class NeonI_3VDL size, bits<4> opcode, + string asmop, string ResS, string OpS, + SDPatternOperator opnode, SDPatternOperator ext, + RegisterOperand OpVPR, + ValueType ResTy, ValueType OpTy> + : NeonI_3VDiff; + +multiclass NeonI_3VDL_s opcode, + string asmop, SDPatternOperator opnode, + bit Commutable = 0> { + let isCommutable = Commutable in { + def _8h8b : NeonI_3VDL<0b0, u, 0b00, opcode, asmop, "8h", "8b", + opnode, sext, VPR64, v8i16, v8i8>; + def _4s4h : NeonI_3VDL<0b0, u, 0b01, opcode, asmop, "4s", "4h", + opnode, sext, VPR64, v4i32, v4i16>; + def _2d2s : NeonI_3VDL<0b0, u, 0b10, opcode, asmop, "2d", "2s", + opnode, sext, VPR64, v2i64, v2i32>; + } +} + +multiclass NeonI_3VDL2_s opcode, string asmop, + SDPatternOperator opnode, bit Commutable = 0> { + let isCommutable = Commutable in { + def _8h16b : NeonI_3VDL<0b1, u, 0b00, opcode, asmop, "8h", "16b", + opnode, NI_sext_high_v8i8, VPR128, v8i16, v16i8>; + def _4s8h : NeonI_3VDL<0b1, u, 0b01, opcode, asmop, "4s", "8h", + opnode, NI_sext_high_v4i16, VPR128, v4i32, v8i16>; + def _2d4s : NeonI_3VDL<0b1, u, 0b10, opcode, asmop, "2d", "4s", + opnode, NI_sext_high_v2i32, VPR128, v2i64, v4i32>; + } +} + +multiclass NeonI_3VDL_u opcode, string asmop, + SDPatternOperator opnode, bit Commutable = 0> { + let isCommutable = Commutable in { + def _8h8b : NeonI_3VDL<0b0, u, 0b00, opcode, asmop, "8h", "8b", + opnode, zext, VPR64, v8i16, v8i8>; + def _4s4h : NeonI_3VDL<0b0, u, 0b01, opcode, asmop, "4s", "4h", + opnode, zext, VPR64, v4i32, v4i16>; + def _2d2s : NeonI_3VDL<0b0, u, 0b10, opcode, asmop, "2d", "2s", + opnode, zext, VPR64, v2i64, v2i32>; + } +} + +multiclass NeonI_3VDL2_u opcode, string asmop, + SDPatternOperator opnode, bit Commutable = 0> { + let isCommutable = Commutable in { + def _8h16b : NeonI_3VDL<0b1, u, 0b00, opcode, asmop, "8h", "16b", + opnode, NI_zext_high_v8i8, VPR128, v8i16, v16i8>; + def _4s8h : NeonI_3VDL<0b1, u, 0b01, opcode, asmop, "4s", "8h", + opnode, NI_zext_high_v4i16, VPR128, v4i32, v8i16>; + def _2d4s : NeonI_3VDL<0b1, u, 0b10, opcode, asmop, "2d", "4s", + opnode, NI_zext_high_v2i32, VPR128, v2i64, v4i32>; + } +} + +defm SADDLvvv : NeonI_3VDL_s<0b0, 0b0000, "saddl", add, 1>; +defm UADDLvvv : NeonI_3VDL_u<0b1, 0b0000, "uaddl", add, 1>; + +defm SADDL2vvv : NeonI_3VDL2_s<0b0, 0b0000, "saddl2", add, 1>; +defm UADDL2vvv : NeonI_3VDL2_u<0b1, 0b0000, "uaddl2", add, 1>; + +defm SSUBLvvv : NeonI_3VDL_s<0b0, 0b0010, "ssubl", sub, 0>; +defm USUBLvvv : NeonI_3VDL_u<0b1, 0b0010, "usubl", sub, 0>; + +defm SSUBL2vvv : NeonI_3VDL2_s<0b0, 0b0010, "ssubl2", sub, 0>; +defm USUBL2vvv : NeonI_3VDL2_u<0b1, 0b0010, "usubl2", sub, 0>; + +// normal wide/wide2 pattern +class NeonI_3VDW size, bits<4> opcode, + string asmop, string ResS, string OpS, + SDPatternOperator opnode, SDPatternOperator ext, + RegisterOperand OpVPR, + ValueType ResTy, ValueType OpTy> + : NeonI_3VDiff; + +multiclass NeonI_3VDW_s opcode, string asmop, + SDPatternOperator opnode> { + def _8h8b : NeonI_3VDW<0b0, u, 0b00, opcode, asmop, "8h", "8b", + opnode, sext, VPR64, v8i16, v8i8>; + def _4s4h : NeonI_3VDW<0b0, u, 0b01, opcode, asmop, "4s", "4h", + opnode, sext, VPR64, v4i32, v4i16>; + def _2d2s : NeonI_3VDW<0b0, u, 0b10, opcode, asmop, "2d", "2s", + opnode, sext, VPR64, v2i64, v2i32>; +} + +defm SADDWvvv : NeonI_3VDW_s<0b0, 0b0001, "saddw", add>; +defm SSUBWvvv : NeonI_3VDW_s<0b0, 0b0011, "ssubw", sub>; + +multiclass NeonI_3VDW2_s opcode, string asmop, + SDPatternOperator opnode> { + def _8h16b : NeonI_3VDW<0b1, u, 0b00, opcode, asmop, "8h", "16b", + opnode, NI_sext_high_v8i8, VPR128, v8i16, v16i8>; + def _4s8h : NeonI_3VDW<0b1, u, 0b01, opcode, asmop, "4s", "8h", + opnode, NI_sext_high_v4i16, VPR128, v4i32, v8i16>; + def _2d4s : NeonI_3VDW<0b1, u, 0b10, opcode, asmop, "2d", "4s", + opnode, NI_sext_high_v2i32, VPR128, v2i64, v4i32>; +} + +defm SADDW2vvv : NeonI_3VDW2_s<0b0, 0b0001, "saddw2", add>; +defm SSUBW2vvv : NeonI_3VDW2_s<0b0, 0b0011, "ssubw2", sub>; + +multiclass NeonI_3VDW_u opcode, string asmop, + SDPatternOperator opnode> { + def _8h8b : NeonI_3VDW<0b0, u, 0b00, opcode, asmop, "8h", "8b", + opnode, zext, VPR64, v8i16, v8i8>; + def _4s4h : NeonI_3VDW<0b0, u, 0b01, opcode, asmop, "4s", "4h", + opnode, zext, VPR64, v4i32, v4i16>; + def _2d2s : NeonI_3VDW<0b0, u, 0b10, opcode, asmop, "2d", "2s", + opnode, zext, VPR64, v2i64, v2i32>; +} + +defm UADDWvvv : NeonI_3VDW_u<0b1, 0b0001, "uaddw", add>; +defm USUBWvvv : NeonI_3VDW_u<0b1, 0b0011, "usubw", sub>; + +multiclass NeonI_3VDW2_u opcode, string asmop, + SDPatternOperator opnode> { + def _8h16b : NeonI_3VDW<0b1, u, 0b00, opcode, asmop, "8h", "16b", + opnode, NI_zext_high_v8i8, VPR128, v8i16, v16i8>; + def _4s8h : NeonI_3VDW<0b1, u, 0b01, opcode, asmop, "4s", "8h", + opnode, NI_zext_high_v4i16, VPR128, v4i32, v8i16>; + def _2d4s : NeonI_3VDW<0b1, u, 0b10, opcode, asmop, "2d", "4s", + opnode, NI_zext_high_v2i32, VPR128, v2i64, v4i32>; +} + +defm UADDW2vvv : NeonI_3VDW2_u<0b1, 0b0001, "uaddw2", add>; +defm USUBW2vvv : NeonI_3VDW2_u<0b1, 0b0011, "usubw2", sub>; + +// Get the high half part of the vector element. +multiclass NeonI_get_high { + def _8h : PatFrag<(ops node:$Rn), + (v8i8 (trunc (v8i16 (srl (v8i16 node:$Rn), + (v8i16 (Neon_vdup (i32 8)))))))>; + def _4s : PatFrag<(ops node:$Rn), + (v4i16 (trunc (v4i32 (srl (v4i32 node:$Rn), + (v4i32 (Neon_vdup (i32 16)))))))>; + def _2d : PatFrag<(ops node:$Rn), + (v2i32 (trunc (v2i64 (srl (v2i64 node:$Rn), + (v2i64 (Neon_vdup (i32 32)))))))>; +} + +defm NI_get_hi : NeonI_get_high; + +// pattern for addhn/subhn with 2 operands +class NeonI_3VDN_addhn_2Op size, bits<4> opcode, + string asmop, string ResS, string OpS, + SDPatternOperator opnode, SDPatternOperator get_hi, + ValueType ResTy, ValueType OpTy> + : NeonI_3VDiff; + +multiclass NeonI_3VDN_addhn_2Op opcode, string asmop, + SDPatternOperator opnode, bit Commutable = 0> { + let isCommutable = Commutable in { + def _8b8h : NeonI_3VDN_addhn_2Op<0b0, u, 0b00, opcode, asmop, "8b", "8h", + opnode, NI_get_hi_8h, v8i8, v8i16>; + def _4h4s : NeonI_3VDN_addhn_2Op<0b0, u, 0b01, opcode, asmop, "4h", "4s", + opnode, NI_get_hi_4s, v4i16, v4i32>; + def _2s2d : NeonI_3VDN_addhn_2Op<0b0, u, 0b10, opcode, asmop, "2s", "2d", + opnode, NI_get_hi_2d, v2i32, v2i64>; + } +} + +defm ADDHNvvv : NeonI_3VDN_addhn_2Op<0b0, 0b0100, "addhn", add, 1>; +defm SUBHNvvv : NeonI_3VDN_addhn_2Op<0b0, 0b0110, "subhn", sub, 0>; + +// pattern for operation with 2 operands +class NeonI_3VD_2Op size, bits<4> opcode, + string asmop, string ResS, string OpS, + SDPatternOperator opnode, + RegisterOperand ResVPR, RegisterOperand OpVPR, + ValueType ResTy, ValueType OpTy> + : NeonI_3VDiff; + +// normal narrow pattern +multiclass NeonI_3VDN_2Op opcode, string asmop, + SDPatternOperator opnode, bit Commutable = 0> { + let isCommutable = Commutable in { + def _8b8h : NeonI_3VD_2Op<0b0, u, 0b00, opcode, asmop, "8b", "8h", + opnode, VPR64, VPR128, v8i8, v8i16>; + def _4h4s : NeonI_3VD_2Op<0b0, u, 0b01, opcode, asmop, "4h", "4s", + opnode, VPR64, VPR128, v4i16, v4i32>; + def _2s2d : NeonI_3VD_2Op<0b0, u, 0b10, opcode, asmop, "2s", "2d", + opnode, VPR64, VPR128, v2i32, v2i64>; + } +} + +defm RADDHNvvv : NeonI_3VDN_2Op<0b1, 0b0100, "raddhn", int_arm_neon_vraddhn, 1>; +defm RSUBHNvvv : NeonI_3VDN_2Op<0b1, 0b0110, "rsubhn", int_arm_neon_vrsubhn, 0>; + +// pattern for acle intrinsic with 3 operands +class NeonI_3VDN_3Op size, bits<4> opcode, + string asmop, string ResS, string OpS> + : NeonI_3VDiff { + let Constraints = "$src = $Rd"; + let neverHasSideEffects = 1; +} + +multiclass NeonI_3VDN_3Op_v1 opcode, string asmop> { + def _16b8h : NeonI_3VDN_3Op<0b1, u, 0b00, opcode, asmop, "16b", "8h">; + def _8h4s : NeonI_3VDN_3Op<0b1, u, 0b01, opcode, asmop, "8h", "4s">; + def _4s2d : NeonI_3VDN_3Op<0b1, u, 0b10, opcode, asmop, "4s", "2d">; +} + +defm ADDHN2vvv : NeonI_3VDN_3Op_v1<0b0, 0b0100, "addhn2">; +defm SUBHN2vvv : NeonI_3VDN_3Op_v1<0b0, 0b0110, "subhn2">; + +defm RADDHN2vvv : NeonI_3VDN_3Op_v1<0b1, 0b0100, "raddhn2">; +defm RSUBHN2vvv : NeonI_3VDN_3Op_v1<0b1, 0b0110, "rsubhn2">; + +// Patterns have to be separate because there's a SUBREG_TO_REG in the output +// part. +class NarrowHighHalfPat + : Pat<(Neon_combine_2D (v1i64 VPR64:$src), + (v1i64 (bitconvert (DstTy (coreop (SrcTy VPR128:$Rn), + (SrcTy VPR128:$Rm)))))), + (INST (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64), + VPR128:$Rn, VPR128:$Rm)>; + +// addhn2 patterns +def : NarrowHighHalfPat>; +def : NarrowHighHalfPat>; +def : NarrowHighHalfPat>; + +// subhn2 patterns +def : NarrowHighHalfPat>; +def : NarrowHighHalfPat>; +def : NarrowHighHalfPat>; + +// raddhn2 patterns +def : NarrowHighHalfPat; +def : NarrowHighHalfPat; +def : NarrowHighHalfPat; + +// rsubhn2 patterns +def : NarrowHighHalfPat; +def : NarrowHighHalfPat; +def : NarrowHighHalfPat; + +// pattern that need to extend result +class NeonI_3VDL_Ext size, bits<4> opcode, + string asmop, string ResS, string OpS, + SDPatternOperator opnode, + RegisterOperand OpVPR, + ValueType ResTy, ValueType OpTy, ValueType OpSTy> + : NeonI_3VDiff; + +multiclass NeonI_3VDL_zext opcode, string asmop, + SDPatternOperator opnode, bit Commutable = 0> { + let isCommutable = Commutable in { + def _8h8b : NeonI_3VDL_Ext<0b0, u, 0b00, opcode, asmop, "8h", "8b", + opnode, VPR64, v8i16, v8i8, v8i8>; + def _4s4h : NeonI_3VDL_Ext<0b0, u, 0b01, opcode, asmop, "4s", "4h", + opnode, VPR64, v4i32, v4i16, v4i16>; + def _2d2s : NeonI_3VDL_Ext<0b0, u, 0b10, opcode, asmop, "2d", "2s", + opnode, VPR64, v2i64, v2i32, v2i32>; + } +} + +defm SABDLvvv : NeonI_3VDL_zext<0b0, 0b0111, "sabdl", int_arm_neon_vabds, 1>; +defm UABDLvvv : NeonI_3VDL_zext<0b1, 0b0111, "uabdl", int_arm_neon_vabdu, 1>; + +multiclass NeonI_Op_High { + def _16B : PatFrag<(ops node:$Rn, node:$Rm), + (op (v8i8 (Neon_High16B node:$Rn)), + (v8i8 (Neon_High16B node:$Rm)))>; + def _8H : PatFrag<(ops node:$Rn, node:$Rm), + (op (v4i16 (Neon_High8H node:$Rn)), + (v4i16 (Neon_High8H node:$Rm)))>; + def _4S : PatFrag<(ops node:$Rn, node:$Rm), + (op (v2i32 (Neon_High4S node:$Rn)), + (v2i32 (Neon_High4S node:$Rm)))>; +} + +defm NI_sabdl_hi : NeonI_Op_High; +defm NI_uabdl_hi : NeonI_Op_High; +defm NI_smull_hi : NeonI_Op_High; +defm NI_umull_hi : NeonI_Op_High; +defm NI_qdmull_hi : NeonI_Op_High; +defm NI_pmull_hi : NeonI_Op_High; + +multiclass NeonI_3VDL_Abd_u opcode, string asmop, string opnode, + bit Commutable = 0> { + let isCommutable = Commutable in { + def _8h8b : NeonI_3VDL_Ext<0b1, u, 0b00, opcode, asmop, "8h", "16b", + !cast(opnode # "_16B"), + VPR128, v8i16, v16i8, v8i8>; + def _4s4h : NeonI_3VDL_Ext<0b1, u, 0b01, opcode, asmop, "4s", "8h", + !cast(opnode # "_8H"), + VPR128, v4i32, v8i16, v4i16>; + def _2d2s : NeonI_3VDL_Ext<0b1, u, 0b10, opcode, asmop, "2d", "4s", + !cast(opnode # "_4S"), + VPR128, v2i64, v4i32, v2i32>; + } +} + +defm SABDL2vvv : NeonI_3VDL_Abd_u<0b0, 0b0111, "sabdl2", "NI_sabdl_hi", 1>; +defm UABDL2vvv : NeonI_3VDL_Abd_u<0b1, 0b0111, "uabdl2", "NI_uabdl_hi", 1>; + +// For pattern that need two operators being chained. +class NeonI_3VDL_Aba size, bits<4> opcode, + string asmop, string ResS, string OpS, + SDPatternOperator opnode, SDPatternOperator subop, + RegisterOperand OpVPR, + ValueType ResTy, ValueType OpTy, ValueType OpSTy> + : NeonI_3VDiff { + let Constraints = "$src = $Rd"; +} + +multiclass NeonI_3VDL_Aba_v1 opcode, string asmop, + SDPatternOperator opnode, SDPatternOperator subop>{ + def _8h8b : NeonI_3VDL_Aba<0b0, u, 0b00, opcode, asmop, "8h", "8b", + opnode, subop, VPR64, v8i16, v8i8, v8i8>; + def _4s4h : NeonI_3VDL_Aba<0b0, u, 0b01, opcode, asmop, "4s", "4h", + opnode, subop, VPR64, v4i32, v4i16, v4i16>; + def _2d2s : NeonI_3VDL_Aba<0b0, u, 0b10, opcode, asmop, "2d", "2s", + opnode, subop, VPR64, v2i64, v2i32, v2i32>; +} + +defm SABALvvv : NeonI_3VDL_Aba_v1<0b0, 0b0101, "sabal", + add, int_arm_neon_vabds>; +defm UABALvvv : NeonI_3VDL_Aba_v1<0b1, 0b0101, "uabal", + add, int_arm_neon_vabdu>; + +multiclass NeonI_3VDL2_Aba_v1 opcode, string asmop, + SDPatternOperator opnode, string subop> { + def _8h8b : NeonI_3VDL_Aba<0b1, u, 0b00, opcode, asmop, "8h", "16b", + opnode, !cast(subop # "_16B"), + VPR128, v8i16, v16i8, v8i8>; + def _4s4h : NeonI_3VDL_Aba<0b1, u, 0b01, opcode, asmop, "4s", "8h", + opnode, !cast(subop # "_8H"), + VPR128, v4i32, v8i16, v4i16>; + def _2d2s : NeonI_3VDL_Aba<0b1, u, 0b10, opcode, asmop, "2d", "4s", + opnode, !cast(subop # "_4S"), + VPR128, v2i64, v4i32, v2i32>; +} + +defm SABAL2vvv : NeonI_3VDL2_Aba_v1<0b0, 0b0101, "sabal2", add, + "NI_sabdl_hi">; +defm UABAL2vvv : NeonI_3VDL2_Aba_v1<0b1, 0b0101, "uabal2", add, + "NI_uabdl_hi">; + +// Long pattern with 2 operands +multiclass NeonI_3VDL_2Op opcode, string asmop, + SDPatternOperator opnode, bit Commutable = 0> { + let isCommutable = Commutable in { + def _8h8b : NeonI_3VD_2Op<0b0, u, 0b00, opcode, asmop, "8h", "8b", + opnode, VPR128, VPR64, v8i16, v8i8>; + def _4s4h : NeonI_3VD_2Op<0b0, u, 0b01, opcode, asmop, "4s", "4h", + opnode, VPR128, VPR64, v4i32, v4i16>; + def _2d2s : NeonI_3VD_2Op<0b0, u, 0b10, opcode, asmop, "2d", "2s", + opnode, VPR128, VPR64, v2i64, v2i32>; + } +} + +defm SMULLvvv : NeonI_3VDL_2Op<0b0, 0b1100, "smull", int_arm_neon_vmulls, 1>; +defm UMULLvvv : NeonI_3VDL_2Op<0b1, 0b1100, "umull", int_arm_neon_vmullu, 1>; + +class NeonI_3VDL2_2Op_mull size, bits<4> opcode, + string asmop, string ResS, string OpS, + SDPatternOperator opnode, + ValueType ResTy, ValueType OpTy> + : NeonI_3VDiff; + +multiclass NeonI_3VDL2_2Op_mull_v1 opcode, string asmop, + string opnode, bit Commutable = 0> { + let isCommutable = Commutable in { + def _8h16b : NeonI_3VDL2_2Op_mull<0b1, u, 0b00, opcode, asmop, "8h", "16b", + !cast(opnode # "_16B"), + v8i16, v16i8>; + def _4s8h : NeonI_3VDL2_2Op_mull<0b1, u, 0b01, opcode, asmop, "4s", "8h", + !cast(opnode # "_8H"), + v4i32, v8i16>; + def _2d4s : NeonI_3VDL2_2Op_mull<0b1, u, 0b10, opcode, asmop, "2d", "4s", + !cast(opnode # "_4S"), + v2i64, v4i32>; + } +} + +defm SMULL2vvv : NeonI_3VDL2_2Op_mull_v1<0b0, 0b1100, "smull2", + "NI_smull_hi", 1>; +defm UMULL2vvv : NeonI_3VDL2_2Op_mull_v1<0b1, 0b1100, "umull2", + "NI_umull_hi", 1>; + +// Long pattern with 3 operands +class NeonI_3VDL_3Op size, bits<4> opcode, + string asmop, string ResS, string OpS, + SDPatternOperator opnode, + ValueType ResTy, ValueType OpTy> + : NeonI_3VDiff { + let Constraints = "$src = $Rd"; +} + +multiclass NeonI_3VDL_3Op_v1 opcode, string asmop, + SDPatternOperator opnode> { + def _8h8b : NeonI_3VDL_3Op<0b0, u, 0b00, opcode, asmop, "8h", "8b", + opnode, v8i16, v8i8>; + def _4s4h : NeonI_3VDL_3Op<0b0, u, 0b01, opcode, asmop, "4s", "4h", + opnode, v4i32, v4i16>; + def _2d2s : NeonI_3VDL_3Op<0b0, u, 0b10, opcode, asmop, "2d", "2s", + opnode, v2i64, v2i32>; +} + +def Neon_smlal : PatFrag<(ops node:$Rd, node:$Rn, node:$Rm), + (add node:$Rd, + (int_arm_neon_vmulls node:$Rn, node:$Rm))>; + +def Neon_umlal : PatFrag<(ops node:$Rd, node:$Rn, node:$Rm), + (add node:$Rd, + (int_arm_neon_vmullu node:$Rn, node:$Rm))>; + +def Neon_smlsl : PatFrag<(ops node:$Rd, node:$Rn, node:$Rm), + (sub node:$Rd, + (int_arm_neon_vmulls node:$Rn, node:$Rm))>; + +def Neon_umlsl : PatFrag<(ops node:$Rd, node:$Rn, node:$Rm), + (sub node:$Rd, + (int_arm_neon_vmullu node:$Rn, node:$Rm))>; + +defm SMLALvvv : NeonI_3VDL_3Op_v1<0b0, 0b1000, "smlal", Neon_smlal>; +defm UMLALvvv : NeonI_3VDL_3Op_v1<0b1, 0b1000, "umlal", Neon_umlal>; + +defm SMLSLvvv : NeonI_3VDL_3Op_v1<0b0, 0b1010, "smlsl", Neon_smlsl>; +defm UMLSLvvv : NeonI_3VDL_3Op_v1<0b1, 0b1010, "umlsl", Neon_umlsl>; + +class NeonI_3VDL2_3Op_mlas size, bits<4> opcode, + string asmop, string ResS, string OpS, + SDPatternOperator subop, SDPatternOperator opnode, + RegisterOperand OpVPR, + ValueType ResTy, ValueType OpTy> + : NeonI_3VDiff { + let Constraints = "$src = $Rd"; +} + +multiclass NeonI_3VDL2_3Op_mlas_v1 opcode, string asmop, + SDPatternOperator subop, string opnode> { + def _8h16b : NeonI_3VDL2_3Op_mlas<0b1, u, 0b00, opcode, asmop, "8h", "16b", + subop, !cast(opnode # "_16B"), + VPR128, v8i16, v16i8>; + def _4s8h : NeonI_3VDL2_3Op_mlas<0b1, u, 0b01, opcode, asmop, "4s", "8h", + subop, !cast(opnode # "_8H"), + VPR128, v4i32, v8i16>; + def _2d4s : NeonI_3VDL2_3Op_mlas<0b1, u, 0b10, opcode, asmop, "2d", "4s", + subop, !cast(opnode # "_4S"), + VPR128, v2i64, v4i32>; +} + +defm SMLAL2vvv : NeonI_3VDL2_3Op_mlas_v1<0b0, 0b1000, "smlal2", + add, "NI_smull_hi">; +defm UMLAL2vvv : NeonI_3VDL2_3Op_mlas_v1<0b1, 0b1000, "umlal2", + add, "NI_umull_hi">; + +defm SMLSL2vvv : NeonI_3VDL2_3Op_mlas_v1<0b0, 0b1010, "smlsl2", + sub, "NI_smull_hi">; +defm UMLSL2vvv : NeonI_3VDL2_3Op_mlas_v1<0b1, 0b1010, "umlsl2", + sub, "NI_umull_hi">; + +multiclass NeonI_3VDL_qdmlal_3Op_v2 opcode, string asmop, + SDPatternOperator opnode> { + def _4s4h : NeonI_3VDL2_3Op_mlas<0b0, u, 0b01, opcode, asmop, "4s", "4h", + opnode, int_arm_neon_vqdmull, + VPR64, v4i32, v4i16>; + def _2d2s : NeonI_3VDL2_3Op_mlas<0b0, u, 0b10, opcode, asmop, "2d", "2s", + opnode, int_arm_neon_vqdmull, + VPR64, v2i64, v2i32>; +} + +defm SQDMLALvvv : NeonI_3VDL_qdmlal_3Op_v2<0b0, 0b1001, "sqdmlal", + int_arm_neon_vqadds>; +defm SQDMLSLvvv : NeonI_3VDL_qdmlal_3Op_v2<0b0, 0b1011, "sqdmlsl", + int_arm_neon_vqsubs>; + +multiclass NeonI_3VDL_v2 opcode, string asmop, + SDPatternOperator opnode, bit Commutable = 0> { + let isCommutable = Commutable in { + def _4s4h : NeonI_3VD_2Op<0b0, u, 0b01, opcode, asmop, "4s", "4h", + opnode, VPR128, VPR64, v4i32, v4i16>; + def _2d2s : NeonI_3VD_2Op<0b0, u, 0b10, opcode, asmop, "2d", "2s", + opnode, VPR128, VPR64, v2i64, v2i32>; + } +} + +defm SQDMULLvvv : NeonI_3VDL_v2<0b0, 0b1101, "sqdmull", + int_arm_neon_vqdmull, 1>; + +multiclass NeonI_3VDL2_2Op_mull_v2 opcode, string asmop, + string opnode, bit Commutable = 0> { + let isCommutable = Commutable in { + def _4s8h : NeonI_3VDL2_2Op_mull<0b1, u, 0b01, opcode, asmop, "4s", "8h", + !cast(opnode # "_8H"), + v4i32, v8i16>; + def _2d4s : NeonI_3VDL2_2Op_mull<0b1, u, 0b10, opcode, asmop, "2d", "4s", + !cast(opnode # "_4S"), + v2i64, v4i32>; + } +} + +defm SQDMULL2vvv : NeonI_3VDL2_2Op_mull_v2<0b0, 0b1101, "sqdmull2", + "NI_qdmull_hi", 1>; + +multiclass NeonI_3VDL2_3Op_qdmlal_v2 opcode, string asmop, + SDPatternOperator opnode> { + def _4s8h : NeonI_3VDL2_3Op_mlas<0b1, u, 0b01, opcode, asmop, "4s", "8h", + opnode, NI_qdmull_hi_8H, + VPR128, v4i32, v8i16>; + def _2d4s : NeonI_3VDL2_3Op_mlas<0b1, u, 0b10, opcode, asmop, "2d", "4s", + opnode, NI_qdmull_hi_4S, + VPR128, v2i64, v4i32>; +} + +defm SQDMLAL2vvv : NeonI_3VDL2_3Op_qdmlal_v2<0b0, 0b1001, "sqdmlal2", + int_arm_neon_vqadds>; +defm SQDMLSL2vvv : NeonI_3VDL2_3Op_qdmlal_v2<0b0, 0b1011, "sqdmlsl2", + int_arm_neon_vqsubs>; + +multiclass NeonI_3VDL_v3 opcode, string asmop, + SDPatternOperator opnode, bit Commutable = 0> { + let isCommutable = Commutable in { + def _8h8b : NeonI_3VD_2Op<0b0, u, 0b00, opcode, asmop, "8h", "8b", + opnode, VPR128, VPR64, v8i16, v8i8>; + + def _1q1d : NeonI_3VDiff<0b0, u, 0b11, opcode, + (outs VPR128:$Rd), (ins VPR64:$Rn, VPR64:$Rm), + asmop # "\t$Rd.1q, $Rn.1d, $Rm.1d", + [], NoItinerary>; + } +} + +defm PMULLvvv : NeonI_3VDL_v3<0b0, 0b1110, "pmull", int_arm_neon_vmullp, 1>; + +multiclass NeonI_3VDL2_2Op_mull_v3 opcode, string asmop, + string opnode, bit Commutable = 0> { + let isCommutable = Commutable in { + def _8h16b : NeonI_3VDL2_2Op_mull<0b1, u, 0b00, opcode, asmop, "8h", "16b", + !cast(opnode # "_16B"), + v8i16, v16i8>; + + def _1q2d : NeonI_3VDiff<0b1, u, 0b11, opcode, + (outs VPR128:$Rd), (ins VPR128:$Rn, VPR128:$Rm), + asmop # "\t$Rd.1q, $Rn.2d, $Rm.2d", + [], NoItinerary>; + } +} + +defm PMULL2vvv : NeonI_3VDL2_2Op_mull_v3<0b0, 0b1110, "pmull2", "NI_pmull_hi", + 1>; + +// End of implementation for instruction class (3V Diff) + +// The followings are vector load/store multiple N-element structure +// (class SIMD lselem). + +// ld1: load multiple 1-element structure to 1/2/3/4 registers. +// ld2/ld3/ld4: load multiple N-element structure to N registers (N = 2, 3, 4). +// The structure consists of a sequence of sets of N values. +// The first element of the structure is placed in the first lane +// of the first first vector, the second element in the first lane +// of the second vector, and so on. +// E.g. LD1_3V_2S will load 32-bit elements {A, B, C, D, E, F} sequentially into +// the three 64-bit vectors list {BA, DC, FE}. +// E.g. LD3_2S will load 32-bit elements {A, B, C, D, E, F} into the three +// 64-bit vectors list {DA, EB, FC}. +// Store instructions store multiple structure to N registers like load. + + +class NeonI_LDVList opcode, bits<2> size, + RegisterOperand VecList, string asmop> + : NeonI_LdStMult { + let mayLoad = 1; + let neverHasSideEffects = 1; +} + +multiclass LDVList_BHSD opcode, string List, string asmop> { + def _8B : NeonI_LDVList<0, opcode, 0b00, + !cast(List # "8B_operand"), asmop>; + + def _4H : NeonI_LDVList<0, opcode, 0b01, + !cast(List # "4H_operand"), asmop>; + + def _2S : NeonI_LDVList<0, opcode, 0b10, + !cast(List # "2S_operand"), asmop>; + + def _16B : NeonI_LDVList<1, opcode, 0b00, + !cast(List # "16B_operand"), asmop>; + + def _8H : NeonI_LDVList<1, opcode, 0b01, + !cast(List # "8H_operand"), asmop>; + + def _4S : NeonI_LDVList<1, opcode, 0b10, + !cast(List # "4S_operand"), asmop>; + + def _2D : NeonI_LDVList<1, opcode, 0b11, + !cast(List # "2D_operand"), asmop>; +} + +// Load multiple N-element structure to N consecutive registers (N = 1,2,3,4) +defm LD1 : LDVList_BHSD<0b0111, "VOne", "ld1">; +def LD1_1D : NeonI_LDVList<0, 0b0111, 0b11, VOne1D_operand, "ld1">; + +defm LD2 : LDVList_BHSD<0b1000, "VPair", "ld2">; + +defm LD3 : LDVList_BHSD<0b0100, "VTriple", "ld3">; + +defm LD4 : LDVList_BHSD<0b0000, "VQuad", "ld4">; + +// Load multiple 1-element structure to N consecutive registers (N = 2,3,4) +defm LD1x2 : LDVList_BHSD<0b1010, "VPair", "ld1">; +def LD1x2_1D : NeonI_LDVList<0, 0b1010, 0b11, VPair1D_operand, "ld1">; + +defm LD1x3 : LDVList_BHSD<0b0110, "VTriple", "ld1">; +def LD1x3_1D : NeonI_LDVList<0, 0b0110, 0b11, VTriple1D_operand, "ld1">; + +defm LD1x4 : LDVList_BHSD<0b0010, "VQuad", "ld1">; +def LD1x4_1D : NeonI_LDVList<0, 0b0010, 0b11, VQuad1D_operand, "ld1">; + +class NeonI_STVList opcode, bits<2> size, + RegisterOperand VecList, string asmop> + : NeonI_LdStMult { + let mayStore = 1; + let neverHasSideEffects = 1; +} + +multiclass STVList_BHSD opcode, string List, string asmop> { + def _8B : NeonI_STVList<0, opcode, 0b00, + !cast(List # "8B_operand"), asmop>; + + def _4H : NeonI_STVList<0, opcode, 0b01, + !cast(List # "4H_operand"), asmop>; + + def _2S : NeonI_STVList<0, opcode, 0b10, + !cast(List # "2S_operand"), asmop>; + + def _16B : NeonI_STVList<1, opcode, 0b00, + !cast(List # "16B_operand"), asmop>; + + def _8H : NeonI_STVList<1, opcode, 0b01, + !cast(List # "8H_operand"), asmop>; + + def _4S : NeonI_STVList<1, opcode, 0b10, + !cast(List # "4S_operand"), asmop>; + + def _2D : NeonI_STVList<1, opcode, 0b11, + !cast(List # "2D_operand"), asmop>; +} + +// Store multiple N-element structures from N registers (N = 1,2,3,4) +defm ST1 : STVList_BHSD<0b0111, "VOne", "st1">; +def ST1_1D : NeonI_STVList<0, 0b0111, 0b11, VOne1D_operand, "st1">; + +defm ST2 : STVList_BHSD<0b1000, "VPair", "st2">; + +defm ST3 : STVList_BHSD<0b0100, "VTriple", "st3">; + +defm ST4 : STVList_BHSD<0b0000, "VQuad", "st4">; + +// Store multiple 1-element structures from N consecutive registers (N = 2,3,4) +defm ST1x2 : STVList_BHSD<0b1010, "VPair", "st1">; +def ST1x2_1D : NeonI_STVList<0, 0b1010, 0b11, VPair1D_operand, "st1">; + +defm ST1x3 : STVList_BHSD<0b0110, "VTriple", "st1">; +def ST1x3_1D : NeonI_STVList<0, 0b0110, 0b11, VTriple1D_operand, "st1">; + +defm ST1x4 : STVList_BHSD<0b0010, "VQuad", "st1">; +def ST1x4_1D : NeonI_STVList<0, 0b0010, 0b11, VQuad1D_operand, "st1">; + +def : Pat<(v2f64 (load GPR64xsp:$addr)), (LD1_2D GPR64xsp:$addr)>; +def : Pat<(v2i64 (load GPR64xsp:$addr)), (LD1_2D GPR64xsp:$addr)>; + +def : Pat<(v4f32 (load GPR64xsp:$addr)), (LD1_4S GPR64xsp:$addr)>; +def : Pat<(v4i32 (load GPR64xsp:$addr)), (LD1_4S GPR64xsp:$addr)>; + +def : Pat<(v8i16 (load GPR64xsp:$addr)), (LD1_8H GPR64xsp:$addr)>; +def : Pat<(v16i8 (load GPR64xsp:$addr)), (LD1_16B GPR64xsp:$addr)>; + +def : Pat<(v1f64 (load GPR64xsp:$addr)), (LD1_1D GPR64xsp:$addr)>; +def : Pat<(v1i64 (load GPR64xsp:$addr)), (LD1_1D GPR64xsp:$addr)>; + +def : Pat<(v2f32 (load GPR64xsp:$addr)), (LD1_2S GPR64xsp:$addr)>; +def : Pat<(v2i32 (load GPR64xsp:$addr)), (LD1_2S GPR64xsp:$addr)>; + +def : Pat<(v4i16 (load GPR64xsp:$addr)), (LD1_4H GPR64xsp:$addr)>; +def : Pat<(v8i8 (load GPR64xsp:$addr)), (LD1_8B GPR64xsp:$addr)>; + +def : Pat<(store (v2i64 VPR128:$value), GPR64xsp:$addr), + (ST1_2D GPR64xsp:$addr, VPR128:$value)>; +def : Pat<(store (v2f64 VPR128:$value), GPR64xsp:$addr), + (ST1_2D GPR64xsp:$addr, VPR128:$value)>; + +def : Pat<(store (v4i32 VPR128:$value), GPR64xsp:$addr), + (ST1_4S GPR64xsp:$addr, VPR128:$value)>; +def : Pat<(store (v4f32 VPR128:$value), GPR64xsp:$addr), + (ST1_4S GPR64xsp:$addr, VPR128:$value)>; + +def : Pat<(store (v8i16 VPR128:$value), GPR64xsp:$addr), + (ST1_8H GPR64xsp:$addr, VPR128:$value)>; +def : Pat<(store (v16i8 VPR128:$value), GPR64xsp:$addr), + (ST1_16B GPR64xsp:$addr, VPR128:$value)>; + +def : Pat<(store (v1i64 VPR64:$value), GPR64xsp:$addr), + (ST1_1D GPR64xsp:$addr, VPR64:$value)>; +def : Pat<(store (v1f64 VPR64:$value), GPR64xsp:$addr), + (ST1_1D GPR64xsp:$addr, VPR64:$value)>; + +def : Pat<(store (v2i32 VPR64:$value), GPR64xsp:$addr), + (ST1_2S GPR64xsp:$addr, VPR64:$value)>; +def : Pat<(store (v2f32 VPR64:$value), GPR64xsp:$addr), + (ST1_2S GPR64xsp:$addr, VPR64:$value)>; + +def : Pat<(store (v4i16 VPR64:$value), GPR64xsp:$addr), + (ST1_4H GPR64xsp:$addr, VPR64:$value)>; +def : Pat<(store (v8i8 VPR64:$value), GPR64xsp:$addr), + (ST1_8B GPR64xsp:$addr, VPR64:$value)>; + +// End of vector load/store multiple N-element structure(class SIMD lselem) + +// The followings are post-index vector load/store multiple N-element +// structure(class SIMD lselem-post) +def exact1_asmoperand : AsmOperandClass { + let Name = "Exact1"; + let PredicateMethod = "isExactImm<1>"; + let RenderMethod = "addImmOperands"; +} +def uimm_exact1 : Operand, ImmLeaf { + let ParserMatchClass = exact1_asmoperand; +} + +def exact2_asmoperand : AsmOperandClass { + let Name = "Exact2"; + let PredicateMethod = "isExactImm<2>"; + let RenderMethod = "addImmOperands"; +} +def uimm_exact2 : Operand, ImmLeaf { + let ParserMatchClass = exact2_asmoperand; +} + +def exact3_asmoperand : AsmOperandClass { + let Name = "Exact3"; + let PredicateMethod = "isExactImm<3>"; + let RenderMethod = "addImmOperands"; +} +def uimm_exact3 : Operand, ImmLeaf { + let ParserMatchClass = exact3_asmoperand; +} + +def exact4_asmoperand : AsmOperandClass { + let Name = "Exact4"; + let PredicateMethod = "isExactImm<4>"; + let RenderMethod = "addImmOperands"; +} +def uimm_exact4 : Operand, ImmLeaf { + let ParserMatchClass = exact4_asmoperand; +} + +def exact6_asmoperand : AsmOperandClass { + let Name = "Exact6"; + let PredicateMethod = "isExactImm<6>"; + let RenderMethod = "addImmOperands"; +} +def uimm_exact6 : Operand, ImmLeaf { + let ParserMatchClass = exact6_asmoperand; +} + +def exact8_asmoperand : AsmOperandClass { + let Name = "Exact8"; + let PredicateMethod = "isExactImm<8>"; + let RenderMethod = "addImmOperands"; +} +def uimm_exact8 : Operand, ImmLeaf { + let ParserMatchClass = exact8_asmoperand; +} + +def exact12_asmoperand : AsmOperandClass { + let Name = "Exact12"; + let PredicateMethod = "isExactImm<12>"; + let RenderMethod = "addImmOperands"; +} +def uimm_exact12 : Operand, ImmLeaf { + let ParserMatchClass = exact12_asmoperand; +} + +def exact16_asmoperand : AsmOperandClass { + let Name = "Exact16"; + let PredicateMethod = "isExactImm<16>"; + let RenderMethod = "addImmOperands"; +} +def uimm_exact16 : Operand, ImmLeaf { + let ParserMatchClass = exact16_asmoperand; +} + +def exact24_asmoperand : AsmOperandClass { + let Name = "Exact24"; + let PredicateMethod = "isExactImm<24>"; + let RenderMethod = "addImmOperands"; +} +def uimm_exact24 : Operand, ImmLeaf { + let ParserMatchClass = exact24_asmoperand; +} + +def exact32_asmoperand : AsmOperandClass { + let Name = "Exact32"; + let PredicateMethod = "isExactImm<32>"; + let RenderMethod = "addImmOperands"; +} +def uimm_exact32 : Operand, ImmLeaf { + let ParserMatchClass = exact32_asmoperand; +} + +def exact48_asmoperand : AsmOperandClass { + let Name = "Exact48"; + let PredicateMethod = "isExactImm<48>"; + let RenderMethod = "addImmOperands"; +} +def uimm_exact48 : Operand, ImmLeaf { + let ParserMatchClass = exact48_asmoperand; +} + +def exact64_asmoperand : AsmOperandClass { + let Name = "Exact64"; + let PredicateMethod = "isExactImm<64>"; + let RenderMethod = "addImmOperands"; +} +def uimm_exact64 : Operand, ImmLeaf { + let ParserMatchClass = exact64_asmoperand; +} + +multiclass NeonI_LDWB_VList opcode, bits<2> size, + RegisterOperand VecList, Operand ImmTy, + string asmop> { + let Constraints = "$Rn = $wb", mayLoad = 1, neverHasSideEffects = 1, + DecoderMethod = "DecodeVLDSTPostInstruction" in { + def _fixed : NeonI_LdStMult_Post { + let Rm = 0b11111; + } + + def _register : NeonI_LdStMult_Post; + } +} + +multiclass LDWB_VList_BHSD opcode, string List, Operand ImmTy, + Operand ImmTy2, string asmop> { + defm _8B : NeonI_LDWB_VList<0, opcode, 0b00, + !cast(List # "8B_operand"), + ImmTy, asmop>; + + defm _4H : NeonI_LDWB_VList<0, opcode, 0b01, + !cast(List # "4H_operand"), + ImmTy, asmop>; + + defm _2S : NeonI_LDWB_VList<0, opcode, 0b10, + !cast(List # "2S_operand"), + ImmTy, asmop>; + + defm _16B : NeonI_LDWB_VList<1, opcode, 0b00, + !cast(List # "16B_operand"), + ImmTy2, asmop>; + + defm _8H : NeonI_LDWB_VList<1, opcode, 0b01, + !cast(List # "8H_operand"), + ImmTy2, asmop>; + + defm _4S : NeonI_LDWB_VList<1, opcode, 0b10, + !cast(List # "4S_operand"), + ImmTy2, asmop>; + + defm _2D : NeonI_LDWB_VList<1, opcode, 0b11, + !cast(List # "2D_operand"), + ImmTy2, asmop>; +} + +// Post-index load multiple N-element structures from N registers (N = 1,2,3,4) +defm LD1WB : LDWB_VList_BHSD<0b0111, "VOne", uimm_exact8, uimm_exact16, "ld1">; +defm LD1WB_1D : NeonI_LDWB_VList<0, 0b0111, 0b11, VOne1D_operand, uimm_exact8, + "ld1">; + +defm LD2WB : LDWB_VList_BHSD<0b1000, "VPair", uimm_exact16, uimm_exact32, "ld2">; + +defm LD3WB : LDWB_VList_BHSD<0b0100, "VTriple", uimm_exact24, uimm_exact48, + "ld3">; + +defm LD4WB : LDWB_VList_BHSD<0b0000, "VQuad", uimm_exact32, uimm_exact64, "ld4">; + +// Post-index load multiple 1-element structures from N consecutive registers +// (N = 2,3,4) +defm LD1x2WB : LDWB_VList_BHSD<0b1010, "VPair", uimm_exact16, uimm_exact32, + "ld1">; +defm LD1x2WB_1D : NeonI_LDWB_VList<0, 0b1010, 0b11, VPair1D_operand, + uimm_exact16, "ld1">; + +defm LD1x3WB : LDWB_VList_BHSD<0b0110, "VTriple", uimm_exact24, uimm_exact48, + "ld1">; +defm LD1x3WB_1D : NeonI_LDWB_VList<0, 0b0110, 0b11, VTriple1D_operand, + uimm_exact24, "ld1">; + +defm LD1x4WB : LDWB_VList_BHSD<0b0010, "VQuad", uimm_exact32, uimm_exact64, + "ld1">; +defm LD1x4WB_1D : NeonI_LDWB_VList<0, 0b0010, 0b11, VQuad1D_operand, + uimm_exact32, "ld1">; + +multiclass NeonI_STWB_VList opcode, bits<2> size, + RegisterOperand VecList, Operand ImmTy, + string asmop> { + let Constraints = "$Rn = $wb", mayStore = 1, neverHasSideEffects = 1, + DecoderMethod = "DecodeVLDSTPostInstruction" in { + def _fixed : NeonI_LdStMult_Post { + let Rm = 0b11111; + } + + def _register : NeonI_LdStMult_Post; + } +} + +multiclass STWB_VList_BHSD opcode, string List, Operand ImmTy, + Operand ImmTy2, string asmop> { + defm _8B : NeonI_STWB_VList<0, opcode, 0b00, + !cast(List # "8B_operand"), ImmTy, asmop>; + + defm _4H : NeonI_STWB_VList<0, opcode, 0b01, + !cast(List # "4H_operand"), + ImmTy, asmop>; + + defm _2S : NeonI_STWB_VList<0, opcode, 0b10, + !cast(List # "2S_operand"), + ImmTy, asmop>; + + defm _16B : NeonI_STWB_VList<1, opcode, 0b00, + !cast(List # "16B_operand"), + ImmTy2, asmop>; + + defm _8H : NeonI_STWB_VList<1, opcode, 0b01, + !cast(List # "8H_operand"), + ImmTy2, asmop>; + + defm _4S : NeonI_STWB_VList<1, opcode, 0b10, + !cast(List # "4S_operand"), + ImmTy2, asmop>; + + defm _2D : NeonI_STWB_VList<1, opcode, 0b11, + !cast(List # "2D_operand"), + ImmTy2, asmop>; +} + +// Post-index load multiple N-element structures from N registers (N = 1,2,3,4) +defm ST1WB : STWB_VList_BHSD<0b0111, "VOne", uimm_exact8, uimm_exact16, "st1">; +defm ST1WB_1D : NeonI_STWB_VList<0, 0b0111, 0b11, VOne1D_operand, uimm_exact8, + "st1">; + +defm ST2WB : STWB_VList_BHSD<0b1000, "VPair", uimm_exact16, uimm_exact32, "st2">; + +defm ST3WB : STWB_VList_BHSD<0b0100, "VTriple", uimm_exact24, uimm_exact48, + "st3">; + +defm ST4WB : STWB_VList_BHSD<0b0000, "VQuad", uimm_exact32, uimm_exact64, "st4">; + +// Post-index load multiple 1-element structures from N consecutive registers +// (N = 2,3,4) +defm ST1x2WB : STWB_VList_BHSD<0b1010, "VPair", uimm_exact16, uimm_exact32, + "st1">; +defm ST1x2WB_1D : NeonI_STWB_VList<0, 0b1010, 0b11, VPair1D_operand, + uimm_exact16, "st1">; + +defm ST1x3WB : STWB_VList_BHSD<0b0110, "VTriple", uimm_exact24, uimm_exact48, + "st1">; +defm ST1x3WB_1D : NeonI_STWB_VList<0, 0b0110, 0b11, VTriple1D_operand, + uimm_exact24, "st1">; + +defm ST1x4WB : STWB_VList_BHSD<0b0010, "VQuad", uimm_exact32, uimm_exact64, + "st1">; +defm ST1x4WB_1D : NeonI_STWB_VList<0, 0b0010, 0b11, VQuad1D_operand, + uimm_exact32, "st1">; + +// End of post-index vector load/store multiple N-element structure +// (class SIMD lselem-post) + +// The followings are vector load/store single N-element structure +// (class SIMD lsone). +def neon_uimm0_bare : Operand, + ImmLeaf { + let ParserMatchClass = neon_uimm0_asmoperand; + let PrintMethod = "printUImmBareOperand"; +} + +def neon_uimm1_bare : Operand, + ImmLeaf { + let ParserMatchClass = neon_uimm1_asmoperand; + let PrintMethod = "printUImmBareOperand"; +} + +def neon_uimm2_bare : Operand, + ImmLeaf { + let ParserMatchClass = neon_uimm2_asmoperand; + let PrintMethod = "printUImmBareOperand"; +} + +def neon_uimm3_bare : Operand, + ImmLeaf { + let ParserMatchClass = uimm3_asmoperand; + let PrintMethod = "printUImmBareOperand"; +} + +def neon_uimm4_bare : Operand, + ImmLeaf { + let ParserMatchClass = uimm4_asmoperand; + let PrintMethod = "printUImmBareOperand"; +} + +class NeonI_LDN_Dup opcode, bits<2> size, + RegisterOperand VecList, string asmop> + : NeonI_LdOne_Dup { + let mayLoad = 1; + let neverHasSideEffects = 1; +} + +multiclass LDN_Dup_BHSD opcode, string List, string asmop> { + def _8B : NeonI_LDN_Dup<0, r, opcode, 0b00, + !cast(List # "8B_operand"), asmop>; + + def _4H : NeonI_LDN_Dup<0, r, opcode, 0b01, + !cast(List # "4H_operand"), asmop>; + + def _2S : NeonI_LDN_Dup<0, r, opcode, 0b10, + !cast(List # "2S_operand"), asmop>; + + def _1D : NeonI_LDN_Dup<0, r, opcode, 0b11, + !cast(List # "1D_operand"), asmop>; + + def _16B : NeonI_LDN_Dup<1, r, opcode, 0b00, + !cast(List # "16B_operand"), asmop>; + + def _8H : NeonI_LDN_Dup<1, r, opcode, 0b01, + !cast(List # "8H_operand"), asmop>; + + def _4S : NeonI_LDN_Dup<1, r, opcode, 0b10, + !cast(List # "4S_operand"), asmop>; + + def _2D : NeonI_LDN_Dup<1, r, opcode, 0b11, + !cast(List # "2D_operand"), asmop>; +} + +// Load single 1-element structure to all lanes of 1 register +defm LD1R : LDN_Dup_BHSD<0b0, 0b110, "VOne", "ld1r">; + +// Load single N-element structure to all lanes of N consecutive +// registers (N = 2,3,4) +defm LD2R : LDN_Dup_BHSD<0b1, 0b110, "VPair", "ld2r">; +defm LD3R : LDN_Dup_BHSD<0b0, 0b111, "VTriple", "ld3r">; +defm LD4R : LDN_Dup_BHSD<0b1, 0b111, "VQuad", "ld4r">; + + +class LD1R_pattern + : Pat<(VTy (Neon_vdup (DTy (LoadOp GPR64xsp:$Rn)))), + (VTy (INST GPR64xsp:$Rn))>; + +// Match all LD1R instructions +def : LD1R_pattern; + +def : LD1R_pattern; + +def : LD1R_pattern; + +def : LD1R_pattern; + +def : LD1R_pattern; +def : LD1R_pattern; + +def : LD1R_pattern; +def : LD1R_pattern; + +def : LD1R_pattern; +def : LD1R_pattern; + +def : LD1R_pattern; +def : LD1R_pattern; + + +multiclass VectorList_Bare_BHSD { + defm B : VectorList_operands; + defm H : VectorList_operands; + defm S : VectorList_operands; + defm D : VectorList_operands; +} + +// Special vector list operand of 128-bit vectors with bare layout. +// i.e. only show ".b", ".h", ".s", ".d" +defm VOne : VectorList_Bare_BHSD<"VOne", 1, FPR128>; +defm VPair : VectorList_Bare_BHSD<"VPair", 2, QPair>; +defm VTriple : VectorList_Bare_BHSD<"VTriple", 3, QTriple>; +defm VQuad : VectorList_Bare_BHSD<"VQuad", 4, QQuad>; + +class NeonI_LDN_Lane op2_1, bit op0, RegisterOperand VList, + Operand ImmOp, string asmop> + : NeonI_LdStOne_Lane<1, r, op2_1, op0, + (outs VList:$Rt), + (ins GPR64xsp:$Rn, VList:$src, ImmOp:$lane), + asmop # "\t$Rt[$lane], [$Rn]", + [], + NoItinerary> { + let mayLoad = 1; + let neverHasSideEffects = 1; + let hasExtraDefRegAllocReq = 1; + let Constraints = "$src = $Rt"; +} + +multiclass LDN_Lane_BHSD { + def _B : NeonI_LDN_Lane(List # "B_operand"), + neon_uimm4_bare, asmop> { + let Inst{12-10} = lane{2-0}; + let Inst{30} = lane{3}; + } + + def _H : NeonI_LDN_Lane(List # "H_operand"), + neon_uimm3_bare, asmop> { + let Inst{12-10} = {lane{1}, lane{0}, 0b0}; + let Inst{30} = lane{2}; + } + + def _S : NeonI_LDN_Lane(List # "S_operand"), + neon_uimm2_bare, asmop> { + let Inst{12-10} = {lane{0}, 0b0, 0b0}; + let Inst{30} = lane{1}; + } + + def _D : NeonI_LDN_Lane(List # "D_operand"), + neon_uimm1_bare, asmop> { + let Inst{12-10} = 0b001; + let Inst{30} = lane{0}; + } +} + +// Load single 1-element structure to one lane of 1 register. +defm LD1LN : LDN_Lane_BHSD<0b0, 0b0, "VOne", "ld1">; + +// Load single N-element structure to one lane of N consecutive registers +// (N = 2,3,4) +defm LD2LN : LDN_Lane_BHSD<0b1, 0b0, "VPair", "ld2">; +defm LD3LN : LDN_Lane_BHSD<0b0, 0b1, "VTriple", "ld3">; +defm LD4LN : LDN_Lane_BHSD<0b1, 0b1, "VQuad", "ld4">; + +multiclass LD1LN_patterns { + def : Pat<(VTy (vector_insert (VTy VPR64:$src), + (DTy (LoadOp GPR64xsp:$Rn)), (ImmOp:$lane))), + (VTy (EXTRACT_SUBREG + (INST GPR64xsp:$Rn, + (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64), + ImmOp:$lane), + sub_64))>; + + def : Pat<(VTy2 (vector_insert (VTy2 VPR128:$src), + (DTy (LoadOp GPR64xsp:$Rn)), (ImmOp2:$lane))), + (VTy2 (INST GPR64xsp:$Rn, VPR128:$src, ImmOp2:$lane))>; +} + +// Match all LD1LN instructions +defm : LD1LN_patterns; + +defm : LD1LN_patterns; + +defm : LD1LN_patterns; +defm : LD1LN_patterns; + +defm : LD1LN_patterns; +defm : LD1LN_patterns; + +class NeonI_STN_Lane op2_1, bit op0, RegisterOperand VList, + Operand ImmOp, string asmop> + : NeonI_LdStOne_Lane<0, r, op2_1, op0, + (outs), (ins GPR64xsp:$Rn, VList:$Rt, ImmOp:$lane), + asmop # "\t$Rt[$lane], [$Rn]", + [], + NoItinerary> { + let mayStore = 1; + let neverHasSideEffects = 1; + let hasExtraDefRegAllocReq = 1; +} + +multiclass STN_Lane_BHSD { + def _B : NeonI_STN_Lane(List # "B_operand"), + neon_uimm4_bare, asmop> { + let Inst{12-10} = lane{2-0}; + let Inst{30} = lane{3}; + } + + def _H : NeonI_STN_Lane(List # "H_operand"), + neon_uimm3_bare, asmop> { + let Inst{12-10} = {lane{1}, lane{0}, 0b0}; + let Inst{30} = lane{2}; + } + + def _S : NeonI_STN_Lane(List # "S_operand"), + neon_uimm2_bare, asmop> { + let Inst{12-10} = {lane{0}, 0b0, 0b0}; + let Inst{30} = lane{1}; + } + + def _D : NeonI_STN_Lane(List # "D_operand"), + neon_uimm1_bare, asmop>{ + let Inst{12-10} = 0b001; + let Inst{30} = lane{0}; + } +} + +// Store single 1-element structure from one lane of 1 register. +defm ST1LN : STN_Lane_BHSD<0b0, 0b0, "VOne", "st1">; + +// Store single N-element structure from one lane of N consecutive registers +// (N = 2,3,4) +defm ST2LN : STN_Lane_BHSD<0b1, 0b0, "VPair", "st2">; +defm ST3LN : STN_Lane_BHSD<0b0, 0b1, "VTriple", "st3">; +defm ST4LN : STN_Lane_BHSD<0b1, 0b1, "VQuad", "st4">; + +multiclass ST1LN_patterns { + def : Pat<(StoreOp (DTy (vector_extract (VTy VPR64:$Rt), ImmOp:$lane)), + GPR64xsp:$Rn), + (INST GPR64xsp:$Rn, + (SUBREG_TO_REG (i64 0), VPR64:$Rt, sub_64), + ImmOp:$lane)>; + + def : Pat<(StoreOp (DTy (vector_extract (VTy2 VPR128:$Rt), ImmOp2:$lane)), + GPR64xsp:$Rn), + (INST GPR64xsp:$Rn, VPR128:$Rt, ImmOp2:$lane)>; +} + +// Match all ST1LN instructions +defm : ST1LN_patterns; + +defm : ST1LN_patterns; + +defm : ST1LN_patterns; +defm : ST1LN_patterns; + +defm : ST1LN_patterns; +defm : ST1LN_patterns; + +// End of vector load/store single N-element structure (class SIMD lsone). + + +// The following are post-index load/store single N-element instructions +// (class SIMD lsone-post) + +multiclass NeonI_LDN_WB_Dup opcode, bits<2> size, + RegisterOperand VecList, Operand ImmTy, + string asmop> { + let mayLoad = 1, neverHasSideEffects = 1, Constraints = "$wb = $Rn", + DecoderMethod = "DecodeVLDSTLanePostInstruction" in { + def _fixed : NeonI_LdOne_Dup_Post { + let Rm = 0b11111; + } + + def _register : NeonI_LdOne_Dup_Post; + } +} + +multiclass LDWB_Dup_BHSD opcode, string List, string asmop, + Operand uimm_b, Operand uimm_h, + Operand uimm_s, Operand uimm_d> { + defm _8B : NeonI_LDN_WB_Dup<0, r, opcode, 0b00, + !cast(List # "8B_operand"), + uimm_b, asmop>; + + defm _4H : NeonI_LDN_WB_Dup<0, r, opcode, 0b01, + !cast(List # "4H_operand"), + uimm_h, asmop>; + + defm _2S : NeonI_LDN_WB_Dup<0, r, opcode, 0b10, + !cast(List # "2S_operand"), + uimm_s, asmop>; + + defm _1D : NeonI_LDN_WB_Dup<0, r, opcode, 0b11, + !cast(List # "1D_operand"), + uimm_d, asmop>; + + defm _16B : NeonI_LDN_WB_Dup<1, r, opcode, 0b00, + !cast(List # "16B_operand"), + uimm_b, asmop>; + + defm _8H : NeonI_LDN_WB_Dup<1, r, opcode, 0b01, + !cast(List # "8H_operand"), + uimm_h, asmop>; + + defm _4S : NeonI_LDN_WB_Dup<1, r, opcode, 0b10, + !cast(List # "4S_operand"), + uimm_s, asmop>; + + defm _2D : NeonI_LDN_WB_Dup<1, r, opcode, 0b11, + !cast(List # "2D_operand"), + uimm_d, asmop>; +} + +// Post-index load single 1-element structure to all lanes of 1 register +defm LD1R_WB : LDWB_Dup_BHSD<0b0, 0b110, "VOne", "ld1r", uimm_exact1, + uimm_exact2, uimm_exact4, uimm_exact8>; + +// Post-index load single N-element structure to all lanes of N consecutive +// registers (N = 2,3,4) +defm LD2R_WB : LDWB_Dup_BHSD<0b1, 0b110, "VPair", "ld2r", uimm_exact2, + uimm_exact4, uimm_exact8, uimm_exact16>; +defm LD3R_WB : LDWB_Dup_BHSD<0b0, 0b111, "VTriple", "ld3r", uimm_exact3, + uimm_exact6, uimm_exact12, uimm_exact24>; +defm LD4R_WB : LDWB_Dup_BHSD<0b1, 0b111, "VQuad", "ld4r", uimm_exact4, + uimm_exact8, uimm_exact16, uimm_exact32>; + +let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1, + Constraints = "$Rn = $wb, $Rt = $src", + DecoderMethod = "DecodeVLDSTLanePostInstruction" in { + class LDN_WBFx_Lane op2_1, bit op0, RegisterOperand VList, + Operand ImmTy, Operand ImmOp, string asmop> + : NeonI_LdStOne_Lane_Post<1, r, op2_1, op0, + (outs VList:$Rt, GPR64xsp:$wb), + (ins GPR64xsp:$Rn, ImmTy:$amt, + VList:$src, ImmOp:$lane), + asmop # "\t$Rt[$lane], [$Rn], $amt", + [], + NoItinerary> { + let Rm = 0b11111; + } + + class LDN_WBReg_Lane op2_1, bit op0, RegisterOperand VList, + Operand ImmTy, Operand ImmOp, string asmop> + : NeonI_LdStOne_Lane_Post<1, r, op2_1, op0, + (outs VList:$Rt, GPR64xsp:$wb), + (ins GPR64xsp:$Rn, GPR64noxzr:$Rm, + VList:$src, ImmOp:$lane), + asmop # "\t$Rt[$lane], [$Rn], $Rm", + [], + NoItinerary>; +} + +multiclass LD_Lane_WB_BHSD { + def _B_fixed : LDN_WBFx_Lane(List # "B_operand"), + uimm_b, neon_uimm4_bare, asmop> { + let Inst{12-10} = lane{2-0}; + let Inst{30} = lane{3}; + } + + def _B_register : LDN_WBReg_Lane(List # "B_operand"), + uimm_b, neon_uimm4_bare, asmop> { + let Inst{12-10} = lane{2-0}; + let Inst{30} = lane{3}; + } + + def _H_fixed : LDN_WBFx_Lane(List # "H_operand"), + uimm_h, neon_uimm3_bare, asmop> { + let Inst{12-10} = {lane{1}, lane{0}, 0b0}; + let Inst{30} = lane{2}; + } + + def _H_register : LDN_WBReg_Lane(List # "H_operand"), + uimm_h, neon_uimm3_bare, asmop> { + let Inst{12-10} = {lane{1}, lane{0}, 0b0}; + let Inst{30} = lane{2}; + } + + def _S_fixed : LDN_WBFx_Lane(List # "S_operand"), + uimm_s, neon_uimm2_bare, asmop> { + let Inst{12-10} = {lane{0}, 0b0, 0b0}; + let Inst{30} = lane{1}; + } + + def _S_register : LDN_WBReg_Lane(List # "S_operand"), + uimm_s, neon_uimm2_bare, asmop> { + let Inst{12-10} = {lane{0}, 0b0, 0b0}; + let Inst{30} = lane{1}; + } + + def _D_fixed : LDN_WBFx_Lane(List # "D_operand"), + uimm_d, neon_uimm1_bare, asmop> { + let Inst{12-10} = 0b001; + let Inst{30} = lane{0}; + } + + def _D_register : LDN_WBReg_Lane(List # "D_operand"), + uimm_d, neon_uimm1_bare, asmop> { + let Inst{12-10} = 0b001; + let Inst{30} = lane{0}; + } +} + +// Post-index load single 1-element structure to one lane of 1 register. +defm LD1LN_WB : LD_Lane_WB_BHSD<0b0, 0b0, "VOne", "ld1", uimm_exact1, + uimm_exact2, uimm_exact4, uimm_exact8>; + +// Post-index load single N-element structure to one lane of N consecutive +// registers +// (N = 2,3,4) +defm LD2LN_WB : LD_Lane_WB_BHSD<0b1, 0b0, "VPair", "ld2", uimm_exact2, + uimm_exact4, uimm_exact8, uimm_exact16>; +defm LD3LN_WB : LD_Lane_WB_BHSD<0b0, 0b1, "VTriple", "ld3", uimm_exact3, + uimm_exact6, uimm_exact12, uimm_exact24>; +defm LD4LN_WB : LD_Lane_WB_BHSD<0b1, 0b1, "VQuad", "ld4", uimm_exact4, + uimm_exact8, uimm_exact16, uimm_exact32>; + +let mayStore = 1, neverHasSideEffects = 1, + hasExtraDefRegAllocReq = 1, Constraints = "$Rn = $wb", + DecoderMethod = "DecodeVLDSTLanePostInstruction" in { + class STN_WBFx_Lane op2_1, bit op0, RegisterOperand VList, + Operand ImmTy, Operand ImmOp, string asmop> + : NeonI_LdStOne_Lane_Post<0, r, op2_1, op0, + (outs GPR64xsp:$wb), + (ins GPR64xsp:$Rn, ImmTy:$amt, + VList:$Rt, ImmOp:$lane), + asmop # "\t$Rt[$lane], [$Rn], $amt", + [], + NoItinerary> { + let Rm = 0b11111; + } + + class STN_WBReg_Lane op2_1, bit op0, RegisterOperand VList, + Operand ImmTy, Operand ImmOp, string asmop> + : NeonI_LdStOne_Lane_Post<0, r, op2_1, op0, + (outs GPR64xsp:$wb), + (ins GPR64xsp:$Rn, GPR64noxzr:$Rm, VList:$Rt, + ImmOp:$lane), + asmop # "\t$Rt[$lane], [$Rn], $Rm", + [], + NoItinerary>; +} + +multiclass ST_Lane_WB_BHSD { + def _B_fixed : STN_WBFx_Lane(List # "B_operand"), + uimm_b, neon_uimm4_bare, asmop> { + let Inst{12-10} = lane{2-0}; + let Inst{30} = lane{3}; + } + + def _B_register : STN_WBReg_Lane(List # "B_operand"), + uimm_b, neon_uimm4_bare, asmop> { + let Inst{12-10} = lane{2-0}; + let Inst{30} = lane{3}; + } + + def _H_fixed : STN_WBFx_Lane(List # "H_operand"), + uimm_h, neon_uimm3_bare, asmop> { + let Inst{12-10} = {lane{1}, lane{0}, 0b0}; + let Inst{30} = lane{2}; + } + + def _H_register : STN_WBReg_Lane(List # "H_operand"), + uimm_h, neon_uimm3_bare, asmop> { + let Inst{12-10} = {lane{1}, lane{0}, 0b0}; + let Inst{30} = lane{2}; + } + + def _S_fixed : STN_WBFx_Lane(List # "S_operand"), + uimm_s, neon_uimm2_bare, asmop> { + let Inst{12-10} = {lane{0}, 0b0, 0b0}; + let Inst{30} = lane{1}; + } + + def _S_register : STN_WBReg_Lane(List # "S_operand"), + uimm_s, neon_uimm2_bare, asmop> { + let Inst{12-10} = {lane{0}, 0b0, 0b0}; + let Inst{30} = lane{1}; + } + + def _D_fixed : STN_WBFx_Lane(List # "D_operand"), + uimm_d, neon_uimm1_bare, asmop> { + let Inst{12-10} = 0b001; + let Inst{30} = lane{0}; + } + + def _D_register : STN_WBReg_Lane(List # "D_operand"), + uimm_d, neon_uimm1_bare, asmop> { + let Inst{12-10} = 0b001; + let Inst{30} = lane{0}; + } +} + +// Post-index store single 1-element structure from one lane of 1 register. +defm ST1LN_WB : ST_Lane_WB_BHSD<0b0, 0b0, "VOne", "st1", uimm_exact1, + uimm_exact2, uimm_exact4, uimm_exact8>; + +// Post-index store single N-element structure from one lane of N consecutive +// registers (N = 2,3,4) +defm ST2LN_WB : ST_Lane_WB_BHSD<0b1, 0b0, "VPair", "st2", uimm_exact2, + uimm_exact4, uimm_exact8, uimm_exact16>; +defm ST3LN_WB : ST_Lane_WB_BHSD<0b0, 0b1, "VTriple", "st3", uimm_exact3, + uimm_exact6, uimm_exact12, uimm_exact24>; +defm ST4LN_WB : ST_Lane_WB_BHSD<0b1, 0b1, "VQuad", "st4", uimm_exact4, + uimm_exact8, uimm_exact16, uimm_exact32>; + +// End of post-index load/store single N-element instructions +// (class SIMD lsone-post) + +// Neon Scalar instructions implementation +// Scalar Three Same + +class NeonI_Scalar3Same_size size, bits<5> opcode, string asmop, + RegisterClass FPRC> + : NeonI_Scalar3Same; + +class NeonI_Scalar3Same_D_size opcode, string asmop> + : NeonI_Scalar3Same_size; + +multiclass NeonI_Scalar3Same_HS_sizes opcode, string asmop, + bit Commutable = 0> { + let isCommutable = Commutable in { + def hhh : NeonI_Scalar3Same_size; + def sss : NeonI_Scalar3Same_size; + } +} + +multiclass NeonI_Scalar3Same_SD_sizes opcode, + string asmop, bit Commutable = 0> { + let isCommutable = Commutable in { + def sss : NeonI_Scalar3Same_size; + def ddd : NeonI_Scalar3Same_size; + } +} + +multiclass NeonI_Scalar3Same_BHSD_sizes opcode, + string asmop, bit Commutable = 0> { + let isCommutable = Commutable in { + def bbb : NeonI_Scalar3Same_size; + def hhh : NeonI_Scalar3Same_size; + def sss : NeonI_Scalar3Same_size; + def ddd : NeonI_Scalar3Same_size; + } +} + +multiclass Neon_Scalar3Same_D_size_patterns { + def : Pat<(v1i64 (opnode (v1i64 FPR64:$Rn), (v1i64 FPR64:$Rm))), + (INSTD FPR64:$Rn, FPR64:$Rm)>; +} + +multiclass Neon_Scalar3Same_BHSD_size_patterns + : Neon_Scalar3Same_D_size_patterns { + def: Pat<(v1i8 (opnode (v1i8 FPR8:$Rn), (v1i8 FPR8:$Rm))), + (INSTB FPR8:$Rn, FPR8:$Rm)>; + + def: Pat<(v1i16 (opnode (v1i16 FPR16:$Rn), (v1i16 FPR16:$Rm))), + (INSTH FPR16:$Rn, FPR16:$Rm)>; + + def: Pat<(v1i32 (opnode (v1i32 FPR32:$Rn), (v1i32 FPR32:$Rm))), + (INSTS FPR32:$Rn, FPR32:$Rm)>; +} + +class Neon_Scalar3Same_cmp_D_size_patterns + : Pat<(v1i64 (opnode (v1i64 FPR64:$Rn), (v1i64 FPR64:$Rm))), + (INSTD FPR64:$Rn, FPR64:$Rm)>; + +multiclass Neon_Scalar3Same_HS_size_patterns { + def : Pat<(v1i16 (opnode (v1i16 FPR16:$Rn), (v1i16 FPR16:$Rm))), + (INSTH FPR16:$Rn, FPR16:$Rm)>; + def : Pat<(v1i32 (opnode (v1i32 FPR32:$Rn), (v1i32 FPR32:$Rm))), + (INSTS FPR32:$Rn, FPR32:$Rm)>; +} + +multiclass Neon_Scalar3Same_SD_size_patterns { + def : Pat<(v1f32 (opnode (v1f32 FPR32:$Rn), (v1f32 FPR32:$Rm))), + (INSTS FPR32:$Rn, FPR32:$Rm)>; + def : Pat<(v1f64 (opnode (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))), + (INSTD FPR64:$Rn, FPR64:$Rm)>; +} + +multiclass Neon_Scalar3Same_cmp_SD_size_patterns { + def : Pat<(v1i32 (opnode (v1f32 FPR32:$Rn), (v1f32 FPR32:$Rm))), + (INSTS FPR32:$Rn, FPR32:$Rm)>; + def : Pat<(v1i64 (opnode (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))), + (INSTD FPR64:$Rn, FPR64:$Rm)>; +} + +class Neon_Scalar3Same_cmp_V1_D_size_patterns + : Pat<(v1i64 (Neon_cmp (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm), CC)), + (INSTD FPR64:$Rn, FPR64:$Rm)>; + +// Scalar Three Different + +class NeonI_Scalar3Diff_size size, bits<4> opcode, string asmop, + RegisterClass FPRCD, RegisterClass FPRCS> + : NeonI_Scalar3Diff; + +multiclass NeonI_Scalar3Diff_HS_size opcode, string asmop> { + def shh : NeonI_Scalar3Diff_size; + def dss : NeonI_Scalar3Diff_size; +} + +multiclass NeonI_Scalar3Diff_ml_HS_size opcode, string asmop> { + let Constraints = "$Src = $Rd" in { + def shh : NeonI_Scalar3Diff; + def dss : NeonI_Scalar3Diff; + } +} + +multiclass Neon_Scalar3Diff_HS_size_patterns { + def : Pat<(v1i32 (opnode (v1i16 FPR16:$Rn), (v1i16 FPR16:$Rm))), + (INSTH FPR16:$Rn, FPR16:$Rm)>; + def : Pat<(v1i64 (opnode (v1i32 FPR32:$Rn), (v1i32 FPR32:$Rm))), + (INSTS FPR32:$Rn, FPR32:$Rm)>; +} + +multiclass Neon_Scalar3Diff_ml_HS_size_patterns { + def : Pat<(v1i32 (opnode (v1i32 FPR32:$Src), (v1i16 FPR16:$Rn), (v1i16 FPR16:$Rm))), + (INSTH FPR32:$Src, FPR16:$Rn, FPR16:$Rm)>; + def : Pat<(v1i64 (opnode (v1i64 FPR64:$Src), (v1i32 FPR32:$Rn), (v1i32 FPR32:$Rm))), + (INSTS FPR64:$Src, FPR32:$Rn, FPR32:$Rm)>; +} + +// Scalar Two Registers Miscellaneous + +class NeonI_Scalar2SameMisc_size size, bits<5> opcode, string asmop, + RegisterClass FPRCD, RegisterClass FPRCS> + : NeonI_Scalar2SameMisc; + +multiclass NeonI_Scalar2SameMisc_SD_size opcode, + string asmop> { + def ss : NeonI_Scalar2SameMisc_size; + def dd : NeonI_Scalar2SameMisc_size; +} + +multiclass NeonI_Scalar2SameMisc_D_size opcode, string asmop> { + def dd : NeonI_Scalar2SameMisc_size; +} + +multiclass NeonI_Scalar2SameMisc_BHSD_size opcode, string asmop> + : NeonI_Scalar2SameMisc_D_size { + def bb : NeonI_Scalar2SameMisc_size; + def hh : NeonI_Scalar2SameMisc_size; + def ss : NeonI_Scalar2SameMisc_size; +} + +class NeonI_Scalar2SameMisc_fcvtxn_D_size opcode, string asmop> + : NeonI_Scalar2SameMisc_size; + +multiclass NeonI_Scalar2SameMisc_narrow_HSD_size opcode, + string asmop> { + def bh : NeonI_Scalar2SameMisc_size; + def hs : NeonI_Scalar2SameMisc_size; + def sd : NeonI_Scalar2SameMisc_size; +} + +class NeonI_Scalar2SameMisc_accum_size size, bits<5> opcode, + string asmop, RegisterClass FPRC> + : NeonI_Scalar2SameMisc; + +multiclass NeonI_Scalar2SameMisc_accum_BHSD_size opcode, + string asmop> { + + let Constraints = "$Src = $Rd" in { + def bb : NeonI_Scalar2SameMisc_accum_size; + def hh : NeonI_Scalar2SameMisc_accum_size; + def ss : NeonI_Scalar2SameMisc_accum_size; + def dd : NeonI_Scalar2SameMisc_accum_size; + } +} + +class Neon_Scalar2SameMisc_fcvtxn_D_size_patterns + : Pat<(v1f32 (opnode (v1f64 FPR64:$Rn))), + (INSTD FPR64:$Rn)>; + +multiclass Neon_Scalar2SameMisc_fcvt_SD_size_patterns { + def : Pat<(v1i32 (opnode (v1f32 FPR32:$Rn))), + (INSTS FPR32:$Rn)>; + def : Pat<(v1i64 (opnode (v1f64 FPR64:$Rn))), + (INSTD FPR64:$Rn)>; +} + +multiclass Neon_Scalar2SameMisc_cvt_SD_size_patterns { + def : Pat<(f32 (Sopnode (v1i32 FPR32:$Rn))), + (INSTS FPR32:$Rn)>; + def : Pat<(f64 (Dopnode (v1i64 FPR64:$Rn))), + (INSTD FPR64:$Rn)>; +} + +multiclass Neon_Scalar2SameMisc_SD_size_patterns { + def : Pat<(v1f32 (opnode (v1f32 FPR32:$Rn))), + (INSTS FPR32:$Rn)>; + def : Pat<(v1f64 (opnode (v1f64 FPR64:$Rn))), + (INSTD FPR64:$Rn)>; +} + +class NeonI_Scalar2SameMisc_cmpz_D_size opcode, string asmop> + : NeonI_Scalar2SameMisc; + +multiclass NeonI_Scalar2SameMisc_cmpz_SD_size opcode, + string asmop> { + def ssi : NeonI_Scalar2SameMisc; + def ddi : NeonI_Scalar2SameMisc; +} + +class Neon_Scalar2SameMisc_cmpz_D_size_patterns + : Pat<(v1i64 (opnode (v1i64 FPR64:$Rn), + (v1i64 (bitconvert (v8i8 Neon_AllZero))))), + (INSTD FPR64:$Rn, 0)>; + +class Neon_Scalar2SameMisc_cmpz_D_V1_size_patterns + : Pat<(v1i64 (Neon_cmpz (v1i64 FPR64:$Rn), + (i32 neon_uimm0:$Imm), CC)), + (INSTD FPR64:$Rn, neon_uimm0:$Imm)>; + +multiclass Neon_Scalar2SameMisc_cmpz_SD_size_patterns { + def : Pat<(v1i32 (opnode (v1f32 FPR32:$Rn), + (v1f32 (scalar_to_vector (f32 fpz32:$FPImm))))), + (INSTS FPR32:$Rn, fpz32:$FPImm)>; + def : Pat<(v1i64 (opnode (v1f64 FPR64:$Rn), + (v1f32 (scalar_to_vector (f32 fpz32:$FPImm))))), + (INSTD FPR64:$Rn, fpz32:$FPImm)>; +} + +multiclass Neon_Scalar2SameMisc_D_size_patterns { + def : Pat<(v1i64 (opnode (v1i64 FPR64:$Rn))), + (INSTD FPR64:$Rn)>; +} + +multiclass Neon_Scalar2SameMisc_BHSD_size_patterns + : Neon_Scalar2SameMisc_D_size_patterns { + def : Pat<(v1i8 (opnode (v1i8 FPR8:$Rn))), + (INSTB FPR8:$Rn)>; + def : Pat<(v1i16 (opnode (v1i16 FPR16:$Rn))), + (INSTH FPR16:$Rn)>; + def : Pat<(v1i32 (opnode (v1i32 FPR32:$Rn))), + (INSTS FPR32:$Rn)>; +} + +multiclass Neon_Scalar2SameMisc_narrow_HSD_size_patterns< + SDPatternOperator opnode, + Instruction INSTH, + Instruction INSTS, + Instruction INSTD> { + def : Pat<(v1i8 (opnode (v1i16 FPR16:$Rn))), + (INSTH FPR16:$Rn)>; + def : Pat<(v1i16 (opnode (v1i32 FPR32:$Rn))), + (INSTS FPR32:$Rn)>; + def : Pat<(v1i32 (opnode (v1i64 FPR64:$Rn))), + (INSTD FPR64:$Rn)>; + +} + +multiclass Neon_Scalar2SameMisc_accum_BHSD_size_patterns< + SDPatternOperator opnode, + Instruction INSTB, + Instruction INSTH, + Instruction INSTS, + Instruction INSTD> { + def : Pat<(v1i8 (opnode (v1i8 FPR8:$Src), (v1i8 FPR8:$Rn))), + (INSTB FPR8:$Src, FPR8:$Rn)>; + def : Pat<(v1i16 (opnode (v1i16 FPR16:$Src), (v1i16 FPR16:$Rn))), + (INSTH FPR16:$Src, FPR16:$Rn)>; + def : Pat<(v1i32 (opnode (v1i32 FPR32:$Src), (v1i32 FPR32:$Rn))), + (INSTS FPR32:$Src, FPR32:$Rn)>; + def : Pat<(v1i64 (opnode (v1i64 FPR64:$Src), (v1i64 FPR64:$Rn))), + (INSTD FPR64:$Src, FPR64:$Rn)>; +} + +// Scalar Shift By Immediate + +class NeonI_ScalarShiftImm_size opcode, string asmop, + RegisterClass FPRC, Operand ImmTy> + : NeonI_ScalarShiftImm; + +multiclass NeonI_ScalarShiftRightImm_D_size opcode, + string asmop> { + def ddi : NeonI_ScalarShiftImm_size { + bits<6> Imm; + let Inst{22} = 0b1; // immh:immb = 1xxxxxx + let Inst{21-16} = Imm; + } +} + +multiclass NeonI_ScalarShiftRightImm_BHSD_size opcode, + string asmop> + : NeonI_ScalarShiftRightImm_D_size { + def bbi : NeonI_ScalarShiftImm_size { + bits<3> Imm; + let Inst{22-19} = 0b0001; // immh:immb = 0001xxx + let Inst{18-16} = Imm; + } + def hhi : NeonI_ScalarShiftImm_size { + bits<4> Imm; + let Inst{22-20} = 0b001; // immh:immb = 001xxxx + let Inst{19-16} = Imm; + } + def ssi : NeonI_ScalarShiftImm_size { + bits<5> Imm; + let Inst{22-21} = 0b01; // immh:immb = 01xxxxx + let Inst{20-16} = Imm; + } +} + +multiclass NeonI_ScalarShiftLeftImm_D_size opcode, + string asmop> { + def ddi : NeonI_ScalarShiftImm_size { + bits<6> Imm; + let Inst{22} = 0b1; // immh:immb = 1xxxxxx + let Inst{21-16} = Imm; + } +} + +multiclass NeonI_ScalarShiftLeftImm_BHSD_size opcode, + string asmop> + : NeonI_ScalarShiftLeftImm_D_size { + def bbi : NeonI_ScalarShiftImm_size { + bits<3> Imm; + let Inst{22-19} = 0b0001; // immh:immb = 0001xxx + let Inst{18-16} = Imm; + } + def hhi : NeonI_ScalarShiftImm_size { + bits<4> Imm; + let Inst{22-20} = 0b001; // immh:immb = 001xxxx + let Inst{19-16} = Imm; + } + def ssi : NeonI_ScalarShiftImm_size { + bits<5> Imm; + let Inst{22-21} = 0b01; // immh:immb = 01xxxxx + let Inst{20-16} = Imm; + } +} + +class NeonI_ScalarShiftRightImm_accum_D_size opcode, string asmop> + : NeonI_ScalarShiftImm { + bits<6> Imm; + let Inst{22} = 0b1; // immh:immb = 1xxxxxx + let Inst{21-16} = Imm; + let Constraints = "$Src = $Rd"; +} + +class NeonI_ScalarShiftLeftImm_accum_D_size opcode, string asmop> + : NeonI_ScalarShiftImm { + bits<6> Imm; + let Inst{22} = 0b1; // immh:immb = 1xxxxxx + let Inst{21-16} = Imm; + let Constraints = "$Src = $Rd"; +} + +class NeonI_ScalarShiftImm_narrow_size opcode, string asmop, + RegisterClass FPRCD, RegisterClass FPRCS, + Operand ImmTy> + : NeonI_ScalarShiftImm; + +multiclass NeonI_ScalarShiftImm_narrow_HSD_size opcode, + string asmop> { + def bhi : NeonI_ScalarShiftImm_narrow_size { + bits<3> Imm; + let Inst{22-19} = 0b0001; // immh:immb = 0001xxx + let Inst{18-16} = Imm; + } + def hsi : NeonI_ScalarShiftImm_narrow_size { + bits<4> Imm; + let Inst{22-20} = 0b001; // immh:immb = 001xxxx + let Inst{19-16} = Imm; + } + def sdi : NeonI_ScalarShiftImm_narrow_size { + bits<5> Imm; + let Inst{22-21} = 0b01; // immh:immb = 01xxxxx + let Inst{20-16} = Imm; + } +} + +multiclass NeonI_ScalarShiftImm_cvt_SD_size opcode, string asmop> { + def ssi : NeonI_ScalarShiftImm_size { + bits<5> Imm; + let Inst{22-21} = 0b01; // immh:immb = 01xxxxx + let Inst{20-16} = Imm; + } + def ddi : NeonI_ScalarShiftImm_size { + bits<6> Imm; + let Inst{22} = 0b1; // immh:immb = 1xxxxxx + let Inst{21-16} = Imm; + } +} + +multiclass Neon_ScalarShiftRImm_D_size_patterns { + def ddi : Pat<(v1i64 (opnode (v1i64 FPR64:$Rn), (i32 shr_imm64:$Imm))), + (INSTD FPR64:$Rn, imm:$Imm)>; +} + +multiclass Neon_ScalarShiftLImm_D_size_patterns { + def ddi : Pat<(v1i64 (opnode (v1i64 FPR64:$Rn), (i32 shl_imm64:$Imm))), + (INSTD FPR64:$Rn, imm:$Imm)>; +} + +class Neon_ScalarShiftImm_arm_D_size_patterns + : Pat<(v1i64 (opnode (v1i64 FPR64:$Rn), + (v1i64 (Neon_vdup (i32 shr_imm64:$Imm))))), + (INSTD FPR64:$Rn, imm:$Imm)>; + +multiclass Neon_ScalarShiftLImm_BHSD_size_patterns + : Neon_ScalarShiftLImm_D_size_patterns { + def bbi : Pat<(v1i8 (opnode (v1i8 FPR8:$Rn), (i32 shl_imm8:$Imm))), + (INSTB FPR8:$Rn, imm:$Imm)>; + def hhi : Pat<(v1i16 (opnode (v1i16 FPR16:$Rn), (i32 shl_imm16:$Imm))), + (INSTH FPR16:$Rn, imm:$Imm)>; + def ssi : Pat<(v1i32 (opnode (v1i32 FPR32:$Rn), (i32 shl_imm32:$Imm))), + (INSTS FPR32:$Rn, imm:$Imm)>; +} + +class Neon_ScalarShiftLImm_accum_D_size_patterns + : Pat<(v1i64 (opnode (v1i64 FPR64:$Src), (v1i64 FPR64:$Rn), + (i32 shl_imm64:$Imm))), + (INSTD FPR64:$Src, FPR64:$Rn, imm:$Imm)>; + +class Neon_ScalarShiftRImm_accum_D_size_patterns + : Pat<(v1i64 (opnode (v1i64 FPR64:$Src), (v1i64 FPR64:$Rn), + (i32 shr_imm64:$Imm))), + (INSTD FPR64:$Src, FPR64:$Rn, imm:$Imm)>; + +multiclass Neon_ScalarShiftImm_narrow_HSD_size_patterns< + SDPatternOperator opnode, + Instruction INSTH, + Instruction INSTS, + Instruction INSTD> { + def bhi : Pat<(v1i8 (opnode (v1i16 FPR16:$Rn), (i32 shr_imm16:$Imm))), + (INSTH FPR16:$Rn, imm:$Imm)>; + def hsi : Pat<(v1i16 (opnode (v1i32 FPR32:$Rn), (i32 shr_imm32:$Imm))), + (INSTS FPR32:$Rn, imm:$Imm)>; + def sdi : Pat<(v1i32 (opnode (v1i64 FPR64:$Rn), (i32 shr_imm64:$Imm))), + (INSTD FPR64:$Rn, imm:$Imm)>; +} + +multiclass Neon_ScalarShiftImm_scvtf_SD_size_patterns { + def ssi : Pat<(f32 (Sopnode (v1i32 FPR32:$Rn), (i32 shr_imm32:$Imm))), + (INSTS FPR32:$Rn, imm:$Imm)>; + def ddi : Pat<(f64 (Dopnode (v1i64 FPR64:$Rn), (i32 shr_imm64:$Imm))), + (INSTD FPR64:$Rn, imm:$Imm)>; +} + +multiclass Neon_ScalarShiftImm_fcvts_SD_size_patterns { + def ssi : Pat<(v1i32 (Sopnode (v1f32 FPR32:$Rn), (i32 shr_imm32:$Imm))), + (INSTS FPR32:$Rn, imm:$Imm)>; + def ddi : Pat<(v1i64 (Dopnode (v1f64 FPR64:$Rn), (i32 shr_imm64:$Imm))), + (INSTD FPR64:$Rn, imm:$Imm)>; +} + +// Scalar Signed Shift Right (Immediate) +defm SSHR : NeonI_ScalarShiftRightImm_D_size<0b0, 0b00000, "sshr">; +defm : Neon_ScalarShiftRImm_D_size_patterns; +// Pattern to match llvm.arm.* intrinsic. +def : Neon_ScalarShiftImm_arm_D_size_patterns; + +// Scalar Unsigned Shift Right (Immediate) +defm USHR : NeonI_ScalarShiftRightImm_D_size<0b1, 0b00000, "ushr">; +defm : Neon_ScalarShiftRImm_D_size_patterns; +// Pattern to match llvm.arm.* intrinsic. +def : Neon_ScalarShiftImm_arm_D_size_patterns; + +// Scalar Signed Rounding Shift Right (Immediate) +defm SRSHR : NeonI_ScalarShiftRightImm_D_size<0b0, 0b00100, "srshr">; +defm : Neon_ScalarShiftRImm_D_size_patterns; + +// Scalar Unigned Rounding Shift Right (Immediate) +defm URSHR : NeonI_ScalarShiftRightImm_D_size<0b1, 0b00100, "urshr">; +defm : Neon_ScalarShiftRImm_D_size_patterns; + +// Scalar Signed Shift Right and Accumulate (Immediate) +def SSRA : NeonI_ScalarShiftRightImm_accum_D_size<0b0, 0b00010, "ssra">; +def : Neon_ScalarShiftRImm_accum_D_size_patterns + ; + +// Scalar Unsigned Shift Right and Accumulate (Immediate) +def USRA : NeonI_ScalarShiftRightImm_accum_D_size<0b1, 0b00010, "usra">; +def : Neon_ScalarShiftRImm_accum_D_size_patterns + ; + +// Scalar Signed Rounding Shift Right and Accumulate (Immediate) +def SRSRA : NeonI_ScalarShiftRightImm_accum_D_size<0b0, 0b00110, "srsra">; +def : Neon_ScalarShiftRImm_accum_D_size_patterns + ; + +// Scalar Unsigned Rounding Shift Right and Accumulate (Immediate) +def URSRA : NeonI_ScalarShiftRightImm_accum_D_size<0b1, 0b00110, "ursra">; +def : Neon_ScalarShiftRImm_accum_D_size_patterns + ; + +// Scalar Shift Left (Immediate) +defm SHL : NeonI_ScalarShiftLeftImm_D_size<0b0, 0b01010, "shl">; +defm : Neon_ScalarShiftLImm_D_size_patterns; +// Pattern to match llvm.arm.* intrinsic. +def : Neon_ScalarShiftImm_arm_D_size_patterns; + +// Signed Saturating Shift Left (Immediate) +defm SQSHL : NeonI_ScalarShiftLeftImm_BHSD_size<0b0, 0b01110, "sqshl">; +defm : Neon_ScalarShiftLImm_BHSD_size_patterns; +// Pattern to match llvm.arm.* intrinsic. +defm : Neon_ScalarShiftLImm_D_size_patterns; + +// Unsigned Saturating Shift Left (Immediate) +defm UQSHL : NeonI_ScalarShiftLeftImm_BHSD_size<0b1, 0b01110, "uqshl">; +defm : Neon_ScalarShiftLImm_BHSD_size_patterns; +// Pattern to match llvm.arm.* intrinsic. +defm : Neon_ScalarShiftLImm_D_size_patterns; + +// Signed Saturating Shift Left Unsigned (Immediate) +defm SQSHLU : NeonI_ScalarShiftLeftImm_BHSD_size<0b1, 0b01100, "sqshlu">; +defm : Neon_ScalarShiftLImm_BHSD_size_patterns; + +// Shift Right And Insert (Immediate) +def SRI : NeonI_ScalarShiftRightImm_accum_D_size<0b1, 0b01000, "sri">; +def : Neon_ScalarShiftRImm_accum_D_size_patterns + ; + +// Shift Left And Insert (Immediate) +def SLI : NeonI_ScalarShiftLeftImm_accum_D_size<0b1, 0b01010, "sli">; +def : Neon_ScalarShiftLImm_accum_D_size_patterns + ; + +// Signed Saturating Shift Right Narrow (Immediate) +defm SQSHRN : NeonI_ScalarShiftImm_narrow_HSD_size<0b0, 0b10010, "sqshrn">; +defm : Neon_ScalarShiftImm_narrow_HSD_size_patterns; + +// Unsigned Saturating Shift Right Narrow (Immediate) +defm UQSHRN : NeonI_ScalarShiftImm_narrow_HSD_size<0b1, 0b10010, "uqshrn">; +defm : Neon_ScalarShiftImm_narrow_HSD_size_patterns; + +// Signed Saturating Rounded Shift Right Narrow (Immediate) +defm SQRSHRN : NeonI_ScalarShiftImm_narrow_HSD_size<0b0, 0b10011, "sqrshrn">; +defm : Neon_ScalarShiftImm_narrow_HSD_size_patterns; + +// Unsigned Saturating Rounded Shift Right Narrow (Immediate) +defm UQRSHRN : NeonI_ScalarShiftImm_narrow_HSD_size<0b1, 0b10011, "uqrshrn">; +defm : Neon_ScalarShiftImm_narrow_HSD_size_patterns; + +// Signed Saturating Shift Right Unsigned Narrow (Immediate) +defm SQSHRUN : NeonI_ScalarShiftImm_narrow_HSD_size<0b1, 0b10000, "sqshrun">; +defm : Neon_ScalarShiftImm_narrow_HSD_size_patterns; + +// Signed Saturating Rounded Shift Right Unsigned Narrow (Immediate) +defm SQRSHRUN : NeonI_ScalarShiftImm_narrow_HSD_size<0b1, 0b10001, "sqrshrun">; +defm : Neon_ScalarShiftImm_narrow_HSD_size_patterns; + +// Scalar Signed Fixed-point Convert To Floating-Point (Immediate) +defm SCVTF_N : NeonI_ScalarShiftImm_cvt_SD_size<0b0, 0b11100, "scvtf">; +defm : Neon_ScalarShiftImm_scvtf_SD_size_patterns; + +// Scalar Unsigned Fixed-point Convert To Floating-Point (Immediate) +defm UCVTF_N : NeonI_ScalarShiftImm_cvt_SD_size<0b1, 0b11100, "ucvtf">; +defm : Neon_ScalarShiftImm_scvtf_SD_size_patterns; + +// Scalar Floating-point Convert To Signed Fixed-point (Immediate) +defm FCVTZS_N : NeonI_ScalarShiftImm_cvt_SD_size<0b0, 0b11111, "fcvtzs">; +defm : Neon_ScalarShiftImm_fcvts_SD_size_patterns; + +// Scalar Floating-point Convert To Unsigned Fixed-point (Immediate) +defm FCVTZU_N : NeonI_ScalarShiftImm_cvt_SD_size<0b1, 0b11111, "fcvtzu">; +defm : Neon_ScalarShiftImm_fcvts_SD_size_patterns; + +// Patterns For Convert Instructions Between v1f64 and v1i64 +class Neon_ScalarShiftImm_cvtf_v1f64_pattern + : Pat<(v1f64 (opnode (v1i64 FPR64:$Rn), (i32 shr_imm64:$Imm))), + (INST FPR64:$Rn, imm:$Imm)>; + +class Neon_ScalarShiftImm_fcvt_v1f64_pattern + : Pat<(v1i64 (opnode (v1f64 FPR64:$Rn), (i32 shr_imm64:$Imm))), + (INST FPR64:$Rn, imm:$Imm)>; + +def : Neon_ScalarShiftImm_cvtf_v1f64_pattern; + +def : Neon_ScalarShiftImm_cvtf_v1f64_pattern; + +def : Neon_ScalarShiftImm_fcvt_v1f64_pattern; + +def : Neon_ScalarShiftImm_fcvt_v1f64_pattern; + +// Scalar Integer Add +let isCommutable = 1 in { +def ADDddd : NeonI_Scalar3Same_D_size<0b0, 0b10000, "add">; +} + +// Scalar Integer Sub +def SUBddd : NeonI_Scalar3Same_D_size<0b1, 0b10000, "sub">; + +// Pattern for Scalar Integer Add and Sub with D register only +defm : Neon_Scalar3Same_D_size_patterns; +defm : Neon_Scalar3Same_D_size_patterns; + +// Patterns to match llvm.aarch64.* intrinsic for Scalar Add, Sub +defm : Neon_Scalar3Same_D_size_patterns; +defm : Neon_Scalar3Same_D_size_patterns; +defm : Neon_Scalar3Same_D_size_patterns; +defm : Neon_Scalar3Same_D_size_patterns; + +// Scalar Integer Saturating Add (Signed, Unsigned) +defm SQADD : NeonI_Scalar3Same_BHSD_sizes<0b0, 0b00001, "sqadd", 1>; +defm UQADD : NeonI_Scalar3Same_BHSD_sizes<0b1, 0b00001, "uqadd", 1>; + +// Scalar Integer Saturating Sub (Signed, Unsigned) +defm SQSUB : NeonI_Scalar3Same_BHSD_sizes<0b0, 0b00101, "sqsub", 0>; +defm UQSUB : NeonI_Scalar3Same_BHSD_sizes<0b1, 0b00101, "uqsub", 0>; + + +// Patterns to match llvm.aarch64.* intrinsic for +// Scalar Integer Saturating Add, Sub (Signed, Unsigned) +defm : Neon_Scalar3Same_BHSD_size_patterns; +defm : Neon_Scalar3Same_BHSD_size_patterns; +defm : Neon_Scalar3Same_BHSD_size_patterns; +defm : Neon_Scalar3Same_BHSD_size_patterns; + +// Scalar Integer Saturating Doubling Multiply Half High +defm SQDMULH : NeonI_Scalar3Same_HS_sizes<0b0, 0b10110, "sqdmulh", 1>; + +// Scalar Integer Saturating Rounding Doubling Multiply Half High +defm SQRDMULH : NeonI_Scalar3Same_HS_sizes<0b1, 0b10110, "sqrdmulh", 1>; + +// Patterns to match llvm.arm.* intrinsic for +// Scalar Integer Saturating Doubling Multiply Half High and +// Scalar Integer Saturating Rounding Doubling Multiply Half High +defm : Neon_Scalar3Same_HS_size_patterns; +defm : Neon_Scalar3Same_HS_size_patterns; + +// Scalar Floating-point Multiply Extended +defm FMULX : NeonI_Scalar3Same_SD_sizes<0b0, 0b0, 0b11011, "fmulx", 1>; + +// Scalar Floating-point Reciprocal Step +defm FRECPS : NeonI_Scalar3Same_SD_sizes<0b0, 0b0, 0b11111, "frecps", 0>; + +// Scalar Floating-point Reciprocal Square Root Step +defm FRSQRTS : NeonI_Scalar3Same_SD_sizes<0b0, 0b1, 0b11111, "frsqrts", 0>; + +// Patterns to match llvm.arm.* intrinsic for +// Scalar Floating-point Reciprocal Step and +// Scalar Floating-point Reciprocal Square Root Step +defm : Neon_Scalar3Same_SD_size_patterns; +defm : Neon_Scalar3Same_SD_size_patterns; + +def : Pat<(v1f64 (fsqrt (v1f64 FPR64:$Rn))), (FSQRTdd FPR64:$Rn)>; + +// Patterns to match llvm.aarch64.* intrinsic for +// Scalar Floating-point Multiply Extended, +multiclass Neon_Scalar3Same_MULX_SD_size_patterns { + def : Pat<(f32 (opnode (f32 FPR32:$Rn), (f32 FPR32:$Rm))), + (INSTS FPR32:$Rn, FPR32:$Rm)>; + def : Pat<(f64 (opnode (f64 FPR64:$Rn), (f64 FPR64:$Rm))), + (INSTD FPR64:$Rn, FPR64:$Rm)>; +} + +defm : Neon_Scalar3Same_MULX_SD_size_patterns; + +// Scalar Integer Shift Left (Signed, Unsigned) +def SSHLddd : NeonI_Scalar3Same_D_size<0b0, 0b01000, "sshl">; +def USHLddd : NeonI_Scalar3Same_D_size<0b1, 0b01000, "ushl">; + +// Patterns to match llvm.arm.* intrinsic for +// Scalar Integer Shift Left (Signed, Unsigned) +defm : Neon_Scalar3Same_D_size_patterns; +defm : Neon_Scalar3Same_D_size_patterns; + +// Patterns to match llvm.aarch64.* intrinsic for +// Scalar Integer Shift Left (Signed, Unsigned) +defm : Neon_Scalar3Same_D_size_patterns; +defm : Neon_Scalar3Same_D_size_patterns; + +// Scalar Integer Saturating Shift Left (Signed, Unsigned) +defm SQSHL: NeonI_Scalar3Same_BHSD_sizes<0b0, 0b01001, "sqshl", 0>; +defm UQSHL: NeonI_Scalar3Same_BHSD_sizes<0b1, 0b01001, "uqshl", 0>; + +// Patterns to match llvm.aarch64.* intrinsic for +// Scalar Integer Saturating Shift Letf (Signed, Unsigned) +defm : Neon_Scalar3Same_BHSD_size_patterns; +defm : Neon_Scalar3Same_BHSD_size_patterns; + +// Patterns to match llvm.arm.* intrinsic for +// Scalar Integer Saturating Shift Letf (Signed, Unsigned) +defm : Neon_Scalar3Same_D_size_patterns; +defm : Neon_Scalar3Same_D_size_patterns; + +// Scalar Integer Rounding Shift Left (Signed, Unsigned) +def SRSHLddd: NeonI_Scalar3Same_D_size<0b0, 0b01010, "srshl">; +def URSHLddd: NeonI_Scalar3Same_D_size<0b1, 0b01010, "urshl">; + +// Patterns to match llvm.aarch64.* intrinsic for +// Scalar Integer Rounding Shift Left (Signed, Unsigned) +defm : Neon_Scalar3Same_D_size_patterns; +defm : Neon_Scalar3Same_D_size_patterns; + +// Patterns to match llvm.arm.* intrinsic for +// Scalar Integer Rounding Shift Left (Signed, Unsigned) +defm : Neon_Scalar3Same_D_size_patterns; +defm : Neon_Scalar3Same_D_size_patterns; + +// Scalar Integer Saturating Rounding Shift Left (Signed, Unsigned) +defm SQRSHL: NeonI_Scalar3Same_BHSD_sizes<0b0, 0b01011, "sqrshl", 0>; +defm UQRSHL: NeonI_Scalar3Same_BHSD_sizes<0b1, 0b01011, "uqrshl", 0>; + +// Patterns to match llvm.aarch64.* intrinsic for +// Scalar Integer Saturating Rounding Shift Left (Signed, Unsigned) +defm : Neon_Scalar3Same_BHSD_size_patterns; +defm : Neon_Scalar3Same_BHSD_size_patterns; + +// Patterns to match llvm.arm.* intrinsic for +// Scalar Integer Saturating Rounding Shift Left (Signed, Unsigned) +defm : Neon_Scalar3Same_D_size_patterns; +defm : Neon_Scalar3Same_D_size_patterns; + +// Signed Saturating Doubling Multiply-Add Long +defm SQDMLAL : NeonI_Scalar3Diff_ml_HS_size<0b0, 0b1001, "sqdmlal">; +defm : Neon_Scalar3Diff_ml_HS_size_patterns; + +// Signed Saturating Doubling Multiply-Subtract Long +defm SQDMLSL : NeonI_Scalar3Diff_ml_HS_size<0b0, 0b1011, "sqdmlsl">; +defm : Neon_Scalar3Diff_ml_HS_size_patterns; + +// Signed Saturating Doubling Multiply Long +defm SQDMULL : NeonI_Scalar3Diff_HS_size<0b0, 0b1101, "sqdmull">; +defm : Neon_Scalar3Diff_HS_size_patterns; + +// Scalar Signed Integer Convert To Floating-point +defm SCVTF : NeonI_Scalar2SameMisc_SD_size<0b0, 0b0, 0b11101, "scvtf">; +defm : Neon_Scalar2SameMisc_cvt_SD_size_patterns; + +// Scalar Unsigned Integer Convert To Floating-point +defm UCVTF : NeonI_Scalar2SameMisc_SD_size<0b1, 0b0, 0b11101, "ucvtf">; +defm : Neon_Scalar2SameMisc_cvt_SD_size_patterns; + +// Scalar Floating-point Converts +def FCVTXN : NeonI_Scalar2SameMisc_fcvtxn_D_size<0b1, 0b10110, "fcvtxn">; +def : Neon_Scalar2SameMisc_fcvtxn_D_size_patterns; + +defm FCVTNS : NeonI_Scalar2SameMisc_SD_size<0b0, 0b0, 0b11010, "fcvtns">; +defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns; + +defm FCVTNU : NeonI_Scalar2SameMisc_SD_size<0b1, 0b0, 0b11010, "fcvtnu">; +defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns; + +defm FCVTMS : NeonI_Scalar2SameMisc_SD_size<0b0, 0b0, 0b11011, "fcvtms">; +defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns; + +defm FCVTMU : NeonI_Scalar2SameMisc_SD_size<0b1, 0b0, 0b11011, "fcvtmu">; +defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns; + +defm FCVTAS : NeonI_Scalar2SameMisc_SD_size<0b0, 0b0, 0b11100, "fcvtas">; +defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns; + +defm FCVTAU : NeonI_Scalar2SameMisc_SD_size<0b1, 0b0, 0b11100, "fcvtau">; +defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns; + +defm FCVTPS : NeonI_Scalar2SameMisc_SD_size<0b0, 0b1, 0b11010, "fcvtps">; +defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns; + +defm FCVTPU : NeonI_Scalar2SameMisc_SD_size<0b1, 0b1, 0b11010, "fcvtpu">; +defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns; + +defm FCVTZS : NeonI_Scalar2SameMisc_SD_size<0b0, 0b1, 0b11011, "fcvtzs">; +defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns; + +defm FCVTZU : NeonI_Scalar2SameMisc_SD_size<0b1, 0b1, 0b11011, "fcvtzu">; +defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns; + +// Patterns For Convert Instructions Between v1f64 and v1i64 +class Neon_Scalar2SameMisc_cvtf_v1f64_pattern + : Pat<(v1f64 (opnode (v1i64 FPR64:$Rn))), (INST FPR64:$Rn)>; + +class Neon_Scalar2SameMisc_fcvt_v1f64_pattern + : Pat<(v1i64 (opnode (v1f64 FPR64:$Rn))), (INST FPR64:$Rn)>; + +def : Neon_Scalar2SameMisc_cvtf_v1f64_pattern; +def : Neon_Scalar2SameMisc_cvtf_v1f64_pattern; + +def : Neon_Scalar2SameMisc_fcvt_v1f64_pattern; +def : Neon_Scalar2SameMisc_fcvt_v1f64_pattern; + +// Scalar Floating-point Reciprocal Estimate +defm FRECPE : NeonI_Scalar2SameMisc_SD_size<0b0, 0b1, 0b11101, "frecpe">; +defm : Neon_Scalar2SameMisc_SD_size_patterns; + +// Scalar Floating-point Reciprocal Exponent +defm FRECPX : NeonI_Scalar2SameMisc_SD_size<0b0, 0b1, 0b11111, "frecpx">; +defm : Neon_Scalar2SameMisc_SD_size_patterns; + +// Scalar Floating-point Reciprocal Square Root Estimate +defm FRSQRTE: NeonI_Scalar2SameMisc_SD_size<0b1, 0b1, 0b11101, "frsqrte">; +defm : Neon_Scalar2SameMisc_SD_size_patterns; + +// Scalar Floating-point Round +class Neon_ScalarFloatRound_pattern + : Pat<(v1f64 (opnode (v1f64 FPR64:$Rn))), (INST FPR64:$Rn)>; + +def : Neon_ScalarFloatRound_pattern; +def : Neon_ScalarFloatRound_pattern; +def : Neon_ScalarFloatRound_pattern; +def : Neon_ScalarFloatRound_pattern; +def : Neon_ScalarFloatRound_pattern; +def : Neon_ScalarFloatRound_pattern; +def : Neon_ScalarFloatRound_pattern; + +// Scalar Integer Compare + +// Scalar Compare Bitwise Equal +def CMEQddd: NeonI_Scalar3Same_D_size<0b1, 0b10001, "cmeq">; +def : Neon_Scalar3Same_cmp_D_size_patterns; + +class Neon_Scalar3Same_cmp_D_size_v1_patterns + : Pat<(v1i64 (opnode (v1i64 FPR64:$Rn), (v1i64 FPR64:$Rm), CC)), + (INSTD FPR64:$Rn, FPR64:$Rm)>; + +def : Neon_Scalar3Same_cmp_D_size_v1_patterns; + +// Scalar Compare Signed Greather Than Or Equal +def CMGEddd: NeonI_Scalar3Same_D_size<0b0, 0b00111, "cmge">; +def : Neon_Scalar3Same_cmp_D_size_patterns; +def : Neon_Scalar3Same_cmp_D_size_v1_patterns; + +// Scalar Compare Unsigned Higher Or Same +def CMHSddd: NeonI_Scalar3Same_D_size<0b1, 0b00111, "cmhs">; +def : Neon_Scalar3Same_cmp_D_size_patterns; +def : Neon_Scalar3Same_cmp_D_size_v1_patterns; + +// Scalar Compare Unsigned Higher +def CMHIddd: NeonI_Scalar3Same_D_size<0b1, 0b00110, "cmhi">; +def : Neon_Scalar3Same_cmp_D_size_patterns; +def : Neon_Scalar3Same_cmp_D_size_v1_patterns; + +// Scalar Compare Signed Greater Than +def CMGTddd: NeonI_Scalar3Same_D_size<0b0, 0b00110, "cmgt">; +def : Neon_Scalar3Same_cmp_D_size_patterns; +def : Neon_Scalar3Same_cmp_D_size_v1_patterns; + +// Scalar Compare Bitwise Test Bits +def CMTSTddd: NeonI_Scalar3Same_D_size<0b0, 0b10001, "cmtst">; +def : Neon_Scalar3Same_cmp_D_size_patterns; +def : Neon_Scalar3Same_cmp_D_size_patterns; + +// Scalar Compare Bitwise Equal To Zero +def CMEQddi: NeonI_Scalar2SameMisc_cmpz_D_size<0b0, 0b01001, "cmeq">; +def : Neon_Scalar2SameMisc_cmpz_D_size_patterns; +def : Neon_Scalar2SameMisc_cmpz_D_V1_size_patterns; + +// Scalar Compare Signed Greather Than Or Equal To Zero +def CMGEddi: NeonI_Scalar2SameMisc_cmpz_D_size<0b1, 0b01000, "cmge">; +def : Neon_Scalar2SameMisc_cmpz_D_size_patterns; +def : Neon_Scalar2SameMisc_cmpz_D_V1_size_patterns; + +// Scalar Compare Signed Greater Than Zero +def CMGTddi: NeonI_Scalar2SameMisc_cmpz_D_size<0b0, 0b01000, "cmgt">; +def : Neon_Scalar2SameMisc_cmpz_D_size_patterns; +def : Neon_Scalar2SameMisc_cmpz_D_V1_size_patterns; + +// Scalar Compare Signed Less Than Or Equal To Zero +def CMLEddi: NeonI_Scalar2SameMisc_cmpz_D_size<0b1, 0b01001, "cmle">; +def : Neon_Scalar2SameMisc_cmpz_D_size_patterns; +def : Neon_Scalar2SameMisc_cmpz_D_V1_size_patterns; + +// Scalar Compare Less Than Zero +def CMLTddi: NeonI_Scalar2SameMisc_cmpz_D_size<0b0, 0b01010, "cmlt">; +def : Neon_Scalar2SameMisc_cmpz_D_size_patterns; +def : Neon_Scalar2SameMisc_cmpz_D_V1_size_patterns; + +// Scalar Floating-point Compare + +// Scalar Floating-point Compare Mask Equal +defm FCMEQ: NeonI_Scalar3Same_SD_sizes<0b0, 0b0, 0b11100, "fcmeq">; +defm : Neon_Scalar3Same_cmp_SD_size_patterns; +def : Neon_Scalar3Same_cmp_V1_D_size_patterns; + +// Scalar Floating-point Compare Mask Equal To Zero +defm FCMEQZ: NeonI_Scalar2SameMisc_cmpz_SD_size<0b0, 0b01101, "fcmeq">; +defm : Neon_Scalar2SameMisc_cmpz_SD_size_patterns; +def : Pat<(v1i64 (Neon_cmpz (v1f64 FPR64:$Rn), (f32 fpz32:$FPImm), SETEQ)), + (FCMEQZddi FPR64:$Rn, fpz32:$FPImm)>; + +// Scalar Floating-point Compare Mask Greater Than Or Equal +defm FCMGE: NeonI_Scalar3Same_SD_sizes<0b1, 0b0, 0b11100, "fcmge">; +defm : Neon_Scalar3Same_cmp_SD_size_patterns; +def : Neon_Scalar3Same_cmp_V1_D_size_patterns; + +// Scalar Floating-point Compare Mask Greater Than Or Equal To Zero +defm FCMGEZ: NeonI_Scalar2SameMisc_cmpz_SD_size<0b1, 0b01100, "fcmge">; +defm : Neon_Scalar2SameMisc_cmpz_SD_size_patterns; + +// Scalar Floating-point Compare Mask Greather Than +defm FCMGT: NeonI_Scalar3Same_SD_sizes<0b1, 0b1, 0b11100, "fcmgt">; +defm : Neon_Scalar3Same_cmp_SD_size_patterns; +def : Neon_Scalar3Same_cmp_V1_D_size_patterns; + +// Scalar Floating-point Compare Mask Greather Than Zero +defm FCMGTZ: NeonI_Scalar2SameMisc_cmpz_SD_size<0b0, 0b01100, "fcmgt">; +defm : Neon_Scalar2SameMisc_cmpz_SD_size_patterns; + +// Scalar Floating-point Compare Mask Less Than Or Equal To Zero +defm FCMLEZ: NeonI_Scalar2SameMisc_cmpz_SD_size<0b1, 0b01101, "fcmle">; +defm : Neon_Scalar2SameMisc_cmpz_SD_size_patterns; + +// Scalar Floating-point Compare Mask Less Than Zero +defm FCMLTZ: NeonI_Scalar2SameMisc_cmpz_SD_size<0b0, 0b01110, "fcmlt">; +defm : Neon_Scalar2SameMisc_cmpz_SD_size_patterns; + +// Scalar Floating-point Absolute Compare Mask Greater Than Or Equal +defm FACGE: NeonI_Scalar3Same_SD_sizes<0b1, 0b0, 0b11101, "facge">; +defm : Neon_Scalar3Same_cmp_SD_size_patterns; + +// Scalar Floating-point Absolute Compare Mask Greater Than +defm FACGT: NeonI_Scalar3Same_SD_sizes<0b1, 0b1, 0b11101, "facgt">; +defm : Neon_Scalar3Same_cmp_SD_size_patterns; + +// Scakar Floating-point Absolute Difference +defm FABD: NeonI_Scalar3Same_SD_sizes<0b1, 0b1, 0b11010, "fabd">; +defm : Neon_Scalar3Same_SD_size_patterns; + +// Scalar Absolute Value +defm ABS : NeonI_Scalar2SameMisc_D_size<0b0, 0b01011, "abs">; +defm : Neon_Scalar2SameMisc_D_size_patterns; + +// Scalar Signed Saturating Absolute Value +defm SQABS : NeonI_Scalar2SameMisc_BHSD_size<0b0, 0b00111, "sqabs">; +defm : Neon_Scalar2SameMisc_BHSD_size_patterns; + +// Scalar Negate +defm NEG : NeonI_Scalar2SameMisc_D_size<0b1, 0b01011, "neg">; +defm : Neon_Scalar2SameMisc_D_size_patterns; + +// Scalar Signed Saturating Negate +defm SQNEG : NeonI_Scalar2SameMisc_BHSD_size<0b1, 0b00111, "sqneg">; +defm : Neon_Scalar2SameMisc_BHSD_size_patterns; + +// Scalar Signed Saturating Accumulated of Unsigned Value +defm SUQADD : NeonI_Scalar2SameMisc_accum_BHSD_size<0b0, 0b00011, "suqadd">; +defm : Neon_Scalar2SameMisc_accum_BHSD_size_patterns; + +// Scalar Unsigned Saturating Accumulated of Signed Value +defm USQADD : NeonI_Scalar2SameMisc_accum_BHSD_size<0b1, 0b00011, "usqadd">; +defm : Neon_Scalar2SameMisc_accum_BHSD_size_patterns; + +def : Pat<(v1i64 (int_aarch64_neon_suqadd (v1i64 FPR64:$Src), + (v1i64 FPR64:$Rn))), + (SUQADDdd FPR64:$Src, FPR64:$Rn)>; + +def : Pat<(v1i64 (int_aarch64_neon_usqadd (v1i64 FPR64:$Src), + (v1i64 FPR64:$Rn))), + (USQADDdd FPR64:$Src, FPR64:$Rn)>; + +def : Pat<(v1i64 (int_arm_neon_vabs (v1i64 FPR64:$Rn))), + (ABSdd FPR64:$Rn)>; + +def : Pat<(v1i64 (int_arm_neon_vqabs (v1i64 FPR64:$Rn))), + (SQABSdd FPR64:$Rn)>; + +def : Pat<(v1i64 (int_arm_neon_vqneg (v1i64 FPR64:$Rn))), + (SQNEGdd FPR64:$Rn)>; + +def : Pat<(v1i64 (sub (v1i64 (bitconvert (v8i8 Neon_AllZero))), + (v1i64 FPR64:$Rn))), + (NEGdd FPR64:$Rn)>; + +// Scalar Signed Saturating Extract Unsigned Narrow +defm SQXTUN : NeonI_Scalar2SameMisc_narrow_HSD_size<0b1, 0b10010, "sqxtun">; +defm : Neon_Scalar2SameMisc_narrow_HSD_size_patterns; + +// Scalar Signed Saturating Extract Narrow +defm SQXTN : NeonI_Scalar2SameMisc_narrow_HSD_size<0b0, 0b10100, "sqxtn">; +defm : Neon_Scalar2SameMisc_narrow_HSD_size_patterns; + +// Scalar Unsigned Saturating Extract Narrow +defm UQXTN : NeonI_Scalar2SameMisc_narrow_HSD_size<0b1, 0b10100, "uqxtn">; +defm : Neon_Scalar2SameMisc_narrow_HSD_size_patterns; + +// Scalar Reduce Pairwise + +multiclass NeonI_ScalarPair_D_sizes opcode, + string asmop, bit Commutable = 0> { + let isCommutable = Commutable in { + def _D_2D : NeonI_ScalarPair; + } +} + +multiclass NeonI_ScalarPair_SD_sizes opcode, + string asmop, bit Commutable = 0> + : NeonI_ScalarPair_D_sizes { + let isCommutable = Commutable in { + def _S_2S : NeonI_ScalarPair; + } +} + +// Scalar Reduce Addition Pairwise (Integer) with +// Pattern to match llvm.arm.* intrinsic +defm ADDPvv : NeonI_ScalarPair_D_sizes<0b0, 0b1, 0b11011, "addp", 0>; + +// Pattern to match llvm.aarch64.* intrinsic for +// Scalar Reduce Addition Pairwise (Integer) +def : Pat<(v1i64 (int_aarch64_neon_vpadd (v2i64 VPR128:$Rn))), + (ADDPvv_D_2D VPR128:$Rn)>; +def : Pat<(v1i64 (int_aarch64_neon_vaddv (v2i64 VPR128:$Rn))), + (ADDPvv_D_2D VPR128:$Rn)>; + +// Scalar Reduce Addition Pairwise (Floating Point) +defm FADDPvv : NeonI_ScalarPair_SD_sizes<0b1, 0b0, 0b01101, "faddp", 0>; + +// Scalar Reduce Maximum Pairwise (Floating Point) +defm FMAXPvv : NeonI_ScalarPair_SD_sizes<0b1, 0b0, 0b01111, "fmaxp", 0>; + +// Scalar Reduce Minimum Pairwise (Floating Point) +defm FMINPvv : NeonI_ScalarPair_SD_sizes<0b1, 0b1, 0b01111, "fminp", 0>; + +// Scalar Reduce maxNum Pairwise (Floating Point) +defm FMAXNMPvv : NeonI_ScalarPair_SD_sizes<0b1, 0b0, 0b01100, "fmaxnmp", 0>; + +// Scalar Reduce minNum Pairwise (Floating Point) +defm FMINNMPvv : NeonI_ScalarPair_SD_sizes<0b1, 0b1, 0b01100, "fminnmp", 0>; + +multiclass Neon_ScalarPair_SD_size_patterns { + def : Pat<(v1f32 (opnodeS (v2f32 VPR64:$Rn))), + (INSTS VPR64:$Rn)>; + def : Pat<(v1f64 (opnodeD (v2f64 VPR128:$Rn))), + (INSTD VPR128:$Rn)>; +} + +// Patterns to match llvm.aarch64.* intrinsic for +// Scalar Reduce Add, Max, Min, MaxiNum, MinNum Pairwise (Floating Point) +defm : Neon_ScalarPair_SD_size_patterns; + +defm : Neon_ScalarPair_SD_size_patterns; + +defm : Neon_ScalarPair_SD_size_patterns; + +defm : Neon_ScalarPair_SD_size_patterns; + +defm : Neon_ScalarPair_SD_size_patterns; + +defm : Neon_ScalarPair_SD_size_patterns; + +def : Pat<(v1f32 (int_aarch64_neon_vaddv (v4f32 VPR128:$Rn))), + (FADDPvv_S_2S (v2f32 + (EXTRACT_SUBREG + (v4f32 (FADDP_4S (v4f32 VPR128:$Rn), (v4f32 VPR128:$Rn))), + sub_64)))>; + +defm : Neon_ScalarPair_SD_size_patterns; + +defm : Neon_ScalarPair_SD_size_patterns; + +defm : Neon_ScalarPair_SD_size_patterns; + +defm : Neon_ScalarPair_SD_size_patterns; + +// Scalar by element Arithmetic + +class NeonI_ScalarXIndexedElemArith opcode, + string rmlane, bit u, bit szhi, bit szlo, + RegisterClass ResFPR, RegisterClass OpFPR, + RegisterOperand OpVPR, Operand OpImm> + : NeonI_ScalarXIndexedElem { + bits<3> Imm; + bits<5> MRm; +} + +class NeonI_ScalarXIndexedElemArith_Constraint_Impl opcode, + string rmlane, + bit u, bit szhi, bit szlo, + RegisterClass ResFPR, + RegisterClass OpFPR, + RegisterOperand OpVPR, + Operand OpImm> + : NeonI_ScalarXIndexedElem { + let Constraints = "$src = $Rd"; + bits<3> Imm; + bits<5> MRm; +} + +// Scalar Floating Point multiply (scalar, by element) +def FMULssv_4S : NeonI_ScalarXIndexedElemArith<"fmul", + 0b1001, ".s", 0b0, 0b1, 0b0, FPR32, FPR32, VPR128, neon_uimm2_bare> { + let Inst{11} = Imm{1}; // h + let Inst{21} = Imm{0}; // l + let Inst{20-16} = MRm; +} +def FMULddv_2D : NeonI_ScalarXIndexedElemArith<"fmul", + 0b1001, ".d", 0b0, 0b1, 0b1, FPR64, FPR64, VPR128, neon_uimm1_bare> { + let Inst{11} = Imm{0}; // h + let Inst{21} = 0b0; // l + let Inst{20-16} = MRm; +} + +// Scalar Floating Point multiply extended (scalar, by element) +def FMULXssv_4S : NeonI_ScalarXIndexedElemArith<"fmulx", + 0b1001, ".s", 0b1, 0b1, 0b0, FPR32, FPR32, VPR128, neon_uimm2_bare> { + let Inst{11} = Imm{1}; // h + let Inst{21} = Imm{0}; // l + let Inst{20-16} = MRm; +} +def FMULXddv_2D : NeonI_ScalarXIndexedElemArith<"fmulx", + 0b1001, ".d", 0b1, 0b1, 0b1, FPR64, FPR64, VPR128, neon_uimm1_bare> { + let Inst{11} = Imm{0}; // h + let Inst{21} = 0b0; // l + let Inst{20-16} = MRm; +} + +multiclass Neon_ScalarXIndexedElem_MUL_MULX_Patterns< + SDPatternOperator opnode, + Instruction INST, + ValueType ResTy, RegisterClass FPRC, ValueType OpTy, Operand OpImm, + ValueType OpNTy, ValueType ExTy, Operand OpNImm> { + + def : Pat<(ResTy (opnode (ResTy FPRC:$Rn), + (ResTy (vector_extract (OpTy VPR128:$MRm), OpImm:$Imm)))), + (ResTy (INST (ResTy FPRC:$Rn), (OpTy VPR128:$MRm), OpImm:$Imm))>; + + def : Pat<(ResTy (opnode (ResTy FPRC:$Rn), + (ResTy (vector_extract (OpNTy VPR64:$MRm), OpNImm:$Imm)))), + (ResTy (INST (ResTy FPRC:$Rn), + (ExTy (SUBREG_TO_REG (i64 0), VPR64:$MRm, sub_64)), + OpNImm:$Imm))>; + + // swapped operands + def : Pat<(ResTy (opnode + (ResTy (vector_extract (OpTy VPR128:$MRm), OpImm:$Imm)), + (ResTy FPRC:$Rn))), + (ResTy (INST (ResTy FPRC:$Rn), (OpTy VPR128:$MRm), OpImm:$Imm))>; + + def : Pat<(ResTy (opnode + (ResTy (vector_extract (OpNTy VPR64:$MRm), OpNImm:$Imm)), + (ResTy FPRC:$Rn))), + (ResTy (INST (ResTy FPRC:$Rn), + (ExTy (SUBREG_TO_REG (i64 0), VPR64:$MRm, sub_64)), + OpNImm:$Imm))>; +} + +// Patterns for Scalar Floating Point multiply (scalar, by element) +defm : Neon_ScalarXIndexedElem_MUL_MULX_Patterns; +defm : Neon_ScalarXIndexedElem_MUL_MULX_Patterns; + +// Patterns for Scalar Floating Point multiply extended (scalar, by element) +defm : Neon_ScalarXIndexedElem_MUL_MULX_Patterns; +defm : Neon_ScalarXIndexedElem_MUL_MULX_Patterns; + + +// Scalar Floating Point fused multiply-add (scalar, by element) +def FMLAssv_4S : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"fmla", + 0b0001, ".s", 0b0, 0b1, 0b0, FPR32, FPR32, VPR128, neon_uimm2_bare> { + let Inst{11} = Imm{1}; // h + let Inst{21} = Imm{0}; // l + let Inst{20-16} = MRm; +} +def FMLAddv_2D : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"fmla", + 0b0001, ".d", 0b0, 0b1, 0b1, FPR64, FPR64, VPR128, neon_uimm1_bare> { + let Inst{11} = Imm{0}; // h + let Inst{21} = 0b0; // l + let Inst{20-16} = MRm; +} + +// Scalar Floating Point fused multiply-subtract (scalar, by element) +def FMLSssv_4S : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"fmls", + 0b0101, ".s", 0b0, 0b1, 0b0, FPR32, FPR32, VPR128, neon_uimm2_bare> { + let Inst{11} = Imm{1}; // h + let Inst{21} = Imm{0}; // l + let Inst{20-16} = MRm; +} +def FMLSddv_2D : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"fmls", + 0b0101, ".d", 0b0, 0b1, 0b1, FPR64, FPR64, VPR128, neon_uimm1_bare> { + let Inst{11} = Imm{0}; // h + let Inst{21} = 0b0; // l + let Inst{20-16} = MRm; +} +// We are allowed to match the fma instruction regardless of compile options. +multiclass Neon_ScalarXIndexedElem_FMA_Patterns< + Instruction FMLAI, Instruction FMLSI, + ValueType ResTy, RegisterClass FPRC, ValueType OpTy, Operand OpImm, + ValueType OpNTy, ValueType ExTy, Operand OpNImm> { + // fmla + def : Pat<(ResTy (fma (ResTy FPRC:$Rn), + (ResTy (vector_extract (OpTy VPR128:$MRm), OpImm:$Imm)), + (ResTy FPRC:$Ra))), + (ResTy (FMLAI (ResTy FPRC:$Ra), + (ResTy FPRC:$Rn), (OpTy VPR128:$MRm), OpImm:$Imm))>; + + def : Pat<(ResTy (fma (ResTy FPRC:$Rn), + (ResTy (vector_extract (OpNTy VPR64:$MRm), OpNImm:$Imm)), + (ResTy FPRC:$Ra))), + (ResTy (FMLAI (ResTy FPRC:$Ra), + (ResTy FPRC:$Rn), + (ExTy (SUBREG_TO_REG (i64 0), VPR64:$MRm, sub_64)), + OpNImm:$Imm))>; + + // swapped fmla operands + def : Pat<(ResTy (fma + (ResTy (vector_extract (OpTy VPR128:$MRm), OpImm:$Imm)), + (ResTy FPRC:$Rn), + (ResTy FPRC:$Ra))), + (ResTy (FMLAI (ResTy FPRC:$Ra), + (ResTy FPRC:$Rn), (OpTy VPR128:$MRm), OpImm:$Imm))>; + + def : Pat<(ResTy (fma + (ResTy (vector_extract (OpNTy VPR64:$MRm), OpNImm:$Imm)), + (ResTy FPRC:$Rn), + (ResTy FPRC:$Ra))), + (ResTy (FMLAI (ResTy FPRC:$Ra), + (ResTy FPRC:$Rn), + (ExTy (SUBREG_TO_REG (i64 0), VPR64:$MRm, sub_64)), + OpNImm:$Imm))>; + + // fmls + def : Pat<(ResTy (fma (ResTy FPRC:$Rn), + (fneg (ResTy (vector_extract (OpTy VPR128:$MRm), OpImm:$Imm))), + (ResTy FPRC:$Ra))), + (ResTy (FMLSI (ResTy FPRC:$Ra), + (ResTy FPRC:$Rn), (OpTy VPR128:$MRm), OpImm:$Imm))>; + + def : Pat<(ResTy (fma (ResTy FPRC:$Rn), + (fneg (ResTy (vector_extract (OpNTy VPR64:$MRm), OpNImm:$Imm))), + (ResTy FPRC:$Ra))), + (ResTy (FMLSI (ResTy FPRC:$Ra), + (ResTy FPRC:$Rn), + (ExTy (SUBREG_TO_REG (i64 0), VPR64:$MRm, sub_64)), + OpNImm:$Imm))>; + + // swapped fmls operands + def : Pat<(ResTy (fma + (fneg (ResTy (vector_extract (OpTy VPR128:$MRm), OpImm:$Imm))), + (ResTy FPRC:$Rn), + (ResTy FPRC:$Ra))), + (ResTy (FMLSI (ResTy FPRC:$Ra), + (ResTy FPRC:$Rn), (OpTy VPR128:$MRm), OpImm:$Imm))>; + + def : Pat<(ResTy (fma + (fneg (ResTy (vector_extract (OpNTy VPR64:$MRm), OpNImm:$Imm))), + (ResTy FPRC:$Rn), + (ResTy FPRC:$Ra))), + (ResTy (FMLSI (ResTy FPRC:$Ra), + (ResTy FPRC:$Rn), + (ExTy (SUBREG_TO_REG (i64 0), VPR64:$MRm, sub_64)), + OpNImm:$Imm))>; +} + +// Scalar Floating Point fused multiply-add and +// multiply-subtract (scalar, by element) +defm : Neon_ScalarXIndexedElem_FMA_Patterns; +defm : Neon_ScalarXIndexedElem_FMA_Patterns; +defm : Neon_ScalarXIndexedElem_FMA_Patterns; + +// Scalar Signed saturating doubling multiply long (scalar, by element) +def SQDMULLshv_4H : NeonI_ScalarXIndexedElemArith<"sqdmull", + 0b1011, ".h", 0b0, 0b0, 0b1, FPR32, FPR16, VPR64Lo, neon_uimm2_bare> { + let Inst{11} = 0b0; // h + let Inst{21} = Imm{1}; // l + let Inst{20} = Imm{0}; // m + let Inst{19-16} = MRm{3-0}; +} +def SQDMULLshv_8H : NeonI_ScalarXIndexedElemArith<"sqdmull", + 0b1011, ".h", 0b0, 0b0, 0b1, FPR32, FPR16, VPR128Lo, neon_uimm3_bare> { + let Inst{11} = Imm{2}; // h + let Inst{21} = Imm{1}; // l + let Inst{20} = Imm{0}; // m + let Inst{19-16} = MRm{3-0}; +} +def SQDMULLdsv_2S : NeonI_ScalarXIndexedElemArith<"sqdmull", + 0b1011, ".s", 0b0, 0b1, 0b0, FPR64, FPR32, VPR64, neon_uimm1_bare> { + let Inst{11} = 0b0; // h + let Inst{21} = Imm{0}; // l + let Inst{20-16} = MRm; +} +def SQDMULLdsv_4S : NeonI_ScalarXIndexedElemArith<"sqdmull", + 0b1011, ".s", 0b0, 0b1, 0b0, FPR64, FPR32, VPR128, neon_uimm2_bare> { + let Inst{11} = Imm{1}; // h + let Inst{21} = Imm{0}; // l + let Inst{20-16} = MRm; +} + +multiclass Neon_ScalarXIndexedElem_MUL_Patterns< + SDPatternOperator opnode, + Instruction INST, + ValueType ResTy, RegisterClass FPRC, + ValueType OpVTy, ValueType OpTy, + ValueType VecOpTy, ValueType ExTy, RegisterOperand VPRC, Operand OpImm> { + + def : Pat<(ResTy (opnode (OpVTy FPRC:$Rn), + (OpVTy (scalar_to_vector + (ExTy (vector_extract (VecOpTy VPRC:$MRm), OpImm:$Imm)))))), + (ResTy (INST (OpVTy FPRC:$Rn), (VecOpTy VPRC:$MRm), OpImm:$Imm))>; + + //swapped operands + def : Pat<(ResTy (opnode + (OpVTy (scalar_to_vector + (ExTy (vector_extract (VecOpTy VPRC:$MRm), OpImm:$Imm)))), + (OpVTy FPRC:$Rn))), + (ResTy (INST (OpVTy FPRC:$Rn), (VecOpTy VPRC:$MRm), OpImm:$Imm))>; +} + + +// Patterns for Scalar Signed saturating doubling +// multiply long (scalar, by element) +defm : Neon_ScalarXIndexedElem_MUL_Patterns; +defm : Neon_ScalarXIndexedElem_MUL_Patterns; +defm : Neon_ScalarXIndexedElem_MUL_Patterns; +defm : Neon_ScalarXIndexedElem_MUL_Patterns; + +// Scalar Signed saturating doubling multiply-add long (scalar, by element) +def SQDMLALshv_4H : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"sqdmlal", + 0b0011, ".h", 0b0, 0b0, 0b1, FPR32, FPR16, VPR64Lo, neon_uimm2_bare> { + let Inst{11} = 0b0; // h + let Inst{21} = Imm{1}; // l + let Inst{20} = Imm{0}; // m + let Inst{19-16} = MRm{3-0}; +} +def SQDMLALshv_8H : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"sqdmlal", + 0b0011, ".h", 0b0, 0b0, 0b1, FPR32, FPR16, VPR128Lo, neon_uimm3_bare> { + let Inst{11} = Imm{2}; // h + let Inst{21} = Imm{1}; // l + let Inst{20} = Imm{0}; // m + let Inst{19-16} = MRm{3-0}; +} +def SQDMLALdsv_2S : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"sqdmlal", + 0b0011, ".s", 0b0, 0b1, 0b0, FPR64, FPR32, VPR64, neon_uimm1_bare> { + let Inst{11} = 0b0; // h + let Inst{21} = Imm{0}; // l + let Inst{20-16} = MRm; +} +def SQDMLALdsv_4S : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"sqdmlal", + 0b0011, ".s", 0b0, 0b1, 0b0, FPR64, FPR32, VPR128, neon_uimm2_bare> { + let Inst{11} = Imm{1}; // h + let Inst{21} = Imm{0}; // l + let Inst{20-16} = MRm; +} + +// Scalar Signed saturating doubling +// multiply-subtract long (scalar, by element) +def SQDMLSLshv_4H : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"sqdmlsl", + 0b0111, ".h", 0b0, 0b0, 0b1, FPR32, FPR16, VPR64Lo, neon_uimm2_bare> { + let Inst{11} = 0b0; // h + let Inst{21} = Imm{1}; // l + let Inst{20} = Imm{0}; // m + let Inst{19-16} = MRm{3-0}; +} +def SQDMLSLshv_8H : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"sqdmlsl", + 0b0111, ".h", 0b0, 0b0, 0b1, FPR32, FPR16, VPR128Lo, neon_uimm3_bare> { + let Inst{11} = Imm{2}; // h + let Inst{21} = Imm{1}; // l + let Inst{20} = Imm{0}; // m + let Inst{19-16} = MRm{3-0}; +} +def SQDMLSLdsv_2S : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"sqdmlsl", + 0b0111, ".s", 0b0, 0b1, 0b0, FPR64, FPR32, VPR64, neon_uimm1_bare> { + let Inst{11} = 0b0; // h + let Inst{21} = Imm{0}; // l + let Inst{20-16} = MRm; +} +def SQDMLSLdsv_4S : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"sqdmlsl", + 0b0111, ".s", 0b0, 0b1, 0b0, FPR64, FPR32, VPR128, neon_uimm2_bare> { + let Inst{11} = Imm{1}; // h + let Inst{21} = Imm{0}; // l + let Inst{20-16} = MRm; +} + +multiclass Neon_ScalarXIndexedElem_MLAL_Patterns< + SDPatternOperator opnode, + SDPatternOperator coreopnode, + Instruction INST, + ValueType ResTy, RegisterClass ResFPRC, RegisterClass FPRC, + ValueType OpTy, + ValueType OpVTy, ValueType ExTy, RegisterOperand VPRC, Operand OpImm> { + + def : Pat<(ResTy (opnode + (ResTy ResFPRC:$Ra), + (ResTy (coreopnode (OpTy FPRC:$Rn), + (OpTy (scalar_to_vector + (ExTy (vector_extract (OpVTy VPRC:$MRm), OpImm:$Imm)))))))), + (ResTy (INST (ResTy ResFPRC:$Ra), + (OpTy FPRC:$Rn), (OpVTy VPRC:$MRm), OpImm:$Imm))>; + + // swapped operands + def : Pat<(ResTy (opnode + (ResTy ResFPRC:$Ra), + (ResTy (coreopnode + (OpTy (scalar_to_vector + (ExTy (vector_extract (OpVTy VPRC:$MRm), OpImm:$Imm)))), + (OpTy FPRC:$Rn))))), + (ResTy (INST (ResTy ResFPRC:$Ra), + (OpTy FPRC:$Rn), (OpVTy VPRC:$MRm), OpImm:$Imm))>; +} + +// Patterns for Scalar Signed saturating +// doubling multiply-add long (scalar, by element) +defm : Neon_ScalarXIndexedElem_MLAL_Patterns; +defm : Neon_ScalarXIndexedElem_MLAL_Patterns; +defm : Neon_ScalarXIndexedElem_MLAL_Patterns; +defm : Neon_ScalarXIndexedElem_MLAL_Patterns; + +// Patterns for Scalar Signed saturating +// doubling multiply-sub long (scalar, by element) +defm : Neon_ScalarXIndexedElem_MLAL_Patterns; +defm : Neon_ScalarXIndexedElem_MLAL_Patterns; +defm : Neon_ScalarXIndexedElem_MLAL_Patterns; +defm : Neon_ScalarXIndexedElem_MLAL_Patterns; + +// Scalar general arithmetic operation +class Neon_Scalar_GeneralMath2D_pattern + : Pat<(v1f64 (opnode (v1f64 FPR64:$Rn))), (INST FPR64:$Rn)>; + +class Neon_Scalar_GeneralMath3D_pattern + : Pat<(v1f64 (opnode (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))), + (INST FPR64:$Rn, FPR64:$Rm)>; + +class Neon_Scalar_GeneralMath4D_pattern + : Pat<(v1f64 (opnode (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm), + (v1f64 FPR64:$Ra))), + (INST FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>; + +def : Neon_Scalar_GeneralMath3D_pattern; +def : Neon_Scalar_GeneralMath3D_pattern; +def : Neon_Scalar_GeneralMath3D_pattern; +def : Neon_Scalar_GeneralMath3D_pattern; +def : Neon_Scalar_GeneralMath3D_pattern; +def : Neon_Scalar_GeneralMath3D_pattern; +def : Neon_Scalar_GeneralMath3D_pattern; +def : Neon_Scalar_GeneralMath3D_pattern; +def : Neon_Scalar_GeneralMath3D_pattern; + +def : Neon_Scalar_GeneralMath2D_pattern; +def : Neon_Scalar_GeneralMath2D_pattern; + +def : Neon_Scalar_GeneralMath4D_pattern; +def : Neon_Scalar_GeneralMath4D_pattern; + +// Scalar Signed saturating doubling multiply returning +// high half (scalar, by element) +def SQDMULHhhv_4H : NeonI_ScalarXIndexedElemArith<"sqdmulh", + 0b1100, ".h", 0b0, 0b0, 0b1, FPR16, FPR16, VPR64Lo, neon_uimm2_bare> { + let Inst{11} = 0b0; // h + let Inst{21} = Imm{1}; // l + let Inst{20} = Imm{0}; // m + let Inst{19-16} = MRm{3-0}; +} +def SQDMULHhhv_8H : NeonI_ScalarXIndexedElemArith<"sqdmulh", + 0b1100, ".h", 0b0, 0b0, 0b1, FPR16, FPR16, VPR128Lo, neon_uimm3_bare> { + let Inst{11} = Imm{2}; // h + let Inst{21} = Imm{1}; // l + let Inst{20} = Imm{0}; // m + let Inst{19-16} = MRm{3-0}; +} +def SQDMULHssv_2S : NeonI_ScalarXIndexedElemArith<"sqdmulh", + 0b1100, ".s", 0b0, 0b1, 0b0, FPR32, FPR32, VPR64, neon_uimm1_bare> { + let Inst{11} = 0b0; // h + let Inst{21} = Imm{0}; // l + let Inst{20-16} = MRm; +} +def SQDMULHssv_4S : NeonI_ScalarXIndexedElemArith<"sqdmulh", + 0b1100, ".s", 0b0, 0b1, 0b0, FPR32, FPR32, VPR128, neon_uimm2_bare> { + let Inst{11} = Imm{1}; // h + let Inst{21} = Imm{0}; // l + let Inst{20-16} = MRm; +} + +// Patterns for Scalar Signed saturating doubling multiply returning +// high half (scalar, by element) +defm : Neon_ScalarXIndexedElem_MUL_Patterns; +defm : Neon_ScalarXIndexedElem_MUL_Patterns; +defm : Neon_ScalarXIndexedElem_MUL_Patterns; +defm : Neon_ScalarXIndexedElem_MUL_Patterns; + +// Scalar Signed saturating rounding doubling multiply +// returning high half (scalar, by element) +def SQRDMULHhhv_4H : NeonI_ScalarXIndexedElemArith<"sqrdmulh", + 0b1101, ".h", 0b0, 0b0, 0b1, FPR16, FPR16, VPR64Lo, neon_uimm2_bare> { + let Inst{11} = 0b0; // h + let Inst{21} = Imm{1}; // l + let Inst{20} = Imm{0}; // m + let Inst{19-16} = MRm{3-0}; +} +def SQRDMULHhhv_8H : NeonI_ScalarXIndexedElemArith<"sqrdmulh", + 0b1101, ".h", 0b0, 0b0, 0b1, FPR16, FPR16, VPR128Lo, neon_uimm3_bare> { + let Inst{11} = Imm{2}; // h + let Inst{21} = Imm{1}; // l + let Inst{20} = Imm{0}; // m + let Inst{19-16} = MRm{3-0}; +} +def SQRDMULHssv_2S : NeonI_ScalarXIndexedElemArith<"sqrdmulh", + 0b1101, ".s", 0b0, 0b1, 0b0, FPR32, FPR32, VPR64, neon_uimm1_bare> { + let Inst{11} = 0b0; // h + let Inst{21} = Imm{0}; // l + let Inst{20-16} = MRm; +} +def SQRDMULHssv_4S : NeonI_ScalarXIndexedElemArith<"sqrdmulh", + 0b1101, ".s", 0b0, 0b1, 0b0, FPR32, FPR32, VPR128, neon_uimm2_bare> { + let Inst{11} = Imm{1}; // h + let Inst{21} = Imm{0}; // l + let Inst{20-16} = MRm; +} + +defm : Neon_ScalarXIndexedElem_MUL_Patterns; +defm : Neon_ScalarXIndexedElem_MUL_Patterns; +defm : Neon_ScalarXIndexedElem_MUL_Patterns; +defm : Neon_ScalarXIndexedElem_MUL_Patterns; + +// Scalar Copy - DUP element to scalar +class NeonI_Scalar_DUP + : NeonI_ScalarCopy<(outs ResRC:$Rd), (ins VPRC:$Rn, OpImm:$Imm), + asmop # "\t$Rd, $Rn." # asmlane # "[$Imm]", + [], + NoItinerary> { + bits<4> Imm; +} + +def DUPbv_B : NeonI_Scalar_DUP<"dup", "b", FPR8, VPR128, neon_uimm4_bare> { + let Inst{20-16} = {Imm{3}, Imm{2}, Imm{1}, Imm{0}, 0b1}; +} +def DUPhv_H : NeonI_Scalar_DUP<"dup", "h", FPR16, VPR128, neon_uimm3_bare> { + let Inst{20-16} = {Imm{2}, Imm{1}, Imm{0}, 0b1, 0b0}; +} +def DUPsv_S : NeonI_Scalar_DUP<"dup", "s", FPR32, VPR128, neon_uimm2_bare> { + let Inst{20-16} = {Imm{1}, Imm{0}, 0b1, 0b0, 0b0}; +} +def DUPdv_D : NeonI_Scalar_DUP<"dup", "d", FPR64, VPR128, neon_uimm1_bare> { + let Inst{20-16} = {Imm, 0b1, 0b0, 0b0, 0b0}; +} + +multiclass NeonI_Scalar_DUP_Elt_pattern { + def : Pat<(ResTy (vector_extract (OpTy VPR128:$Rn), OpImm:$Imm)), + (ResTy (DUPI (OpTy VPR128:$Rn), OpImm:$Imm))>; + + def : Pat<(ResTy (vector_extract (OpNTy VPR64:$Rn), OpNImm:$Imm)), + (ResTy (DUPI + (ExTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)), + OpNImm:$Imm))>; +} + +// Patterns for vector extract of FP data using scalar DUP instructions +defm : NeonI_Scalar_DUP_Elt_pattern; +defm : NeonI_Scalar_DUP_Elt_pattern; + +multiclass NeonI_Scalar_DUP_Ext_Vec_pattern { + + def : Pat<(ResTy (extract_subvector (OpTy VPR128:$Rn), OpLImm:$Imm)), + (ResTy (DUPI VPR128:$Rn, OpLImm:$Imm))>; + + def : Pat<(ResTy (extract_subvector (NOpTy VPR64:$Rn), OpNImm:$Imm)), + (ResTy (DUPI + (ExTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)), + OpNImm:$Imm))>; +} + +// Patterns for extract subvectors of v1ix data using scalar DUP instructions. +defm : NeonI_Scalar_DUP_Ext_Vec_pattern; +defm : NeonI_Scalar_DUP_Ext_Vec_pattern; +defm : NeonI_Scalar_DUP_Ext_Vec_pattern; + +multiclass NeonI_Scalar_DUP_Copy_pattern1 { + + def : Pat<(ResTy (vector_insert (ResTy undef), + (ElemTy (vector_extract (OpTy VPR128:$Rn), OpImm:$Imm)), + (neon_uimm0_bare:$Imm))), + (ResTy (DUPI (OpTy VPR128:$Rn), OpImm:$Imm))>; + + def : Pat<(ResTy (vector_insert (ResTy undef), + (ElemTy (vector_extract (OpNTy VPR64:$Rn), OpNImm:$Imm)), + (OpNImm:$Imm))), + (ResTy (DUPI + (ExTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)), + OpNImm:$Imm))>; +} + +multiclass NeonI_Scalar_DUP_Copy_pattern2 { + + def : Pat<(ResTy (scalar_to_vector + (ElemTy (vector_extract (OpTy VPR128:$Rn), OpImm:$Imm)))), + (ResTy (DUPI (OpTy VPR128:$Rn), OpImm:$Imm))>; + + def : Pat<(ResTy (scalar_to_vector + (ElemTy (vector_extract (OpNTy VPR64:$Rn), OpNImm:$Imm)))), + (ResTy (DUPI + (ExTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)), + OpNImm:$Imm))>; +} + +// Patterns for vector copy to v1ix and v1fx vectors using scalar DUP +// instructions. +defm : NeonI_Scalar_DUP_Copy_pattern1; +defm : NeonI_Scalar_DUP_Copy_pattern1; +defm : NeonI_Scalar_DUP_Copy_pattern1; +defm : NeonI_Scalar_DUP_Copy_pattern1; +defm : NeonI_Scalar_DUP_Copy_pattern1; +defm : NeonI_Scalar_DUP_Copy_pattern1; +defm : NeonI_Scalar_DUP_Copy_pattern2; +defm : NeonI_Scalar_DUP_Copy_pattern2; +defm : NeonI_Scalar_DUP_Copy_pattern2; +defm : NeonI_Scalar_DUP_Copy_pattern2; +defm : NeonI_Scalar_DUP_Copy_pattern2; +defm : NeonI_Scalar_DUP_Copy_pattern2; + +multiclass NeonI_Scalar_DUP_alias { + def : NeonInstAlias; +} + +// Aliases for Scalar copy - DUP element (scalar) +// FIXME: This is actually the preferred syntax but TableGen can't deal with +// custom printing of aliases. +defm : NeonI_Scalar_DUP_alias<"mov", ".b", DUPbv_B, neon_uimm4_bare, FPR8>; +defm : NeonI_Scalar_DUP_alias<"mov", ".h", DUPhv_H, neon_uimm3_bare, FPR16>; +defm : NeonI_Scalar_DUP_alias<"mov", ".s", DUPsv_S, neon_uimm2_bare, FPR32>; +defm : NeonI_Scalar_DUP_alias<"mov", ".d", DUPdv_D, neon_uimm1_bare, FPR64>; + +multiclass NeonI_SDUP { + def : Pat<(ResTy (GetLow VPR128:$Rn)), + (ResTy (EXTRACT_SUBREG (OpTy VPR128:$Rn), sub_64))>; + def : Pat<(ResTy (GetHigh VPR128:$Rn)), + (ResTy (DUPdv_D (OpTy VPR128:$Rn), 1))>; +} + +defm : NeonI_SDUP; +defm : NeonI_SDUP; +defm : NeonI_SDUP; +defm : NeonI_SDUP; +defm : NeonI_SDUP; +defm : NeonI_SDUP; + +//===----------------------------------------------------------------------===// +// Non-Instruction Patterns +//===----------------------------------------------------------------------===// + +// 64-bit vector bitcasts... + +def : Pat<(v1i64 (bitconvert (v8i8 VPR64:$src))), (v1i64 VPR64:$src)>; +def : Pat<(v2f32 (bitconvert (v8i8 VPR64:$src))), (v2f32 VPR64:$src)>; +def : Pat<(v2i32 (bitconvert (v8i8 VPR64:$src))), (v2i32 VPR64:$src)>; +def : Pat<(v4i16 (bitconvert (v8i8 VPR64:$src))), (v4i16 VPR64:$src)>; + +def : Pat<(v1i64 (bitconvert (v4i16 VPR64:$src))), (v1i64 VPR64:$src)>; +def : Pat<(v2i32 (bitconvert (v4i16 VPR64:$src))), (v2i32 VPR64:$src)>; +def : Pat<(v2f32 (bitconvert (v4i16 VPR64:$src))), (v2f32 VPR64:$src)>; +def : Pat<(v8i8 (bitconvert (v4i16 VPR64:$src))), (v8i8 VPR64:$src)>; + +def : Pat<(v1i64 (bitconvert (v2i32 VPR64:$src))), (v1i64 VPR64:$src)>; +def : Pat<(v2f32 (bitconvert (v2i32 VPR64:$src))), (v2f32 VPR64:$src)>; +def : Pat<(v4i16 (bitconvert (v2i32 VPR64:$src))), (v4i16 VPR64:$src)>; +def : Pat<(v8i8 (bitconvert (v2i32 VPR64:$src))), (v8i8 VPR64:$src)>; + +def : Pat<(v1i64 (bitconvert (v2f32 VPR64:$src))), (v1i64 VPR64:$src)>; +def : Pat<(v2i32 (bitconvert (v2f32 VPR64:$src))), (v2i32 VPR64:$src)>; +def : Pat<(v4i16 (bitconvert (v2f32 VPR64:$src))), (v4i16 VPR64:$src)>; +def : Pat<(v8i8 (bitconvert (v2f32 VPR64:$src))), (v8i8 VPR64:$src)>; + +def : Pat<(v2f32 (bitconvert (v1i64 VPR64:$src))), (v2f32 VPR64:$src)>; +def : Pat<(v2i32 (bitconvert (v1i64 VPR64:$src))), (v2i32 VPR64:$src)>; +def : Pat<(v4i16 (bitconvert (v1i64 VPR64:$src))), (v4i16 VPR64:$src)>; +def : Pat<(v8i8 (bitconvert (v1i64 VPR64:$src))), (v8i8 VPR64:$src)>; + +// ..and 128-bit vector bitcasts... + +def : Pat<(v2f64 (bitconvert (v16i8 VPR128:$src))), (v2f64 VPR128:$src)>; +def : Pat<(v2i64 (bitconvert (v16i8 VPR128:$src))), (v2i64 VPR128:$src)>; +def : Pat<(v4f32 (bitconvert (v16i8 VPR128:$src))), (v4f32 VPR128:$src)>; +def : Pat<(v4i32 (bitconvert (v16i8 VPR128:$src))), (v4i32 VPR128:$src)>; +def : Pat<(v8i16 (bitconvert (v16i8 VPR128:$src))), (v8i16 VPR128:$src)>; + +def : Pat<(v2f64 (bitconvert (v8i16 VPR128:$src))), (v2f64 VPR128:$src)>; +def : Pat<(v2i64 (bitconvert (v8i16 VPR128:$src))), (v2i64 VPR128:$src)>; +def : Pat<(v4i32 (bitconvert (v8i16 VPR128:$src))), (v4i32 VPR128:$src)>; +def : Pat<(v4f32 (bitconvert (v8i16 VPR128:$src))), (v4f32 VPR128:$src)>; +def : Pat<(v16i8 (bitconvert (v8i16 VPR128:$src))), (v16i8 VPR128:$src)>; + +def : Pat<(v2f64 (bitconvert (v4i32 VPR128:$src))), (v2f64 VPR128:$src)>; +def : Pat<(v2i64 (bitconvert (v4i32 VPR128:$src))), (v2i64 VPR128:$src)>; +def : Pat<(v4f32 (bitconvert (v4i32 VPR128:$src))), (v4f32 VPR128:$src)>; +def : Pat<(v8i16 (bitconvert (v4i32 VPR128:$src))), (v8i16 VPR128:$src)>; +def : Pat<(v16i8 (bitconvert (v4i32 VPR128:$src))), (v16i8 VPR128:$src)>; + +def : Pat<(v2f64 (bitconvert (v4f32 VPR128:$src))), (v2f64 VPR128:$src)>; +def : Pat<(v2i64 (bitconvert (v4f32 VPR128:$src))), (v2i64 VPR128:$src)>; +def : Pat<(v4i32 (bitconvert (v4f32 VPR128:$src))), (v4i32 VPR128:$src)>; +def : Pat<(v8i16 (bitconvert (v4f32 VPR128:$src))), (v8i16 VPR128:$src)>; +def : Pat<(v16i8 (bitconvert (v4f32 VPR128:$src))), (v16i8 VPR128:$src)>; + +def : Pat<(v2f64 (bitconvert (v2i64 VPR128:$src))), (v2f64 VPR128:$src)>; +def : Pat<(v4f32 (bitconvert (v2i64 VPR128:$src))), (v4f32 VPR128:$src)>; +def : Pat<(v4i32 (bitconvert (v2i64 VPR128:$src))), (v4i32 VPR128:$src)>; +def : Pat<(v8i16 (bitconvert (v2i64 VPR128:$src))), (v8i16 VPR128:$src)>; +def : Pat<(v16i8 (bitconvert (v2i64 VPR128:$src))), (v16i8 VPR128:$src)>; + +def : Pat<(v2i64 (bitconvert (v2f64 VPR128:$src))), (v2i64 VPR128:$src)>; +def : Pat<(v4f32 (bitconvert (v2f64 VPR128:$src))), (v4f32 VPR128:$src)>; +def : Pat<(v4i32 (bitconvert (v2f64 VPR128:$src))), (v4i32 VPR128:$src)>; +def : Pat<(v8i16 (bitconvert (v2f64 VPR128:$src))), (v8i16 VPR128:$src)>; +def : Pat<(v16i8 (bitconvert (v2f64 VPR128:$src))), (v16i8 VPR128:$src)>; + +// ...and scalar bitcasts... +def : Pat<(f16 (bitconvert (v1i16 FPR16:$src))), (f16 FPR16:$src)>; +def : Pat<(f32 (bitconvert (v1i32 FPR32:$src))), (f32 FPR32:$src)>; +def : Pat<(f64 (bitconvert (v1i64 FPR64:$src))), (f64 FPR64:$src)>; +def : Pat<(f32 (bitconvert (v1f32 FPR32:$src))), (f32 FPR32:$src)>; +def : Pat<(f64 (bitconvert (v1f64 FPR64:$src))), (f64 FPR64:$src)>; + +def : Pat<(i64 (bitconvert (v1i64 FPR64:$src))), (FMOVxd $src)>; +def : Pat<(i64 (bitconvert (v1f64 FPR64:$src))), (FMOVxd $src)>; +def : Pat<(i64 (bitconvert (v2i32 FPR64:$src))), (FMOVxd $src)>; +def : Pat<(i64 (bitconvert (v2f32 FPR64:$src))), (FMOVxd $src)>; +def : Pat<(i64 (bitconvert (v4i16 FPR64:$src))), (FMOVxd $src)>; +def : Pat<(i64 (bitconvert (v8i8 FPR64:$src))), (FMOVxd $src)>; + +def : Pat<(i32 (bitconvert (v1i32 FPR32:$src))), (FMOVws $src)>; + +def : Pat<(v8i8 (bitconvert (v1i64 VPR64:$src))), (v8i8 VPR64:$src)>; +def : Pat<(v4i16 (bitconvert (v1i64 VPR64:$src))), (v4i16 VPR64:$src)>; +def : Pat<(v2i32 (bitconvert (v1i64 VPR64:$src))), (v2i32 VPR64:$src)>; + +def : Pat<(f64 (bitconvert (v8i8 VPR64:$src))), (f64 VPR64:$src)>; +def : Pat<(f64 (bitconvert (v4i16 VPR64:$src))), (f64 VPR64:$src)>; +def : Pat<(f64 (bitconvert (v2i32 VPR64:$src))), (f64 VPR64:$src)>; +def : Pat<(f64 (bitconvert (v2f32 VPR64:$src))), (f64 VPR64:$src)>; +def : Pat<(f64 (bitconvert (v1i64 VPR64:$src))), (f64 VPR64:$src)>; + +def : Pat<(f128 (bitconvert (v16i8 VPR128:$src))), (f128 VPR128:$src)>; +def : Pat<(f128 (bitconvert (v8i16 VPR128:$src))), (f128 VPR128:$src)>; +def : Pat<(f128 (bitconvert (v4i32 VPR128:$src))), (f128 VPR128:$src)>; +def : Pat<(f128 (bitconvert (v2i64 VPR128:$src))), (f128 VPR128:$src)>; +def : Pat<(f128 (bitconvert (v4f32 VPR128:$src))), (f128 VPR128:$src)>; +def : Pat<(f128 (bitconvert (v2f64 VPR128:$src))), (f128 VPR128:$src)>; + +def : Pat<(v1i16 (bitconvert (f16 FPR16:$src))), (v1i16 FPR16:$src)>; +def : Pat<(v1i32 (bitconvert (f32 FPR32:$src))), (v1i32 FPR32:$src)>; +def : Pat<(v1i64 (bitconvert (f64 FPR64:$src))), (v1i64 FPR64:$src)>; +def : Pat<(v1f32 (bitconvert (f32 FPR32:$src))), (v1f32 FPR32:$src)>; +def : Pat<(v1f64 (bitconvert (f64 FPR64:$src))), (v1f64 FPR64:$src)>; + +def : Pat<(v1i64 (bitconvert (i64 GPR64:$src))), (FMOVdx $src)>; +def : Pat<(v1f64 (bitconvert (i64 GPR64:$src))), (FMOVdx $src)>; +def : Pat<(v2i32 (bitconvert (i64 GPR64:$src))), (FMOVdx $src)>; +def : Pat<(v2f32 (bitconvert (i64 GPR64:$src))), (FMOVdx $src)>; +def : Pat<(v4i16 (bitconvert (i64 GPR64:$src))), (FMOVdx $src)>; +def : Pat<(v8i8 (bitconvert (i64 GPR64:$src))), (FMOVdx $src)>; + +def : Pat<(v1i32 (bitconvert (i32 GPR32:$src))), (FMOVsw $src)>; + +def : Pat<(v8i8 (bitconvert (f64 FPR64:$src))), (v8i8 FPR64:$src)>; +def : Pat<(v4i16 (bitconvert (f64 FPR64:$src))), (v4i16 FPR64:$src)>; +def : Pat<(v2i32 (bitconvert (f64 FPR64:$src))), (v2i32 FPR64:$src)>; +def : Pat<(v2f32 (bitconvert (f64 FPR64:$src))), (v2f32 FPR64:$src)>; +def : Pat<(v1i64 (bitconvert (f64 FPR64:$src))), (v1i64 FPR64:$src)>; + +def : Pat<(v16i8 (bitconvert (f128 FPR128:$src))), (v16i8 FPR128:$src)>; +def : Pat<(v8i16 (bitconvert (f128 FPR128:$src))), (v8i16 FPR128:$src)>; +def : Pat<(v4i32 (bitconvert (f128 FPR128:$src))), (v4i32 FPR128:$src)>; +def : Pat<(v2i64 (bitconvert (f128 FPR128:$src))), (v2i64 FPR128:$src)>; +def : Pat<(v4f32 (bitconvert (f128 FPR128:$src))), (v4f32 FPR128:$src)>; +def : Pat<(v2f64 (bitconvert (f128 FPR128:$src))), (v2f64 FPR128:$src)>; + +// Scalar Three Same + +def neon_uimm3 : Operand, + ImmLeaf { + let ParserMatchClass = uimm3_asmoperand; + let PrintMethod = "printUImmHexOperand"; +} + +def neon_uimm4 : Operand, + ImmLeaf { + let ParserMatchClass = uimm4_asmoperand; + let PrintMethod = "printUImmHexOperand"; +} + +// Bitwise Extract +class NeonI_Extract op2, string asmop, + string OpS, RegisterOperand OpVPR, Operand OpImm> + : NeonI_BitExtract{ + bits<4> Index; +} + +def EXTvvvi_8b : NeonI_Extract<0b0, 0b00, "ext", "8b", + VPR64, neon_uimm3> { + let Inst{14-11} = {0b0, Index{2}, Index{1}, Index{0}}; +} + +def EXTvvvi_16b: NeonI_Extract<0b1, 0b00, "ext", "16b", + VPR128, neon_uimm4> { + let Inst{14-11} = Index; +} + +class NI_Extract + : Pat<(OpTy (Neon_vextract (OpTy OpVPR:$Rn), (OpTy OpVPR:$Rm), + (i64 OpImm:$Imm))), + (INST OpVPR:$Rn, OpVPR:$Rm, OpImm:$Imm)>; + +def : NI_Extract; +def : NI_Extract; +def : NI_Extract; +def : NI_Extract; +def : NI_Extract; +def : NI_Extract; +def : NI_Extract; +def : NI_Extract; +def : NI_Extract; +def : NI_Extract; +def : NI_Extract; +def : NI_Extract; + +// Table lookup +class NI_TBL op2, bits<2> len, bit op, + string asmop, string OpS, RegisterOperand OpVPR, + RegisterOperand VecList> + : NeonI_TBL; + +// The vectors in look up table are always 16b +multiclass NI_TBL_pat len, bit op, string asmop, string List> { + def _8b : NI_TBL<0, 0b00, len, op, asmop, "8b", VPR64, + !cast(List # "16B_operand")>; + + def _16b : NI_TBL<1, 0b00, len, op, asmop, "16b", VPR128, + !cast(List # "16B_operand")>; +} + +defm TBL1 : NI_TBL_pat<0b00, 0b0, "tbl", "VOne">; +defm TBL2 : NI_TBL_pat<0b01, 0b0, "tbl", "VPair">; +defm TBL3 : NI_TBL_pat<0b10, 0b0, "tbl", "VTriple">; +defm TBL4 : NI_TBL_pat<0b11, 0b0, "tbl", "VQuad">; + +// Table lookup extention +class NI_TBX op2, bits<2> len, bit op, + string asmop, string OpS, RegisterOperand OpVPR, + RegisterOperand VecList> + : NeonI_TBL { + let Constraints = "$src = $Rd"; +} + +// The vectors in look up table are always 16b +multiclass NI_TBX_pat len, bit op, string asmop, string List> { + def _8b : NI_TBX<0, 0b00, len, op, asmop, "8b", VPR64, + !cast(List # "16B_operand")>; + + def _16b : NI_TBX<1, 0b00, len, op, asmop, "16b", VPR128, + !cast(List # "16B_operand")>; +} + +defm TBX1 : NI_TBX_pat<0b00, 0b1, "tbx", "VOne">; +defm TBX2 : NI_TBX_pat<0b01, 0b1, "tbx", "VPair">; +defm TBX3 : NI_TBX_pat<0b10, 0b1, "tbx", "VTriple">; +defm TBX4 : NI_TBX_pat<0b11, 0b1, "tbx", "VQuad">; + +class NeonI_INS_main + : NeonI_copy<0b1, 0b0, 0b0011, + (outs VPR128:$Rd), (ins VPR128:$src, OpGPR:$Rn, OpImm:$Imm), + asmop # "\t$Rd." # Res # "[$Imm], $Rn", + [(set (ResTy VPR128:$Rd), + (ResTy (vector_insert + (ResTy VPR128:$src), + (OpTy OpGPR:$Rn), + (OpImm:$Imm))))], + NoItinerary> { + bits<4> Imm; + let Constraints = "$src = $Rd"; +} + +//Insert element (vector, from main) +def INSbw : NeonI_INS_main<"ins", "b", v16i8, GPR32, i32, + neon_uimm4_bare> { + let Inst{20-16} = {Imm{3}, Imm{2}, Imm{1}, Imm{0}, 0b1}; +} +def INShw : NeonI_INS_main<"ins", "h", v8i16, GPR32, i32, + neon_uimm3_bare> { + let Inst{20-16} = {Imm{2}, Imm{1}, Imm{0}, 0b1, 0b0}; +} +def INSsw : NeonI_INS_main<"ins", "s", v4i32, GPR32, i32, + neon_uimm2_bare> { + let Inst{20-16} = {Imm{1}, Imm{0}, 0b1, 0b0, 0b0}; +} +def INSdx : NeonI_INS_main<"ins", "d", v2i64, GPR64, i64, + neon_uimm1_bare> { + let Inst{20-16} = {Imm, 0b1, 0b0, 0b0, 0b0}; +} + +def : NeonInstAlias<"mov $Rd.b[$Imm], $Rn", + (INSbw VPR128:$Rd, GPR32:$Rn, neon_uimm4_bare:$Imm), 0>; +def : NeonInstAlias<"mov $Rd.h[$Imm], $Rn", + (INShw VPR128:$Rd, GPR32:$Rn, neon_uimm3_bare:$Imm), 0>; +def : NeonInstAlias<"mov $Rd.s[$Imm], $Rn", + (INSsw VPR128:$Rd, GPR32:$Rn, neon_uimm2_bare:$Imm), 0>; +def : NeonInstAlias<"mov $Rd.d[$Imm], $Rn", + (INSdx VPR128:$Rd, GPR64:$Rn, neon_uimm1_bare:$Imm), 0>; + +class Neon_INS_main_pattern + : Pat<(ResTy (vector_insert + (ResTy VPR64:$src), + (OpTy OpGPR:$Rn), + (OpImm:$Imm))), + (ResTy (EXTRACT_SUBREG + (ExtResTy (INS (ExtResTy (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64)), + OpGPR:$Rn, OpImm:$Imm)), sub_64))>; + +def INSbw_pattern : Neon_INS_main_pattern; +def INShw_pattern : Neon_INS_main_pattern; +def INSsw_pattern : Neon_INS_main_pattern; +def INSdx_pattern : Neon_INS_main_pattern; + +class NeonI_INS_element + : NeonI_insert<0b1, 0b1, + (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn, + ResImm:$Immd, ResImm:$Immn), + asmop # "\t$Rd." # Res # "[$Immd], $Rn." # Res # "[$Immn]", + [], + NoItinerary> { + let Constraints = "$src = $Rd"; + bits<4> Immd; + bits<4> Immn; +} + +//Insert element (vector, from element) +def INSELb : NeonI_INS_element<"ins", "b", neon_uimm4_bare> { + let Inst{20-16} = {Immd{3}, Immd{2}, Immd{1}, Immd{0}, 0b1}; + let Inst{14-11} = {Immn{3}, Immn{2}, Immn{1}, Immn{0}}; +} +def INSELh : NeonI_INS_element<"ins", "h", neon_uimm3_bare> { + let Inst{20-16} = {Immd{2}, Immd{1}, Immd{0}, 0b1, 0b0}; + let Inst{14-11} = {Immn{2}, Immn{1}, Immn{0}, 0b0}; + // bit 11 is unspecified, but should be set to zero. +} +def INSELs : NeonI_INS_element<"ins", "s", neon_uimm2_bare> { + let Inst{20-16} = {Immd{1}, Immd{0}, 0b1, 0b0, 0b0}; + let Inst{14-11} = {Immn{1}, Immn{0}, 0b0, 0b0}; + // bits 11-12 are unspecified, but should be set to zero. +} +def INSELd : NeonI_INS_element<"ins", "d", neon_uimm1_bare> { + let Inst{20-16} = {Immd, 0b1, 0b0, 0b0, 0b0}; + let Inst{14-11} = {Immn{0}, 0b0, 0b0, 0b0}; + // bits 11-13 are unspecified, but should be set to zero. +} + +def : NeonInstAlias<"mov $Rd.b[$Immd], $Rn.b[$Immn]", + (INSELb VPR128:$Rd, VPR128:$Rn, + neon_uimm4_bare:$Immd, neon_uimm4_bare:$Immn), 0>; +def : NeonInstAlias<"mov $Rd.h[$Immd], $Rn.h[$Immn]", + (INSELh VPR128:$Rd, VPR128:$Rn, + neon_uimm3_bare:$Immd, neon_uimm3_bare:$Immn), 0>; +def : NeonInstAlias<"mov $Rd.s[$Immd], $Rn.s[$Immn]", + (INSELs VPR128:$Rd, VPR128:$Rn, + neon_uimm2_bare:$Immd, neon_uimm2_bare:$Immn), 0>; +def : NeonInstAlias<"mov $Rd.d[$Immd], $Rn.d[$Immn]", + (INSELd VPR128:$Rd, VPR128:$Rn, + neon_uimm1_bare:$Immd, neon_uimm1_bare:$Immn), 0>; + +multiclass Neon_INS_elt_pattern { +def : Pat<(ResTy (vector_insert + (ResTy VPR128:$src), + (MidTy (vector_extract + (ResTy VPR128:$Rn), + (StImm:$Immn))), + (StImm:$Immd))), + (INS (ResTy VPR128:$src), (ResTy VPR128:$Rn), + StImm:$Immd, StImm:$Immn)>; + +def : Pat <(ResTy (vector_insert + (ResTy VPR128:$src), + (MidTy (vector_extract + (NaTy VPR64:$Rn), + (NaImm:$Immn))), + (StImm:$Immd))), + (INS (ResTy VPR128:$src), + (ResTy (SUBREG_TO_REG (i64 0), (NaTy VPR64:$Rn), sub_64)), + StImm:$Immd, NaImm:$Immn)>; + +def : Pat <(NaTy (vector_insert + (NaTy VPR64:$src), + (MidTy (vector_extract + (ResTy VPR128:$Rn), + (StImm:$Immn))), + (NaImm:$Immd))), + (NaTy (EXTRACT_SUBREG + (ResTy (INS + (ResTy (SUBREG_TO_REG (i64 0), (NaTy VPR64:$src), sub_64)), + (ResTy VPR128:$Rn), + NaImm:$Immd, StImm:$Immn)), + sub_64))>; + +def : Pat <(NaTy (vector_insert + (NaTy VPR64:$src), + (MidTy (vector_extract + (NaTy VPR64:$Rn), + (NaImm:$Immn))), + (NaImm:$Immd))), + (NaTy (EXTRACT_SUBREG + (ResTy (INS + (ResTy (SUBREG_TO_REG (i64 0), (NaTy VPR64:$src), sub_64)), + (ResTy (SUBREG_TO_REG (i64 0), (NaTy VPR64:$Rn), sub_64)), + NaImm:$Immd, NaImm:$Immn)), + sub_64))>; +} + +defm : Neon_INS_elt_pattern; +defm : Neon_INS_elt_pattern; +defm : Neon_INS_elt_pattern; +defm : Neon_INS_elt_pattern; +defm : Neon_INS_elt_pattern; +defm : Neon_INS_elt_pattern; + +multiclass Neon_INS_elt_float_pattern { +def : Pat <(ResTy (vector_insert + (ResTy VPR128:$src), + (MidTy OpFPR:$Rn), + (ResImm:$Imm))), + (INS (ResTy VPR128:$src), + (ResTy (SUBREG_TO_REG (i64 0), OpFPR:$Rn, SubIndex)), + ResImm:$Imm, + (i64 0))>; + +def : Pat <(NaTy (vector_insert + (NaTy VPR64:$src), + (MidTy OpFPR:$Rn), + (ResImm:$Imm))), + (NaTy (EXTRACT_SUBREG + (ResTy (INS + (ResTy (SUBREG_TO_REG (i64 0), (NaTy VPR64:$src), sub_64)), + (ResTy (SUBREG_TO_REG (i64 0), (MidTy OpFPR:$Rn), SubIndex)), + ResImm:$Imm, + (i64 0))), + sub_64))>; +} + +defm : Neon_INS_elt_float_pattern; +defm : Neon_INS_elt_float_pattern; + +class NeonI_SMOV + : NeonI_copy { + bits<4> Imm; +} + +//Signed integer move (main, from element) +def SMOVwb : NeonI_SMOV<"smov", "b", 0b0, v16i8, i8, neon_uimm4_bare, + GPR32, i32> { + let Inst{20-16} = {Imm{3}, Imm{2}, Imm{1}, Imm{0}, 0b1}; +} +def SMOVwh : NeonI_SMOV<"smov", "h", 0b0, v8i16, i16, neon_uimm3_bare, + GPR32, i32> { + let Inst{20-16} = {Imm{2}, Imm{1}, Imm{0}, 0b1, 0b0}; +} +def SMOVxb : NeonI_SMOV<"smov", "b", 0b1, v16i8, i8, neon_uimm4_bare, + GPR64, i64> { + let Inst{20-16} = {Imm{3}, Imm{2}, Imm{1}, Imm{0}, 0b1}; +} +def SMOVxh : NeonI_SMOV<"smov", "h", 0b1, v8i16, i16, neon_uimm3_bare, + GPR64, i64> { + let Inst{20-16} = {Imm{2}, Imm{1}, Imm{0}, 0b1, 0b0}; +} +def SMOVxs : NeonI_SMOV<"smov", "s", 0b1, v4i32, i32, neon_uimm2_bare, + GPR64, i64> { + let Inst{20-16} = {Imm{1}, Imm{0}, 0b1, 0b0, 0b0}; +} + +multiclass Neon_SMOVx_pattern { + def : Pat<(i64 (sext_inreg + (i64 (anyext + (i32 (vector_extract + (StTy VPR128:$Rn), (StImm:$Imm))))), + eleTy)), + (SMOVI VPR128:$Rn, StImm:$Imm)>; + + def : Pat<(i64 (sext + (i32 (vector_extract + (StTy VPR128:$Rn), (StImm:$Imm))))), + (SMOVI VPR128:$Rn, StImm:$Imm)>; + + def : Pat<(i64 (sext_inreg + (i64 (vector_extract + (NaTy VPR64:$Rn), (NaImm:$Imm))), + eleTy)), + (SMOVI (StTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)), + NaImm:$Imm)>; + + def : Pat<(i64 (sext_inreg + (i64 (anyext + (i32 (vector_extract + (NaTy VPR64:$Rn), (NaImm:$Imm))))), + eleTy)), + (SMOVI (StTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)), + NaImm:$Imm)>; + + def : Pat<(i64 (sext + (i32 (vector_extract + (NaTy VPR64:$Rn), (NaImm:$Imm))))), + (SMOVI (StTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)), + NaImm:$Imm)>; +} + +defm : Neon_SMOVx_pattern; +defm : Neon_SMOVx_pattern; +defm : Neon_SMOVx_pattern; + +class Neon_SMOVw_pattern + : Pat<(i32 (sext_inreg + (i32 (vector_extract + (NaTy VPR64:$Rn), (NaImm:$Imm))), + eleTy)), + (SMOVI (StTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)), + NaImm:$Imm)>; + +def : Neon_SMOVw_pattern; +def : Neon_SMOVw_pattern; + +class NeonI_UMOV + : NeonI_copy { + bits<4> Imm; +} + +//Unsigned integer move (main, from element) +def UMOVwb : NeonI_UMOV<"umov", "b", 0b0, v16i8, neon_uimm4_bare, + GPR32, i32> { + let Inst{20-16} = {Imm{3}, Imm{2}, Imm{1}, Imm{0}, 0b1}; +} +def UMOVwh : NeonI_UMOV<"umov", "h", 0b0, v8i16, neon_uimm3_bare, + GPR32, i32> { + let Inst{20-16} = {Imm{2}, Imm{1}, Imm{0}, 0b1, 0b0}; +} +def UMOVws : NeonI_UMOV<"umov", "s", 0b0, v4i32, neon_uimm2_bare, + GPR32, i32> { + let Inst{20-16} = {Imm{1}, Imm{0}, 0b1, 0b0, 0b0}; +} +def UMOVxd : NeonI_UMOV<"umov", "d", 0b1, v2i64, neon_uimm1_bare, + GPR64, i64> { + let Inst{20-16} = {Imm, 0b1, 0b0, 0b0, 0b0}; +} + +def : NeonInstAlias<"mov $Rd, $Rn.s[$Imm]", + (UMOVws GPR32:$Rd, VPR128:$Rn, neon_uimm2_bare:$Imm), 0>; +def : NeonInstAlias<"mov $Rd, $Rn.d[$Imm]", + (UMOVxd GPR64:$Rd, VPR128:$Rn, neon_uimm1_bare:$Imm), 0>; + +class Neon_UMOV_pattern + : Pat<(ResTy (vector_extract + (NaTy VPR64:$Rn), NaImm:$Imm)), + (SMOVI (StTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)), + NaImm:$Imm)>; + +def : Neon_UMOV_pattern; +def : Neon_UMOV_pattern; +def : Neon_UMOV_pattern; + +def : Pat<(i32 (and + (i32 (vector_extract + (v16i8 VPR128:$Rn), (neon_uimm4_bare:$Imm))), + 255)), + (UMOVwb VPR128:$Rn, neon_uimm4_bare:$Imm)>; + +def : Pat<(i32 (and + (i32 (vector_extract + (v8i16 VPR128:$Rn), (neon_uimm3_bare:$Imm))), + 65535)), + (UMOVwh VPR128:$Rn, neon_uimm3_bare:$Imm)>; + +def : Pat<(i64 (zext + (i32 (vector_extract + (v2i64 VPR128:$Rn), (neon_uimm1_bare:$Imm))))), + (UMOVxd VPR128:$Rn, neon_uimm1_bare:$Imm)>; + +def : Pat<(i32 (and + (i32 (vector_extract + (v8i8 VPR64:$Rn), (neon_uimm3_bare:$Imm))), + 255)), + (UMOVwb (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64), + neon_uimm3_bare:$Imm)>; + +def : Pat<(i32 (and + (i32 (vector_extract + (v4i16 VPR64:$Rn), (neon_uimm2_bare:$Imm))), + 65535)), + (UMOVwh (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64), + neon_uimm2_bare:$Imm)>; + +def : Pat<(i64 (zext + (i32 (vector_extract + (v1i64 VPR64:$Rn), (neon_uimm0_bare:$Imm))))), + (UMOVxd (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64), + neon_uimm0_bare:$Imm)>; + +// Additional copy patterns for scalar types +def : Pat<(i32 (vector_extract (v1i8 FPR8:$Rn), (i64 0))), + (UMOVwb (v16i8 + (SUBREG_TO_REG (i64 0), FPR8:$Rn, sub_8)), (i64 0))>; + +def : Pat<(i32 (vector_extract (v1i16 FPR16:$Rn), (i64 0))), + (UMOVwh (v8i16 + (SUBREG_TO_REG (i64 0), FPR16:$Rn, sub_16)), (i64 0))>; + +def : Pat<(i32 (vector_extract (v1i32 FPR32:$Rn), (i64 0))), + (FMOVws FPR32:$Rn)>; + +def : Pat<(i64 (vector_extract (v1i64 FPR64:$Rn), (i64 0))), + (FMOVxd FPR64:$Rn)>; + +def : Pat<(f64 (vector_extract (v1f64 FPR64:$Rn), (i64 0))), + (f64 FPR64:$Rn)>; + +def : Pat<(f32 (vector_extract (v1f32 FPR32:$Rn), (i64 0))), + (f32 FPR32:$Rn)>; + +def : Pat<(v1i8 (scalar_to_vector GPR32:$Rn)), + (v1i8 (EXTRACT_SUBREG (v16i8 + (INSbw (v16i8 (IMPLICIT_DEF)), $Rn, (i64 0))), + sub_8))>; + +def : Pat<(v1i16 (scalar_to_vector GPR32:$Rn)), + (v1i16 (EXTRACT_SUBREG (v8i16 + (INShw (v8i16 (IMPLICIT_DEF)), $Rn, (i64 0))), + sub_16))>; + +def : Pat<(v1i32 (scalar_to_vector GPR32:$src)), + (FMOVsw $src)>; + +def : Pat<(v1i64 (scalar_to_vector GPR64:$src)), + (FMOVdx $src)>; + +def : Pat<(v1f32 (scalar_to_vector (f32 FPR32:$Rn))), + (v1f32 FPR32:$Rn)>; +def : Pat<(v1f64 (scalar_to_vector (f64 FPR64:$Rn))), + (v1f64 FPR64:$Rn)>; + +def : Pat<(v1f64 (scalar_to_vector (f64 FPR64:$src))), + (FMOVdd $src)>; + +def : Pat<(v2f64 (scalar_to_vector (f64 FPR64:$src))), + (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), + (f64 FPR64:$src), sub_64)>; + +class NeonI_DUP_Elt + : NeonI_copy { + bits<4> Imm; +} + +def DUPELT16b : NeonI_DUP_Elt<0b1, "dup", ".16b", ".b", VPR128, + neon_uimm4_bare> { + let Inst{20-16} = {Imm{3}, Imm{2}, Imm{1}, Imm{0}, 0b1}; +} + +def DUPELT8h : NeonI_DUP_Elt<0b1, "dup", ".8h", ".h", VPR128, + neon_uimm3_bare> { + let Inst{20-16} = {Imm{2}, Imm{1}, Imm{0}, 0b1, 0b0}; +} + +def DUPELT4s : NeonI_DUP_Elt<0b1, "dup", ".4s", ".s", VPR128, + neon_uimm2_bare> { + let Inst{20-16} = {Imm{1}, Imm{0}, 0b1, 0b0, 0b0}; +} + +def DUPELT2d : NeonI_DUP_Elt<0b1, "dup", ".2d", ".d", VPR128, + neon_uimm1_bare> { + let Inst{20-16} = {Imm, 0b1, 0b0, 0b0, 0b0}; +} + +def DUPELT8b : NeonI_DUP_Elt<0b0, "dup", ".8b", ".b", VPR64, + neon_uimm4_bare> { + let Inst{20-16} = {Imm{3}, Imm{2}, Imm{1}, Imm{0}, 0b1}; +} + +def DUPELT4h : NeonI_DUP_Elt<0b0, "dup", ".4h", ".h", VPR64, + neon_uimm3_bare> { + let Inst{20-16} = {Imm{2}, Imm{1}, Imm{0}, 0b1, 0b0}; +} + +def DUPELT2s : NeonI_DUP_Elt<0b0, "dup", ".2s", ".s", VPR64, + neon_uimm2_bare> { + let Inst{20-16} = {Imm{1}, Imm{0}, 0b1, 0b0, 0b0}; +} + +multiclass NeonI_DUP_Elt_pattern { +def : Pat<(ResTy (Neon_vduplane (OpTy VPR128:$Rn), OpLImm:$Imm)), + (ResTy (DUPELT (OpTy VPR128:$Rn), OpLImm:$Imm))>; + +def : Pat<(ResTy (Neon_vduplane + (NaTy VPR64:$Rn), OpNImm:$Imm)), + (ResTy (DUPELT + (ExTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)), OpNImm:$Imm))>; +} +defm : NeonI_DUP_Elt_pattern; +defm : NeonI_DUP_Elt_pattern; +defm : NeonI_DUP_Elt_pattern; +defm : NeonI_DUP_Elt_pattern; +defm : NeonI_DUP_Elt_pattern; +defm : NeonI_DUP_Elt_pattern; +defm : NeonI_DUP_Elt_pattern; +defm : NeonI_DUP_Elt_pattern; +defm : NeonI_DUP_Elt_pattern; +defm : NeonI_DUP_Elt_pattern; + +def : Pat<(v2f32 (Neon_vdup (f32 FPR32:$Rn))), + (v2f32 (DUPELT2s + (SUBREG_TO_REG (i64 0), FPR32:$Rn, sub_32), + (i64 0)))>; +def : Pat<(v4f32 (Neon_vdup (f32 FPR32:$Rn))), + (v4f32 (DUPELT4s + (SUBREG_TO_REG (i64 0), FPR32:$Rn, sub_32), + (i64 0)))>; +def : Pat<(v2f64 (Neon_vdup (f64 FPR64:$Rn))), + (v2f64 (DUPELT2d + (SUBREG_TO_REG (i64 0), FPR64:$Rn, sub_64), + (i64 0)))>; + +class NeonI_DUP + : NeonI_copy; + +def DUP16b : NeonI_DUP<0b1, "dup", ".16b", VPR128, v16i8, GPR32, i32> { + let Inst{20-16} = 0b00001; + // bits 17-20 are unspecified, but should be set to zero. +} + +def DUP8h : NeonI_DUP<0b1, "dup", ".8h", VPR128, v8i16, GPR32, i32> { + let Inst{20-16} = 0b00010; + // bits 18-20 are unspecified, but should be set to zero. +} + +def DUP4s : NeonI_DUP<0b1, "dup", ".4s", VPR128, v4i32, GPR32, i32> { + let Inst{20-16} = 0b00100; + // bits 19-20 are unspecified, but should be set to zero. +} + +def DUP2d : NeonI_DUP<0b1, "dup", ".2d", VPR128, v2i64, GPR64, i64> { + let Inst{20-16} = 0b01000; + // bit 20 is unspecified, but should be set to zero. +} + +def DUP8b : NeonI_DUP<0b0, "dup", ".8b", VPR64, v8i8, GPR32, i32> { + let Inst{20-16} = 0b00001; + // bits 17-20 are unspecified, but should be set to zero. +} + +def DUP4h : NeonI_DUP<0b0, "dup", ".4h", VPR64, v4i16, GPR32, i32> { + let Inst{20-16} = 0b00010; + // bits 18-20 are unspecified, but should be set to zero. +} + +def DUP2s : NeonI_DUP<0b0, "dup", ".2s", VPR64, v2i32, GPR32, i32> { + let Inst{20-16} = 0b00100; + // bits 19-20 are unspecified, but should be set to zero. +} + +// patterns for CONCAT_VECTORS +multiclass Concat_Vector_Pattern { +def : Pat<(ResTy (concat_vectors (OpTy VPR64:$Rn), undef)), + (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)>; +def : Pat<(ResTy (concat_vectors (OpTy VPR64:$Rn), (OpTy VPR64:$Rm))), + (INSELd + (v2i64 (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)), + (v2i64 (SUBREG_TO_REG (i64 0), VPR64:$Rm, sub_64)), + (i64 1), + (i64 0))>; +def : Pat<(ResTy (concat_vectors (OpTy VPR64:$Rn), (OpTy VPR64:$Rn))), + (DUPELT2d + (v2i64 (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)), + (i64 0))> ; +} + +defm : Concat_Vector_Pattern; +defm : Concat_Vector_Pattern; +defm : Concat_Vector_Pattern; +defm : Concat_Vector_Pattern; +defm : Concat_Vector_Pattern; +defm : Concat_Vector_Pattern; + +//patterns for EXTRACT_SUBVECTOR +def : Pat<(v8i8 (extract_subvector (v16i8 VPR128:$Rn), (i64 0))), + (v8i8 (EXTRACT_SUBREG VPR128:$Rn, sub_64))>; +def : Pat<(v4i16 (extract_subvector (v8i16 VPR128:$Rn), (i64 0))), + (v4i16 (EXTRACT_SUBREG VPR128:$Rn, sub_64))>; +def : Pat<(v2i32 (extract_subvector (v4i32 VPR128:$Rn), (i64 0))), + (v2i32 (EXTRACT_SUBREG VPR128:$Rn, sub_64))>; +def : Pat<(v1i64 (extract_subvector (v2i64 VPR128:$Rn), (i64 0))), + (v1i64 (EXTRACT_SUBREG VPR128:$Rn, sub_64))>; +def : Pat<(v2f32 (extract_subvector (v4f32 VPR128:$Rn), (i64 0))), + (v2f32 (EXTRACT_SUBREG VPR128:$Rn, sub_64))>; +def : Pat<(v1f64 (extract_subvector (v2f64 VPR128:$Rn), (i64 0))), + (v1f64 (EXTRACT_SUBREG VPR128:$Rn, sub_64))>; + +// The followings are for instruction class (3V Elem) + +// Variant 1 + +class NI_2VE size, bits<4> opcode, + string asmop, string ResS, string OpS, string EleOpS, + Operand OpImm, RegisterOperand ResVPR, + RegisterOperand OpVPR, RegisterOperand EleOpVPR> + : NeonI_2VElem { + bits<3> Index; + bits<5> Re; + + let Constraints = "$src = $Rd"; +} + +multiclass NI_2VE_v1 opcode, string asmop> { + // vector register class for element is always 128-bit to cover the max index + def _2s4s : NI_2VE<0b0, u, 0b10, opcode, asmop, "2s", "2s", "s", + neon_uimm2_bare, VPR64, VPR64, VPR128> { + let Inst{11} = {Index{1}}; + let Inst{21} = {Index{0}}; + let Inst{20-16} = Re; + } + + def _4s4s : NI_2VE<0b1, u, 0b10, opcode, asmop, "4s", "4s", "s", + neon_uimm2_bare, VPR128, VPR128, VPR128> { + let Inst{11} = {Index{1}}; + let Inst{21} = {Index{0}}; + let Inst{20-16} = Re; + } + + // Index operations on 16-bit(H) elements are restricted to using v0-v15. + def _4h8h : NI_2VE<0b0, u, 0b01, opcode, asmop, "4h", "4h", "h", + neon_uimm3_bare, VPR64, VPR64, VPR128Lo> { + let Inst{11} = {Index{2}}; + let Inst{21} = {Index{1}}; + let Inst{20} = {Index{0}}; + let Inst{19-16} = Re{3-0}; + } + + def _8h8h : NI_2VE<0b1, u, 0b01, opcode, asmop, "8h", "8h", "h", + neon_uimm3_bare, VPR128, VPR128, VPR128Lo> { + let Inst{11} = {Index{2}}; + let Inst{21} = {Index{1}}; + let Inst{20} = {Index{0}}; + let Inst{19-16} = Re{3-0}; + } +} + +defm MLAvve : NI_2VE_v1<0b1, 0b0000, "mla">; +defm MLSvve : NI_2VE_v1<0b1, 0b0100, "mls">; + +// Pattern for lane in 128-bit vector +class NI_2VE_laneq + : Pat<(ResTy (op (ResTy ResVPR:$src), (OpTy OpVPR:$Rn), + (OpTy (Neon_vduplane (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))), + (INST ResVPR:$src, OpVPR:$Rn, EleOpVPR:$Re, OpImm:$Index)>; + +// Pattern for lane in 64-bit vector +class NI_2VE_lane + : Pat<(ResTy (op (ResTy ResVPR:$src), (OpTy OpVPR:$Rn), + (OpTy (Neon_vduplane (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))), + (INST ResVPR:$src, OpVPR:$Rn, + (SUBREG_TO_REG (i64 0), EleOpVPR:$Re, sub_64), OpImm:$Index)>; + +multiclass NI_2VE_v1_pat +{ + def : NI_2VE_laneq(subop # "_2s4s"), neon_uimm2_bare, + op, VPR64, VPR64, VPR128, v2i32, v2i32, v4i32>; + + def : NI_2VE_laneq(subop # "_4s4s"), neon_uimm2_bare, + op, VPR128, VPR128, VPR128, v4i32, v4i32, v4i32>; + + def : NI_2VE_laneq(subop # "_4h8h"), neon_uimm3_bare, + op, VPR64, VPR64, VPR128Lo, v4i16, v4i16, v8i16>; + + def : NI_2VE_laneq(subop # "_8h8h"), neon_uimm3_bare, + op, VPR128, VPR128, VPR128Lo, v8i16, v8i16, v8i16>; + + // Index can only be half of the max value for lane in 64-bit vector + + def : NI_2VE_lane(subop # "_2s4s"), neon_uimm1_bare, + op, VPR64, VPR64, VPR64, v2i32, v2i32, v2i32>; + + def : NI_2VE_lane(subop # "_4h8h"), neon_uimm2_bare, + op, VPR64, VPR64, VPR64Lo, v4i16, v4i16, v4i16>; +} + +defm MLA_lane_v1 : NI_2VE_v1_pat<"MLAvve", Neon_mla>; +defm MLS_lane_v1 : NI_2VE_v1_pat<"MLSvve", Neon_mls>; + +class NI_2VE_2op size, bits<4> opcode, + string asmop, string ResS, string OpS, string EleOpS, + Operand OpImm, RegisterOperand ResVPR, + RegisterOperand OpVPR, RegisterOperand EleOpVPR> + : NeonI_2VElem { + bits<3> Index; + bits<5> Re; +} + +multiclass NI_2VE_v1_2op opcode, string asmop> { + // vector register class for element is always 128-bit to cover the max index + def _2s4s : NI_2VE_2op<0b0, u, 0b10, opcode, asmop, "2s", "2s", "s", + neon_uimm2_bare, VPR64, VPR64, VPR128> { + let Inst{11} = {Index{1}}; + let Inst{21} = {Index{0}}; + let Inst{20-16} = Re; + } + + def _4s4s : NI_2VE_2op<0b1, u, 0b10, opcode, asmop, "4s", "4s", "s", + neon_uimm2_bare, VPR128, VPR128, VPR128> { + let Inst{11} = {Index{1}}; + let Inst{21} = {Index{0}}; + let Inst{20-16} = Re; + } + + // Index operations on 16-bit(H) elements are restricted to using v0-v15. + def _4h8h : NI_2VE_2op<0b0, u, 0b01, opcode, asmop, "4h", "4h", "h", + neon_uimm3_bare, VPR64, VPR64, VPR128Lo> { + let Inst{11} = {Index{2}}; + let Inst{21} = {Index{1}}; + let Inst{20} = {Index{0}}; + let Inst{19-16} = Re{3-0}; + } + + def _8h8h : NI_2VE_2op<0b1, u, 0b01, opcode, asmop, "8h", "8h", "h", + neon_uimm3_bare, VPR128, VPR128, VPR128Lo> { + let Inst{11} = {Index{2}}; + let Inst{21} = {Index{1}}; + let Inst{20} = {Index{0}}; + let Inst{19-16} = Re{3-0}; + } +} + +defm MULve : NI_2VE_v1_2op<0b0, 0b1000, "mul">; +defm SQDMULHve : NI_2VE_v1_2op<0b0, 0b1100, "sqdmulh">; +defm SQRDMULHve : NI_2VE_v1_2op<0b0, 0b1101, "sqrdmulh">; + +// Pattern for lane in 128-bit vector +class NI_2VE_mul_laneq + : Pat<(ResTy (op (OpTy OpVPR:$Rn), + (OpTy (Neon_vduplane (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))), + (INST OpVPR:$Rn, EleOpVPR:$Re, OpImm:$Index)>; + +// Pattern for lane in 64-bit vector +class NI_2VE_mul_lane + : Pat<(ResTy (op (OpTy OpVPR:$Rn), + (OpTy (Neon_vduplane (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))), + (INST OpVPR:$Rn, + (SUBREG_TO_REG (i64 0), EleOpVPR:$Re, sub_64), OpImm:$Index)>; + +multiclass NI_2VE_mul_v1_pat { + def : NI_2VE_mul_laneq(subop # "_2s4s"), neon_uimm2_bare, + op, VPR64, VPR128, v2i32, v2i32, v4i32>; + + def : NI_2VE_mul_laneq(subop # "_4s4s"), neon_uimm2_bare, + op, VPR128, VPR128, v4i32, v4i32, v4i32>; + + def : NI_2VE_mul_laneq(subop # "_4h8h"), neon_uimm3_bare, + op, VPR64, VPR128Lo, v4i16, v4i16, v8i16>; + + def : NI_2VE_mul_laneq(subop # "_8h8h"), neon_uimm3_bare, + op, VPR128, VPR128Lo, v8i16, v8i16, v8i16>; + + // Index can only be half of the max value for lane in 64-bit vector + + def : NI_2VE_mul_lane(subop # "_2s4s"), neon_uimm1_bare, + op, VPR64, VPR64, v2i32, v2i32, v2i32>; + + def : NI_2VE_mul_lane(subop # "_4h8h"), neon_uimm2_bare, + op, VPR64, VPR64Lo, v4i16, v4i16, v4i16>; +} + +defm MUL_lane_v1 : NI_2VE_mul_v1_pat<"MULve", mul>; +defm SQDMULH_lane_v1 : NI_2VE_mul_v1_pat<"SQDMULHve", int_arm_neon_vqdmulh>; +defm SQRDMULH_lane_v1 : NI_2VE_mul_v1_pat<"SQRDMULHve", int_arm_neon_vqrdmulh>; + +// Variant 2 + +multiclass NI_2VE_v2_2op opcode, string asmop> { + // vector register class for element is always 128-bit to cover the max index + def _2s4s : NI_2VE_2op<0b0, u, 0b10, opcode, asmop, "2s", "2s", "s", + neon_uimm2_bare, VPR64, VPR64, VPR128> { + let Inst{11} = {Index{1}}; + let Inst{21} = {Index{0}}; + let Inst{20-16} = Re; + } + + def _4s4s : NI_2VE_2op<0b1, u, 0b10, opcode, asmop, "4s", "4s", "s", + neon_uimm2_bare, VPR128, VPR128, VPR128> { + let Inst{11} = {Index{1}}; + let Inst{21} = {Index{0}}; + let Inst{20-16} = Re; + } + + // _1d2d doesn't exist! + + def _2d2d : NI_2VE_2op<0b1, u, 0b11, opcode, asmop, "2d", "2d", "d", + neon_uimm1_bare, VPR128, VPR128, VPR128> { + let Inst{11} = {Index{0}}; + let Inst{21} = 0b0; + let Inst{20-16} = Re; + } +} + +defm FMULve : NI_2VE_v2_2op<0b0, 0b1001, "fmul">; +defm FMULXve : NI_2VE_v2_2op<0b1, 0b1001, "fmulx">; + +class NI_2VE_mul_lane_2d + : Pat<(ResTy (op (OpTy OpVPR:$Rn), + (OpTy (coreop (EleOpTy EleOpVPR:$Re), (EleOpTy EleOpVPR:$Re))))), + (INST OpVPR:$Rn, + (SUBREG_TO_REG (i64 0), EleOpVPR:$Re, sub_64), 0)>; + +multiclass NI_2VE_mul_v2_pat { + def : NI_2VE_mul_laneq(subop # "_2s4s"), neon_uimm2_bare, + op, VPR64, VPR128, v2f32, v2f32, v4f32>; + + def : NI_2VE_mul_laneq(subop # "_4s4s"), neon_uimm2_bare, + op, VPR128, VPR128, v4f32, v4f32, v4f32>; + + def : NI_2VE_mul_laneq(subop # "_2d2d"), neon_uimm1_bare, + op, VPR128, VPR128, v2f64, v2f64, v2f64>; + + // Index can only be half of the max value for lane in 64-bit vector + + def : NI_2VE_mul_lane(subop # "_2s4s"), neon_uimm1_bare, + op, VPR64, VPR64, v2f32, v2f32, v2f32>; + + def : NI_2VE_mul_lane_2d(subop # "_2d2d"), neon_uimm1_bare, + op, VPR128, VPR64, v2f64, v2f64, v1f64, + BinOpFrag<(Neon_combine_2d node:$LHS, node:$RHS)>>; +} + +defm FMUL_lane_v2 : NI_2VE_mul_v2_pat<"FMULve", fmul>; +defm FMULX_lane_v2 : NI_2VE_mul_v2_pat<"FMULXve", int_aarch64_neon_vmulx>; + +def : Pat<(v2f32 (fmul (v2f32 (Neon_vdup (f32 FPR32:$Re))), + (v2f32 VPR64:$Rn))), + (FMULve_2s4s VPR64:$Rn, (SUBREG_TO_REG (i32 0), $Re, sub_32), 0)>; + +def : Pat<(v4f32 (fmul (v4f32 (Neon_vdup (f32 FPR32:$Re))), + (v4f32 VPR128:$Rn))), + (FMULve_4s4s VPR128:$Rn, (SUBREG_TO_REG (i32 0), $Re, sub_32), 0)>; + +def : Pat<(v2f64 (fmul (v2f64 (Neon_vdup (f64 FPR64:$Re))), + (v2f64 VPR128:$Rn))), + (FMULve_2d2d VPR128:$Rn, (SUBREG_TO_REG (i64 0), $Re, sub_64), 0)>; + +// The followings are patterns using fma +// -ffp-contract=fast generates fma + +multiclass NI_2VE_v2 opcode, string asmop> { + // vector register class for element is always 128-bit to cover the max index + def _2s4s : NI_2VE<0b0, u, 0b10, opcode, asmop, "2s", "2s", "s", + neon_uimm2_bare, VPR64, VPR64, VPR128> { + let Inst{11} = {Index{1}}; + let Inst{21} = {Index{0}}; + let Inst{20-16} = Re; + } + + def _4s4s : NI_2VE<0b1, u, 0b10, opcode, asmop, "4s", "4s", "s", + neon_uimm2_bare, VPR128, VPR128, VPR128> { + let Inst{11} = {Index{1}}; + let Inst{21} = {Index{0}}; + let Inst{20-16} = Re; + } + + // _1d2d doesn't exist! + + def _2d2d : NI_2VE<0b1, u, 0b11, opcode, asmop, "2d", "2d", "d", + neon_uimm1_bare, VPR128, VPR128, VPR128> { + let Inst{11} = {Index{0}}; + let Inst{21} = 0b0; + let Inst{20-16} = Re; + } +} + +defm FMLAvve : NI_2VE_v2<0b0, 0b0001, "fmla">; +defm FMLSvve : NI_2VE_v2<0b0, 0b0101, "fmls">; + +// Pattern for lane in 128-bit vector +class NI_2VEswap_laneq + : Pat<(ResTy (op (ResTy (coreop (OpTy OpVPR:$Re), (i64 OpImm:$Index))), + (ResTy ResVPR:$src), (ResTy ResVPR:$Rn))), + (INST ResVPR:$src, ResVPR:$Rn, OpVPR:$Re, OpImm:$Index)>; + +// Pattern for lane 0 +class NI_2VEfma_lane0 + : Pat<(ResTy (op (ResTy ResVPR:$Rn), + (ResTy (Neon_vdup (f32 FPR32:$Re))), + (ResTy ResVPR:$src))), + (INST ResVPR:$src, ResVPR:$Rn, + (SUBREG_TO_REG (i32 0), $Re, sub_32), 0)>; + +// Pattern for lane in 64-bit vector +class NI_2VEswap_lane + : Pat<(ResTy (op (ResTy (coreop (OpTy OpVPR:$Re), (i64 OpImm:$Index))), + (ResTy ResVPR:$Rn), (ResTy ResVPR:$src))), + (INST ResVPR:$src, ResVPR:$Rn, + (SUBREG_TO_REG (i64 0), OpVPR:$Re, sub_64), OpImm:$Index)>; + +// Pattern for lane in 64-bit vector +class NI_2VEswap_lane_2d2d + : Pat<(ResTy (op (ResTy (coreop (OpTy OpVPR:$Re), (OpTy OpVPR:$Re))), + (ResTy ResVPR:$Rn), (ResTy ResVPR:$src))), + (INST ResVPR:$src, ResVPR:$Rn, + (SUBREG_TO_REG (i64 0), OpVPR:$Re, sub_64), 0)>; + + +multiclass NI_2VE_fma_v2_pat { + def : NI_2VEswap_laneq(subop # "_2s4s"), + neon_uimm2_bare, op, VPR64, VPR128, v2f32, v4f32, + BinOpFrag<(Neon_vduplane node:$LHS, node:$RHS)>>; + + def : NI_2VEfma_lane0(subop # "_2s4s"), + op, VPR64, v2f32>; + + def : NI_2VEswap_laneq(subop # "_4s4s"), + neon_uimm2_bare, op, VPR128, VPR128, v4f32, v4f32, + BinOpFrag<(Neon_vduplane node:$LHS, node:$RHS)>>; + + def : NI_2VEfma_lane0(subop # "_4s4s"), + op, VPR128, v4f32>; + + def : NI_2VEswap_laneq(subop # "_2d2d"), + neon_uimm1_bare, op, VPR128, VPR128, v2f64, v2f64, + BinOpFrag<(Neon_vduplane node:$LHS, node:$RHS)>>; + + // Index can only be half of the max value for lane in 64-bit vector + + def : NI_2VEswap_lane(subop # "_2s4s"), + neon_uimm1_bare, op, VPR64, VPR64, v2f32, v2f32, + BinOpFrag<(Neon_vduplane node:$LHS, node:$RHS)>>; + + def : NI_2VEswap_lane_2d2d(subop # "_2d2d"), + neon_uimm1_bare, op, VPR128, VPR64, v2f64, v1f64, + BinOpFrag<(Neon_combine_2d node:$LHS, node:$RHS)>>; +} + +defm FMLA_lane_v2_s : NI_2VE_fma_v2_pat<"FMLAvve", fma>; + +// Pattern for lane 0 +class NI_2VEfms_lane0 + : Pat<(ResTy (op (ResTy (fneg ResVPR:$Rn)), + (ResTy (Neon_vdup (f32 FPR32:$Re))), + (ResTy ResVPR:$src))), + (INST ResVPR:$src, ResVPR:$Rn, + (SUBREG_TO_REG (i32 0), $Re, sub_32), 0)>; + +multiclass NI_2VE_fms_v2_pat +{ + def : NI_2VEswap_laneq(subop # "_2s4s"), + neon_uimm2_bare, op, VPR64, VPR128, v2f32, v4f32, + BinOpFrag<(fneg (Neon_vduplane node:$LHS, node:$RHS))>>; + + def : NI_2VEswap_laneq(subop # "_2s4s"), + neon_uimm2_bare, op, VPR64, VPR128, v2f32, v4f32, + BinOpFrag<(Neon_vduplane + (fneg node:$LHS), node:$RHS)>>; + + def : NI_2VEfms_lane0(subop # "_2s4s"), + op, VPR64, v2f32>; + + def : NI_2VEswap_laneq(subop # "_4s4s"), + neon_uimm2_bare, op, VPR128, VPR128, v4f32, v4f32, + BinOpFrag<(fneg (Neon_vduplane + node:$LHS, node:$RHS))>>; + + def : NI_2VEswap_laneq(subop # "_4s4s"), + neon_uimm2_bare, op, VPR128, VPR128, v4f32, v4f32, + BinOpFrag<(Neon_vduplane + (fneg node:$LHS), node:$RHS)>>; + + def : NI_2VEfms_lane0(subop # "_4s4s"), + op, VPR128, v4f32>; + + def : NI_2VEswap_laneq(subop # "_2d2d"), + neon_uimm1_bare, op, VPR128, VPR128, v2f64, v2f64, + BinOpFrag<(fneg (Neon_vduplane + node:$LHS, node:$RHS))>>; + + def : NI_2VEswap_laneq(subop # "_2d2d"), + neon_uimm1_bare, op, VPR128, VPR128, v2f64, v2f64, + BinOpFrag<(Neon_vduplane + (fneg node:$LHS), node:$RHS)>>; + + // Index can only be half of the max value for lane in 64-bit vector + + def : NI_2VEswap_lane(subop # "_2s4s"), + neon_uimm1_bare, op, VPR64, VPR64, v2f32, v2f32, + BinOpFrag<(fneg (Neon_vduplane + node:$LHS, node:$RHS))>>; + + def : NI_2VEswap_lane(subop # "_2s4s"), + neon_uimm1_bare, op, VPR64, VPR64, v2f32, v2f32, + BinOpFrag<(Neon_vduplane + (fneg node:$LHS), node:$RHS)>>; + + def : NI_2VEswap_lane(subop # "_4s4s"), + neon_uimm1_bare, op, VPR128, VPR64, v4f32, v2f32, + BinOpFrag<(fneg (Neon_vduplane node:$LHS, node:$RHS))>>; + + def : NI_2VEswap_lane(subop # "_4s4s"), + neon_uimm1_bare, op, VPR128, VPR64, v4f32, v2f32, + BinOpFrag<(Neon_vduplane (fneg node:$LHS), node:$RHS)>>; + + def : NI_2VEswap_lane_2d2d(subop # "_2d2d"), + neon_uimm1_bare, op, VPR128, VPR64, v2f64, v1f64, + BinOpFrag<(fneg (Neon_combine_2d + node:$LHS, node:$RHS))>>; + + def : NI_2VEswap_lane_2d2d(subop # "_2d2d"), + neon_uimm1_bare, op, VPR128, VPR64, v2f64, v1f64, + BinOpFrag<(Neon_combine_2d + (fneg node:$LHS), (fneg node:$RHS))>>; +} + +defm FMLS_lane_v2_s : NI_2VE_fms_v2_pat<"FMLSvve", fma>; + +// Variant 3: Long type +// E.g. SMLAL : 4S/4H/H (v0-v15), 2D/2S/S +// SMLAL2: 4S/8H/H (v0-v15), 2D/4S/S + +multiclass NI_2VE_v3 opcode, string asmop> { + // vector register class for element is always 128-bit to cover the max index + def _2d2s : NI_2VE<0b0, u, 0b10, opcode, asmop, "2d", "2s", "s", + neon_uimm2_bare, VPR128, VPR64, VPR128> { + let Inst{11} = {Index{1}}; + let Inst{21} = {Index{0}}; + let Inst{20-16} = Re; + } + + def _2d4s : NI_2VE<0b1, u, 0b10, opcode, asmop # "2", "2d", "4s", "s", + neon_uimm2_bare, VPR128, VPR128, VPR128> { + let Inst{11} = {Index{1}}; + let Inst{21} = {Index{0}}; + let Inst{20-16} = Re; + } + + // Index operations on 16-bit(H) elements are restricted to using v0-v15. + def _4s8h : NI_2VE<0b1, u, 0b01, opcode, asmop # "2", "4s", "8h", "h", + neon_uimm3_bare, VPR128, VPR128, VPR128Lo> { + let Inst{11} = {Index{2}}; + let Inst{21} = {Index{1}}; + let Inst{20} = {Index{0}}; + let Inst{19-16} = Re{3-0}; + } + + def _4s4h : NI_2VE<0b0, u, 0b01, opcode, asmop, "4s", "4h", "h", + neon_uimm3_bare, VPR128, VPR64, VPR128Lo> { + let Inst{11} = {Index{2}}; + let Inst{21} = {Index{1}}; + let Inst{20} = {Index{0}}; + let Inst{19-16} = Re{3-0}; + } +} + +defm SMLALvve : NI_2VE_v3<0b0, 0b0010, "smlal">; +defm UMLALvve : NI_2VE_v3<0b1, 0b0010, "umlal">; +defm SMLSLvve : NI_2VE_v3<0b0, 0b0110, "smlsl">; +defm UMLSLvve : NI_2VE_v3<0b1, 0b0110, "umlsl">; +defm SQDMLALvve : NI_2VE_v3<0b0, 0b0011, "sqdmlal">; +defm SQDMLSLvve : NI_2VE_v3<0b0, 0b0111, "sqdmlsl">; + +multiclass NI_2VE_v3_2op opcode, string asmop> { + // vector register class for element is always 128-bit to cover the max index + def _2d2s : NI_2VE_2op<0b0, u, 0b10, opcode, asmop, "2d", "2s", "s", + neon_uimm2_bare, VPR128, VPR64, VPR128> { + let Inst{11} = {Index{1}}; + let Inst{21} = {Index{0}}; + let Inst{20-16} = Re; + } + + def _2d4s : NI_2VE_2op<0b1, u, 0b10, opcode, asmop # "2", "2d", "4s", "s", + neon_uimm2_bare, VPR128, VPR128, VPR128> { + let Inst{11} = {Index{1}}; + let Inst{21} = {Index{0}}; + let Inst{20-16} = Re; + } + + // Index operations on 16-bit(H) elements are restricted to using v0-v15. + def _4s8h : NI_2VE_2op<0b1, u, 0b01, opcode, asmop # "2", "4s", "8h", "h", + neon_uimm3_bare, VPR128, VPR128, VPR128Lo> { + let Inst{11} = {Index{2}}; + let Inst{21} = {Index{1}}; + let Inst{20} = {Index{0}}; + let Inst{19-16} = Re{3-0}; + } + + def _4s4h : NI_2VE_2op<0b0, u, 0b01, opcode, asmop, "4s", "4h", "h", + neon_uimm3_bare, VPR128, VPR64, VPR128Lo> { + let Inst{11} = {Index{2}}; + let Inst{21} = {Index{1}}; + let Inst{20} = {Index{0}}; + let Inst{19-16} = Re{3-0}; + } +} + +defm SMULLve : NI_2VE_v3_2op<0b0, 0b1010, "smull">; +defm UMULLve : NI_2VE_v3_2op<0b1, 0b1010, "umull">; +defm SQDMULLve : NI_2VE_v3_2op<0b0, 0b1011, "sqdmull">; + +def : Pat<(v1f64 (scalar_to_vector (f64 FPR64:$src))), + (FMOVdd $src)>; +def : Pat<(v1f32 (scalar_to_vector (f32 FPR32:$src))), + (FMOVss $src)>; + +// Pattern for lane in 128-bit vector +class NI_2VEL2_laneq + : Pat<(ResTy (op (ResTy VPR128:$src), + (HalfOpTy (hiop (OpTy VPR128:$Rn))), + (HalfOpTy (Neon_vduplane + (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))), + (INST VPR128:$src, VPR128:$Rn, EleOpVPR:$Re, OpImm:$Index)>; + +// Pattern for lane in 64-bit vector +class NI_2VEL2_lane + : Pat<(ResTy (op (ResTy VPR128:$src), + (HalfOpTy (hiop (OpTy VPR128:$Rn))), + (HalfOpTy (Neon_vduplane + (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))), + (INST VPR128:$src, VPR128:$Rn, + (SUBREG_TO_REG (i64 0), EleOpVPR:$Re, sub_64), OpImm:$Index)>; + +class NI_2VEL2_lane0 + : Pat<(ResTy (op (ResTy VPR128:$src), + (HalfOpTy (hiop (OpTy VPR128:$Rn))), + (HalfOpTy (Neon_vdup (i32 GPR32:$Re))))), + (INST VPR128:$src, VPR128:$Rn, (DupInst $Re), 0)>; + +multiclass NI_2VEL_v3_pat { + def : NI_2VE_laneq(subop # "_4s4h"), neon_uimm3_bare, + op, VPR128, VPR64, VPR128Lo, v4i32, v4i16, v8i16>; + + def : NI_2VE_laneq(subop # "_2d2s"), neon_uimm2_bare, + op, VPR128, VPR64, VPR128, v2i64, v2i32, v4i32>; + + def : NI_2VEL2_laneq(subop # "_4s8h"), neon_uimm3_bare, + op, VPR128Lo, v4i32, v8i16, v8i16, v4i16, Neon_High8H>; + + def : NI_2VEL2_laneq(subop # "_2d4s"), neon_uimm2_bare, + op, VPR128, v2i64, v4i32, v4i32, v2i32, Neon_High4S>; + + def : NI_2VEL2_lane0(subop # "_4s8h"), + op, v4i32, v8i16, v4i16, Neon_High8H, DUP8h>; + + def : NI_2VEL2_lane0(subop # "_2d4s"), + op, v2i64, v4i32, v2i32, Neon_High4S, DUP4s>; + + // Index can only be half of the max value for lane in 64-bit vector + + def : NI_2VE_lane(subop # "_4s4h"), neon_uimm2_bare, + op, VPR128, VPR64, VPR64Lo, v4i32, v4i16, v4i16>; + + def : NI_2VE_lane(subop # "_2d2s"), neon_uimm1_bare, + op, VPR128, VPR64, VPR64, v2i64, v2i32, v2i32>; + + def : NI_2VEL2_lane(subop # "_4s8h"), neon_uimm2_bare, + op, VPR64Lo, v4i32, v8i16, v4i16, v4i16, Neon_High8H>; + + def : NI_2VEL2_lane(subop # "_2d4s"), neon_uimm1_bare, + op, VPR64, v2i64, v4i32, v2i32, v2i32, Neon_High4S>; +} + +defm SMLAL_lane_v3 : NI_2VEL_v3_pat<"SMLALvve", Neon_smlal>; +defm UMLAL_lane_v3 : NI_2VEL_v3_pat<"UMLALvve", Neon_umlal>; +defm SMLSL_lane_v3 : NI_2VEL_v3_pat<"SMLSLvve", Neon_smlsl>; +defm UMLSL_lane_v3 : NI_2VEL_v3_pat<"UMLSLvve", Neon_umlsl>; + +// Pattern for lane in 128-bit vector +class NI_2VEL2_mul_laneq + : Pat<(ResTy (op + (HalfOpTy (hiop (OpTy VPR128:$Rn))), + (HalfOpTy (Neon_vduplane + (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))), + (INST VPR128:$Rn, EleOpVPR:$Re, OpImm:$Index)>; + +// Pattern for lane in 64-bit vector +class NI_2VEL2_mul_lane + : Pat<(ResTy (op + (HalfOpTy (hiop (OpTy VPR128:$Rn))), + (HalfOpTy (Neon_vduplane + (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))), + (INST VPR128:$Rn, + (SUBREG_TO_REG (i64 0), EleOpVPR:$Re, sub_64), OpImm:$Index)>; + +// Pattern for fixed lane 0 +class NI_2VEL2_mul_lane0 + : Pat<(ResTy (op + (HalfOpTy (hiop (OpTy VPR128:$Rn))), + (HalfOpTy (Neon_vdup (i32 GPR32:$Re))))), + (INST VPR128:$Rn, (DupInst $Re), 0)>; + +multiclass NI_2VEL_mul_v3_pat { + def : NI_2VE_mul_laneq(subop # "_4s4h"), neon_uimm3_bare, + op, VPR64, VPR128Lo, v4i32, v4i16, v8i16>; + + def : NI_2VE_mul_laneq(subop # "_2d2s"), neon_uimm2_bare, + op, VPR64, VPR128, v2i64, v2i32, v4i32>; + + def : NI_2VEL2_mul_laneq(subop # "_4s8h"), neon_uimm3_bare, + op, VPR128Lo, v4i32, v8i16, v8i16, v4i16, Neon_High8H>; + + def : NI_2VEL2_mul_laneq(subop # "_2d4s"), neon_uimm2_bare, + op, VPR128, v2i64, v4i32, v4i32, v2i32, Neon_High4S>; + + def : NI_2VEL2_mul_lane0(subop # "_4s8h"), + op, v4i32, v8i16, v4i16, Neon_High8H, DUP8h>; + + def : NI_2VEL2_mul_lane0(subop # "_2d4s"), + op, v2i64, v4i32, v2i32, Neon_High4S, DUP4s>; + + // Index can only be half of the max value for lane in 64-bit vector + + def : NI_2VE_mul_lane(subop # "_4s4h"), neon_uimm2_bare, + op, VPR64, VPR64Lo, v4i32, v4i16, v4i16>; + + def : NI_2VE_mul_lane(subop # "_2d2s"), neon_uimm1_bare, + op, VPR64, VPR64, v2i64, v2i32, v2i32>; + + def : NI_2VEL2_mul_lane(subop # "_4s8h"), neon_uimm2_bare, + op, VPR64Lo, v4i32, v8i16, v4i16, v4i16, Neon_High8H>; + + def : NI_2VEL2_mul_lane(subop # "_2d4s"), neon_uimm1_bare, + op, VPR64, v2i64, v4i32, v2i32, v2i32, Neon_High4S>; +} + +defm SMULL_lane_v3 : NI_2VEL_mul_v3_pat<"SMULLve", int_arm_neon_vmulls>; +defm UMULL_lane_v3 : NI_2VEL_mul_v3_pat<"UMULLve", int_arm_neon_vmullu>; +defm SQDMULL_lane_v3 : NI_2VEL_mul_v3_pat<"SQDMULLve", int_arm_neon_vqdmull>; + +multiclass NI_qdma { + def _4s : PatFrag<(ops node:$Ra, node:$Rn, node:$Rm), + (op node:$Ra, + (v4i32 (int_arm_neon_vqdmull node:$Rn, node:$Rm)))>; + + def _2d : PatFrag<(ops node:$Ra, node:$Rn, node:$Rm), + (op node:$Ra, + (v2i64 (int_arm_neon_vqdmull node:$Rn, node:$Rm)))>; +} + +defm Neon_qdmlal : NI_qdma; +defm Neon_qdmlsl : NI_qdma; + +multiclass NI_2VEL_v3_qdma_pat { + def : NI_2VE_laneq(subop # "_4s4h"), neon_uimm3_bare, + !cast(op # "_4s"), VPR128, VPR64, VPR128Lo, + v4i32, v4i16, v8i16>; + + def : NI_2VE_laneq(subop # "_2d2s"), neon_uimm2_bare, + !cast(op # "_2d"), VPR128, VPR64, VPR128, + v2i64, v2i32, v4i32>; + + def : NI_2VEL2_laneq(subop # "_4s8h"), neon_uimm3_bare, + !cast(op # "_4s"), VPR128Lo, + v4i32, v8i16, v8i16, v4i16, Neon_High8H>; + + def : NI_2VEL2_laneq(subop # "_2d4s"), neon_uimm2_bare, + !cast(op # "_2d"), VPR128, + v2i64, v4i32, v4i32, v2i32, Neon_High4S>; + + def : NI_2VEL2_lane0(subop # "_4s8h"), + !cast(op # "_4s"), + v4i32, v8i16, v4i16, Neon_High8H, DUP8h>; + + def : NI_2VEL2_lane0(subop # "_2d4s"), + !cast(op # "_2d"), + v2i64, v4i32, v2i32, Neon_High4S, DUP4s>; + + // Index can only be half of the max value for lane in 64-bit vector + + def : NI_2VE_lane(subop # "_4s4h"), neon_uimm2_bare, + !cast(op # "_4s"), VPR128, VPR64, VPR64Lo, + v4i32, v4i16, v4i16>; + + def : NI_2VE_lane(subop # "_2d2s"), neon_uimm1_bare, + !cast(op # "_2d"), VPR128, VPR64, VPR64, + v2i64, v2i32, v2i32>; + + def : NI_2VEL2_lane(subop # "_4s8h"), neon_uimm2_bare, + !cast(op # "_4s"), VPR64Lo, + v4i32, v8i16, v4i16, v4i16, Neon_High8H>; + + def : NI_2VEL2_lane(subop # "_2d4s"), neon_uimm1_bare, + !cast(op # "_2d"), VPR64, + v2i64, v4i32, v2i32, v2i32, Neon_High4S>; +} + +defm SQDMLAL_lane_v3 : NI_2VEL_v3_qdma_pat<"SQDMLALvve", "Neon_qdmlal">; +defm SQDMLSL_lane_v3 : NI_2VEL_v3_qdma_pat<"SQDMLSLvve", "Neon_qdmlsl">; + +// End of implementation for instruction class (3V Elem) + +class NeonI_REV size, bit Q, bit U, + bits<5> opcode, RegisterOperand ResVPR, ValueType ResTy, + SDPatternOperator Neon_Rev> + : NeonI_2VMisc ; + +def REV64_16b : NeonI_REV<"rev64", "16b", 0b00, 0b1, 0b0, 0b00000, VPR128, + v16i8, Neon_rev64>; +def REV64_8h : NeonI_REV<"rev64", "8h", 0b01, 0b1, 0b0, 0b00000, VPR128, + v8i16, Neon_rev64>; +def REV64_4s : NeonI_REV<"rev64", "4s", 0b10, 0b1, 0b0, 0b00000, VPR128, + v4i32, Neon_rev64>; +def REV64_8b : NeonI_REV<"rev64", "8b", 0b00, 0b0, 0b0, 0b00000, VPR64, + v8i8, Neon_rev64>; +def REV64_4h : NeonI_REV<"rev64", "4h", 0b01, 0b0, 0b0, 0b00000, VPR64, + v4i16, Neon_rev64>; +def REV64_2s : NeonI_REV<"rev64", "2s", 0b10, 0b0, 0b0, 0b00000, VPR64, + v2i32, Neon_rev64>; + +def : Pat<(v4f32 (Neon_rev64 (v4f32 VPR128:$Rn))), (REV64_4s VPR128:$Rn)>; +def : Pat<(v2f32 (Neon_rev64 (v2f32 VPR64:$Rn))), (REV64_2s VPR64:$Rn)>; + +def REV32_16b : NeonI_REV<"rev32", "16b", 0b00, 0b1, 0b1, 0b00000, VPR128, + v16i8, Neon_rev32>; +def REV32_8h : NeonI_REV<"rev32", "8h", 0b01, 0b1, 0b1, 0b00000, VPR128, + v8i16, Neon_rev32>; +def REV32_8b : NeonI_REV<"rev32", "8b", 0b00, 0b0, 0b1, 0b00000, VPR64, + v8i8, Neon_rev32>; +def REV32_4h : NeonI_REV<"rev32", "4h", 0b01, 0b0, 0b1, 0b00000, VPR64, + v4i16, Neon_rev32>; + +def REV16_16b : NeonI_REV<"rev16", "16b", 0b00, 0b1, 0b0, 0b00001, VPR128, + v16i8, Neon_rev16>; +def REV16_8b : NeonI_REV<"rev16", "8b", 0b00, 0b0, 0b0, 0b00001, VPR64, + v8i8, Neon_rev16>; + +multiclass NeonI_PairwiseAdd opcode, + SDPatternOperator Neon_Padd> { + def 16b8h : NeonI_2VMisc<0b1, U, 0b00, opcode, + (outs VPR128:$Rd), (ins VPR128:$Rn), + asmop # "\t$Rd.8h, $Rn.16b", + [(set (v8i16 VPR128:$Rd), + (v8i16 (Neon_Padd (v16i8 VPR128:$Rn))))], + NoItinerary>; + + def 8b4h : NeonI_2VMisc<0b0, U, 0b00, opcode, + (outs VPR64:$Rd), (ins VPR64:$Rn), + asmop # "\t$Rd.4h, $Rn.8b", + [(set (v4i16 VPR64:$Rd), + (v4i16 (Neon_Padd (v8i8 VPR64:$Rn))))], + NoItinerary>; + + def 8h4s : NeonI_2VMisc<0b1, U, 0b01, opcode, + (outs VPR128:$Rd), (ins VPR128:$Rn), + asmop # "\t$Rd.4s, $Rn.8h", + [(set (v4i32 VPR128:$Rd), + (v4i32 (Neon_Padd (v8i16 VPR128:$Rn))))], + NoItinerary>; + + def 4h2s : NeonI_2VMisc<0b0, U, 0b01, opcode, + (outs VPR64:$Rd), (ins VPR64:$Rn), + asmop # "\t$Rd.2s, $Rn.4h", + [(set (v2i32 VPR64:$Rd), + (v2i32 (Neon_Padd (v4i16 VPR64:$Rn))))], + NoItinerary>; + + def 4s2d : NeonI_2VMisc<0b1, U, 0b10, opcode, + (outs VPR128:$Rd), (ins VPR128:$Rn), + asmop # "\t$Rd.2d, $Rn.4s", + [(set (v2i64 VPR128:$Rd), + (v2i64 (Neon_Padd (v4i32 VPR128:$Rn))))], + NoItinerary>; + + def 2s1d : NeonI_2VMisc<0b0, U, 0b10, opcode, + (outs VPR64:$Rd), (ins VPR64:$Rn), + asmop # "\t$Rd.1d, $Rn.2s", + [(set (v1i64 VPR64:$Rd), + (v1i64 (Neon_Padd (v2i32 VPR64:$Rn))))], + NoItinerary>; +} + +defm SADDLP : NeonI_PairwiseAdd<"saddlp", 0b0, 0b00010, + int_arm_neon_vpaddls>; +defm UADDLP : NeonI_PairwiseAdd<"uaddlp", 0b1, 0b00010, + int_arm_neon_vpaddlu>; + +multiclass NeonI_PairwiseAddAcc opcode, + SDPatternOperator Neon_Padd> { + let Constraints = "$src = $Rd" in { + def 16b8h : NeonI_2VMisc<0b1, U, 0b00, opcode, + (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn), + asmop # "\t$Rd.8h, $Rn.16b", + [(set (v8i16 VPR128:$Rd), + (v8i16 (Neon_Padd + (v8i16 VPR128:$src), (v16i8 VPR128:$Rn))))], + NoItinerary>; + + def 8b4h : NeonI_2VMisc<0b0, U, 0b00, opcode, + (outs VPR64:$Rd), (ins VPR64:$src, VPR64:$Rn), + asmop # "\t$Rd.4h, $Rn.8b", + [(set (v4i16 VPR64:$Rd), + (v4i16 (Neon_Padd + (v4i16 VPR64:$src), (v8i8 VPR64:$Rn))))], + NoItinerary>; + + def 8h4s : NeonI_2VMisc<0b1, U, 0b01, opcode, + (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn), + asmop # "\t$Rd.4s, $Rn.8h", + [(set (v4i32 VPR128:$Rd), + (v4i32 (Neon_Padd + (v4i32 VPR128:$src), (v8i16 VPR128:$Rn))))], + NoItinerary>; + + def 4h2s : NeonI_2VMisc<0b0, U, 0b01, opcode, + (outs VPR64:$Rd), (ins VPR64:$src, VPR64:$Rn), + asmop # "\t$Rd.2s, $Rn.4h", + [(set (v2i32 VPR64:$Rd), + (v2i32 (Neon_Padd + (v2i32 VPR64:$src), (v4i16 VPR64:$Rn))))], + NoItinerary>; + + def 4s2d : NeonI_2VMisc<0b1, U, 0b10, opcode, + (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn), + asmop # "\t$Rd.2d, $Rn.4s", + [(set (v2i64 VPR128:$Rd), + (v2i64 (Neon_Padd + (v2i64 VPR128:$src), (v4i32 VPR128:$Rn))))], + NoItinerary>; + + def 2s1d : NeonI_2VMisc<0b0, U, 0b10, opcode, + (outs VPR64:$Rd), (ins VPR64:$src, VPR64:$Rn), + asmop # "\t$Rd.1d, $Rn.2s", + [(set (v1i64 VPR64:$Rd), + (v1i64 (Neon_Padd + (v1i64 VPR64:$src), (v2i32 VPR64:$Rn))))], + NoItinerary>; + } +} + +defm SADALP : NeonI_PairwiseAddAcc<"sadalp", 0b0, 0b00110, + int_arm_neon_vpadals>; +defm UADALP : NeonI_PairwiseAddAcc<"uadalp", 0b1, 0b00110, + int_arm_neon_vpadalu>; + +multiclass NeonI_2VMisc_BHSDsize_1Arg opcode> { + def 16b : NeonI_2VMisc<0b1, U, 0b00, opcode, + (outs VPR128:$Rd), (ins VPR128:$Rn), + asmop # "\t$Rd.16b, $Rn.16b", + [], NoItinerary>; + + def 8h : NeonI_2VMisc<0b1, U, 0b01, opcode, + (outs VPR128:$Rd), (ins VPR128:$Rn), + asmop # "\t$Rd.8h, $Rn.8h", + [], NoItinerary>; + + def 4s : NeonI_2VMisc<0b1, U, 0b10, opcode, + (outs VPR128:$Rd), (ins VPR128:$Rn), + asmop # "\t$Rd.4s, $Rn.4s", + [], NoItinerary>; + + def 2d : NeonI_2VMisc<0b1, U, 0b11, opcode, + (outs VPR128:$Rd), (ins VPR128:$Rn), + asmop # "\t$Rd.2d, $Rn.2d", + [], NoItinerary>; + + def 8b : NeonI_2VMisc<0b0, U, 0b00, opcode, + (outs VPR64:$Rd), (ins VPR64:$Rn), + asmop # "\t$Rd.8b, $Rn.8b", + [], NoItinerary>; + + def 4h : NeonI_2VMisc<0b0, U, 0b01, opcode, + (outs VPR64:$Rd), (ins VPR64:$Rn), + asmop # "\t$Rd.4h, $Rn.4h", + [], NoItinerary>; + + def 2s : NeonI_2VMisc<0b0, U, 0b10, opcode, + (outs VPR64:$Rd), (ins VPR64:$Rn), + asmop # "\t$Rd.2s, $Rn.2s", + [], NoItinerary>; +} + +defm SQABS : NeonI_2VMisc_BHSDsize_1Arg<"sqabs", 0b0, 0b00111>; +defm SQNEG : NeonI_2VMisc_BHSDsize_1Arg<"sqneg", 0b1, 0b00111>; +defm ABS : NeonI_2VMisc_BHSDsize_1Arg<"abs", 0b0, 0b01011>; +defm NEG : NeonI_2VMisc_BHSDsize_1Arg<"neg", 0b1, 0b01011>; + +multiclass NeonI_2VMisc_BHSD_1Arg_Pattern { + def : Pat<(v16i8 (Neon_Op (v16i8 VPR128:$Rn))), + (v16i8 (!cast(Prefix # 16b) (v16i8 VPR128:$Rn)))>; + + def : Pat<(v8i16 (Neon_Op (v8i16 VPR128:$Rn))), + (v8i16 (!cast(Prefix # 8h) (v8i16 VPR128:$Rn)))>; + + def : Pat<(v4i32 (Neon_Op (v4i32 VPR128:$Rn))), + (v4i32 (!cast(Prefix # 4s) (v4i32 VPR128:$Rn)))>; + + def : Pat<(v2i64 (Neon_Op (v2i64 VPR128:$Rn))), + (v2i64 (!cast(Prefix # 2d) (v2i64 VPR128:$Rn)))>; + + def : Pat<(v8i8 (Neon_Op (v8i8 VPR64:$Rn))), + (v8i8 (!cast(Prefix # 8b) (v8i8 VPR64:$Rn)))>; + + def : Pat<(v4i16 (Neon_Op (v4i16 VPR64:$Rn))), + (v4i16 (!cast(Prefix # 4h) (v4i16 VPR64:$Rn)))>; + + def : Pat<(v2i32 (Neon_Op (v2i32 VPR64:$Rn))), + (v2i32 (!cast(Prefix # 2s) (v2i32 VPR64:$Rn)))>; +} + +defm : NeonI_2VMisc_BHSD_1Arg_Pattern<"SQABS", int_arm_neon_vqabs>; +defm : NeonI_2VMisc_BHSD_1Arg_Pattern<"SQNEG", int_arm_neon_vqneg>; +defm : NeonI_2VMisc_BHSD_1Arg_Pattern<"ABS", int_arm_neon_vabs>; + +def : Pat<(v16i8 (sub + (v16i8 Neon_AllZero), + (v16i8 VPR128:$Rn))), + (v16i8 (NEG16b (v16i8 VPR128:$Rn)))>; +def : Pat<(v8i8 (sub + (v8i8 Neon_AllZero), + (v8i8 VPR64:$Rn))), + (v8i8 (NEG8b (v8i8 VPR64:$Rn)))>; +def : Pat<(v8i16 (sub + (v8i16 (bitconvert (v16i8 Neon_AllZero))), + (v8i16 VPR128:$Rn))), + (v8i16 (NEG8h (v8i16 VPR128:$Rn)))>; +def : Pat<(v4i16 (sub + (v4i16 (bitconvert (v8i8 Neon_AllZero))), + (v4i16 VPR64:$Rn))), + (v4i16 (NEG4h (v4i16 VPR64:$Rn)))>; +def : Pat<(v4i32 (sub + (v4i32 (bitconvert (v16i8 Neon_AllZero))), + (v4i32 VPR128:$Rn))), + (v4i32 (NEG4s (v4i32 VPR128:$Rn)))>; +def : Pat<(v2i32 (sub + (v2i32 (bitconvert (v8i8 Neon_AllZero))), + (v2i32 VPR64:$Rn))), + (v2i32 (NEG2s (v2i32 VPR64:$Rn)))>; +def : Pat<(v2i64 (sub + (v2i64 (bitconvert (v16i8 Neon_AllZero))), + (v2i64 VPR128:$Rn))), + (v2i64 (NEG2d (v2i64 VPR128:$Rn)))>; + +multiclass NeonI_2VMisc_BHSDsize_2Args opcode> { + let Constraints = "$src = $Rd" in { + def 16b : NeonI_2VMisc<0b1, U, 0b00, opcode, + (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn), + asmop # "\t$Rd.16b, $Rn.16b", + [], NoItinerary>; + + def 8h : NeonI_2VMisc<0b1, U, 0b01, opcode, + (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn), + asmop # "\t$Rd.8h, $Rn.8h", + [], NoItinerary>; + + def 4s : NeonI_2VMisc<0b1, U, 0b10, opcode, + (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn), + asmop # "\t$Rd.4s, $Rn.4s", + [], NoItinerary>; + + def 2d : NeonI_2VMisc<0b1, U, 0b11, opcode, + (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn), + asmop # "\t$Rd.2d, $Rn.2d", + [], NoItinerary>; + + def 8b : NeonI_2VMisc<0b0, U, 0b00, opcode, + (outs VPR64:$Rd), (ins VPR64:$src, VPR64:$Rn), + asmop # "\t$Rd.8b, $Rn.8b", + [], NoItinerary>; + + def 4h : NeonI_2VMisc<0b0, U, 0b01, opcode, + (outs VPR64:$Rd), (ins VPR64:$src, VPR64:$Rn), + asmop # "\t$Rd.4h, $Rn.4h", + [], NoItinerary>; + + def 2s : NeonI_2VMisc<0b0, U, 0b10, opcode, + (outs VPR64:$Rd), (ins VPR64:$src, VPR64:$Rn), + asmop # "\t$Rd.2s, $Rn.2s", + [], NoItinerary>; + } +} + +defm SUQADD : NeonI_2VMisc_BHSDsize_2Args<"suqadd", 0b0, 0b00011>; +defm USQADD : NeonI_2VMisc_BHSDsize_2Args<"usqadd", 0b1, 0b00011>; + +multiclass NeonI_2VMisc_BHSD_2Args_Pattern { + def : Pat<(v16i8 (Neon_Op (v16i8 VPR128:$src), (v16i8 VPR128:$Rn))), + (v16i8 (!cast(Prefix # 16b) + (v16i8 VPR128:$src), (v16i8 VPR128:$Rn)))>; + + def : Pat<(v8i16 (Neon_Op (v8i16 VPR128:$src), (v8i16 VPR128:$Rn))), + (v8i16 (!cast(Prefix # 8h) + (v8i16 VPR128:$src), (v8i16 VPR128:$Rn)))>; + + def : Pat<(v4i32 (Neon_Op (v4i32 VPR128:$src), (v4i32 VPR128:$Rn))), + (v4i32 (!cast(Prefix # 4s) + (v4i32 VPR128:$src), (v4i32 VPR128:$Rn)))>; + + def : Pat<(v2i64 (Neon_Op (v2i64 VPR128:$src), (v2i64 VPR128:$Rn))), + (v2i64 (!cast(Prefix # 2d) + (v2i64 VPR128:$src), (v2i64 VPR128:$Rn)))>; + + def : Pat<(v8i8 (Neon_Op (v8i8 VPR64:$src), (v8i8 VPR64:$Rn))), + (v8i8 (!cast(Prefix # 8b) + (v8i8 VPR64:$src), (v8i8 VPR64:$Rn)))>; + + def : Pat<(v4i16 (Neon_Op (v4i16 VPR64:$src), (v4i16 VPR64:$Rn))), + (v4i16 (!cast(Prefix # 4h) + (v4i16 VPR64:$src), (v4i16 VPR64:$Rn)))>; + + def : Pat<(v2i32 (Neon_Op (v2i32 VPR64:$src), (v2i32 VPR64:$Rn))), + (v2i32 (!cast(Prefix # 2s) + (v2i32 VPR64:$src), (v2i32 VPR64:$Rn)))>; +} + +defm : NeonI_2VMisc_BHSD_2Args_Pattern<"SUQADD", int_aarch64_neon_suqadd>; +defm : NeonI_2VMisc_BHSD_2Args_Pattern<"USQADD", int_aarch64_neon_usqadd>; + +multiclass NeonI_2VMisc_BHSsizes { + def 16b : NeonI_2VMisc<0b1, U, 0b00, 0b00100, + (outs VPR128:$Rd), (ins VPR128:$Rn), + asmop # "\t$Rd.16b, $Rn.16b", + [(set (v16i8 VPR128:$Rd), + (v16i8 (Neon_Op (v16i8 VPR128:$Rn))))], + NoItinerary>; + + def 8h : NeonI_2VMisc<0b1, U, 0b01, 0b00100, + (outs VPR128:$Rd), (ins VPR128:$Rn), + asmop # "\t$Rd.8h, $Rn.8h", + [(set (v8i16 VPR128:$Rd), + (v8i16 (Neon_Op (v8i16 VPR128:$Rn))))], + NoItinerary>; + + def 4s : NeonI_2VMisc<0b1, U, 0b10, 0b00100, + (outs VPR128:$Rd), (ins VPR128:$Rn), + asmop # "\t$Rd.4s, $Rn.4s", + [(set (v4i32 VPR128:$Rd), + (v4i32 (Neon_Op (v4i32 VPR128:$Rn))))], + NoItinerary>; + + def 8b : NeonI_2VMisc<0b0, U, 0b00, 0b00100, + (outs VPR64:$Rd), (ins VPR64:$Rn), + asmop # "\t$Rd.8b, $Rn.8b", + [(set (v8i8 VPR64:$Rd), + (v8i8 (Neon_Op (v8i8 VPR64:$Rn))))], + NoItinerary>; + + def 4h : NeonI_2VMisc<0b0, U, 0b01, 0b00100, + (outs VPR64:$Rd), (ins VPR64:$Rn), + asmop # "\t$Rd.4h, $Rn.4h", + [(set (v4i16 VPR64:$Rd), + (v4i16 (Neon_Op (v4i16 VPR64:$Rn))))], + NoItinerary>; + + def 2s : NeonI_2VMisc<0b0, U, 0b10, 0b00100, + (outs VPR64:$Rd), (ins VPR64:$Rn), + asmop # "\t$Rd.2s, $Rn.2s", + [(set (v2i32 VPR64:$Rd), + (v2i32 (Neon_Op (v2i32 VPR64:$Rn))))], + NoItinerary>; +} + +defm CLS : NeonI_2VMisc_BHSsizes<"cls", 0b0, int_arm_neon_vcls>; +defm CLZ : NeonI_2VMisc_BHSsizes<"clz", 0b1, ctlz>; + +multiclass NeonI_2VMisc_Bsize size, + bits<5> Opcode> { + def 16b : NeonI_2VMisc<0b1, U, size, Opcode, + (outs VPR128:$Rd), (ins VPR128:$Rn), + asmop # "\t$Rd.16b, $Rn.16b", + [], NoItinerary>; + + def 8b : NeonI_2VMisc<0b0, U, size, Opcode, + (outs VPR64:$Rd), (ins VPR64:$Rn), + asmop # "\t$Rd.8b, $Rn.8b", + [], NoItinerary>; +} + +defm CNT : NeonI_2VMisc_Bsize<"cnt", 0b0, 0b00, 0b00101>; +defm NOT : NeonI_2VMisc_Bsize<"not", 0b1, 0b00, 0b00101>; +defm RBIT : NeonI_2VMisc_Bsize<"rbit", 0b1, 0b01, 0b00101>; + +def : NeonInstAlias<"mvn $Rd.16b, $Rn.16b", + (NOT16b VPR128:$Rd, VPR128:$Rn), 0>; +def : NeonInstAlias<"mvn $Rd.8b, $Rn.8b", + (NOT8b VPR64:$Rd, VPR64:$Rn), 0>; + +def : Pat<(v16i8 (ctpop (v16i8 VPR128:$Rn))), + (v16i8 (CNT16b (v16i8 VPR128:$Rn)))>; +def : Pat<(v8i8 (ctpop (v8i8 VPR64:$Rn))), + (v8i8 (CNT8b (v8i8 VPR64:$Rn)))>; + +def : Pat<(v16i8 (xor + (v16i8 VPR128:$Rn), + (v16i8 Neon_AllOne))), + (v16i8 (NOT16b (v16i8 VPR128:$Rn)))>; +def : Pat<(v8i8 (xor + (v8i8 VPR64:$Rn), + (v8i8 Neon_AllOne))), + (v8i8 (NOT8b (v8i8 VPR64:$Rn)))>; +def : Pat<(v8i16 (xor + (v8i16 VPR128:$Rn), + (v8i16 (bitconvert (v16i8 Neon_AllOne))))), + (NOT16b VPR128:$Rn)>; +def : Pat<(v4i16 (xor + (v4i16 VPR64:$Rn), + (v4i16 (bitconvert (v8i8 Neon_AllOne))))), + (NOT8b VPR64:$Rn)>; +def : Pat<(v4i32 (xor + (v4i32 VPR128:$Rn), + (v4i32 (bitconvert (v16i8 Neon_AllOne))))), + (NOT16b VPR128:$Rn)>; +def : Pat<(v2i32 (xor + (v2i32 VPR64:$Rn), + (v2i32 (bitconvert (v8i8 Neon_AllOne))))), + (NOT8b VPR64:$Rn)>; +def : Pat<(v2i64 (xor + (v2i64 VPR128:$Rn), + (v2i64 (bitconvert (v16i8 Neon_AllOne))))), + (NOT16b VPR128:$Rn)>; + +def : Pat<(v16i8 (int_aarch64_neon_rbit (v16i8 VPR128:$Rn))), + (v16i8 (RBIT16b (v16i8 VPR128:$Rn)))>; +def : Pat<(v8i8 (int_aarch64_neon_rbit (v8i8 VPR64:$Rn))), + (v8i8 (RBIT8b (v8i8 VPR64:$Rn)))>; + +multiclass NeonI_2VMisc_SDsizes opcode, + SDPatternOperator Neon_Op> { + def 4s : NeonI_2VMisc<0b1, U, 0b10, opcode, + (outs VPR128:$Rd), (ins VPR128:$Rn), + asmop # "\t$Rd.4s, $Rn.4s", + [(set (v4f32 VPR128:$Rd), + (v4f32 (Neon_Op (v4f32 VPR128:$Rn))))], + NoItinerary>; + + def 2d : NeonI_2VMisc<0b1, U, 0b11, opcode, + (outs VPR128:$Rd), (ins VPR128:$Rn), + asmop # "\t$Rd.2d, $Rn.2d", + [(set (v2f64 VPR128:$Rd), + (v2f64 (Neon_Op (v2f64 VPR128:$Rn))))], + NoItinerary>; + + def 2s : NeonI_2VMisc<0b0, U, 0b10, opcode, + (outs VPR64:$Rd), (ins VPR64:$Rn), + asmop # "\t$Rd.2s, $Rn.2s", + [(set (v2f32 VPR64:$Rd), + (v2f32 (Neon_Op (v2f32 VPR64:$Rn))))], + NoItinerary>; +} + +defm FABS : NeonI_2VMisc_SDsizes<"fabs", 0b0, 0b01111, fabs>; +defm FNEG : NeonI_2VMisc_SDsizes<"fneg", 0b1, 0b01111, fneg>; + +multiclass NeonI_2VMisc_HSD_Narrow opcode> { + def 8h8b : NeonI_2VMisc<0b0, U, 0b00, opcode, + (outs VPR64:$Rd), (ins VPR128:$Rn), + asmop # "\t$Rd.8b, $Rn.8h", + [], NoItinerary>; + + def 4s4h : NeonI_2VMisc<0b0, U, 0b01, opcode, + (outs VPR64:$Rd), (ins VPR128:$Rn), + asmop # "\t$Rd.4h, $Rn.4s", + [], NoItinerary>; + + def 2d2s : NeonI_2VMisc<0b0, U, 0b10, opcode, + (outs VPR64:$Rd), (ins VPR128:$Rn), + asmop # "\t$Rd.2s, $Rn.2d", + [], NoItinerary>; + + let Constraints = "$Rd = $src" in { + def 8h16b : NeonI_2VMisc<0b1, U, 0b00, opcode, + (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn), + asmop # "2\t$Rd.16b, $Rn.8h", + [], NoItinerary>; + + def 4s8h : NeonI_2VMisc<0b1, U, 0b01, opcode, + (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn), + asmop # "2\t$Rd.8h, $Rn.4s", + [], NoItinerary>; + + def 2d4s : NeonI_2VMisc<0b1, U, 0b10, opcode, + (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn), + asmop # "2\t$Rd.4s, $Rn.2d", + [], NoItinerary>; + } +} + +defm XTN : NeonI_2VMisc_HSD_Narrow<"xtn", 0b0, 0b10010>; +defm SQXTUN : NeonI_2VMisc_HSD_Narrow<"sqxtun", 0b1, 0b10010>; +defm SQXTN : NeonI_2VMisc_HSD_Narrow<"sqxtn", 0b0, 0b10100>; +defm UQXTN : NeonI_2VMisc_HSD_Narrow<"uqxtn", 0b1, 0b10100>; + +multiclass NeonI_2VMisc_Narrow_Patterns { + def : Pat<(v8i8 (Neon_Op (v8i16 VPR128:$Rn))), + (v8i8 (!cast(Prefix # 8h8b) (v8i16 VPR128:$Rn)))>; + + def : Pat<(v4i16 (Neon_Op (v4i32 VPR128:$Rn))), + (v4i16 (!cast(Prefix # 4s4h) (v4i32 VPR128:$Rn)))>; + + def : Pat<(v2i32 (Neon_Op (v2i64 VPR128:$Rn))), + (v2i32 (!cast(Prefix # 2d2s) (v2i64 VPR128:$Rn)))>; + + def : Pat<(v16i8 (concat_vectors + (v8i8 VPR64:$src), + (v8i8 (Neon_Op (v8i16 VPR128:$Rn))))), + (!cast(Prefix # 8h16b) + (SUBREG_TO_REG (i32 0), VPR64:$src, sub_64), + VPR128:$Rn)>; + + def : Pat<(v8i16 (concat_vectors + (v4i16 VPR64:$src), + (v4i16 (Neon_Op (v4i32 VPR128:$Rn))))), + (!cast(Prefix # 4s8h) + (SUBREG_TO_REG (i32 0), VPR64:$src, sub_64), + VPR128:$Rn)>; + + def : Pat<(v4i32 (concat_vectors + (v2i32 VPR64:$src), + (v2i32 (Neon_Op (v2i64 VPR128:$Rn))))), + (!cast(Prefix # 2d4s) + (SUBREG_TO_REG (i32 0), VPR64:$src, sub_64), + VPR128:$Rn)>; +} + +defm : NeonI_2VMisc_Narrow_Patterns<"XTN", trunc>; +defm : NeonI_2VMisc_Narrow_Patterns<"SQXTUN", int_arm_neon_vqmovnsu>; +defm : NeonI_2VMisc_Narrow_Patterns<"SQXTN", int_arm_neon_vqmovns>; +defm : NeonI_2VMisc_Narrow_Patterns<"UQXTN", int_arm_neon_vqmovnu>; + +multiclass NeonI_2VMisc_SHIFT opcode> { + let DecoderMethod = "DecodeSHLLInstruction" in { + def 8b8h : NeonI_2VMisc<0b0, U, 0b00, opcode, + (outs VPR128:$Rd), + (ins VPR64:$Rn, uimm_exact8:$Imm), + asmop # "\t$Rd.8h, $Rn.8b, $Imm", + [], NoItinerary>; + + def 4h4s : NeonI_2VMisc<0b0, U, 0b01, opcode, + (outs VPR128:$Rd), + (ins VPR64:$Rn, uimm_exact16:$Imm), + asmop # "\t$Rd.4s, $Rn.4h, $Imm", + [], NoItinerary>; + + def 2s2d : NeonI_2VMisc<0b0, U, 0b10, opcode, + (outs VPR128:$Rd), + (ins VPR64:$Rn, uimm_exact32:$Imm), + asmop # "\t$Rd.2d, $Rn.2s, $Imm", + [], NoItinerary>; + + def 16b8h : NeonI_2VMisc<0b1, U, 0b00, opcode, + (outs VPR128:$Rd), + (ins VPR128:$Rn, uimm_exact8:$Imm), + asmop # "2\t$Rd.8h, $Rn.16b, $Imm", + [], NoItinerary>; + + def 8h4s : NeonI_2VMisc<0b1, U, 0b01, opcode, + (outs VPR128:$Rd), + (ins VPR128:$Rn, uimm_exact16:$Imm), + asmop # "2\t$Rd.4s, $Rn.8h, $Imm", + [], NoItinerary>; + + def 4s2d : NeonI_2VMisc<0b1, U, 0b10, opcode, + (outs VPR128:$Rd), + (ins VPR128:$Rn, uimm_exact32:$Imm), + asmop # "2\t$Rd.2d, $Rn.4s, $Imm", + [], NoItinerary>; + } +} + +defm SHLL : NeonI_2VMisc_SHIFT<"shll", 0b1, 0b10011>; + +class NeonI_SHLL_Patterns + : Pat<(DesTy (shl + (DesTy (ExtOp (OpTy VPR64:$Rn))), + (DesTy (Neon_vdup + (i32 Neon_Imm:$Imm))))), + (!cast("SHLL" # suffix) VPR64:$Rn, Neon_Imm:$Imm)>; + +class NeonI_SHLL_High_Patterns + : Pat<(DesTy (shl + (DesTy (ExtOp + (OpTy (GetHigh VPR128:$Rn)))), + (DesTy (Neon_vdup + (i32 Neon_Imm:$Imm))))), + (!cast("SHLL" # suffix) VPR128:$Rn, Neon_Imm:$Imm)>; + +def : NeonI_SHLL_Patterns; +def : NeonI_SHLL_Patterns; +def : NeonI_SHLL_Patterns; +def : NeonI_SHLL_Patterns; +def : NeonI_SHLL_Patterns; +def : NeonI_SHLL_Patterns; +def : NeonI_SHLL_High_Patterns; +def : NeonI_SHLL_High_Patterns; +def : NeonI_SHLL_High_Patterns; +def : NeonI_SHLL_High_Patterns; +def : NeonI_SHLL_High_Patterns; +def : NeonI_SHLL_High_Patterns; + +multiclass NeonI_2VMisc_SD_Narrow opcode> { + def 4s4h : NeonI_2VMisc<0b0, U, 0b00, opcode, + (outs VPR64:$Rd), (ins VPR128:$Rn), + asmop # "\t$Rd.4h, $Rn.4s", + [], NoItinerary>; + + def 2d2s : NeonI_2VMisc<0b0, U, 0b01, opcode, + (outs VPR64:$Rd), (ins VPR128:$Rn), + asmop # "\t$Rd.2s, $Rn.2d", + [], NoItinerary>; + + let Constraints = "$src = $Rd" in { + def 4s8h : NeonI_2VMisc<0b1, U, 0b00, opcode, + (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn), + asmop # "2\t$Rd.8h, $Rn.4s", + [], NoItinerary>; + + def 2d4s : NeonI_2VMisc<0b1, U, 0b01, opcode, + (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn), + asmop # "2\t$Rd.4s, $Rn.2d", + [], NoItinerary>; + } +} + +defm FCVTN : NeonI_2VMisc_SD_Narrow<"fcvtn", 0b0, 0b10110>; + +multiclass NeonI_2VMisc_Narrow_Pattern { + + def : Pat<(v4i16 (f32_to_f16_Op (v4f32 VPR128:$Rn))), + (!cast(prefix # "4s4h") (v4f32 VPR128:$Rn))>; + + def : Pat<(v8i16 (concat_vectors + (v4i16 VPR64:$src), + (v4i16 (f32_to_f16_Op (v4f32 VPR128:$Rn))))), + (!cast(prefix # "4s8h") + (v4f32 (SUBREG_TO_REG (i32 0), VPR64:$src, sub_64)), + (v4f32 VPR128:$Rn))>; + + def : Pat<(v2f32 (f64_to_f32_Op (v2f64 VPR128:$Rn))), + (!cast(prefix # "2d2s") (v2f64 VPR128:$Rn))>; + + def : Pat<(v4f32 (concat_vectors + (v2f32 VPR64:$src), + (v2f32 (f64_to_f32_Op (v2f64 VPR128:$Rn))))), + (!cast(prefix # "2d4s") + (v4f32 (SUBREG_TO_REG (i32 0), VPR64:$src, sub_64)), + (v2f64 VPR128:$Rn))>; +} + +defm : NeonI_2VMisc_Narrow_Pattern<"FCVTN", int_arm_neon_vcvtfp2hf, fround>; + +multiclass NeonI_2VMisc_D_Narrow opcode> { + def 2d2s : NeonI_2VMisc<0b0, U, 0b01, opcode, + (outs VPR64:$Rd), (ins VPR128:$Rn), + asmop # "\t$Rd.2s, $Rn.2d", + [], NoItinerary>; + + def 2d4s : NeonI_2VMisc<0b1, U, 0b01, opcode, + (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn), + asmop # "2\t$Rd.4s, $Rn.2d", + [], NoItinerary> { + let Constraints = "$src = $Rd"; + } + + def : Pat<(v2f32 (int_aarch64_neon_fcvtxn (v2f64 VPR128:$Rn))), + (!cast(prefix # "2d2s") VPR128:$Rn)>; + + def : Pat<(v4f32 (concat_vectors + (v2f32 VPR64:$src), + (v2f32 (int_aarch64_neon_fcvtxn (v2f64 VPR128:$Rn))))), + (!cast(prefix # "2d4s") + (v4f32 (SUBREG_TO_REG (i32 0), VPR64:$src, sub_64)), + VPR128:$Rn)>; +} + +defm FCVTXN : NeonI_2VMisc_D_Narrow<"fcvtxn","FCVTXN", 0b1, 0b10110>; + +def Neon_High4Float : PatFrag<(ops node:$in), + (extract_subvector (v4f32 node:$in), (iPTR 2))>; + +multiclass NeonI_2VMisc_HS_Extend opcode> { + def 4h4s : NeonI_2VMisc<0b0, U, 0b00, opcode, + (outs VPR128:$Rd), (ins VPR64:$Rn), + asmop # "\t$Rd.4s, $Rn.4h", + [], NoItinerary>; + + def 2s2d : NeonI_2VMisc<0b0, U, 0b01, opcode, + (outs VPR128:$Rd), (ins VPR64:$Rn), + asmop # "\t$Rd.2d, $Rn.2s", + [], NoItinerary>; + + def 8h4s : NeonI_2VMisc<0b1, U, 0b00, opcode, + (outs VPR128:$Rd), (ins VPR128:$Rn), + asmop # "2\t$Rd.4s, $Rn.8h", + [], NoItinerary>; + + def 4s2d : NeonI_2VMisc<0b1, U, 0b01, opcode, + (outs VPR128:$Rd), (ins VPR128:$Rn), + asmop # "2\t$Rd.2d, $Rn.4s", + [], NoItinerary>; +} + +defm FCVTL : NeonI_2VMisc_HS_Extend<"fcvtl", 0b0, 0b10111>; + +multiclass NeonI_2VMisc_Extend_Pattern { + def : Pat<(v4f32 (int_arm_neon_vcvthf2fp (v4i16 VPR64:$Rn))), + (!cast(prefix # "4h4s") VPR64:$Rn)>; + + def : Pat<(v4f32 (int_arm_neon_vcvthf2fp + (v4i16 (Neon_High8H + (v8i16 VPR128:$Rn))))), + (!cast(prefix # "8h4s") VPR128:$Rn)>; + + def : Pat<(v2f64 (fextend (v2f32 VPR64:$Rn))), + (!cast(prefix # "2s2d") VPR64:$Rn)>; + + def : Pat<(v2f64 (fextend + (v2f32 (Neon_High4Float + (v4f32 VPR128:$Rn))))), + (!cast(prefix # "4s2d") VPR128:$Rn)>; +} + +defm : NeonI_2VMisc_Extend_Pattern<"FCVTL">; + +multiclass NeonI_2VMisc_SD_Conv opcode, + ValueType ResTy4s, ValueType OpTy4s, + ValueType ResTy2d, ValueType OpTy2d, + ValueType ResTy2s, ValueType OpTy2s, + SDPatternOperator Neon_Op> { + + def 4s : NeonI_2VMisc<0b1, U, {Size, 0b0}, opcode, + (outs VPR128:$Rd), (ins VPR128:$Rn), + asmop # "\t$Rd.4s, $Rn.4s", + [(set (ResTy4s VPR128:$Rd), + (ResTy4s (Neon_Op (OpTy4s VPR128:$Rn))))], + NoItinerary>; + + def 2d : NeonI_2VMisc<0b1, U, {Size, 0b1}, opcode, + (outs VPR128:$Rd), (ins VPR128:$Rn), + asmop # "\t$Rd.2d, $Rn.2d", + [(set (ResTy2d VPR128:$Rd), + (ResTy2d (Neon_Op (OpTy2d VPR128:$Rn))))], + NoItinerary>; + + def 2s : NeonI_2VMisc<0b0, U, {Size, 0b0}, opcode, + (outs VPR64:$Rd), (ins VPR64:$Rn), + asmop # "\t$Rd.2s, $Rn.2s", + [(set (ResTy2s VPR64:$Rd), + (ResTy2s (Neon_Op (OpTy2s VPR64:$Rn))))], + NoItinerary>; +} + +multiclass NeonI_2VMisc_fp_to_int opcode, SDPatternOperator Neon_Op> { + defm _ : NeonI_2VMisc_SD_Conv; +} + +defm FCVTNS : NeonI_2VMisc_fp_to_int<"fcvtns", 0b0, 0b0, 0b11010, + int_aarch64_neon_fcvtns>; +defm FCVTNU : NeonI_2VMisc_fp_to_int<"fcvtnu", 0b0, 0b1, 0b11010, + int_aarch64_neon_fcvtnu>; +defm FCVTPS : NeonI_2VMisc_fp_to_int<"fcvtps", 0b1, 0b0, 0b11010, + int_aarch64_neon_fcvtps>; +defm FCVTPU : NeonI_2VMisc_fp_to_int<"fcvtpu", 0b1, 0b1, 0b11010, + int_aarch64_neon_fcvtpu>; +defm FCVTMS : NeonI_2VMisc_fp_to_int<"fcvtms", 0b0, 0b0, 0b11011, + int_aarch64_neon_fcvtms>; +defm FCVTMU : NeonI_2VMisc_fp_to_int<"fcvtmu", 0b0, 0b1, 0b11011, + int_aarch64_neon_fcvtmu>; +defm FCVTZS : NeonI_2VMisc_fp_to_int<"fcvtzs", 0b1, 0b0, 0b11011, fp_to_sint>; +defm FCVTZU : NeonI_2VMisc_fp_to_int<"fcvtzu", 0b1, 0b1, 0b11011, fp_to_uint>; +defm FCVTAS : NeonI_2VMisc_fp_to_int<"fcvtas", 0b0, 0b0, 0b11100, + int_aarch64_neon_fcvtas>; +defm FCVTAU : NeonI_2VMisc_fp_to_int<"fcvtau", 0b0, 0b1, 0b11100, + int_aarch64_neon_fcvtau>; + +multiclass NeonI_2VMisc_int_to_fp opcode, SDPatternOperator Neon_Op> { + defm _ : NeonI_2VMisc_SD_Conv; +} + +defm SCVTF : NeonI_2VMisc_int_to_fp<"scvtf", 0b0, 0b0, 0b11101, sint_to_fp>; +defm UCVTF : NeonI_2VMisc_int_to_fp<"ucvtf", 0b0, 0b1, 0b11101, uint_to_fp>; + +multiclass NeonI_2VMisc_fp_to_fp opcode, SDPatternOperator Neon_Op> { + defm _ : NeonI_2VMisc_SD_Conv; +} + +defm FRINTN : NeonI_2VMisc_fp_to_fp<"frintn", 0b0, 0b0, 0b11000, + int_aarch64_neon_frintn>; +defm FRINTA : NeonI_2VMisc_fp_to_fp<"frinta", 0b0, 0b1, 0b11000, frnd>; +defm FRINTP : NeonI_2VMisc_fp_to_fp<"frintp", 0b1, 0b0, 0b11000, fceil>; +defm FRINTM : NeonI_2VMisc_fp_to_fp<"frintm", 0b0, 0b0, 0b11001, ffloor>; +defm FRINTX : NeonI_2VMisc_fp_to_fp<"frintx", 0b0, 0b1, 0b11001, frint>; +defm FRINTZ : NeonI_2VMisc_fp_to_fp<"frintz", 0b1, 0b0, 0b11001, ftrunc>; +defm FRINTI : NeonI_2VMisc_fp_to_fp<"frinti", 0b1, 0b1, 0b11001, fnearbyint>; +defm FRECPE : NeonI_2VMisc_fp_to_fp<"frecpe", 0b1, 0b0, 0b11101, + int_arm_neon_vrecpe>; +defm FRSQRTE : NeonI_2VMisc_fp_to_fp<"frsqrte", 0b1, 0b1, 0b11101, + int_arm_neon_vrsqrte>; +defm FSQRT : NeonI_2VMisc_fp_to_fp<"fsqrt", 0b1, 0b1, 0b11111, fsqrt>; + +multiclass NeonI_2VMisc_S_Conv opcode, SDPatternOperator Neon_Op> { + def 4s : NeonI_2VMisc<0b1, U, {Size, 0b0}, opcode, + (outs VPR128:$Rd), (ins VPR128:$Rn), + asmop # "\t$Rd.4s, $Rn.4s", + [(set (v4i32 VPR128:$Rd), + (v4i32 (Neon_Op (v4i32 VPR128:$Rn))))], + NoItinerary>; + + def 2s : NeonI_2VMisc<0b0, U, {Size, 0b0}, opcode, + (outs VPR64:$Rd), (ins VPR64:$Rn), + asmop # "\t$Rd.2s, $Rn.2s", + [(set (v2i32 VPR64:$Rd), + (v2i32 (Neon_Op (v2i32 VPR64:$Rn))))], + NoItinerary>; +} + +defm URECPE : NeonI_2VMisc_S_Conv<"urecpe", 0b1, 0b0, 0b11100, + int_arm_neon_vrecpe>; +defm URSQRTE : NeonI_2VMisc_S_Conv<"ursqrte", 0b1, 0b1, 0b11100, + int_arm_neon_vrsqrte>; + +// Crypto Class +class NeonI_Cryptoaes_2v size, bits<5> opcode, + string asmop, SDPatternOperator opnode> + : NeonI_Crypto_AES{ + let Constraints = "$src = $Rd"; + let Predicates = [HasNEON, HasCrypto]; +} + +def AESE : NeonI_Cryptoaes_2v<0b00, 0b00100, "aese", int_arm_neon_aese>; +def AESD : NeonI_Cryptoaes_2v<0b00, 0b00101, "aesd", int_arm_neon_aesd>; + +class NeonI_Cryptoaes size, bits<5> opcode, + string asmop, SDPatternOperator opnode> + : NeonI_Crypto_AES; + +def AESMC : NeonI_Cryptoaes<0b00, 0b00110, "aesmc", int_arm_neon_aesmc>; +def AESIMC : NeonI_Cryptoaes<0b00, 0b00111, "aesimc", int_arm_neon_aesimc>; + +class NeonI_Cryptosha_vv size, bits<5> opcode, + string asmop, SDPatternOperator opnode> + : NeonI_Crypto_SHA { + let Constraints = "$src = $Rd"; + let Predicates = [HasNEON, HasCrypto]; +} + +def SHA1SU1 : NeonI_Cryptosha_vv<0b00, 0b00001, "sha1su1", + int_arm_neon_sha1su1>; +def SHA256SU0 : NeonI_Cryptosha_vv<0b00, 0b00010, "sha256su0", + int_arm_neon_sha256su0>; + +class NeonI_Cryptosha_ss size, bits<5> opcode, + string asmop, SDPatternOperator opnode> + : NeonI_Crypto_SHA { + let Predicates = [HasNEON, HasCrypto]; +} + +def SHA1H : NeonI_Cryptosha_ss<0b00, 0b00000, "sha1h", int_arm_neon_sha1h>; + +class NeonI_Cryptosha3_vvv size, bits<3> opcode, string asmop, + SDPatternOperator opnode> + : NeonI_Crypto_3VSHA { + let Constraints = "$src = $Rd"; + let Predicates = [HasNEON, HasCrypto]; +} + +def SHA1SU0 : NeonI_Cryptosha3_vvv<0b00, 0b011, "sha1su0", + int_arm_neon_sha1su0>; +def SHA256SU1 : NeonI_Cryptosha3_vvv<0b00, 0b110, "sha256su1", + int_arm_neon_sha256su1>; + +class NeonI_Cryptosha3_qqv size, bits<3> opcode, string asmop, + SDPatternOperator opnode> + : NeonI_Crypto_3VSHA { + let Constraints = "$src = $Rd"; + let Predicates = [HasNEON, HasCrypto]; +} + +def SHA256H : NeonI_Cryptosha3_qqv<0b00, 0b100, "sha256h", + int_arm_neon_sha256h>; +def SHA256H2 : NeonI_Cryptosha3_qqv<0b00, 0b101, "sha256h2", + int_arm_neon_sha256h2>; + +class NeonI_Cryptosha3_qsv size, bits<3> opcode, string asmop, + SDPatternOperator opnode> + : NeonI_Crypto_3VSHA { + let Constraints = "$src = $Rd"; + let Predicates = [HasNEON, HasCrypto]; +} + +def SHA1C : NeonI_Cryptosha3_qsv<0b00, 0b000, "sha1c", int_aarch64_neon_sha1c>; +def SHA1P : NeonI_Cryptosha3_qsv<0b00, 0b001, "sha1p", int_aarch64_neon_sha1p>; +def SHA1M : NeonI_Cryptosha3_qsv<0b00, 0b010, "sha1m", int_aarch64_neon_sha1m>; + +// +// Patterns for handling half-precision values +// + +// Convert f16 value coming in as i16 value to f32 +def : Pat<(f32 (f16_to_f32 (i32 (and (i32 GPR32:$Rn), 65535)))), + (FCVTsh (EXTRACT_SUBREG (FMOVsw GPR32:$Rn), sub_16))>; +def : Pat<(f32 (f16_to_f32 (i32 (assertzext GPR32:$Rn)))), + (FCVTsh (EXTRACT_SUBREG (FMOVsw GPR32:$Rn), sub_16))>; + +def : Pat<(f32 (f16_to_f32 (i32 (assertzext (i32 ( + f32_to_f16 (f32 FPR32:$Rn))))))), + (f32 FPR32:$Rn)>; + +// Patterns for vector extract of half-precision FP value in i16 storage type +def : Pat<(f32 (f16_to_f32 ( i32 (and (i32 (vector_extract + (v4i16 VPR64:$Rn), neon_uimm2_bare:$Imm)), 65535)))), + (FCVTsh (f16 (DUPhv_H + (v8i16 (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)), + neon_uimm2_bare:$Imm)))>; + +def : Pat<(f32 (f16_to_f32 ( i32 (and (i32 (vector_extract + (v8i16 VPR128:$Rn), neon_uimm3_bare:$Imm)), 65535)))), + (FCVTsh (f16 (DUPhv_H (v8i16 VPR128:$Rn), neon_uimm3_bare:$Imm)))>; + +// Patterns for vector insert of half-precision FP value 0 in i16 storage type +def : Pat<(v8i16 (vector_insert (v8i16 VPR128:$Rn), + (i32 (assertsext (i32 (fp_to_sint(f32 (f16_to_f32 (i32 0))))))), + (neon_uimm3_bare:$Imm))), + (v8i16 (INSELh (v8i16 VPR128:$Rn), + (v8i16 (SUBREG_TO_REG (i64 0), + (f16 (EXTRACT_SUBREG (f32 (FMOVsw (i32 WZR))), sub_16)), + sub_16)), + neon_uimm3_bare:$Imm, 0))>; + +def : Pat<(v4i16 (vector_insert (v4i16 VPR64:$Rn), + (i32 (assertsext (i32 (fp_to_sint(f32 (f16_to_f32 (i32 0))))))), + (neon_uimm2_bare:$Imm))), + (v4i16 (EXTRACT_SUBREG + (v8i16 (INSELh + (v8i16 (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)), + (v8i16 (SUBREG_TO_REG (i64 0), + (f16 (EXTRACT_SUBREG (f32 (FMOVsw (i32 WZR))), sub_16)), + sub_16)), + neon_uimm2_bare:$Imm, 0)), + sub_64))>; + +// Patterns for vector insert of half-precision FP value in i16 storage type +def : Pat<(v8i16 (vector_insert (v8i16 VPR128:$Rn), + (i32 (assertsext (i32 (fp_to_sint + (f32 (f16_to_f32 (i32 (and (i32 GPR32:$src), 65535)))))))), + (neon_uimm3_bare:$Imm))), + (v8i16 (INSELh (v8i16 VPR128:$Rn), + (v8i16 (SUBREG_TO_REG (i64 0), + (f16 (EXTRACT_SUBREG (f32 (FMOVsw (i32 GPR32:$src))), sub_16)), + sub_16)), + neon_uimm3_bare:$Imm, 0))>; + +def : Pat<(v4i16 (vector_insert (v4i16 VPR64:$Rn), + (i32 (assertsext (i32 (fp_to_sint + (f32 (f16_to_f32 (i32 (and (i32 GPR32:$src), 65535)))))))), + (neon_uimm2_bare:$Imm))), + (v4i16 (EXTRACT_SUBREG + (v8i16 (INSELh + (v8i16 (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)), + (v8i16 (SUBREG_TO_REG (i64 0), + (f16 (EXTRACT_SUBREG (f32 (FMOVsw (i32 GPR32:$src))), sub_16)), + sub_16)), + neon_uimm2_bare:$Imm, 0)), + sub_64))>; + +def : Pat<(v8i16 (vector_insert (v8i16 VPR128:$Rn), + (i32 (vector_extract (v8i16 VPR128:$src), neon_uimm3_bare:$Imm2)), + (neon_uimm3_bare:$Imm1))), + (v8i16 (INSELh (v8i16 VPR128:$Rn), (v8i16 VPR128:$src), + neon_uimm3_bare:$Imm1, neon_uimm3_bare:$Imm2))>; + +// Patterns for vector copy of half-precision FP value in i16 storage type +def : Pat<(v8i16 (vector_insert (v8i16 VPR128:$Rn), + (i32 (assertsext (i32 (fp_to_sint(f32 (f16_to_f32 (i32 (and (i32 + (vector_extract (v8i16 VPR128:$src), neon_uimm3_bare:$Imm2)), + 65535)))))))), + (neon_uimm3_bare:$Imm1))), + (v8i16 (INSELh (v8i16 VPR128:$Rn), (v8i16 VPR128:$src), + neon_uimm3_bare:$Imm1, neon_uimm3_bare:$Imm2))>; + +def : Pat<(v4i16 (vector_insert (v4i16 VPR64:$Rn), + (i32 (assertsext (i32 (fp_to_sint(f32 (f16_to_f32 (i32 (and (i32 + (vector_extract (v4i16 VPR64:$src), neon_uimm3_bare:$Imm2)), + 65535)))))))), + (neon_uimm3_bare:$Imm1))), + (v4i16 (EXTRACT_SUBREG + (v8i16 (INSELh + (v8i16 (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)), + (v8i16 (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64)), + neon_uimm3_bare:$Imm1, neon_uimm3_bare:$Imm2)), + sub_64))>; + + diff --git a/lib/Target/AArch64/AArch64MCInstLower.cpp b/lib/Target/AArch64/AArch64MCInstLower.cpp index 3d22330afe72..8cfb968237e4 100644 --- a/lib/Target/AArch64/AArch64MCInstLower.cpp +++ b/lib/Target/AArch64/AArch64MCInstLower.cpp @@ -109,6 +109,11 @@ bool AArch64AsmPrinter::lowerOperand(const MachineOperand &MO, case MachineOperand::MO_Immediate: MCOp = MCOperand::CreateImm(MO.getImm()); break; + case MachineOperand::MO_FPImmediate: { + assert(MO.getFPImm()->isZero() && "Only fp imm 0.0 is supported"); + MCOp = MCOperand::CreateFPImm(0.0); + break; + } case MachineOperand::MO_BlockAddress: MCOp = lowerSymbolOperand(MO, GetBlockAddressSymbol(MO.getBlockAddress())); break; @@ -116,7 +121,7 @@ bool AArch64AsmPrinter::lowerOperand(const MachineOperand &MO, MCOp = lowerSymbolOperand(MO, GetExternalSymbolSymbol(MO.getSymbolName())); break; case MachineOperand::MO_GlobalAddress: - MCOp = lowerSymbolOperand(MO, Mang->getSymbol(MO.getGlobal())); + MCOp = lowerSymbolOperand(MO, getSymbol(MO.getGlobal())); break; case MachineOperand::MO_MachineBasicBlock: MCOp = MCOperand::CreateExpr(MCSymbolRefExpr::Create( diff --git a/lib/Target/AArch64/AArch64RegisterInfo.cpp b/lib/Target/AArch64/AArch64RegisterInfo.cpp index 20b0dcf86f46..75ec44f3fecb 100644 --- a/lib/Target/AArch64/AArch64RegisterInfo.cpp +++ b/lib/Target/AArch64/AArch64RegisterInfo.cpp @@ -29,9 +29,8 @@ using namespace llvm; -AArch64RegisterInfo::AArch64RegisterInfo(const AArch64InstrInfo &tii, - const AArch64Subtarget &sti) - : AArch64GenRegisterInfo(AArch64::X30), TII(tii) { +AArch64RegisterInfo::AArch64RegisterInfo() + : AArch64GenRegisterInfo(AArch64::X30) { } const uint16_t * @@ -122,6 +121,8 @@ AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MBBI, return; } + const AArch64InstrInfo &TII = + *static_cast(MF.getTarget().getInstrInfo()); int MinOffset, MaxOffset, OffsetScale; if (MI.getOpcode() == AArch64::ADDxxi_lsl0_s) { MinOffset = 0; diff --git a/lib/Target/AArch64/AArch64RegisterInfo.h b/lib/Target/AArch64/AArch64RegisterInfo.h index bb64fd55b2c3..4d679439936a 100644 --- a/lib/Target/AArch64/AArch64RegisterInfo.h +++ b/lib/Target/AArch64/AArch64RegisterInfo.h @@ -25,12 +25,7 @@ class AArch64InstrInfo; class AArch64Subtarget; struct AArch64RegisterInfo : public AArch64GenRegisterInfo { -private: - const AArch64InstrInfo &TII; - -public: - AArch64RegisterInfo(const AArch64InstrInfo &tii, - const AArch64Subtarget &sti); + AArch64RegisterInfo(); const uint16_t *getCalleeSavedRegs(const MachineFunction *MF = 0) const; const uint32_t *getCallPreservedMask(CallingConv::ID) const; diff --git a/lib/Target/AArch64/AArch64RegisterInfo.td b/lib/Target/AArch64/AArch64RegisterInfo.td index bd79546371c5..4e2022c06165 100644 --- a/lib/Target/AArch64/AArch64RegisterInfo.td +++ b/lib/Target/AArch64/AArch64RegisterInfo.td @@ -12,15 +12,25 @@ //===----------------------------------------------------------------------===// let Namespace = "AArch64" in { -def sub_128 : SubRegIndex; -def sub_64 : SubRegIndex; -def sub_32 : SubRegIndex; -def sub_16 : SubRegIndex; -def sub_8 : SubRegIndex; - -// The VPR registers are handled as sub-registers of FPR equivalents, but -// they're really the same thing. We give this concept a special index. -def sub_alias : SubRegIndex; +def sub_128 : SubRegIndex<128>; +def sub_64 : SubRegIndex<64>; +def sub_32 : SubRegIndex<32>; +def sub_16 : SubRegIndex<16>; +def sub_8 : SubRegIndex<8>; + +// Note: Code depends on these having consecutive numbers. +def qqsub : SubRegIndex<256, 256>; + +def qsub_0 : SubRegIndex<128>; +def qsub_1 : SubRegIndex<128, 128>; +def qsub_2 : ComposedSubRegIndex; +def qsub_3 : ComposedSubRegIndex; + +def dsub_0 : SubRegIndex<64>; +def dsub_1 : SubRegIndex<64, 64>; +def dsub_2 : ComposedSubRegIndex; +def dsub_3 : ComposedSubRegIndex; +def dsub_4 : ComposedSubRegIndex; } // Registers are identified with 5-bit ID numbers. @@ -137,60 +147,51 @@ foreach Index = 0-31 in { } -def FPR8 : RegisterClass<"AArch64", [i8], 8, +def FPR8 : RegisterClass<"AArch64", [i8, v1i8], 8, (sequence "B%u", 0, 31)> { } -def FPR16 : RegisterClass<"AArch64", [f16], 16, +def FPR16 : RegisterClass<"AArch64", [f16, v1i16], 16, (sequence "H%u", 0, 31)> { } -def FPR32 : RegisterClass<"AArch64", [f32], 32, +def FPR32 : RegisterClass<"AArch64", [f32, v1i32, v1f32], 32, (sequence "S%u", 0, 31)> { } -def FPR64 : RegisterClass<"AArch64", [f64], 64, - (sequence "D%u", 0, 31)> { -} +def FPR64 : RegisterClass<"AArch64", + [f64, v2f32, v2i32, v4i16, v8i8, v1i64, v1f64], + 64, (sequence "D%u", 0, 31)>; -def FPR128 : RegisterClass<"AArch64", [f128], 128, - (sequence "Q%u", 0, 31)> { -} +def FPR128 : RegisterClass<"AArch64", + [f128, v2f64, v2i64, v4f32, v4i32, v8i16, v16i8], + 128, (sequence "Q%u", 0, 31)>; +def FPR64Lo : RegisterClass<"AArch64", + [f64, v2f32, v2i32, v4i16, v8i8, v1i64, v1f64], + 64, (sequence "D%u", 0, 15)>; + +def FPR128Lo : RegisterClass<"AArch64", + [f128, v2f64, v2i64, v4f32, v4i32, v8i16, v16i8], + 128, (sequence "Q%u", 0, 15)>; //===----------------------------------------------------------------------===// // Vector registers: //===----------------------------------------------------------------------===// -// NEON registers simply specify the overall vector, and it's expected that -// Instructions will individually specify the acceptable data layout. In -// principle this leaves two approaches open: -// + An operand, giving a single ADDvvv instruction (for example). This turns -// out to be unworkable in the assembly parser (without every Instruction -// having a "cvt" function, at least) because the constraints can't be -// properly enforced. It also complicates specifying patterns since each -// instruction will accept many types. -// + A bare token (e.g. ".2d"). This means the AsmParser has to know specific -// details about NEON registers, but simplifies most other details. -// -// The second approach was taken. - -foreach Index = 0-31 in { - def V # Index : AArch64RegWithSubs("Q" # Index)], - [sub_alias]>, - DwarfRegNum<[!add(Index, 64)]>; +def VPR64AsmOperand : AsmOperandClass { + let Name = "VPR"; + let PredicateMethod = "isReg"; + let RenderMethod = "addRegOperands"; } -// These two classes contain the same registers, which should be reasonably -// sensible for MC and allocation purposes, but allows them to be treated -// separately for things like stack spilling. -def VPR64 : RegisterClass<"AArch64", [v2f32, v2i32, v4i16, v8i8], 64, - (sequence "V%u", 0, 31)>; +def VPR64 : RegisterOperand; + +def VPR128 : RegisterOperand; + +def VPR64Lo : RegisterOperand; -def VPR128 : RegisterClass<"AArch64", - [v2f64, v2i64, v4f32, v4i32, v8i16, v16i8], 128, - (sequence "V%u", 0, 31)>; +def VPR128Lo : RegisterOperand; // Flags register def NZCV : Register<"nzcv"> { @@ -201,3 +202,90 @@ def FlagClass : RegisterClass<"AArch64", [i32], 32, (add NZCV)> { let CopyCost = -1; let isAllocatable = 0; } + +//===----------------------------------------------------------------------===// +// Consecutive vector registers +//===----------------------------------------------------------------------===// +// 2 Consecutive 64-bit registers: D0_D1, D1_D2, ..., D30_D31 +def Tuples2D : RegisterTuples<[dsub_0, dsub_1], + [(rotl FPR64, 0), (rotl FPR64, 1)]>; + +// 3 Consecutive 64-bit registers: D0_D1_D2, ..., D31_D0_D1 +def Tuples3D : RegisterTuples<[dsub_0, dsub_1, dsub_2], + [(rotl FPR64, 0), (rotl FPR64, 1), + (rotl FPR64, 2)]>; + +// 4 Consecutive 64-bit registers: D0_D1_D2_D3, ..., D31_D0_D1_D2 +def Tuples4D : RegisterTuples<[dsub_0, dsub_1, dsub_2, dsub_3], + [(rotl FPR64, 0), (rotl FPR64, 1), + (rotl FPR64, 2), (rotl FPR64, 3)]>; + +// 2 Consecutive 128-bit registers: Q0_Q1, Q1_Q2, ..., Q30_Q31 +def Tuples2Q : RegisterTuples<[qsub_0, qsub_1], + [(rotl FPR128, 0), (rotl FPR128, 1)]>; + +// 3 Consecutive 128-bit registers: Q0_Q1_Q2, ..., Q31_Q0_Q1 +def Tuples3Q : RegisterTuples<[qsub_0, qsub_1, qsub_2], + [(rotl FPR128, 0), (rotl FPR128, 1), + (rotl FPR128, 2)]>; + +// 4 Consecutive 128-bit registers: Q0_Q1_Q2_Q3, ..., Q31_Q0_Q1_Q2 +def Tuples4Q : RegisterTuples<[qsub_0, qsub_1, qsub_2, qsub_3], + [(rotl FPR128, 0), (rotl FPR128, 1), + (rotl FPR128, 2), (rotl FPR128, 3)]>; + +// The followings are super register classes to model 2/3/4 consecutive +// 64-bit/128-bit registers. + +def DPair : RegisterClass<"AArch64", [v2i64], 64, (add Tuples2D)>; + +def DTriple : RegisterClass<"AArch64", [untyped], 64, (add Tuples3D)> { + let Size = 192; // 3 x 64 bits, we have no predefined type of that size. +} + +def DQuad : RegisterClass<"AArch64", [v4i64], 64, (add Tuples4D)>; + +def QPair : RegisterClass<"AArch64", [v4i64], 128, (add Tuples2Q)>; + +def QTriple : RegisterClass<"AArch64", [untyped], 128, (add Tuples3Q)> { + let Size = 384; // 3 x 128 bits, we have no predefined type of that size. +} + +def QQuad : RegisterClass<"AArch64", [v8i64], 128, (add Tuples4Q)>; + + +// The followings are vector list operands +multiclass VectorList_operands { + def _asmoperand : AsmOperandClass { + let Name = PREFIX # LAYOUT # Count; + let RenderMethod = "addVectorListOperands"; + let PredicateMethod = + "isVectorList"; + let ParserMethod = "ParseVectorList"; + } + + def _operand : RegisterOperand"> { + let ParserMatchClass = + !cast(PREFIX # LAYOUT # "_asmoperand"); + } +} + +multiclass VectorList_BHSD { + defm 8B : VectorList_operands; + defm 4H : VectorList_operands; + defm 2S : VectorList_operands; + defm 1D : VectorList_operands; + defm 16B : VectorList_operands; + defm 8H : VectorList_operands; + defm 4S : VectorList_operands; + defm 2D : VectorList_operands; +} + +// Vector list operand with 1/2/3/4 registers: VOne8B_operand,..., VQuad2D_operand +defm VOne : VectorList_BHSD<"VOne", 1, FPR64, FPR128>; +defm VPair : VectorList_BHSD<"VPair", 2, DPair, QPair>; +defm VTriple : VectorList_BHSD<"VTriple", 3, DTriple, QTriple>; +defm VQuad : VectorList_BHSD<"VQuad", 4, DQuad, QQuad>; \ No newline at end of file diff --git a/lib/Target/AArch64/AArch64Subtarget.cpp b/lib/Target/AArch64/AArch64Subtarget.cpp index d17b73820994..5c693c18c6a6 100644 --- a/lib/Target/AArch64/AArch64Subtarget.cpp +++ b/lib/Target/AArch64/AArch64Subtarget.cpp @@ -25,13 +25,31 @@ using namespace llvm; +// Pin the vtable to this file. +void AArch64Subtarget::anchor() {} + AArch64Subtarget::AArch64Subtarget(StringRef TT, StringRef CPU, StringRef FS) - : AArch64GenSubtargetInfo(TT, CPU, FS) - , HasNEON(true) - , HasCrypto(true) - , TargetTriple(TT) { + : AArch64GenSubtargetInfo(TT, CPU, FS), HasFPARMv8(false), HasNEON(false), + HasCrypto(false), TargetTriple(TT), CPUString(CPU) { + + initializeSubtargetFeatures(CPU, FS); +} + +void AArch64Subtarget::initializeSubtargetFeatures(StringRef CPU, + StringRef FS) { + if (CPU.empty()) + CPUString = "generic"; + + std::string FullFS = FS; + if (CPUString == "generic") { + // Enable FP by default. + if (FullFS.empty()) + FullFS = "+fp-armv8"; + else + FullFS = "+fp-armv8," + FullFS; + } - ParseSubtargetFeatures(CPU, FS); + ParseSubtargetFeatures(CPU, FullFS); } bool AArch64Subtarget::GVIsIndirectSymbol(const GlobalValue *GV, diff --git a/lib/Target/AArch64/AArch64Subtarget.h b/lib/Target/AArch64/AArch64Subtarget.h index 2e9205fc9924..bbfd3bc7dfac 100644 --- a/lib/Target/AArch64/AArch64Subtarget.h +++ b/lib/Target/AArch64/AArch64Subtarget.h @@ -27,18 +27,31 @@ class StringRef; class GlobalValue; class AArch64Subtarget : public AArch64GenSubtargetInfo { + virtual void anchor(); protected: + bool HasFPARMv8; bool HasNEON; bool HasCrypto; /// TargetTriple - What processor and OS we're targeting. Triple TargetTriple; + + /// CPUString - String name of used CPU. + std::string CPUString; + +private: + void initializeSubtargetFeatures(StringRef CPU, StringRef FS); + public: /// This constructor initializes the data members to match that /// of the specified triple. /// AArch64Subtarget(StringRef TT, StringRef CPU, StringRef FS); + virtual bool enableMachineScheduler() const { + return true; + } + /// ParseSubtargetFeatures - Parses features string setting specified /// subtarget options. Definition of function is auto generated by tblgen. void ParseSubtargetFeatures(StringRef CPU, StringRef FS); @@ -46,8 +59,13 @@ public: bool GVIsIndirectSymbol(const GlobalValue *GV, Reloc::Model RelocM) const; bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); } - bool isTargetLinux() const { return TargetTriple.getOS() == Triple::Linux; } + bool isTargetLinux() const { return TargetTriple.isOSLinux(); } + + bool hasFPARMv8() const { return HasFPARMv8; } + bool hasNEON() const { return HasNEON; } + bool hasCrypto() const { return HasCrypto; } + const std::string & getCPUString() const { return CPUString; } }; } // End llvm namespace diff --git a/lib/Target/AArch64/AArch64TargetMachine.cpp b/lib/Target/AArch64/AArch64TargetMachine.cpp index df599d599dd6..f1695e2ce207 100644 --- a/lib/Target/AArch64/AArch64TargetMachine.cpp +++ b/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -38,6 +38,7 @@ AArch64TargetMachine::AArch64TargetMachine(const Target &T, StringRef TT, TLInfo(*this), TSInfo(*this), FrameLowering(Subtarget) { + initAsmInfo(); } namespace { diff --git a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp index 69bb80a48537..fbbce116ad82 100644 --- a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp +++ b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp @@ -54,8 +54,9 @@ public: #include "AArch64GenAsmMatcher.inc" }; - AArch64AsmParser(MCSubtargetInfo &_STI, MCAsmParser &_Parser) - : MCTargetAsmParser(), STI(_STI), Parser(_Parser) { + AArch64AsmParser(MCSubtargetInfo &_STI, MCAsmParser &_Parser, + const MCInstrInfo &MII) + : MCTargetAsmParser(), STI(_STI), Parser(_Parser) { MCAsmParserExtension::Initialize(_Parser); // Initialize the set of available features. @@ -126,6 +127,11 @@ public: OperandMatchResultTy ParseSysRegOperand(SmallVectorImpl &Operands); + bool TryParseVector(uint32_t &RegNum, SMLoc &RegEndLoc, StringRef &Layout, + SMLoc &LayoutLoc); + + OperandMatchResultTy ParseVectorList(SmallVectorImpl &); + bool validateInstruction(MCInst &Inst, const SmallVectorImpl &Operands); @@ -153,6 +159,7 @@ private: k_Immediate, // Including expressions referencing symbols k_Register, k_ShiftExtend, + k_VectorList, // A sequential list of 1 to 4 registers. k_SysReg, // The register operand of MRS and MSR instructions k_Token, // The mnemonic; other raw tokens the auto-generated k_WrappedRegister // Load/store exclusive permit a wrapped register. @@ -188,6 +195,13 @@ private: bool ImplicitAmount; }; + // A vector register list is a sequential list of 1 to 4 registers. + struct VectorListOp { + unsigned RegNum; + unsigned Count; + A64Layout::VectorLayout Layout; + }; + struct SysRegOp { const char *Data; unsigned Length; @@ -205,6 +219,7 @@ private: struct ImmOp Imm; struct RegOp Reg; struct ShiftExtendOp ShiftExtend; + struct VectorListOp VectorList; struct SysRegOp SysReg; struct TokOp Tok; }; @@ -454,7 +469,7 @@ public: } bool isMOVN32Imm() const { - static AArch64MCExpr::VariantKind PermittedModifiers[] = { + static const AArch64MCExpr::VariantKind PermittedModifiers[] = { AArch64MCExpr::VK_AARCH64_SABS_G0, AArch64MCExpr::VK_AARCH64_SABS_G1, AArch64MCExpr::VK_AARCH64_DTPREL_G1, @@ -463,13 +478,13 @@ public: AArch64MCExpr::VK_AARCH64_TPREL_G1, AArch64MCExpr::VK_AARCH64_TPREL_G0, }; - unsigned NumModifiers = llvm::array_lengthof(PermittedModifiers); + const unsigned NumModifiers = llvm::array_lengthof(PermittedModifiers); return isMoveWideImm(32, PermittedModifiers, NumModifiers); } bool isMOVN64Imm() const { - static AArch64MCExpr::VariantKind PermittedModifiers[] = { + static const AArch64MCExpr::VariantKind PermittedModifiers[] = { AArch64MCExpr::VK_AARCH64_SABS_G0, AArch64MCExpr::VK_AARCH64_SABS_G1, AArch64MCExpr::VK_AARCH64_SABS_G2, @@ -481,14 +496,14 @@ public: AArch64MCExpr::VK_AARCH64_TPREL_G1, AArch64MCExpr::VK_AARCH64_TPREL_G0, }; - unsigned NumModifiers = llvm::array_lengthof(PermittedModifiers); + const unsigned NumModifiers = llvm::array_lengthof(PermittedModifiers); return isMoveWideImm(64, PermittedModifiers, NumModifiers); } bool isMOVZ32Imm() const { - static AArch64MCExpr::VariantKind PermittedModifiers[] = { + static const AArch64MCExpr::VariantKind PermittedModifiers[] = { AArch64MCExpr::VK_AARCH64_ABS_G0, AArch64MCExpr::VK_AARCH64_ABS_G1, AArch64MCExpr::VK_AARCH64_SABS_G0, @@ -499,13 +514,13 @@ public: AArch64MCExpr::VK_AARCH64_TPREL_G1, AArch64MCExpr::VK_AARCH64_TPREL_G0, }; - unsigned NumModifiers = llvm::array_lengthof(PermittedModifiers); + const unsigned NumModifiers = llvm::array_lengthof(PermittedModifiers); return isMoveWideImm(32, PermittedModifiers, NumModifiers); } bool isMOVZ64Imm() const { - static AArch64MCExpr::VariantKind PermittedModifiers[] = { + static const AArch64MCExpr::VariantKind PermittedModifiers[] = { AArch64MCExpr::VK_AARCH64_ABS_G0, AArch64MCExpr::VK_AARCH64_ABS_G1, AArch64MCExpr::VK_AARCH64_ABS_G2, @@ -521,13 +536,13 @@ public: AArch64MCExpr::VK_AARCH64_TPREL_G1, AArch64MCExpr::VK_AARCH64_TPREL_G0, }; - unsigned NumModifiers = llvm::array_lengthof(PermittedModifiers); + const unsigned NumModifiers = llvm::array_lengthof(PermittedModifiers); return isMoveWideImm(64, PermittedModifiers, NumModifiers); } bool isMOVK32Imm() const { - static AArch64MCExpr::VariantKind PermittedModifiers[] = { + static const AArch64MCExpr::VariantKind PermittedModifiers[] = { AArch64MCExpr::VK_AARCH64_ABS_G0_NC, AArch64MCExpr::VK_AARCH64_ABS_G1_NC, AArch64MCExpr::VK_AARCH64_DTPREL_G1_NC, @@ -536,13 +551,13 @@ public: AArch64MCExpr::VK_AARCH64_TPREL_G1_NC, AArch64MCExpr::VK_AARCH64_TPREL_G0_NC, }; - unsigned NumModifiers = llvm::array_lengthof(PermittedModifiers); + const unsigned NumModifiers = llvm::array_lengthof(PermittedModifiers); return isMoveWideImm(32, PermittedModifiers, NumModifiers); } bool isMOVK64Imm() const { - static AArch64MCExpr::VariantKind PermittedModifiers[] = { + static const AArch64MCExpr::VariantKind PermittedModifiers[] = { AArch64MCExpr::VK_AARCH64_ABS_G0_NC, AArch64MCExpr::VK_AARCH64_ABS_G1_NC, AArch64MCExpr::VK_AARCH64_ABS_G2_NC, @@ -553,13 +568,13 @@ public: AArch64MCExpr::VK_AARCH64_TPREL_G1_NC, AArch64MCExpr::VK_AARCH64_TPREL_G0_NC, }; - unsigned NumModifiers = llvm::array_lengthof(PermittedModifiers); + const unsigned NumModifiers = llvm::array_lengthof(PermittedModifiers); return isMoveWideImm(64, PermittedModifiers, NumModifiers); } bool isMoveWideImm(unsigned RegWidth, - AArch64MCExpr::VariantKind *PermittedModifiers, + const AArch64MCExpr::VariantKind *PermittedModifiers, unsigned NumModifiers) const { if (!isImmWithLSL()) return false; @@ -664,8 +679,86 @@ public: return !ShiftExtend.ImplicitAmount && ShiftExtend.Amount <= 4; } - template bool isSImm7Scaled() const { - if (!isImm()) return false; + // if 0 < value <= w, return true + bool isShrFixedWidth(int w) const { + if (!isImm()) + return false; + const MCConstantExpr *CE = dyn_cast(getImm()); + if (!CE) + return false; + int64_t Value = CE->getValue(); + return Value > 0 && Value <= w; + } + + bool isShrImm8() const { return isShrFixedWidth(8); } + + bool isShrImm16() const { return isShrFixedWidth(16); } + + bool isShrImm32() const { return isShrFixedWidth(32); } + + bool isShrImm64() const { return isShrFixedWidth(64); } + + // if 0 <= value < w, return true + bool isShlFixedWidth(int w) const { + if (!isImm()) + return false; + const MCConstantExpr *CE = dyn_cast(getImm()); + if (!CE) + return false; + int64_t Value = CE->getValue(); + return Value >= 0 && Value < w; + } + + bool isShlImm8() const { return isShlFixedWidth(8); } + + bool isShlImm16() const { return isShlFixedWidth(16); } + + bool isShlImm32() const { return isShlFixedWidth(32); } + + bool isShlImm64() const { return isShlFixedWidth(64); } + + bool isNeonMovImmShiftLSL() const { + if (!isShiftOrExtend()) + return false; + + if (ShiftExtend.ShiftType != A64SE::LSL) + return false; + + // Valid shift amount is 0, 8, 16 and 24. + return ShiftExtend.Amount % 8 == 0 && ShiftExtend.Amount <= 24; + } + + bool isNeonMovImmShiftLSLH() const { + if (!isShiftOrExtend()) + return false; + + if (ShiftExtend.ShiftType != A64SE::LSL) + return false; + + // Valid shift amount is 0 and 8. + return ShiftExtend.Amount == 0 || ShiftExtend.Amount == 8; + } + + bool isNeonMovImmShiftMSL() const { + if (!isShiftOrExtend()) + return false; + + if (ShiftExtend.ShiftType != A64SE::MSL) + return false; + + // Valid shift amount is 8 and 16. + return ShiftExtend.Amount == 8 || ShiftExtend.Amount == 16; + } + + template + bool isVectorList() const { + return Kind == k_VectorList && VectorList.Layout == Layout && + VectorList.Count == Count; + } + + template bool isSImm7Scaled() const { + if (!isImm()) + return false; const MCConstantExpr *CE = dyn_cast(getImm()); if (!CE) return false; @@ -705,10 +798,38 @@ public: return isa(getImm()); } + bool isNeonUImm64Mask() const { + if (!isImm()) + return false; + + const MCConstantExpr *CE = dyn_cast(getImm()); + if (!CE) + return false; + + uint64_t Value = CE->getValue(); + + // i64 value with each byte being either 0x00 or 0xff. + for (unsigned i = 0; i < 8; ++i, Value >>= 8) + if ((Value & 0xff) != 0 && (Value & 0xff) != 0xff) + return false; + return true; + } + + // if value == N, return true + template + bool isExactImm() const { + if (!isImm()) return false; + + const MCConstantExpr *CE = dyn_cast(getImm()); + if (!CE) return false; + + return CE->getValue() == N; + } + static AArch64Operand *CreateImmWithLSL(const MCExpr *Val, unsigned ShiftAmount, bool ImplicitAmount, - SMLoc S, SMLoc E) { + SMLoc S,SMLoc E) { AArch64Operand *Op = new AArch64Operand(k_ImmWithLSL, S, E); Op->ImmWithLSL.Val = Val; Op->ImmWithLSL.ShiftAmount = ShiftAmount; @@ -766,6 +887,18 @@ public: return Op; } + static AArch64Operand *CreateVectorList(unsigned RegNum, unsigned Count, + A64Layout::VectorLayout Layout, + SMLoc S, SMLoc E) { + AArch64Operand *Op = new AArch64Operand(k_VectorList, S, E); + Op->VectorList.RegNum = RegNum; + Op->VectorList.Count = Count; + Op->VectorList.Layout = Layout; + Op->StartLoc = S; + Op->EndLoc = E; + return Op; + } + static AArch64Operand *CreateToken(StringRef Str, SMLoc S) { AArch64Operand *Op = new AArch64Operand(k_Token, S, S); Op->Tok.Data = Str.data(); @@ -1026,6 +1159,40 @@ public: Inst.addOperand(MCOperand::CreateImm(ShiftExtend.Amount)); } + // For Vector Immediates shifted imm operands. + void addNeonMovImmShiftLSLOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + + if (ShiftExtend.Amount % 8 != 0 || ShiftExtend.Amount > 24) + llvm_unreachable("Invalid shift amount for vector immediate inst."); + + // Encode LSL shift amount 0, 8, 16, 24 as 0, 1, 2, 3. + int64_t Imm = ShiftExtend.Amount / 8; + Inst.addOperand(MCOperand::CreateImm(Imm)); + } + + void addNeonMovImmShiftLSLHOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + + if (ShiftExtend.Amount != 0 && ShiftExtend.Amount != 8) + llvm_unreachable("Invalid shift amount for vector immediate inst."); + + // Encode LSLH shift amount 0, 8 as 0, 1. + int64_t Imm = ShiftExtend.Amount / 8; + Inst.addOperand(MCOperand::CreateImm(Imm)); + } + + void addNeonMovImmShiftMSLOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + + if (ShiftExtend.Amount != 8 && ShiftExtend.Amount != 16) + llvm_unreachable("Invalid shift amount for vector immediate inst."); + + // Encode MSL shift amount 8, 16 as 0, 1. + int64_t Imm = ShiftExtend.Amount / 8 - 1; + Inst.addOperand(MCOperand::CreateImm(Imm)); + } + // For the extend in load-store (register offset) instructions. template void addAddrRegExtendOperands(MCInst &Inst, unsigned N) const { @@ -1065,6 +1232,25 @@ public: Inst.addOperand(MCOperand::CreateImm(ShiftExtend.Amount)); } + + void addNeonUImm64MaskOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + + // A bit from each byte in the constant forms the encoded immediate + const MCConstantExpr *CE = dyn_cast(getImm()); + uint64_t Value = CE->getValue(); + + unsigned Imm = 0; + for (unsigned i = 0; i < 8; ++i, Value >>= 8) { + Imm |= (Value & 1) << i; + } + Inst.addOperand(MCOperand::CreateImm(Imm)); + } + + void addVectorListOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::CreateReg(VectorList.RegNum)); + } }; } // end anonymous namespace. @@ -1104,7 +1290,6 @@ AArch64AsmParser::ParseOperand(SmallVectorImpl &Operands, else return MatchOperand_Success; } - // ... or it might be a symbolish thing } // Fall through @@ -1148,7 +1333,7 @@ AArch64AsmParser::ParseOperand(SmallVectorImpl &Operands, return ParseOperand(Operands, Mnemonic); } // The following will likely be useful later, but not in very early cases - case AsmToken::LCurly: // Weird SIMD lists + case AsmToken::LCurly: // SIMD vector list is not parsed here llvm_unreachable("Don't know how to deal with '{' in operand"); return MatchOperand_ParseFail; } @@ -1306,7 +1491,7 @@ AArch64AsmParser::ParseImmWithLSLOperand( // The optional operand must be "lsl #N" where N is non-negative. if (Parser.getTok().is(AsmToken::Identifier) - && Parser.getTok().getIdentifier().lower() == "lsl") { + && Parser.getTok().getIdentifier().equals_lower("lsl")) { Parser.Lex(); if (Parser.getTok().is(AsmToken::Hash)) { @@ -1363,9 +1548,8 @@ AArch64AsmParser::ParseCRxOperand( return MatchOperand_ParseFail; } - std::string LowerTok = Parser.getTok().getIdentifier().lower(); - StringRef Tok(LowerTok); - if (Tok[0] != 'c') { + StringRef Tok = Parser.getTok().getIdentifier(); + if (Tok[0] != 'c' && Tok[0] != 'C') { Error(S, "Expected cN operand where 0 <= N <= 15"); return MatchOperand_ParseFail; } @@ -1437,22 +1621,11 @@ AArch64AsmParser::IdentifyRegister(unsigned &RegNum, SMLoc &RegEndLoc, std::string LowerReg = Tok.getString().lower(); size_t DotPos = LowerReg.find('.'); - RegNum = MatchRegisterName(LowerReg.substr(0, DotPos)); - if (RegNum == AArch64::NoRegister) { - RegNum = StringSwitch(LowerReg.substr(0, DotPos)) - .Case("ip0", AArch64::X16) - .Case("ip1", AArch64::X17) - .Case("fp", AArch64::X29) - .Case("lr", AArch64::X30) - .Default(AArch64::NoRegister); - } - if (RegNum == AArch64::NoRegister) - return false; - + bool IsVec128 = false; SMLoc S = Tok.getLoc(); RegEndLoc = SMLoc::getFromPointer(S.getPointer() + DotPos); - if (DotPos == StringRef::npos) { + if (DotPos == std::string::npos) { Layout = StringRef(); } else { // Everything afterwards needs to be a literal token, expected to be @@ -1462,20 +1635,78 @@ AArch64AsmParser::IdentifyRegister(unsigned &RegNum, SMLoc &RegEndLoc, // gives us a permanent string to use in the token (a pointer into LowerReg // would go out of scope when we return). LayoutLoc = SMLoc::getFromPointer(S.getPointer() + DotPos + 1); - std::string LayoutText = LowerReg.substr(DotPos, StringRef::npos); + StringRef LayoutText = StringRef(LowerReg).substr(DotPos); + + // See if it's a 128-bit layout first. Layout = StringSwitch(LayoutText) - .Case(".d", ".d").Case(".1d", ".1d").Case(".2d", ".2d") - .Case(".s", ".s").Case(".2s", ".2s").Case(".4s", ".4s") - .Case(".h", ".h").Case(".4h", ".4h").Case(".8h", ".8h") - .Case(".b", ".b").Case(".8b", ".8b").Case(".16b", ".16b") + .Case(".q", ".q").Case(".1q", ".1q") + .Case(".d", ".d").Case(".2d", ".2d") + .Case(".s", ".s").Case(".4s", ".4s") + .Case(".h", ".h").Case(".8h", ".8h") + .Case(".b", ".b").Case(".16b", ".16b") .Default(""); + if (Layout.size() != 0) + IsVec128 = true; + else { + Layout = StringSwitch(LayoutText) + .Case(".1d", ".1d") + .Case(".2s", ".2s") + .Case(".4h", ".4h") + .Case(".8b", ".8b") + .Default(""); + } + if (Layout.size() == 0) { - // Malformed register + // If we've still not pinned it down the register is malformed. return false; } } + RegNum = MatchRegisterName(LowerReg.substr(0, DotPos)); + if (RegNum == AArch64::NoRegister) { + RegNum = StringSwitch(LowerReg.substr(0, DotPos)) + .Case("ip0", AArch64::X16) + .Case("ip1", AArch64::X17) + .Case("fp", AArch64::X29) + .Case("lr", AArch64::X30) + .Case("v0", IsVec128 ? AArch64::Q0 : AArch64::D0) + .Case("v1", IsVec128 ? AArch64::Q1 : AArch64::D1) + .Case("v2", IsVec128 ? AArch64::Q2 : AArch64::D2) + .Case("v3", IsVec128 ? AArch64::Q3 : AArch64::D3) + .Case("v4", IsVec128 ? AArch64::Q4 : AArch64::D4) + .Case("v5", IsVec128 ? AArch64::Q5 : AArch64::D5) + .Case("v6", IsVec128 ? AArch64::Q6 : AArch64::D6) + .Case("v7", IsVec128 ? AArch64::Q7 : AArch64::D7) + .Case("v8", IsVec128 ? AArch64::Q8 : AArch64::D8) + .Case("v9", IsVec128 ? AArch64::Q9 : AArch64::D9) + .Case("v10", IsVec128 ? AArch64::Q10 : AArch64::D10) + .Case("v11", IsVec128 ? AArch64::Q11 : AArch64::D11) + .Case("v12", IsVec128 ? AArch64::Q12 : AArch64::D12) + .Case("v13", IsVec128 ? AArch64::Q13 : AArch64::D13) + .Case("v14", IsVec128 ? AArch64::Q14 : AArch64::D14) + .Case("v15", IsVec128 ? AArch64::Q15 : AArch64::D15) + .Case("v16", IsVec128 ? AArch64::Q16 : AArch64::D16) + .Case("v17", IsVec128 ? AArch64::Q17 : AArch64::D17) + .Case("v18", IsVec128 ? AArch64::Q18 : AArch64::D18) + .Case("v19", IsVec128 ? AArch64::Q19 : AArch64::D19) + .Case("v20", IsVec128 ? AArch64::Q20 : AArch64::D20) + .Case("v21", IsVec128 ? AArch64::Q21 : AArch64::D21) + .Case("v22", IsVec128 ? AArch64::Q22 : AArch64::D22) + .Case("v23", IsVec128 ? AArch64::Q23 : AArch64::D23) + .Case("v24", IsVec128 ? AArch64::Q24 : AArch64::D24) + .Case("v25", IsVec128 ? AArch64::Q25 : AArch64::D25) + .Case("v26", IsVec128 ? AArch64::Q26 : AArch64::D26) + .Case("v27", IsVec128 ? AArch64::Q27 : AArch64::D27) + .Case("v28", IsVec128 ? AArch64::Q28 : AArch64::D28) + .Case("v29", IsVec128 ? AArch64::Q29 : AArch64::D29) + .Case("v30", IsVec128 ? AArch64::Q30 : AArch64::D30) + .Case("v31", IsVec128 ? AArch64::Q31 : AArch64::D31) + .Default(AArch64::NoRegister); + } + if (RegNum == AArch64::NoRegister) + return false; + return true; } @@ -1507,6 +1738,7 @@ AArch64AsmParser::ParseRegister(SmallVectorImpl &Operands, case 'h': NumLanes = 8; break; case 's': NumLanes = 4; break; case 'd': NumLanes = 2; break; + case 'q': NumLanes = 1; break; } } @@ -1660,20 +1892,21 @@ AArch64AsmParser::ParseShiftExtend( std::string LowerID = IDVal.lower(); A64SE::ShiftExtSpecifiers Spec = - StringSwitch(LowerID) - .Case("lsl", A64SE::LSL) - .Case("lsr", A64SE::LSR) - .Case("asr", A64SE::ASR) - .Case("ror", A64SE::ROR) - .Case("uxtb", A64SE::UXTB) - .Case("uxth", A64SE::UXTH) - .Case("uxtw", A64SE::UXTW) - .Case("uxtx", A64SE::UXTX) - .Case("sxtb", A64SE::SXTB) - .Case("sxth", A64SE::SXTH) - .Case("sxtw", A64SE::SXTW) - .Case("sxtx", A64SE::SXTX) - .Default(A64SE::Invalid); + StringSwitch(LowerID) + .Case("lsl", A64SE::LSL) + .Case("msl", A64SE::MSL) + .Case("lsr", A64SE::LSR) + .Case("asr", A64SE::ASR) + .Case("ror", A64SE::ROR) + .Case("uxtb", A64SE::UXTB) + .Case("uxth", A64SE::UXTH) + .Case("uxtw", A64SE::UXTW) + .Case("uxtx", A64SE::UXTX) + .Case("sxtb", A64SE::SXTB) + .Case("sxth", A64SE::SXTH) + .Case("sxtw", A64SE::SXTW) + .Case("sxtx", A64SE::SXTX) + .Default(A64SE::Invalid); if (Spec == A64SE::Invalid) return MatchOperand_NoMatch; @@ -1683,8 +1916,8 @@ AArch64AsmParser::ParseShiftExtend( S = Parser.getTok().getLoc(); Parser.Lex(); - if (Spec != A64SE::LSL && Spec != A64SE::LSR && - Spec != A64SE::ASR && Spec != A64SE::ROR) { + if (Spec != A64SE::LSL && Spec != A64SE::LSR && Spec != A64SE::ASR && + Spec != A64SE::ROR && Spec != A64SE::MSL) { // The shift amount can be omitted for the extending versions, but not real // shifts: // add x0, x0, x0, uxtb @@ -1724,6 +1957,148 @@ AArch64AsmParser::ParseShiftExtend( return MatchOperand_Success; } +/// Try to parse a vector register token, If it is a vector register, +/// the token is eaten and return true. Otherwise return false. +bool AArch64AsmParser::TryParseVector(uint32_t &RegNum, SMLoc &RegEndLoc, + StringRef &Layout, SMLoc &LayoutLoc) { + bool IsVector = true; + + if (!IdentifyRegister(RegNum, RegEndLoc, Layout, LayoutLoc)) + IsVector = false; + else if (!AArch64MCRegisterClasses[AArch64::FPR64RegClassID] + .contains(RegNum) && + !AArch64MCRegisterClasses[AArch64::FPR128RegClassID] + .contains(RegNum)) + IsVector = false; + else if (Layout.size() == 0) + IsVector = false; + + if (!IsVector) + Error(Parser.getTok().getLoc(), "expected vector type register"); + + Parser.Lex(); // Eat this token. + return IsVector; +} + + +// A vector list contains 1-4 consecutive registers. +// Now there are two kinds of vector list when number of vector > 1: +// (1) {Vn.layout, Vn+1.layout, ... , Vm.layout} +// (2) {Vn.layout - Vm.layout} +// If the layout is like .b/.h/.s/.d, also parse the lane. +AArch64AsmParser::OperandMatchResultTy AArch64AsmParser::ParseVectorList( + SmallVectorImpl &Operands) { + if (Parser.getTok().isNot(AsmToken::LCurly)) { + Error(Parser.getTok().getLoc(), "'{' expected"); + return MatchOperand_ParseFail; + } + SMLoc SLoc = Parser.getTok().getLoc(); + Parser.Lex(); // Eat '{' token. + + unsigned Reg, Count = 1; + StringRef LayoutStr; + SMLoc RegEndLoc, LayoutLoc; + if (!TryParseVector(Reg, RegEndLoc, LayoutStr, LayoutLoc)) + return MatchOperand_ParseFail; + + if (Parser.getTok().is(AsmToken::Minus)) { + Parser.Lex(); // Eat the minus. + + unsigned Reg2; + StringRef LayoutStr2; + SMLoc RegEndLoc2, LayoutLoc2; + SMLoc RegLoc2 = Parser.getTok().getLoc(); + + if (!TryParseVector(Reg2, RegEndLoc2, LayoutStr2, LayoutLoc2)) + return MatchOperand_ParseFail; + unsigned Space = (Reg < Reg2) ? (Reg2 - Reg) : (Reg2 + 32 - Reg); + + if (LayoutStr != LayoutStr2) { + Error(LayoutLoc2, "expected the same vector layout"); + return MatchOperand_ParseFail; + } + if (Space == 0 || Space > 3) { + Error(RegLoc2, "invalid number of vectors"); + return MatchOperand_ParseFail; + } + + Count += Space; + } else { + unsigned LastReg = Reg; + while (Parser.getTok().is(AsmToken::Comma)) { + Parser.Lex(); // Eat the comma. + unsigned Reg2; + StringRef LayoutStr2; + SMLoc RegEndLoc2, LayoutLoc2; + SMLoc RegLoc2 = Parser.getTok().getLoc(); + + if (!TryParseVector(Reg2, RegEndLoc2, LayoutStr2, LayoutLoc2)) + return MatchOperand_ParseFail; + unsigned Space = (LastReg < Reg2) ? (Reg2 - LastReg) + : (Reg2 + 32 - LastReg); + Count++; + + // The space between two vectors should be 1. And they should have the same layout. + // Total count shouldn't be great than 4 + if (Space != 1) { + Error(RegLoc2, "invalid space between two vectors"); + return MatchOperand_ParseFail; + } + if (LayoutStr != LayoutStr2) { + Error(LayoutLoc2, "expected the same vector layout"); + return MatchOperand_ParseFail; + } + if (Count > 4) { + Error(RegLoc2, "invalid number of vectors"); + return MatchOperand_ParseFail; + } + + LastReg = Reg2; + } + } + + if (Parser.getTok().isNot(AsmToken::RCurly)) { + Error(Parser.getTok().getLoc(), "'}' expected"); + return MatchOperand_ParseFail; + } + SMLoc ELoc = Parser.getTok().getLoc(); + Parser.Lex(); // Eat '}' token. + + A64Layout::VectorLayout Layout = A64StringToVectorLayout(LayoutStr); + if (Count > 1) { // If count > 1, create vector list using super register. + bool IsVec64 = (Layout < A64Layout::VL_16B); + static unsigned SupRegIDs[3][2] = { + { AArch64::QPairRegClassID, AArch64::DPairRegClassID }, + { AArch64::QTripleRegClassID, AArch64::DTripleRegClassID }, + { AArch64::QQuadRegClassID, AArch64::DQuadRegClassID } + }; + unsigned SupRegID = SupRegIDs[Count - 2][static_cast(IsVec64)]; + unsigned Sub0 = IsVec64 ? AArch64::dsub_0 : AArch64::qsub_0; + const MCRegisterInfo *MRI = getContext().getRegisterInfo(); + Reg = MRI->getMatchingSuperReg(Reg, Sub0, + &AArch64MCRegisterClasses[SupRegID]); + } + Operands.push_back( + AArch64Operand::CreateVectorList(Reg, Count, Layout, SLoc, ELoc)); + + if (Parser.getTok().is(AsmToken::LBrac)) { + uint32_t NumLanes = 0; + switch(Layout) { + case A64Layout::VL_B : NumLanes = 16; break; + case A64Layout::VL_H : NumLanes = 8; break; + case A64Layout::VL_S : NumLanes = 4; break; + case A64Layout::VL_D : NumLanes = 2; break; + default: + SMLoc Loc = getLexer().getLoc(); + Error(Loc, "expected comma before next operand"); + return MatchOperand_ParseFail; + } + return ParseNEONLane(Operands, NumLanes); + } else { + return MatchOperand_Success; + } +} + // FIXME: We would really like to be able to tablegen'erate this. bool AArch64AsmParser:: validateInstruction(MCInst &Inst, @@ -1918,7 +2293,7 @@ bool AArch64AsmParser::ParseDirectiveWord(unsigned Size, SMLoc L) { if (getParser().parseExpression(Value)) return true; - getParser().getStreamer().EmitValue(Value, Size, 0/*addrspace*/); + getParser().getStreamer().EmitValue(Value, Size); if (getLexer().is(AsmToken::EndOfStatement)) break; @@ -2019,7 +2394,7 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, "expected compatible register or floating-point constant"); case Match_FPZero: return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(), - "expected floating-point constant #0.0"); + "expected floating-point constant #0.0 or invalid register type"); case Match_Label: return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(), "expected label or encodable integer pc offset"); @@ -2140,6 +2515,30 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, case Match_Width64: return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(), "expected integer in range [, 63]"); + case Match_ShrImm8: + return Error(((AArch64Operand *)Operands[ErrorInfo])->getStartLoc(), + "expected integer in range [1, 8]"); + case Match_ShrImm16: + return Error(((AArch64Operand *)Operands[ErrorInfo])->getStartLoc(), + "expected integer in range [1, 16]"); + case Match_ShrImm32: + return Error(((AArch64Operand *)Operands[ErrorInfo])->getStartLoc(), + "expected integer in range [1, 32]"); + case Match_ShrImm64: + return Error(((AArch64Operand *)Operands[ErrorInfo])->getStartLoc(), + "expected integer in range [1, 64]"); + case Match_ShlImm8: + return Error(((AArch64Operand *)Operands[ErrorInfo])->getStartLoc(), + "expected integer in range [0, 7]"); + case Match_ShlImm16: + return Error(((AArch64Operand *)Operands[ErrorInfo])->getStartLoc(), + "expected integer in range [0, 15]"); + case Match_ShlImm32: + return Error(((AArch64Operand *)Operands[ErrorInfo])->getStartLoc(), + "expected integer in range [0, 31]"); + case Match_ShlImm64: + return Error(((AArch64Operand *)Operands[ErrorInfo])->getStartLoc(), + "expected integer in range [0, 63]"); } llvm_unreachable("Implement any new match types added!"); diff --git a/lib/Target/AArch64/CMakeLists.txt b/lib/Target/AArch64/CMakeLists.txt index 8164d6f73c97..0f2e81693198 100644 --- a/lib/Target/AArch64/CMakeLists.txt +++ b/lib/Target/AArch64/CMakeLists.txt @@ -28,6 +28,8 @@ add_llvm_target(AArch64CodeGen AArch64TargetObjectFile.cpp ) +add_dependencies(LLVMAArch64CodeGen AArch64CommonTableGen) + add_subdirectory(AsmParser) add_subdirectory(Disassembler) add_subdirectory(InstPrinter) diff --git a/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp b/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp index 12c1b8f4c81a..be4d7f22b2b1 100644 --- a/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp +++ b/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp @@ -38,7 +38,7 @@ typedef MCDisassembler::DecodeStatus DecodeStatus; namespace { /// AArch64 disassembler for all AArch64 platforms. class AArch64Disassembler : public MCDisassembler { - const MCRegisterInfo *RegInfo; + OwningPtr RegInfo; public: /// Initializes the disassembler. /// @@ -46,8 +46,7 @@ public: : MCDisassembler(STI), RegInfo(Info) { } - ~AArch64Disassembler() { - } + ~AArch64Disassembler() {} /// See MCDisassembler. DecodeStatus getInstruction(MCInst &instr, @@ -57,7 +56,7 @@ public: raw_ostream &vStream, raw_ostream &cStream) const; - const MCRegisterInfo *getRegInfo() const { return RegInfo; } + const MCRegisterInfo *getRegInfo() const { return RegInfo.get(); } }; } @@ -83,12 +82,38 @@ static DecodeStatus DecodeFPR32RegisterClass(llvm::MCInst &Inst, unsigned RegNo, uint64_t Address, const void *Decoder); static DecodeStatus DecodeFPR64RegisterClass(llvm::MCInst &Inst, unsigned RegNo, uint64_t Address, const void *Decoder); +static DecodeStatus DecodeFPR64LoRegisterClass(llvm::MCInst &Inst, unsigned RegNo, + uint64_t Address, const void *Decoder); static DecodeStatus DecodeFPR128RegisterClass(llvm::MCInst &Inst, unsigned RegNo, uint64_t Address, const void *Decoder); -static DecodeStatus DecodeVPR128RegisterClass(llvm::MCInst &Inst, - unsigned RegNo, uint64_t Address, - const void *Decoder); +static DecodeStatus DecodeFPR128LoRegisterClass(llvm::MCInst &Inst, + unsigned RegNo, uint64_t Address, + const void *Decoder); + +static DecodeStatus DecodeGPR64noxzrRegisterClass(llvm::MCInst &Inst, + unsigned RegNo, + uint64_t Address, + const void *Decoder); + +static DecodeStatus DecodeDPairRegisterClass(llvm::MCInst &Inst, unsigned RegNo, + uint64_t Address, + const void *Decoder); +static DecodeStatus DecodeQPairRegisterClass(llvm::MCInst &Inst, unsigned RegNo, + uint64_t Address, + const void *Decoder); +static DecodeStatus DecodeDTripleRegisterClass(llvm::MCInst &Inst, + unsigned RegNo, uint64_t Address, + const void *Decoder); +static DecodeStatus DecodeQTripleRegisterClass(llvm::MCInst &Inst, + unsigned RegNo, uint64_t Address, + const void *Decoder); +static DecodeStatus DecodeDQuadRegisterClass(llvm::MCInst &Inst, unsigned RegNo, + uint64_t Address, + const void *Decoder); +static DecodeStatus DecodeQQuadRegisterClass(llvm::MCInst &Inst, unsigned RegNo, + uint64_t Address, + const void *Decoder); static DecodeStatus DecodeAddrRegExtendOperand(llvm::MCInst &Inst, unsigned OptionHiS, @@ -111,6 +136,30 @@ static DecodeStatus DecodeFPZeroOperand(llvm::MCInst &Inst, uint64_t Address, const void *Decoder); +static DecodeStatus DecodeShiftRightImm8(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeShiftRightImm16(MCInst &Inst, unsigned Val, + uint64_t Address, + const void *Decoder); +static DecodeStatus DecodeShiftRightImm32(MCInst &Inst, unsigned Val, + uint64_t Address, + const void *Decoder); +static DecodeStatus DecodeShiftRightImm64(MCInst &Inst, unsigned Val, + uint64_t Address, + const void *Decoder); + +static DecodeStatus DecodeShiftLeftImm8(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeShiftLeftImm16(MCInst &Inst, unsigned Val, + uint64_t Address, + const void *Decoder); +static DecodeStatus DecodeShiftLeftImm32(MCInst &Inst, unsigned Val, + uint64_t Address, + const void *Decoder); +static DecodeStatus DecodeShiftLeftImm64(MCInst &Inst, unsigned Val, + uint64_t Address, + const void *Decoder); + template static DecodeStatus DecodeMoveWideImmOperand(llvm::MCInst &Inst, unsigned FullImm, @@ -127,6 +176,10 @@ static DecodeStatus DecodeRegExtendOperand(llvm::MCInst &Inst, unsigned ShiftAmount, uint64_t Address, const void *Decoder); +template +static DecodeStatus +DecodeNeonMovImmShiftOperand(llvm::MCInst &Inst, unsigned ShiftAmount, + uint64_t Address, const void *Decoder); static DecodeStatus Decode32BitShiftOperand(llvm::MCInst &Inst, unsigned ShiftAmount, @@ -177,6 +230,17 @@ static DecodeStatus DecodeSingleIndexedInstruction(llvm::MCInst &Inst, uint64_t Address, const void *Decoder); +static DecodeStatus DecodeVLDSTPostInstruction(MCInst &Inst, unsigned Val, + uint64_t Address, + const void *Decoder); + +static DecodeStatus DecodeVLDSTLanePostInstruction(MCInst &Inst, unsigned Insn, + uint64_t Address, + const void *Decoder); + +static DecodeStatus DecodeSHLLInstruction(MCInst &Inst, unsigned Insn, + uint64_t Address, + const void *Decoder); static bool Check(DecodeStatus &Out, DecodeStatus In); @@ -208,7 +272,7 @@ DecodeStatus AArch64Disassembler::getInstruction(MCInst &MI, uint64_t &Size, uint8_t bytes[4]; // We want to read exactly 4 bytes of data. - if (Region.readBytes(Address, 4, (uint8_t*)bytes, NULL) == -1) { + if (Region.readBytes(Address, 4, bytes) == -1) { Size = 0; return MCDisassembler::Fail; } @@ -325,6 +389,14 @@ DecodeFPR64RegisterClass(llvm::MCInst &Inst, unsigned RegNo, return MCDisassembler::Success; } +static DecodeStatus +DecodeFPR64LoRegisterClass(llvm::MCInst &Inst, unsigned RegNo, + uint64_t Address, const void *Decoder) { + if (RegNo > 15) + return MCDisassembler::Fail; + + return DecodeFPR64RegisterClass(Inst, RegNo, Address, Decoder); +} static DecodeStatus DecodeFPR128RegisterClass(llvm::MCInst &Inst, unsigned RegNo, @@ -338,16 +410,79 @@ DecodeFPR128RegisterClass(llvm::MCInst &Inst, unsigned RegNo, } static DecodeStatus -DecodeVPR128RegisterClass(llvm::MCInst &Inst, unsigned RegNo, - uint64_t Address, const void *Decoder) { +DecodeFPR128LoRegisterClass(llvm::MCInst &Inst, unsigned RegNo, + uint64_t Address, const void *Decoder) { + if (RegNo > 15) + return MCDisassembler::Fail; + + return DecodeFPR128RegisterClass(Inst, RegNo, Address, Decoder); +} + +static DecodeStatus DecodeGPR64noxzrRegisterClass(llvm::MCInst &Inst, + unsigned RegNo, + uint64_t Address, + const void *Decoder) { + if (RegNo > 30) + return MCDisassembler::Fail; + + uint16_t Register = getReg(Decoder, AArch64::GPR64noxzrRegClassID, RegNo); + Inst.addOperand(MCOperand::CreateReg(Register)); + return MCDisassembler::Success; +} + +static DecodeStatus DecodeRegisterClassByID(llvm::MCInst &Inst, unsigned RegNo, + unsigned RegID, + const void *Decoder) { if (RegNo > 31) return MCDisassembler::Fail; - uint16_t Register = getReg(Decoder, AArch64::VPR128RegClassID, RegNo); + uint16_t Register = getReg(Decoder, RegID, RegNo); Inst.addOperand(MCOperand::CreateReg(Register)); return MCDisassembler::Success; } +static DecodeStatus DecodeDPairRegisterClass(llvm::MCInst &Inst, unsigned RegNo, + uint64_t Address, + const void *Decoder) { + return DecodeRegisterClassByID(Inst, RegNo, AArch64::DPairRegClassID, + Decoder); +} + +static DecodeStatus DecodeQPairRegisterClass(llvm::MCInst &Inst, unsigned RegNo, + uint64_t Address, + const void *Decoder) { + return DecodeRegisterClassByID(Inst, RegNo, AArch64::QPairRegClassID, + Decoder); +} + +static DecodeStatus DecodeDTripleRegisterClass(llvm::MCInst &Inst, + unsigned RegNo, uint64_t Address, + const void *Decoder) { + return DecodeRegisterClassByID(Inst, RegNo, AArch64::DTripleRegClassID, + Decoder); +} + +static DecodeStatus DecodeQTripleRegisterClass(llvm::MCInst &Inst, + unsigned RegNo, uint64_t Address, + const void *Decoder) { + return DecodeRegisterClassByID(Inst, RegNo, AArch64::QTripleRegClassID, + Decoder); +} + +static DecodeStatus DecodeDQuadRegisterClass(llvm::MCInst &Inst, unsigned RegNo, + uint64_t Address, + const void *Decoder) { + return DecodeRegisterClassByID(Inst, RegNo, AArch64::DQuadRegClassID, + Decoder); +} + +static DecodeStatus DecodeQQuadRegisterClass(llvm::MCInst &Inst, unsigned RegNo, + uint64_t Address, + const void *Decoder) { + return DecodeRegisterClassByID(Inst, RegNo, AArch64::QQuadRegClassID, + Decoder); +} + static DecodeStatus DecodeAddrRegExtendOperand(llvm::MCInst &Inst, unsigned OptionHiS, uint64_t Address, @@ -396,7 +531,73 @@ static DecodeStatus DecodeFPZeroOperand(llvm::MCInst &Inst, return MCDisassembler::Success; } +static DecodeStatus DecodeShiftRightImm8(MCInst &Inst, unsigned Val, + uint64_t Address, + const void *Decoder) { + Inst.addOperand(MCOperand::CreateImm(8 - Val)); + return MCDisassembler::Success; +} +static DecodeStatus DecodeShiftRightImm16(MCInst &Inst, unsigned Val, + uint64_t Address, + const void *Decoder) { + Inst.addOperand(MCOperand::CreateImm(16 - Val)); + return MCDisassembler::Success; +} + +static DecodeStatus DecodeShiftRightImm32(MCInst &Inst, unsigned Val, + uint64_t Address, + const void *Decoder) { + Inst.addOperand(MCOperand::CreateImm(32 - Val)); + return MCDisassembler::Success; +} + +static DecodeStatus DecodeShiftRightImm64(MCInst &Inst, unsigned Val, + uint64_t Address, + const void *Decoder) { + Inst.addOperand(MCOperand::CreateImm(64 - Val)); + return MCDisassembler::Success; +} + +static DecodeStatus DecodeShiftLeftImm8(MCInst &Inst, unsigned Val, + uint64_t Address, + const void *Decoder) { + if (Val > 7) + return MCDisassembler::Fail; + + Inst.addOperand(MCOperand::CreateImm(Val)); + return MCDisassembler::Success; +} + +static DecodeStatus DecodeShiftLeftImm16(MCInst &Inst, unsigned Val, + uint64_t Address, + const void *Decoder) { + if (Val > 15) + return MCDisassembler::Fail; + + Inst.addOperand(MCOperand::CreateImm(Val)); + return MCDisassembler::Success; +} + +static DecodeStatus DecodeShiftLeftImm32(MCInst &Inst, unsigned Val, + uint64_t Address, + const void *Decoder) { + if (Val > 31) + return MCDisassembler::Fail; + + Inst.addOperand(MCOperand::CreateImm(Val)); + return MCDisassembler::Success; +} + +static DecodeStatus DecodeShiftLeftImm64(MCInst &Inst, unsigned Val, + uint64_t Address, + const void *Decoder) { + if (Val > 63) + return MCDisassembler::Fail; + + Inst.addOperand(MCOperand::CreateImm(Val)); + return MCDisassembler::Success; +} template static DecodeStatus DecodeMoveWideImmOperand(llvm::MCInst &Inst, @@ -553,11 +754,11 @@ static DecodeStatus DecodeFMOVLaneInstruction(llvm::MCInst &Inst, unsigned Insn, unsigned IsToVec = fieldFromInstruction(Insn, 16, 1); if (IsToVec) { - DecodeVPR128RegisterClass(Inst, Rd, Address, Decoder); + DecodeFPR128RegisterClass(Inst, Rd, Address, Decoder); DecodeGPR64RegisterClass(Inst, Rn, Address, Decoder); } else { DecodeGPR64RegisterClass(Inst, Rd, Address, Decoder); - DecodeVPR128RegisterClass(Inst, Rn, Address, Decoder); + DecodeFPR128RegisterClass(Inst, Rn, Address, Decoder); } // Add the lane @@ -800,4 +1001,572 @@ extern "C" void LLVMInitializeAArch64Disassembler() { createAArch64Disassembler); } +template +static DecodeStatus +DecodeNeonMovImmShiftOperand(llvm::MCInst &Inst, unsigned ShiftAmount, + uint64_t Address, const void *Decoder) { + bool IsLSL = false; + if (Ext == A64SE::LSL) + IsLSL = true; + else if (Ext != A64SE::MSL) + return MCDisassembler::Fail; + + // MSL and LSLH accepts encoded shift amount 0 or 1. + if ((!IsLSL || (IsLSL && IsHalf)) && ShiftAmount != 0 && ShiftAmount != 1) + return MCDisassembler::Fail; + + // LSL accepts encoded shift amount 0, 1, 2 or 3. + if (IsLSL && ShiftAmount > 3) + return MCDisassembler::Fail; + + Inst.addOperand(MCOperand::CreateImm(ShiftAmount)); + return MCDisassembler::Success; +} + +// Decode post-index vector load/store instructions. +// This is necessary as we need to decode Rm: if Rm == 0b11111, the last +// operand is an immediate equal the the length of vector list in bytes, +// or Rm is decoded to a GPR64noxzr register. +static DecodeStatus DecodeVLDSTPostInstruction(MCInst &Inst, unsigned Insn, + uint64_t Address, + const void *Decoder) { + unsigned Rt = fieldFromInstruction(Insn, 0, 5); + unsigned Rn = fieldFromInstruction(Insn, 5, 5); + unsigned Rm = fieldFromInstruction(Insn, 16, 5); + unsigned Opcode = fieldFromInstruction(Insn, 12, 4); + unsigned IsLoad = fieldFromInstruction(Insn, 22, 1); + // 0 for 64bit vector list, 1 for 128bit vector list + unsigned Is128BitVec = fieldFromInstruction(Insn, 30, 1); + + unsigned NumVecs; + switch (Opcode) { + case 0: // ld4/st4 + case 2: // ld1/st1 with 4 vectors + NumVecs = 4; break; + case 4: // ld3/st3 + case 6: // ld1/st1 with 3 vectors + NumVecs = 3; break; + case 7: // ld1/st1 with 1 vector + NumVecs = 1; break; + case 8: // ld2/st2 + case 10: // ld1/st1 with 2 vectors + NumVecs = 2; break; + default: + llvm_unreachable("Invalid opcode for post-index load/store instructions"); + } + + // Decode vector list of 1/2/3/4 vectors for load instructions. + if (IsLoad) { + switch (NumVecs) { + case 1: + Is128BitVec ? DecodeFPR128RegisterClass(Inst, Rt, Address, Decoder) + : DecodeFPR64RegisterClass(Inst, Rt, Address, Decoder); + break; + case 2: + Is128BitVec ? DecodeQPairRegisterClass(Inst, Rt, Address, Decoder) + : DecodeDPairRegisterClass(Inst, Rt, Address, Decoder); + break; + case 3: + Is128BitVec ? DecodeQTripleRegisterClass(Inst, Rt, Address, Decoder) + : DecodeDTripleRegisterClass(Inst, Rt, Address, Decoder); + break; + case 4: + Is128BitVec ? DecodeQQuadRegisterClass(Inst, Rt, Address, Decoder) + : DecodeDQuadRegisterClass(Inst, Rt, Address, Decoder); + break; + } + } + + // Decode write back register, which is equal to Rn. + DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder); + DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder); + + if (Rm == 31) // If Rm is 0x11111, add the vector list length in byte + Inst.addOperand(MCOperand::CreateImm(NumVecs * (Is128BitVec ? 16 : 8))); + else // Decode Rm + DecodeGPR64noxzrRegisterClass(Inst, Rm, Address, Decoder); + + // Decode vector list of 1/2/3/4 vectors for load instructions. + if (!IsLoad) { + switch (NumVecs) { + case 1: + Is128BitVec ? DecodeFPR128RegisterClass(Inst, Rt, Address, Decoder) + : DecodeFPR64RegisterClass(Inst, Rt, Address, Decoder); + break; + case 2: + Is128BitVec ? DecodeQPairRegisterClass(Inst, Rt, Address, Decoder) + : DecodeDPairRegisterClass(Inst, Rt, Address, Decoder); + break; + case 3: + Is128BitVec ? DecodeQTripleRegisterClass(Inst, Rt, Address, Decoder) + : DecodeDTripleRegisterClass(Inst, Rt, Address, Decoder); + break; + case 4: + Is128BitVec ? DecodeQQuadRegisterClass(Inst, Rt, Address, Decoder) + : DecodeDQuadRegisterClass(Inst, Rt, Address, Decoder); + break; + } + } + + return MCDisassembler::Success; +} + +// Decode post-index vector load/store lane instructions. +// This is necessary as we need to decode Rm: if Rm == 0b11111, the last +// operand is an immediate equal the the length of the changed bytes, +// or Rm is decoded to a GPR64noxzr register. +static DecodeStatus DecodeVLDSTLanePostInstruction(MCInst &Inst, unsigned Insn, + uint64_t Address, + const void *Decoder) { + bool Is64bitVec = false; + bool IsLoadDup = false; + bool IsLoad = false; + // The total number of bytes transferred. + // TransferBytes = NumVecs * OneLaneBytes + unsigned TransferBytes = 0; + unsigned NumVecs = 0; + unsigned Opc = Inst.getOpcode(); + switch (Opc) { + case AArch64::LD1R_WB_8B_fixed: case AArch64::LD1R_WB_8B_register: + case AArch64::LD1R_WB_4H_fixed: case AArch64::LD1R_WB_4H_register: + case AArch64::LD1R_WB_2S_fixed: case AArch64::LD1R_WB_2S_register: + case AArch64::LD1R_WB_1D_fixed: case AArch64::LD1R_WB_1D_register: { + switch (Opc) { + case AArch64::LD1R_WB_8B_fixed: case AArch64::LD1R_WB_8B_register: + TransferBytes = 1; break; + case AArch64::LD1R_WB_4H_fixed: case AArch64::LD1R_WB_4H_register: + TransferBytes = 2; break; + case AArch64::LD1R_WB_2S_fixed: case AArch64::LD1R_WB_2S_register: + TransferBytes = 4; break; + case AArch64::LD1R_WB_1D_fixed: case AArch64::LD1R_WB_1D_register: + TransferBytes = 8; break; + } + Is64bitVec = true; + IsLoadDup = true; + NumVecs = 1; + break; + } + + case AArch64::LD1R_WB_16B_fixed: case AArch64::LD1R_WB_16B_register: + case AArch64::LD1R_WB_8H_fixed: case AArch64::LD1R_WB_8H_register: + case AArch64::LD1R_WB_4S_fixed: case AArch64::LD1R_WB_4S_register: + case AArch64::LD1R_WB_2D_fixed: case AArch64::LD1R_WB_2D_register: { + switch (Opc) { + case AArch64::LD1R_WB_16B_fixed: case AArch64::LD1R_WB_16B_register: + TransferBytes = 1; break; + case AArch64::LD1R_WB_8H_fixed: case AArch64::LD1R_WB_8H_register: + TransferBytes = 2; break; + case AArch64::LD1R_WB_4S_fixed: case AArch64::LD1R_WB_4S_register: + TransferBytes = 4; break; + case AArch64::LD1R_WB_2D_fixed: case AArch64::LD1R_WB_2D_register: + TransferBytes = 8; break; + } + IsLoadDup = true; + NumVecs = 1; + break; + } + + case AArch64::LD2R_WB_8B_fixed: case AArch64::LD2R_WB_8B_register: + case AArch64::LD2R_WB_4H_fixed: case AArch64::LD2R_WB_4H_register: + case AArch64::LD2R_WB_2S_fixed: case AArch64::LD2R_WB_2S_register: + case AArch64::LD2R_WB_1D_fixed: case AArch64::LD2R_WB_1D_register: { + switch (Opc) { + case AArch64::LD2R_WB_8B_fixed: case AArch64::LD2R_WB_8B_register: + TransferBytes = 2; break; + case AArch64::LD2R_WB_4H_fixed: case AArch64::LD2R_WB_4H_register: + TransferBytes = 4; break; + case AArch64::LD2R_WB_2S_fixed: case AArch64::LD2R_WB_2S_register: + TransferBytes = 8; break; + case AArch64::LD2R_WB_1D_fixed: case AArch64::LD2R_WB_1D_register: + TransferBytes = 16; break; + } + Is64bitVec = true; + IsLoadDup = true; + NumVecs = 2; + break; + } + + case AArch64::LD2R_WB_16B_fixed: case AArch64::LD2R_WB_16B_register: + case AArch64::LD2R_WB_8H_fixed: case AArch64::LD2R_WB_8H_register: + case AArch64::LD2R_WB_4S_fixed: case AArch64::LD2R_WB_4S_register: + case AArch64::LD2R_WB_2D_fixed: case AArch64::LD2R_WB_2D_register: { + switch (Opc) { + case AArch64::LD2R_WB_16B_fixed: case AArch64::LD2R_WB_16B_register: + TransferBytes = 2; break; + case AArch64::LD2R_WB_8H_fixed: case AArch64::LD2R_WB_8H_register: + TransferBytes = 4; break; + case AArch64::LD2R_WB_4S_fixed: case AArch64::LD2R_WB_4S_register: + TransferBytes = 8; break; + case AArch64::LD2R_WB_2D_fixed: case AArch64::LD2R_WB_2D_register: + TransferBytes = 16; break; + } + IsLoadDup = true; + NumVecs = 2; + break; + } + + case AArch64::LD3R_WB_8B_fixed: case AArch64::LD3R_WB_8B_register: + case AArch64::LD3R_WB_4H_fixed: case AArch64::LD3R_WB_4H_register: + case AArch64::LD3R_WB_2S_fixed: case AArch64::LD3R_WB_2S_register: + case AArch64::LD3R_WB_1D_fixed: case AArch64::LD3R_WB_1D_register: { + switch (Opc) { + case AArch64::LD3R_WB_8B_fixed: case AArch64::LD3R_WB_8B_register: + TransferBytes = 3; break; + case AArch64::LD3R_WB_4H_fixed: case AArch64::LD3R_WB_4H_register: + TransferBytes = 6; break; + case AArch64::LD3R_WB_2S_fixed: case AArch64::LD3R_WB_2S_register: + TransferBytes = 12; break; + case AArch64::LD3R_WB_1D_fixed: case AArch64::LD3R_WB_1D_register: + TransferBytes = 24; break; + } + Is64bitVec = true; + IsLoadDup = true; + NumVecs = 3; + break; + } + + case AArch64::LD3R_WB_16B_fixed: case AArch64::LD3R_WB_16B_register: + case AArch64::LD3R_WB_4S_fixed: case AArch64::LD3R_WB_8H_register: + case AArch64::LD3R_WB_8H_fixed: case AArch64::LD3R_WB_4S_register: + case AArch64::LD3R_WB_2D_fixed: case AArch64::LD3R_WB_2D_register: { + switch (Opc) { + case AArch64::LD3R_WB_16B_fixed: case AArch64::LD3R_WB_16B_register: + TransferBytes = 3; break; + case AArch64::LD3R_WB_8H_fixed: case AArch64::LD3R_WB_8H_register: + TransferBytes = 6; break; + case AArch64::LD3R_WB_4S_fixed: case AArch64::LD3R_WB_4S_register: + TransferBytes = 12; break; + case AArch64::LD3R_WB_2D_fixed: case AArch64::LD3R_WB_2D_register: + TransferBytes = 24; break; + } + IsLoadDup = true; + NumVecs = 3; + break; + } + + case AArch64::LD4R_WB_8B_fixed: case AArch64::LD4R_WB_8B_register: + case AArch64::LD4R_WB_4H_fixed: case AArch64::LD4R_WB_4H_register: + case AArch64::LD4R_WB_2S_fixed: case AArch64::LD4R_WB_2S_register: + case AArch64::LD4R_WB_1D_fixed: case AArch64::LD4R_WB_1D_register: { + switch (Opc) { + case AArch64::LD4R_WB_8B_fixed: case AArch64::LD4R_WB_8B_register: + TransferBytes = 4; break; + case AArch64::LD4R_WB_4H_fixed: case AArch64::LD4R_WB_4H_register: + TransferBytes = 8; break; + case AArch64::LD4R_WB_2S_fixed: case AArch64::LD4R_WB_2S_register: + TransferBytes = 16; break; + case AArch64::LD4R_WB_1D_fixed: case AArch64::LD4R_WB_1D_register: + TransferBytes = 32; break; + } + Is64bitVec = true; + IsLoadDup = true; + NumVecs = 4; + break; + } + + case AArch64::LD4R_WB_16B_fixed: case AArch64::LD4R_WB_16B_register: + case AArch64::LD4R_WB_4S_fixed: case AArch64::LD4R_WB_8H_register: + case AArch64::LD4R_WB_8H_fixed: case AArch64::LD4R_WB_4S_register: + case AArch64::LD4R_WB_2D_fixed: case AArch64::LD4R_WB_2D_register: { + switch (Opc) { + case AArch64::LD4R_WB_16B_fixed: case AArch64::LD4R_WB_16B_register: + TransferBytes = 4; break; + case AArch64::LD4R_WB_8H_fixed: case AArch64::LD4R_WB_8H_register: + TransferBytes = 8; break; + case AArch64::LD4R_WB_4S_fixed: case AArch64::LD4R_WB_4S_register: + TransferBytes = 16; break; + case AArch64::LD4R_WB_2D_fixed: case AArch64::LD4R_WB_2D_register: + TransferBytes = 32; break; + } + IsLoadDup = true; + NumVecs = 4; + break; + } + + case AArch64::LD1LN_WB_B_fixed: case AArch64::LD1LN_WB_B_register: + case AArch64::LD1LN_WB_H_fixed: case AArch64::LD1LN_WB_H_register: + case AArch64::LD1LN_WB_S_fixed: case AArch64::LD1LN_WB_S_register: + case AArch64::LD1LN_WB_D_fixed: case AArch64::LD1LN_WB_D_register: { + switch (Opc) { + case AArch64::LD1LN_WB_B_fixed: case AArch64::LD1LN_WB_B_register: + TransferBytes = 1; break; + case AArch64::LD1LN_WB_H_fixed: case AArch64::LD1LN_WB_H_register: + TransferBytes = 2; break; + case AArch64::LD1LN_WB_S_fixed: case AArch64::LD1LN_WB_S_register: + TransferBytes = 4; break; + case AArch64::LD1LN_WB_D_fixed: case AArch64::LD1LN_WB_D_register: + TransferBytes = 8; break; + } + IsLoad = true; + NumVecs = 1; + break; + } + + case AArch64::LD2LN_WB_B_fixed: case AArch64::LD2LN_WB_B_register: + case AArch64::LD2LN_WB_H_fixed: case AArch64::LD2LN_WB_H_register: + case AArch64::LD2LN_WB_S_fixed: case AArch64::LD2LN_WB_S_register: + case AArch64::LD2LN_WB_D_fixed: case AArch64::LD2LN_WB_D_register: { + switch (Opc) { + case AArch64::LD2LN_WB_B_fixed: case AArch64::LD2LN_WB_B_register: + TransferBytes = 2; break; + case AArch64::LD2LN_WB_H_fixed: case AArch64::LD2LN_WB_H_register: + TransferBytes = 4; break; + case AArch64::LD2LN_WB_S_fixed: case AArch64::LD2LN_WB_S_register: + TransferBytes = 8; break; + case AArch64::LD2LN_WB_D_fixed: case AArch64::LD2LN_WB_D_register: + TransferBytes = 16; break; + } + IsLoad = true; + NumVecs = 2; + break; + } + + case AArch64::LD3LN_WB_B_fixed: case AArch64::LD3LN_WB_B_register: + case AArch64::LD3LN_WB_H_fixed: case AArch64::LD3LN_WB_H_register: + case AArch64::LD3LN_WB_S_fixed: case AArch64::LD3LN_WB_S_register: + case AArch64::LD3LN_WB_D_fixed: case AArch64::LD3LN_WB_D_register: { + switch (Opc) { + case AArch64::LD3LN_WB_B_fixed: case AArch64::LD3LN_WB_B_register: + TransferBytes = 3; break; + case AArch64::LD3LN_WB_H_fixed: case AArch64::LD3LN_WB_H_register: + TransferBytes = 6; break; + case AArch64::LD3LN_WB_S_fixed: case AArch64::LD3LN_WB_S_register: + TransferBytes = 12; break; + case AArch64::LD3LN_WB_D_fixed: case AArch64::LD3LN_WB_D_register: + TransferBytes = 24; break; + } + IsLoad = true; + NumVecs = 3; + break; + } + + case AArch64::LD4LN_WB_B_fixed: case AArch64::LD4LN_WB_B_register: + case AArch64::LD4LN_WB_H_fixed: case AArch64::LD4LN_WB_H_register: + case AArch64::LD4LN_WB_S_fixed: case AArch64::LD4LN_WB_S_register: + case AArch64::LD4LN_WB_D_fixed: case AArch64::LD4LN_WB_D_register: { + switch (Opc) { + case AArch64::LD4LN_WB_B_fixed: case AArch64::LD4LN_WB_B_register: + TransferBytes = 4; break; + case AArch64::LD4LN_WB_H_fixed: case AArch64::LD4LN_WB_H_register: + TransferBytes = 8; break; + case AArch64::LD4LN_WB_S_fixed: case AArch64::LD4LN_WB_S_register: + TransferBytes = 16; break; + case AArch64::LD4LN_WB_D_fixed: case AArch64::LD4LN_WB_D_register: + TransferBytes = 32; break; + } + IsLoad = true; + NumVecs = 4; + break; + } + + case AArch64::ST1LN_WB_B_fixed: case AArch64::ST1LN_WB_B_register: + case AArch64::ST1LN_WB_H_fixed: case AArch64::ST1LN_WB_H_register: + case AArch64::ST1LN_WB_S_fixed: case AArch64::ST1LN_WB_S_register: + case AArch64::ST1LN_WB_D_fixed: case AArch64::ST1LN_WB_D_register: { + switch (Opc) { + case AArch64::ST1LN_WB_B_fixed: case AArch64::ST1LN_WB_B_register: + TransferBytes = 1; break; + case AArch64::ST1LN_WB_H_fixed: case AArch64::ST1LN_WB_H_register: + TransferBytes = 2; break; + case AArch64::ST1LN_WB_S_fixed: case AArch64::ST1LN_WB_S_register: + TransferBytes = 4; break; + case AArch64::ST1LN_WB_D_fixed: case AArch64::ST1LN_WB_D_register: + TransferBytes = 8; break; + } + NumVecs = 1; + break; + } + + case AArch64::ST2LN_WB_B_fixed: case AArch64::ST2LN_WB_B_register: + case AArch64::ST2LN_WB_H_fixed: case AArch64::ST2LN_WB_H_register: + case AArch64::ST2LN_WB_S_fixed: case AArch64::ST2LN_WB_S_register: + case AArch64::ST2LN_WB_D_fixed: case AArch64::ST2LN_WB_D_register: { + switch (Opc) { + case AArch64::ST2LN_WB_B_fixed: case AArch64::ST2LN_WB_B_register: + TransferBytes = 2; break; + case AArch64::ST2LN_WB_H_fixed: case AArch64::ST2LN_WB_H_register: + TransferBytes = 4; break; + case AArch64::ST2LN_WB_S_fixed: case AArch64::ST2LN_WB_S_register: + TransferBytes = 8; break; + case AArch64::ST2LN_WB_D_fixed: case AArch64::ST2LN_WB_D_register: + TransferBytes = 16; break; + } + NumVecs = 2; + break; + } + + case AArch64::ST3LN_WB_B_fixed: case AArch64::ST3LN_WB_B_register: + case AArch64::ST3LN_WB_H_fixed: case AArch64::ST3LN_WB_H_register: + case AArch64::ST3LN_WB_S_fixed: case AArch64::ST3LN_WB_S_register: + case AArch64::ST3LN_WB_D_fixed: case AArch64::ST3LN_WB_D_register: { + switch (Opc) { + case AArch64::ST3LN_WB_B_fixed: case AArch64::ST3LN_WB_B_register: + TransferBytes = 3; break; + case AArch64::ST3LN_WB_H_fixed: case AArch64::ST3LN_WB_H_register: + TransferBytes = 6; break; + case AArch64::ST3LN_WB_S_fixed: case AArch64::ST3LN_WB_S_register: + TransferBytes = 12; break; + case AArch64::ST3LN_WB_D_fixed: case AArch64::ST3LN_WB_D_register: + TransferBytes = 24; break; + } + NumVecs = 3; + break; + } + + case AArch64::ST4LN_WB_B_fixed: case AArch64::ST4LN_WB_B_register: + case AArch64::ST4LN_WB_H_fixed: case AArch64::ST4LN_WB_H_register: + case AArch64::ST4LN_WB_S_fixed: case AArch64::ST4LN_WB_S_register: + case AArch64::ST4LN_WB_D_fixed: case AArch64::ST4LN_WB_D_register: { + switch (Opc) { + case AArch64::ST4LN_WB_B_fixed: case AArch64::ST4LN_WB_B_register: + TransferBytes = 4; break; + case AArch64::ST4LN_WB_H_fixed: case AArch64::ST4LN_WB_H_register: + TransferBytes = 8; break; + case AArch64::ST4LN_WB_S_fixed: case AArch64::ST4LN_WB_S_register: + TransferBytes = 16; break; + case AArch64::ST4LN_WB_D_fixed: case AArch64::ST4LN_WB_D_register: + TransferBytes = 32; break; + } + NumVecs = 4; + break; + } + + default: + return MCDisassembler::Fail; + } // End of switch (Opc) + + unsigned Rt = fieldFromInstruction(Insn, 0, 5); + unsigned Rn = fieldFromInstruction(Insn, 5, 5); + unsigned Rm = fieldFromInstruction(Insn, 16, 5); + + // Decode post-index of load duplicate lane + if (IsLoadDup) { + switch (NumVecs) { + case 1: + Is64bitVec ? DecodeFPR64RegisterClass(Inst, Rt, Address, Decoder) + : DecodeFPR128RegisterClass(Inst, Rt, Address, Decoder); + break; + case 2: + Is64bitVec ? DecodeDPairRegisterClass(Inst, Rt, Address, Decoder) + : DecodeQPairRegisterClass(Inst, Rt, Address, Decoder); + break; + case 3: + Is64bitVec ? DecodeDTripleRegisterClass(Inst, Rt, Address, Decoder) + : DecodeQTripleRegisterClass(Inst, Rt, Address, Decoder); + break; + case 4: + Is64bitVec ? DecodeDQuadRegisterClass(Inst, Rt, Address, Decoder) + : DecodeQQuadRegisterClass(Inst, Rt, Address, Decoder); + } + + // Decode write back register, which is equal to Rn. + DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder); + DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder); + + if (Rm == 31) // If Rm is 0x11111, add the number of transferred bytes + Inst.addOperand(MCOperand::CreateImm(TransferBytes)); + else // Decode Rm + DecodeGPR64noxzrRegisterClass(Inst, Rm, Address, Decoder); + + return MCDisassembler::Success; + } + + // Decode post-index of load/store lane + // Loads have a vector list as output. + if (IsLoad) { + switch (NumVecs) { + case 1: + DecodeFPR128RegisterClass(Inst, Rt, Address, Decoder); + break; + case 2: + DecodeQPairRegisterClass(Inst, Rt, Address, Decoder); + break; + case 3: + DecodeQTripleRegisterClass(Inst, Rt, Address, Decoder); + break; + case 4: + DecodeQQuadRegisterClass(Inst, Rt, Address, Decoder); + } + } + + // Decode write back register, which is equal to Rn. + DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder); + DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder); + + if (Rm == 31) // If Rm is 0x11111, add the number of transferred bytes + Inst.addOperand(MCOperand::CreateImm(TransferBytes)); + else // Decode Rm + DecodeGPR64noxzrRegisterClass(Inst, Rm, Address, Decoder); + + // Decode the source vector list. + switch (NumVecs) { + case 1: + DecodeFPR128RegisterClass(Inst, Rt, Address, Decoder); + break; + case 2: + DecodeQPairRegisterClass(Inst, Rt, Address, Decoder); + break; + case 3: + DecodeQTripleRegisterClass(Inst, Rt, Address, Decoder); + break; + case 4: + DecodeQQuadRegisterClass(Inst, Rt, Address, Decoder); + } + + // Decode lane + unsigned Q = fieldFromInstruction(Insn, 30, 1); + unsigned S = fieldFromInstruction(Insn, 10, 3); + unsigned lane = 0; + // Calculate the number of lanes by number of vectors and transfered bytes. + // NumLanes = 16 bytes / bytes of each lane + unsigned NumLanes = 16 / (TransferBytes / NumVecs); + switch (NumLanes) { + case 16: // A vector has 16 lanes, each lane is 1 bytes. + lane = (Q << 3) | S; + break; + case 8: + lane = (Q << 2) | (S >> 1); + break; + case 4: + lane = (Q << 1) | (S >> 2); + break; + case 2: + lane = Q; + break; + } + Inst.addOperand(MCOperand::CreateImm(lane)); + + return MCDisassembler::Success; +} + +static DecodeStatus DecodeSHLLInstruction(MCInst &Inst, unsigned Insn, + uint64_t Address, + const void *Decoder) { + unsigned Rd = fieldFromInstruction(Insn, 0, 5); + unsigned Rn = fieldFromInstruction(Insn, 5, 5); + unsigned size = fieldFromInstruction(Insn, 22, 2); + unsigned Q = fieldFromInstruction(Insn, 30, 1); + + DecodeFPR128RegisterClass(Inst, Rd, Address, Decoder); + + if(Q) + DecodeFPR128RegisterClass(Inst, Rn, Address, Decoder); + else + DecodeFPR64RegisterClass(Inst, Rn, Address, Decoder); + + switch (size) { + case 0: + Inst.addOperand(MCOperand::CreateImm(8)); + break; + case 1: + Inst.addOperand(MCOperand::CreateImm(16)); + break; + case 2: + Inst.addOperand(MCOperand::CreateImm(32)); + break; + default : + return MCDisassembler::Fail; + } + return MCDisassembler::Success; +} diff --git a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp index 82ce80c8b1a1..0438de3152e1 100644 --- a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp +++ b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp @@ -368,6 +368,14 @@ AArch64InstPrinter::printSImm7ScaledOperand(const MCInst *MI, unsigned OpNum, O << "#" << (Imm * MemScale); } +void AArch64InstPrinter::printVPRRegister(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + unsigned Reg = MI->getOperand(OpNo).getReg(); + std::string Name = getRegisterName(Reg); + Name[0] = 'v'; + O << Name; +} + void AArch64InstPrinter::printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) { const MCOperand &Op = MI->getOperand(OpNo); @@ -406,3 +414,126 @@ void AArch64InstPrinter::printInst(const MCInst *MI, raw_ostream &O, printAnnotation(O, Annot); } + +template +void AArch64InstPrinter::printNeonMovImmShiftOperand(const MCInst *MI, + unsigned OpNum, + raw_ostream &O) { + const MCOperand &MO = MI->getOperand(OpNum); + + assert(MO.isImm() && + "Immediate operand required for Neon vector immediate inst."); + + bool IsLSL = false; + if (Ext == A64SE::LSL) + IsLSL = true; + else if (Ext != A64SE::MSL) + llvm_unreachable("Invalid shift specifier in movi instruction"); + + int64_t Imm = MO.getImm(); + + // MSL and LSLH accepts encoded shift amount 0 or 1. + if ((!IsLSL || (IsLSL && isHalf)) && Imm != 0 && Imm != 1) + llvm_unreachable("Invalid shift amount in movi instruction"); + + // LSH accepts encoded shift amount 0, 1, 2 or 3. + if (IsLSL && (Imm < 0 || Imm > 3)) + llvm_unreachable("Invalid shift amount in movi instruction"); + + // Print shift amount as multiple of 8 with MSL encoded shift amount + // 0 and 1 printed as 8 and 16. + if (!IsLSL) + Imm++; + Imm *= 8; + + // LSL #0 is not printed + if (IsLSL) { + if (Imm == 0) + return; + O << ", lsl"; + } else + O << ", msl"; + + O << " #" << Imm; +} + +void AArch64InstPrinter::printNeonUImm0Operand(const MCInst *MI, unsigned OpNum, + raw_ostream &o) { + o << "#0x0"; +} + +void AArch64InstPrinter::printUImmHexOperand(const MCInst *MI, unsigned OpNum, + raw_ostream &O) { + const MCOperand &MOUImm = MI->getOperand(OpNum); + + assert(MOUImm.isImm() && + "Immediate operand required for Neon vector immediate inst."); + + unsigned Imm = MOUImm.getImm(); + + O << "#0x"; + O.write_hex(Imm); +} + +void AArch64InstPrinter::printUImmBareOperand(const MCInst *MI, + unsigned OpNum, + raw_ostream &O) { + const MCOperand &MOUImm = MI->getOperand(OpNum); + + assert(MOUImm.isImm() + && "Immediate operand required for Neon vector immediate inst."); + + unsigned Imm = MOUImm.getImm(); + O << Imm; +} + +void AArch64InstPrinter::printNeonUImm64MaskOperand(const MCInst *MI, + unsigned OpNum, + raw_ostream &O) { + const MCOperand &MOUImm8 = MI->getOperand(OpNum); + + assert(MOUImm8.isImm() && + "Immediate operand required for Neon vector immediate bytemask inst."); + + uint32_t UImm8 = MOUImm8.getImm(); + uint64_t Mask = 0; + + // Replicates 0x00 or 0xff byte in a 64-bit vector + for (unsigned ByteNum = 0; ByteNum < 8; ++ByteNum) { + if ((UImm8 >> ByteNum) & 1) + Mask |= (uint64_t)0xff << (8 * ByteNum); + } + + O << "#0x"; + O.write_hex(Mask); +} + +// If Count > 1, there are two valid kinds of vector list: +// (1) {Vn.layout, Vn+1.layout, ... , Vm.layout} +// (2) {Vn.layout - Vm.layout} +// We choose the first kind as output. +template +void AArch64InstPrinter::printVectorList(const MCInst *MI, unsigned OpNum, + raw_ostream &O) { + assert(Count >= 1 && Count <= 4 && "Invalid Number of Vectors"); + + unsigned Reg = MI->getOperand(OpNum).getReg(); + std::string LayoutStr = A64VectorLayoutToString(Layout); + O << "{"; + if (Count > 1) { // Print sub registers separately + bool IsVec64 = (Layout < A64Layout::VL_16B); + unsigned SubRegIdx = IsVec64 ? AArch64::dsub_0 : AArch64::qsub_0; + for (unsigned I = 0; I < Count; I++) { + std::string Name = getRegisterName(MRI.getSubReg(Reg, SubRegIdx++)); + Name[0] = 'v'; + O << Name << LayoutStr; + if (I != Count - 1) + O << ", "; + } + } else { // Print the register directly when NumVecs is 1. + std::string Name = getRegisterName(Reg); + Name[0] = 'v'; + O << Name << LayoutStr; + } + O << "}"; +} diff --git a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h index 639fa869c016..37b7273438dd 100644 --- a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h +++ b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h @@ -157,6 +157,7 @@ public: void printRegExtendOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O, A64SE::ShiftExtSpecifiers Ext); + void printVPRRegister(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); virtual void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot); @@ -164,9 +165,18 @@ public: return RegNo == AArch64::XSP || RegNo == AArch64::WSP; } - + template + void printNeonMovImmShiftOperand(const MCInst *MI, unsigned OpNum, + raw_ostream &O); + void printNeonUImm0Operand(const MCInst *MI, unsigned OpNum, raw_ostream &O); + void printUImmHexOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O); + void printUImmBareOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O); + void printNeonUImm64MaskOperand(const MCInst *MI, unsigned OpNum, + raw_ostream &O); + + template + void printVectorList(const MCInst *MI, unsigned OpNum, raw_ostream &O); }; - } #endif diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp index a3373b1087bb..8a9077c1cab4 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp @@ -578,8 +578,8 @@ static uint64_t adjustFixupValue(unsigned Kind, uint64_t Value) { } MCAsmBackend * -llvm::createAArch64AsmBackend(const Target &T, StringRef TT, StringRef CPU) { +llvm::createAArch64AsmBackend(const Target &T, const MCRegisterInfo &MRI, + StringRef TT, StringRef CPU) { Triple TheTriple(TT); - return new ELFAArch64AsmBackend(T, TT, TheTriple.getOS()); } diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp index 3b811df212d1..a64c463f9e5c 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp @@ -55,11 +55,10 @@ namespace { /// by MachO. Beware! class AArch64ELFStreamer : public MCELFStreamer { public: - AArch64ELFStreamer(MCContext &Context, MCAsmBackend &TAB, - raw_ostream &OS, MCCodeEmitter *Emitter) - : MCELFStreamer(Context, TAB, OS, Emitter), - MappingSymbolCounter(0), LastEMS(EMS_None) { - } + AArch64ELFStreamer(MCContext &Context, MCAsmBackend &TAB, raw_ostream &OS, + MCCodeEmitter *Emitter) + : MCELFStreamer(Context, 0, TAB, OS, Emitter), MappingSymbolCounter(0), + LastEMS(EMS_None) {} ~AArch64ELFStreamer() {} @@ -85,18 +84,17 @@ public: /// This is one of the functions used to emit data into an ELF section, so the /// AArch64 streamer overrides it to add the appropriate mapping symbol ($d) /// if necessary. - virtual void EmitBytes(StringRef Data, unsigned AddrSpace) { + virtual void EmitBytes(StringRef Data) { EmitDataMappingSymbol(); - MCELFStreamer::EmitBytes(Data, AddrSpace); + MCELFStreamer::EmitBytes(Data); } /// This is one of the functions used to emit data into an ELF section, so the /// AArch64 streamer overrides it to add the appropriate mapping symbol ($d) /// if necessary. - virtual void EmitValueImpl(const MCExpr *Value, unsigned Size, - unsigned AddrSpace) { + virtual void EmitValueImpl(const MCExpr *Value, unsigned Size) { EmitDataMappingSymbol(); - MCELFStreamer::EmitValueImpl(Value, Size, AddrSpace); + MCELFStreamer::EmitValueImpl(Value, Size); } private: @@ -130,7 +128,7 @@ private: MCELF::SetType(SD, ELF::STT_NOTYPE); MCELF::SetBinding(SD, ELF::STB_LOCAL); SD.setExternal(false); - Symbol->setSection(*getCurrentSection().first); + AssignSection(Symbol, getCurrentSection().first); const MCExpr *Value = MCSymbolRefExpr::Create(Start, getContext()); Symbol->setVariableValue(Value); diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp index 8ec8cbf1c525..add874c12019 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp @@ -31,11 +31,12 @@ AArch64ELFMCAsmInfo::AArch64ELFMCAsmInfo() { UseDataRegionDirectives = true; - WeakRefDirective = "\t.weak\t"; - HasLEB128 = true; SupportsDebugInformation = true; // Exceptions handling ExceptionsType = ExceptionHandling::DwarfCFI; } + +// Pin the vtable to this file. +void AArch64ELFMCAsmInfo::anchor() {} diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h index a20bc471c20d..d1dd285c832c 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h +++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h @@ -14,13 +14,15 @@ #ifndef LLVM_AARCH64TARGETASMINFO_H #define LLVM_AARCH64TARGETASMINFO_H -#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCAsmInfoELF.h" namespace llvm { - struct AArch64ELFMCAsmInfo : public MCAsmInfo { - explicit AArch64ELFMCAsmInfo(); - }; +struct AArch64ELFMCAsmInfo : public MCAsmInfoELF { + explicit AArch64ELFMCAsmInfo(); +private: + virtual void anchor(); +}; } // namespace llvm diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp index a5c591eee800..b41c566f612b 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp @@ -59,6 +59,23 @@ public: unsigned getBitfield64LSLOpValue(const MCInst &MI, unsigned OpIdx, SmallVectorImpl &Fixups) const; + unsigned getShiftRightImm8(const MCInst &MI, unsigned Op, + SmallVectorImpl &Fixups) const; + unsigned getShiftRightImm16(const MCInst &MI, unsigned Op, + SmallVectorImpl &Fixups) const; + unsigned getShiftRightImm32(const MCInst &MI, unsigned Op, + SmallVectorImpl &Fixups) const; + unsigned getShiftRightImm64(const MCInst &MI, unsigned Op, + SmallVectorImpl &Fixups) const; + + unsigned getShiftLeftImm8(const MCInst &MI, unsigned Op, + SmallVectorImpl &Fixups) const; + unsigned getShiftLeftImm16(const MCInst &MI, unsigned Op, + SmallVectorImpl &Fixups) const; + unsigned getShiftLeftImm32(const MCInst &MI, unsigned Op, + SmallVectorImpl &Fixups) const; + unsigned getShiftLeftImm64(const MCInst &MI, unsigned Op, + SmallVectorImpl &Fixups) const; // Labels are handled mostly the same way: a symbol is needed, and // just gets some fixup attached. @@ -152,10 +169,10 @@ getOffsetUImm12OpValue(const MCInst &MI, unsigned OpIdx, switch (Expr->getKind()) { default: llvm_unreachable("Unexpected operand modifier"); case AArch64MCExpr::VK_AARCH64_LO12: { - unsigned FixupsBySize[] = { AArch64::fixup_a64_ldst8_lo12, - AArch64::fixup_a64_ldst16_lo12, - AArch64::fixup_a64_ldst32_lo12, - AArch64::fixup_a64_ldst64_lo12, + static const unsigned FixupsBySize[] = { AArch64::fixup_a64_ldst8_lo12, + AArch64::fixup_a64_ldst16_lo12, + AArch64::fixup_a64_ldst32_lo12, + AArch64::fixup_a64_ldst64_lo12, AArch64::fixup_a64_ldst128_lo12 }; assert(MemSize <= 16 && "Invalid fixup for operation"); FixupKind = FixupsBySize[Log2_32(MemSize)]; @@ -166,19 +183,23 @@ getOffsetUImm12OpValue(const MCInst &MI, unsigned OpIdx, FixupKind = AArch64::fixup_a64_ld64_got_lo12_nc; break; case AArch64MCExpr::VK_AARCH64_DTPREL_LO12: { - unsigned FixupsBySize[] = { AArch64::fixup_a64_ldst8_dtprel_lo12, - AArch64::fixup_a64_ldst16_dtprel_lo12, - AArch64::fixup_a64_ldst32_dtprel_lo12, - AArch64::fixup_a64_ldst64_dtprel_lo12 }; + static const unsigned FixupsBySize[] = { + AArch64::fixup_a64_ldst8_dtprel_lo12, + AArch64::fixup_a64_ldst16_dtprel_lo12, + AArch64::fixup_a64_ldst32_dtprel_lo12, + AArch64::fixup_a64_ldst64_dtprel_lo12 + }; assert(MemSize <= 8 && "Invalid fixup for operation"); FixupKind = FixupsBySize[Log2_32(MemSize)]; break; } case AArch64MCExpr::VK_AARCH64_DTPREL_LO12_NC: { - unsigned FixupsBySize[] = { AArch64::fixup_a64_ldst8_dtprel_lo12_nc, - AArch64::fixup_a64_ldst16_dtprel_lo12_nc, - AArch64::fixup_a64_ldst32_dtprel_lo12_nc, - AArch64::fixup_a64_ldst64_dtprel_lo12_nc }; + static const unsigned FixupsBySize[] = { + AArch64::fixup_a64_ldst8_dtprel_lo12_nc, + AArch64::fixup_a64_ldst16_dtprel_lo12_nc, + AArch64::fixup_a64_ldst32_dtprel_lo12_nc, + AArch64::fixup_a64_ldst64_dtprel_lo12_nc + }; assert(MemSize <= 8 && "Invalid fixup for operation"); FixupKind = FixupsBySize[Log2_32(MemSize)]; break; @@ -188,19 +209,23 @@ getOffsetUImm12OpValue(const MCInst &MI, unsigned OpIdx, FixupKind = AArch64::fixup_a64_ld64_gottprel_lo12_nc; break; case AArch64MCExpr::VK_AARCH64_TPREL_LO12:{ - unsigned FixupsBySize[] = { AArch64::fixup_a64_ldst8_tprel_lo12, - AArch64::fixup_a64_ldst16_tprel_lo12, - AArch64::fixup_a64_ldst32_tprel_lo12, - AArch64::fixup_a64_ldst64_tprel_lo12 }; + static const unsigned FixupsBySize[] = { + AArch64::fixup_a64_ldst8_tprel_lo12, + AArch64::fixup_a64_ldst16_tprel_lo12, + AArch64::fixup_a64_ldst32_tprel_lo12, + AArch64::fixup_a64_ldst64_tprel_lo12 + }; assert(MemSize <= 8 && "Invalid fixup for operation"); FixupKind = FixupsBySize[Log2_32(MemSize)]; break; } case AArch64MCExpr::VK_AARCH64_TPREL_LO12_NC: { - unsigned FixupsBySize[] = { AArch64::fixup_a64_ldst8_tprel_lo12_nc, - AArch64::fixup_a64_ldst16_tprel_lo12_nc, - AArch64::fixup_a64_ldst32_tprel_lo12_nc, - AArch64::fixup_a64_ldst64_tprel_lo12_nc }; + static const unsigned FixupsBySize[] = { + AArch64::fixup_a64_ldst8_tprel_lo12_nc, + AArch64::fixup_a64_ldst16_tprel_lo12_nc, + AArch64::fixup_a64_ldst32_tprel_lo12_nc, + AArch64::fixup_a64_ldst64_tprel_lo12_nc + }; assert(MemSize <= 8 && "Invalid fixup for operation"); FixupKind = FixupsBySize[Log2_32(MemSize)]; break; @@ -302,6 +327,45 @@ AArch64MCCodeEmitter::getBitfield64LSLOpValue(const MCInst &MI, unsigned OpIdx, return ((64 - MO.getImm()) & 0x3f) | (63 - MO.getImm()) << 6; } +unsigned AArch64MCCodeEmitter::getShiftRightImm8( + const MCInst &MI, unsigned Op, SmallVectorImpl &Fixups) const { + return 8 - MI.getOperand(Op).getImm(); +} + +unsigned AArch64MCCodeEmitter::getShiftRightImm16( + const MCInst &MI, unsigned Op, SmallVectorImpl &Fixups) const { + return 16 - MI.getOperand(Op).getImm(); +} + +unsigned AArch64MCCodeEmitter::getShiftRightImm32( + const MCInst &MI, unsigned Op, SmallVectorImpl &Fixups) const { + return 32 - MI.getOperand(Op).getImm(); +} + +unsigned AArch64MCCodeEmitter::getShiftRightImm64( + const MCInst &MI, unsigned Op, SmallVectorImpl &Fixups) const { + return 64 - MI.getOperand(Op).getImm(); +} + +unsigned AArch64MCCodeEmitter::getShiftLeftImm8( + const MCInst &MI, unsigned Op, SmallVectorImpl &Fixups) const { + return MI.getOperand(Op).getImm() - 8; +} + +unsigned AArch64MCCodeEmitter::getShiftLeftImm16( + const MCInst &MI, unsigned Op, SmallVectorImpl &Fixups) const { + return MI.getOperand(Op).getImm() - 16; +} + +unsigned AArch64MCCodeEmitter::getShiftLeftImm32( + const MCInst &MI, unsigned Op, SmallVectorImpl &Fixups) const { + return MI.getOperand(Op).getImm() - 32; +} + +unsigned AArch64MCCodeEmitter::getShiftLeftImm64( + const MCInst &MI, unsigned Op, SmallVectorImpl &Fixups) const { + return MI.getOperand(Op).getImm() - 64; +} template unsigned AArch64MCCodeEmitter::getLabelOpValue(const MCInst &MI, @@ -346,7 +410,7 @@ AArch64MCCodeEmitter::getMachineOpValue(const MCInst &MI, const MCOperand &MO, SmallVectorImpl &Fixups) const { if (MO.isReg()) { - return Ctx.getRegisterInfo().getEncodingValue(MO.getReg()); + return Ctx.getRegisterInfo()->getEncodingValue(MO.getReg()); } else if (MO.isImm()) { return static_cast(MO.getImm()); } diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp index 819eeadb44e4..58fc95c2eaf6 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp @@ -40,7 +40,7 @@ MCSubtargetInfo *AArch64_MC::createAArch64MCSubtargetInfo(StringRef TT, StringRef CPU, StringRef FS) { MCSubtargetInfo *X = new MCSubtargetInfo(); - InitAArch64MCSubtargetInfo(X, TT, CPU, ""); + InitAArch64MCSubtargetInfo(X, TT, CPU, FS); return X; } @@ -57,13 +57,14 @@ static MCRegisterInfo *createAArch64MCRegisterInfo(StringRef Triple) { return X; } -static MCAsmInfo *createAArch64MCAsmInfo(const Target &T, StringRef TT) { +static MCAsmInfo *createAArch64MCAsmInfo(const MCRegisterInfo &MRI, + StringRef TT) { Triple TheTriple(TT); MCAsmInfo *MAI = new AArch64ELFMCAsmInfo(); - MachineLocation Dst(MachineLocation::VirtualFP); - MachineLocation Src(AArch64::XSP, 0); - MAI->addInitialFrameState(0, Dst, Src); + unsigned Reg = MRI.getDwarfRegNum(AArch64::XSP, true); + MCCFIInstruction Inst = MCCFIInstruction::createDefCfa(0, Reg, 0); + MAI->addInitialFrameState(Inst); return MAI; } @@ -135,17 +136,17 @@ public: return MCInstrAnalysis::isConditionalBranch(Inst); } - uint64_t evaluateBranch(const MCInst &Inst, uint64_t Addr, - uint64_t Size) const { + bool evaluateBranch(const MCInst &Inst, uint64_t Addr, + uint64_t Size, uint64_t &Target) const { unsigned LblOperand = Inst.getOpcode() == AArch64::Bcc ? 1 : 0; // FIXME: We only handle PCRel branches for now. if (Info->get(Inst.getOpcode()).OpInfo[LblOperand].OperandType != MCOI::OPERAND_PCREL) - return -1ULL; + return false; int64_t Imm = Inst.getOperand(LblOperand).getImm(); - - return Addr + Imm; + Target = Addr + Imm; + return true; } }; diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h index 3849fe379513..670e657ec73c 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h +++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h @@ -43,8 +43,9 @@ MCCodeEmitter *createAArch64MCCodeEmitter(const MCInstrInfo &MCII, MCObjectWriter *createAArch64ELFObjectWriter(raw_ostream &OS, uint8_t OSABI); -MCAsmBackend *createAArch64AsmBackend(const Target &T, StringRef TT, - StringRef CPU); +MCAsmBackend *createAArch64AsmBackend(const Target &T, + const MCRegisterInfo &MRI, + StringRef TT, StringRef CPU); } // End llvm namespace diff --git a/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp b/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp index fc706a4cd42d..377b533be898 100644 --- a/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp +++ b/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp @@ -20,5 +20,5 @@ Target llvm::TheAArch64Target; extern "C" void LLVMInitializeAArch64TargetInfo() { RegisterTarget - X(TheAArch64Target, "aarch64", "AArch64"); + X(TheAArch64Target, "aarch64", "AArch64 (ARM 64-bit target)"); } diff --git a/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp b/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp index bedccb5438f6..2a97cd632560 100644 --- a/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp +++ b/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp @@ -972,7 +972,7 @@ bool A64Imms::isLogicalImm(unsigned RegWidth, uint64_t Imm, uint32_t &Bits) { // Now we have to work out the amount of rotation needed. The first part of // this calculation is actually independent of RepeatWidth, but the complex // case will depend on it. - Rotation = CountTrailingZeros_64(Imm); + Rotation = countTrailingZeros(Imm); if (Rotation == 0) { // There were no leading zeros, which means it's either in place or there // are 1s at each end (e.g. 0x8003 needs rotating). @@ -1105,3 +1105,69 @@ bool A64Imms::isOnlyMOVNImm(int RegWidth, uint64_t Value, return isMOVNImm(RegWidth, Value, UImm16, Shift); } + +// decodeNeonModShiftImm - Decode a Neon OpCmode value into the +// the shift amount and the shift type (shift zeros or ones in) and +// returns whether the OpCmode value implies a shift operation. +bool A64Imms::decodeNeonModShiftImm(unsigned OpCmode, unsigned &ShiftImm, + unsigned &ShiftOnesIn) { + ShiftImm = 0; + ShiftOnesIn = false; + bool HasShift = true; + + if (OpCmode == 0xe) { + // movi byte + HasShift = false; + } else if (OpCmode == 0x1e) { + // movi 64-bit bytemask + HasShift = false; + } else if ((OpCmode & 0xc) == 0x8) { + // shift zeros, per halfword + ShiftImm = ((OpCmode & 0x2) >> 1); + } else if ((OpCmode & 0x8) == 0) { + // shift zeros, per word + ShiftImm = ((OpCmode & 0x6) >> 1); + } else if ((OpCmode & 0xe) == 0xc) { + // shift ones, per word + ShiftOnesIn = true; + ShiftImm = (OpCmode & 0x1); + } else { + // per byte, per bytemask + llvm_unreachable("Unsupported Neon modified immediate"); + } + + return HasShift; +} + +// decodeNeonModImm - Decode a NEON modified immediate and OpCmode values +// into the element value and the element size in bits. +uint64_t A64Imms::decodeNeonModImm(unsigned Val, unsigned OpCmode, + unsigned &EltBits) { + uint64_t DecodedVal = Val; + EltBits = 0; + + if (OpCmode == 0xe) { + // movi byte + EltBits = 8; + } else if (OpCmode == 0x1e) { + // movi 64-bit bytemask + DecodedVal = 0; + for (unsigned ByteNum = 0; ByteNum < 8; ++ByteNum) { + if ((Val >> ByteNum) & 1) + DecodedVal |= (uint64_t)0xff << (8 * ByteNum); + } + EltBits = 64; + } else if ((OpCmode & 0xc) == 0x8) { + // shift zeros, per halfword + EltBits = 16; + } else if ((OpCmode & 0x8) == 0) { + // shift zeros, per word + EltBits = 32; + } else if ((OpCmode & 0xe) == 0xc) { + // shift ones, per word + EltBits = 32; + } else { + llvm_unreachable("Unsupported Neon modified immediate"); + } + return DecodedVal; +} diff --git a/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/lib/Target/AArch64/Utils/AArch64BaseInfo.h index 9a1ca6127ae9..ce970b0a8aba 100644 --- a/lib/Target/AArch64/Utils/AArch64BaseInfo.h +++ b/lib/Target/AArch64/Utils/AArch64BaseInfo.h @@ -289,6 +289,7 @@ namespace A64SE { enum ShiftExtSpecifiers { Invalid = -1, LSL, + MSL, LSR, ASR, ROR, @@ -305,6 +306,65 @@ namespace A64SE { }; } +namespace A64Layout { + enum VectorLayout { + Invalid = -1, + VL_8B, + VL_4H, + VL_2S, + VL_1D, + + VL_16B, + VL_8H, + VL_4S, + VL_2D, + + // Bare layout for the 128-bit vector + // (only show ".b", ".h", ".s", ".d" without vector number) + VL_B, + VL_H, + VL_S, + VL_D + }; +} + +inline static const char * +A64VectorLayoutToString(A64Layout::VectorLayout Layout) { + switch (Layout) { + case A64Layout::VL_8B: return ".8b"; + case A64Layout::VL_4H: return ".4h"; + case A64Layout::VL_2S: return ".2s"; + case A64Layout::VL_1D: return ".1d"; + case A64Layout::VL_16B: return ".16b"; + case A64Layout::VL_8H: return ".8h"; + case A64Layout::VL_4S: return ".4s"; + case A64Layout::VL_2D: return ".2d"; + case A64Layout::VL_B: return ".b"; + case A64Layout::VL_H: return ".h"; + case A64Layout::VL_S: return ".s"; + case A64Layout::VL_D: return ".d"; + default: llvm_unreachable("Unknown Vector Layout"); + } +} + +inline static A64Layout::VectorLayout +A64StringToVectorLayout(StringRef LayoutStr) { + return StringSwitch(LayoutStr) + .Case(".8b", A64Layout::VL_8B) + .Case(".4h", A64Layout::VL_4H) + .Case(".2s", A64Layout::VL_2S) + .Case(".1d", A64Layout::VL_1D) + .Case(".16b", A64Layout::VL_16B) + .Case(".8h", A64Layout::VL_8H) + .Case(".4s", A64Layout::VL_4S) + .Case(".2d", A64Layout::VL_2D) + .Case(".b", A64Layout::VL_B) + .Case(".h", A64Layout::VL_H) + .Case(".s", A64Layout::VL_S) + .Case(".d", A64Layout::VL_D) + .Default(A64Layout::Invalid); +} + namespace A64SysReg { enum SysRegROValues { MDCCSR_EL0 = 0x9808, // 10 011 0000 0001 000 @@ -1068,7 +1128,10 @@ namespace A64Imms { // MOVN but *not* with a MOVZ (because that would take priority). bool isOnlyMOVNImm(int RegWidth, uint64_t Value, int &UImm16, int &Shift); -} + uint64_t decodeNeonModImm(unsigned Val, unsigned OpCmode, unsigned &EltBits); + bool decodeNeonModShiftImm(unsigned OpCmode, unsigned &ShiftImm, + unsigned &ShiftOnesIn); + } } // end namespace llvm; diff --git a/lib/Target/AArch64/Utils/CMakeLists.txt b/lib/Target/AArch64/Utils/CMakeLists.txt index 2c28348d7d81..2348e44f850b 100644 --- a/lib/Target/AArch64/Utils/CMakeLists.txt +++ b/lib/Target/AArch64/Utils/CMakeLists.txt @@ -3,3 +3,5 @@ include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/ add_llvm_library(LLVMAArch64Utils AArch64BaseInfo.cpp ) + +add_dependencies(LLVMAArch64Utils AArch64CommonTableGen) diff --git a/lib/Target/ARM/A15SDOptimizer.cpp b/lib/Target/ARM/A15SDOptimizer.cpp index f0d4dbe2bfb3..ff585b41a2aa 100644 --- a/lib/Target/ARM/A15SDOptimizer.cpp +++ b/lib/Target/ARM/A15SDOptimizer.cpp @@ -615,7 +615,7 @@ bool A15SDOptimizer::runOnInstruction(MachineInstr *MI) { SmallVector Defs = getReadDPRs(MI); bool Modified = false; - for (SmallVector::iterator I = Defs.begin(), E = Defs.end(); + for (SmallVectorImpl::iterator I = Defs.begin(), E = Defs.end(); I != E; ++I) { // Follow the def-use chain for this DPR through COPYs, and also through // PHIs (which are essentially multi-way COPYs). It is because of PHIs that @@ -630,7 +630,7 @@ bool A15SDOptimizer::runOnInstruction(MachineInstr *MI) { elideCopiesAndPHIs(Def, DefSrcs); - for (SmallVector::iterator II = DefSrcs.begin(), + for (SmallVectorImpl::iterator II = DefSrcs.begin(), EE = DefSrcs.end(); II != EE; ++II) { MachineInstr *MI = *II; @@ -655,8 +655,15 @@ bool A15SDOptimizer::runOnInstruction(MachineInstr *MI) { if (NewReg != 0) { Modified = true; - for (SmallVector::const_iterator I = Uses.begin(), + for (SmallVectorImpl::const_iterator I = Uses.begin(), E = Uses.end(); I != E; ++I) { + // Make sure to constrain the register class of the new register to + // match what we're replacing. Otherwise we can optimize a DPR_VFP2 + // reference into a plain DPR, and that will end poorly. NewReg is + // always virtual here, so there will always be a matching subclass + // to find. + MRI->constrainRegClass(NewReg, MRI->getRegClass((*I)->getReg())); + DEBUG(dbgs() << "Replacing operand " << **I << " with " << PrintReg(NewReg) << "\n"); diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td index 2d7470919dc4..36e5680ca4e0 100644 --- a/lib/Target/ARM/ARM.td +++ b/lib/Target/ARM/ARM.td @@ -38,12 +38,16 @@ def FeatureNEON : SubtargetFeature<"neon", "HasNEON", "true", def FeatureThumb2 : SubtargetFeature<"thumb2", "HasThumb2", "true", "Enable Thumb2 instructions">; def FeatureNoARM : SubtargetFeature<"noarm", "NoARM", "true", - "Does not support ARM mode execution">; + "Does not support ARM mode execution", + [ModeThumb]>; def FeatureFP16 : SubtargetFeature<"fp16", "HasFP16", "true", "Enable half-precision floating point">; def FeatureVFP4 : SubtargetFeature<"vfp4", "HasVFPv4", "true", "Enable VFP4 instructions", [FeatureVFP3, FeatureFP16]>; +def FeatureFPARMv8 : SubtargetFeature<"fp-armv8", "HasFPARMv8", + "true", "Enable ARMv8 FP", + [FeatureVFP4]>; def FeatureD16 : SubtargetFeature<"d16", "HasD16", "true", "Restrict VFP3 to 16 double registers">; def FeatureHWDiv : SubtargetFeature<"hwdiv", "HasHardwareDivide", "true", @@ -59,8 +63,15 @@ def FeatureSlowFPBrcc : SubtargetFeature<"slow-fp-brcc", "SlowFPBrcc", "true", "FP compare + branch is slow">; def FeatureVFPOnlySP : SubtargetFeature<"fp-only-sp", "FPOnlySP", "true", "Floating point unit supports single precision only">; +def FeaturePerfMon : SubtargetFeature<"perfmon", "HasPerfMon", "true", + "Enable support for Performance Monitor extensions">; def FeatureTrustZone : SubtargetFeature<"trustzone", "HasTrustZone", "true", "Enable support for TrustZone security extensions">; +def FeatureCrypto : SubtargetFeature<"crypto", "HasCrypto", "true", + "Enable support for Cryptography extensions", + [FeatureNEON]>; +def FeatureCRC : SubtargetFeature<"crc", "HasCRC", "true", + "Enable support for CRC instructions">; // Some processors have FP multiply-accumulate instructions that don't // play nicely with other VFP / NEON instructions, and it's generally better @@ -108,10 +119,24 @@ def FeatureDSPThumb2 : SubtargetFeature<"t2dsp", "Thumb2DSP", "true", def FeatureMP : SubtargetFeature<"mp", "HasMPExtension", "true", "Supports Multiprocessing extension">; -// M-series ISA? -def FeatureMClass : SubtargetFeature<"mclass", "IsMClass", "true", +// Virtualization extension - requires HW divide (ARMv7-AR ARMARM - 4.4.8). +def FeatureVirtualization : SubtargetFeature<"virtualization", + "HasVirtualization", "true", + "Supports Virtualization extension", + [FeatureHWDiv, FeatureHWDivARM]>; + +// M-series ISA +def FeatureMClass : SubtargetFeature<"mclass", "ARMProcClass", "MClass", "Is microcontroller profile ('M' series)">; +// R-series ISA +def FeatureRClass : SubtargetFeature<"rclass", "ARMProcClass", "RClass", + "Is realtime profile ('R' series)">; + +// A-series ISA +def FeatureAClass : SubtargetFeature<"aclass", "ARMProcClass", "AClass", + "Is application profile ('A' series)">; + // Special TRAP encoding for NaCl, which looks like a TRAP in Thumb too. // See ARMInstrInfo.td for details. def FeatureNaClTrap : SubtargetFeature<"nacl-trap", "UseNaClTrap", "true", @@ -129,12 +154,19 @@ def HasV5TEOps : SubtargetFeature<"v5te", "HasV5TEOps", "true", def HasV6Ops : SubtargetFeature<"v6", "HasV6Ops", "true", "Support ARM v6 instructions", [HasV5TEOps]>; +def HasV6MOps : SubtargetFeature<"v6m", "HasV6MOps", "true", + "Support ARM v6M instructions", + [HasV6Ops]>; def HasV6T2Ops : SubtargetFeature<"v6t2", "HasV6T2Ops", "true", "Support ARM v6t2 instructions", - [HasV6Ops, FeatureThumb2]>; + [HasV6MOps, FeatureThumb2]>; def HasV7Ops : SubtargetFeature<"v7", "HasV7Ops", "true", "Support ARM v7 instructions", - [HasV6T2Ops]>; + [HasV6T2Ops, FeaturePerfMon]>; +def HasV8Ops : SubtargetFeature<"v8", "HasV8Ops", "true", + "Support ARM v8 instructions", + [HasV7Ops, FeatureVirtualization, + FeatureMP]>; //===----------------------------------------------------------------------===// // ARM Processors supported. @@ -170,12 +202,27 @@ def ProcSwift : SubtargetFeature<"swift", "ARMProcFamily", "Swift", // FIXME: It has not been determined if A15 has these features. def ProcA15 : SubtargetFeature<"a15", "ARMProcFamily", "CortexA15", "Cortex-A15 ARM processors", - [FeatureT2XtPk, FeatureFP16, + [FeatureT2XtPk, FeatureVFP4, + FeatureMP, FeatureHWDiv, FeatureHWDivARM, FeatureAvoidPartialCPSR, - FeatureTrustZone]>; + FeatureTrustZone, FeatureVirtualization]>; + +def ProcA53 : SubtargetFeature<"a53", "ARMProcFamily", "CortexA53", + "Cortex-A53 ARM processors", + [FeatureHWDiv, FeatureHWDivARM, + FeatureTrustZone, FeatureT2XtPk, + FeatureCrypto, FeatureCRC]>; + +def ProcA57 : SubtargetFeature<"a57", "ARMProcFamily", "CortexA57", + "Cortex-A57 ARM processors", + [FeatureHWDiv, FeatureHWDivARM, + FeatureTrustZone, FeatureT2XtPk, + FeatureCrypto, FeatureCRC]>; + def ProcR5 : SubtargetFeature<"r5", "ARMProcFamily", "CortexR5", "Cortex-R5 ARM processors", - [FeatureSlowFPBrcc, FeatureHWDivARM, + [FeatureSlowFPBrcc, + FeatureHWDiv, FeatureHWDivARM, FeatureHasSlowFPVMLx, FeatureAvoidPartialCPSR, FeatureT2XtPk]>; @@ -233,7 +280,7 @@ def : Processor<"mpcore", ARMV6Itineraries, [HasV6Ops, FeatureVFP2, FeatureHasSlowFPVMLx]>; // V6M Processors. -def : Processor<"cortex-m0", ARMV6Itineraries, [HasV6Ops, FeatureNoARM, +def : Processor<"cortex-m0", ARMV6Itineraries, [HasV6MOps, FeatureNoARM, FeatureDB, FeatureMClass]>; // V6T2 Processors. @@ -248,26 +295,30 @@ def : Processor<"arm1156t2f-s", ARMV6Itineraries, [HasV6T2Ops, FeatureVFP2, def : ProcessorModel<"cortex-a5", CortexA8Model, [ProcA5, HasV7Ops, FeatureNEON, FeatureDB, FeatureVFP4, FeatureDSPThumb2, - FeatureHasRAS]>; + FeatureHasRAS, FeatureAClass]>; def : ProcessorModel<"cortex-a8", CortexA8Model, [ProcA8, HasV7Ops, FeatureNEON, FeatureDB, - FeatureDSPThumb2, FeatureHasRAS]>; + FeatureDSPThumb2, FeatureHasRAS, + FeatureAClass]>; def : ProcessorModel<"cortex-a9", CortexA9Model, [ProcA9, HasV7Ops, FeatureNEON, FeatureDB, - FeatureDSPThumb2, FeatureHasRAS]>; + FeatureDSPThumb2, FeatureHasRAS, + FeatureAClass]>; def : ProcessorModel<"cortex-a9-mp", CortexA9Model, [ProcA9, HasV7Ops, FeatureNEON, FeatureDB, FeatureDSPThumb2, FeatureMP, - FeatureHasRAS]>; + FeatureHasRAS, FeatureAClass]>; // FIXME: A15 has currently the same ProcessorModel as A9. def : ProcessorModel<"cortex-a15", CortexA9Model, [ProcA15, HasV7Ops, FeatureNEON, FeatureDB, - FeatureDSPThumb2, FeatureHasRAS]>; + FeatureDSPThumb2, FeatureHasRAS, + FeatureAClass]>; // FIXME: R5 has currently the same ProcessorModel as A8. def : ProcessorModel<"cortex-r5", CortexA8Model, [ProcR5, HasV7Ops, FeatureDB, FeatureVFP3, FeatureDSPThumb2, - FeatureHasRAS]>; + FeatureHasRAS, FeatureVFPOnlySP, + FeatureD16, FeatureRClass]>; // V7M Processors. def : ProcNoItin<"cortex-m3", [HasV7Ops, @@ -279,13 +330,22 @@ def : ProcNoItin<"cortex-m4", [HasV7Ops, FeatureThumb2, FeatureNoARM, FeatureDB, FeatureHWDiv, FeatureDSPThumb2, FeatureT2XtPk, FeatureVFP4, - FeatureVFPOnlySP, FeatureMClass]>; + FeatureVFPOnlySP, FeatureD16, + FeatureMClass]>; // Swift uArch Processors. def : ProcessorModel<"swift", SwiftModel, [ProcSwift, HasV7Ops, FeatureNEON, FeatureDB, FeatureDSPThumb2, - FeatureHasRAS]>; + FeatureHasRAS, FeatureAClass]>; + +// V8 Processors +def : ProcNoItin<"cortex-a53", [ProcA53, HasV8Ops, FeatureAClass, + FeatureDB, FeatureFPARMv8, + FeatureNEON, FeatureDSPThumb2]>; +def : ProcNoItin<"cortex-a57", [ProcA57, HasV8Ops, FeatureAClass, + FeatureDB, FeatureFPARMv8, + FeatureNEON, FeatureDSPThumb2]>; //===----------------------------------------------------------------------===// // Register File Description diff --git a/lib/Target/ARM/ARMAsmPrinter.cpp b/lib/Target/ARM/ARMAsmPrinter.cpp index 13ec2087938a..e79f88d4b6f1 100644 --- a/lib/Target/ARM/ARMAsmPrinter.cpp +++ b/lib/Target/ARM/ARMAsmPrinter.cpp @@ -17,6 +17,7 @@ #include "ARM.h" #include "ARMBuildAttrs.h" #include "ARMConstantPoolValue.h" +#include "ARMFPUName.h" #include "ARMMachineFunctionInfo.h" #include "ARMTargetMachine.h" #include "ARMTargetObjectFile.h" @@ -55,235 +56,67 @@ #include using namespace llvm; -namespace { - - // Per section and per symbol attributes are not supported. - // To implement them we would need the ability to delay this emission - // until the assembly file is fully parsed/generated as only then do we - // know the symbol and section numbers. - class AttributeEmitter { - public: - virtual void MaybeSwitchVendor(StringRef Vendor) = 0; - virtual void EmitAttribute(unsigned Attribute, unsigned Value) = 0; - virtual void EmitTextAttribute(unsigned Attribute, StringRef String) = 0; - virtual void Finish() = 0; - virtual ~AttributeEmitter() {} - }; - - class AsmAttributeEmitter : public AttributeEmitter { - MCStreamer &Streamer; - - public: - AsmAttributeEmitter(MCStreamer &Streamer_) : Streamer(Streamer_) {} - void MaybeSwitchVendor(StringRef Vendor) { } - - void EmitAttribute(unsigned Attribute, unsigned Value) { - Streamer.EmitRawText("\t.eabi_attribute " + - Twine(Attribute) + ", " + Twine(Value)); - } - - void EmitTextAttribute(unsigned Attribute, StringRef String) { - switch (Attribute) { - default: llvm_unreachable("Unsupported Text attribute in ASM Mode"); - case ARMBuildAttrs::CPU_name: - Streamer.EmitRawText(StringRef("\t.cpu ") + String.lower()); - break; - /* GAS requires .fpu to be emitted regardless of EABI attribute */ - case ARMBuildAttrs::Advanced_SIMD_arch: - case ARMBuildAttrs::VFP_arch: - Streamer.EmitRawText(StringRef("\t.fpu ") + String.lower()); - break; - } - } - void Finish() { } - }; - - class ObjectAttributeEmitter : public AttributeEmitter { - // This structure holds all attributes, accounting for - // their string/numeric value, so we can later emmit them - // in declaration order, keeping all in the same vector - struct AttributeItemType { - enum { - HiddenAttribute = 0, - NumericAttribute, - TextAttribute - } Type; - unsigned Tag; - unsigned IntValue; - StringRef StringValue; - } AttributeItem; - - MCObjectStreamer &Streamer; - StringRef CurrentVendor; - SmallVector Contents; - - // Account for the ULEB/String size of each item, - // not just the number of items - size_t ContentsSize; - // FIXME: this should be in a more generic place, but - // getULEBSize() is in MCAsmInfo and will be moved to MCDwarf - size_t getULEBSize(int Value) { - size_t Size = 0; - do { - Value >>= 7; - Size += sizeof(int8_t); // Is this really necessary? - } while (Value); - return Size; - } - - public: - ObjectAttributeEmitter(MCObjectStreamer &Streamer_) : - Streamer(Streamer_), CurrentVendor(""), ContentsSize(0) { } - - void MaybeSwitchVendor(StringRef Vendor) { - assert(!Vendor.empty() && "Vendor cannot be empty."); - - if (CurrentVendor.empty()) - CurrentVendor = Vendor; - else if (CurrentVendor == Vendor) - return; - else - Finish(); - - CurrentVendor = Vendor; - - assert(Contents.size() == 0); - } - - void EmitAttribute(unsigned Attribute, unsigned Value) { - AttributeItemType attr = { - AttributeItemType::NumericAttribute, - Attribute, - Value, - StringRef("") - }; - ContentsSize += getULEBSize(Attribute); - ContentsSize += getULEBSize(Value); - Contents.push_back(attr); - } - - void EmitTextAttribute(unsigned Attribute, StringRef String) { - AttributeItemType attr = { - AttributeItemType::TextAttribute, - Attribute, - 0, - String - }; - ContentsSize += getULEBSize(Attribute); - // String + \0 - ContentsSize += String.size()+1; - - Contents.push_back(attr); - } - - void Finish() { - // Vendor size + Vendor name + '\0' - const size_t VendorHeaderSize = 4 + CurrentVendor.size() + 1; - - // Tag + Tag Size - const size_t TagHeaderSize = 1 + 4; - - Streamer.EmitIntValue(VendorHeaderSize + TagHeaderSize + ContentsSize, 4); - Streamer.EmitBytes(CurrentVendor); - Streamer.EmitIntValue(0, 1); // '\0' - - Streamer.EmitIntValue(ARMBuildAttrs::File, 1); - Streamer.EmitIntValue(TagHeaderSize + ContentsSize, 4); - - // Size should have been accounted for already, now - // emit each field as its type (ULEB or String) - for (unsigned int i=0; igetNumOperands() == 4 && "Invalid no. of machine operands!"); - // Frame address. Currently handles register +- offset only. - if (MI->getOperand(0).isReg() && MI->getOperand(1).isImm()) - Location.set(MI->getOperand(0).getReg(), MI->getOperand(1).getImm()); - else { - DEBUG(dbgs() << "DBG_VALUE instruction ignored! " << *MI << "\n"); - } - return Location; -} - /// EmitDwarfRegOp - Emit dwarf register operation. -void ARMAsmPrinter::EmitDwarfRegOp(const MachineLocation &MLoc) const { +void ARMAsmPrinter::EmitDwarfRegOp(const MachineLocation &MLoc, + bool Indirect) const { const TargetRegisterInfo *RI = TM.getRegisterInfo(); - if (RI->getDwarfRegNum(MLoc.getReg(), false) != -1) - AsmPrinter::EmitDwarfRegOp(MLoc); - else { - unsigned Reg = MLoc.getReg(); - if (Reg >= ARM::S0 && Reg <= ARM::S31) { - assert(ARM::S0 + 31 == ARM::S31 && "Unexpected ARM S register numbering"); - // S registers are described as bit-pieces of a register - // S[2x] = DW_OP_regx(256 + (x>>1)) DW_OP_bit_piece(32, 0) - // S[2x+1] = DW_OP_regx(256 + (x>>1)) DW_OP_bit_piece(32, 32) - - unsigned SReg = Reg - ARM::S0; - bool odd = SReg & 0x1; - unsigned Rx = 256 + (SReg >> 1); - - OutStreamer.AddComment("DW_OP_regx for S register"); - EmitInt8(dwarf::DW_OP_regx); - - OutStreamer.AddComment(Twine(SReg)); - EmitULEB128(Rx); - - if (odd) { - OutStreamer.AddComment("DW_OP_bit_piece 32 32"); - EmitInt8(dwarf::DW_OP_bit_piece); - EmitULEB128(32); - EmitULEB128(32); - } else { - OutStreamer.AddComment("DW_OP_bit_piece 32 0"); - EmitInt8(dwarf::DW_OP_bit_piece); - EmitULEB128(32); - EmitULEB128(0); - } - } else if (Reg >= ARM::Q0 && Reg <= ARM::Q15) { - assert(ARM::Q0 + 15 == ARM::Q15 && "Unexpected ARM Q register numbering"); - // Q registers Q0-Q15 are described by composing two D registers together. - // Qx = DW_OP_regx(256+2x) DW_OP_piece(8) DW_OP_regx(256+2x+1) - // DW_OP_piece(8) - - unsigned QReg = Reg - ARM::Q0; - unsigned D1 = 256 + 2 * QReg; - unsigned D2 = D1 + 1; - - OutStreamer.AddComment("DW_OP_regx for Q register: D1"); - EmitInt8(dwarf::DW_OP_regx); - EmitULEB128(D1); - OutStreamer.AddComment("DW_OP_piece 8"); - EmitInt8(dwarf::DW_OP_piece); - EmitULEB128(8); - - OutStreamer.AddComment("DW_OP_regx for Q register: D2"); - EmitInt8(dwarf::DW_OP_regx); - EmitULEB128(D2); - OutStreamer.AddComment("DW_OP_piece 8"); - EmitInt8(dwarf::DW_OP_piece); - EmitULEB128(8); + if (RI->getDwarfRegNum(MLoc.getReg(), false) != -1) { + AsmPrinter::EmitDwarfRegOp(MLoc, Indirect); + return; + } + assert(MLoc.isReg() && !Indirect && + "This doesn't support offset/indirection - implement it if needed"); + unsigned Reg = MLoc.getReg(); + if (Reg >= ARM::S0 && Reg <= ARM::S31) { + assert(ARM::S0 + 31 == ARM::S31 && "Unexpected ARM S register numbering"); + // S registers are described as bit-pieces of a register + // S[2x] = DW_OP_regx(256 + (x>>1)) DW_OP_bit_piece(32, 0) + // S[2x+1] = DW_OP_regx(256 + (x>>1)) DW_OP_bit_piece(32, 32) + + unsigned SReg = Reg - ARM::S0; + bool odd = SReg & 0x1; + unsigned Rx = 256 + (SReg >> 1); + + OutStreamer.AddComment("DW_OP_regx for S register"); + EmitInt8(dwarf::DW_OP_regx); + + OutStreamer.AddComment(Twine(SReg)); + EmitULEB128(Rx); + + if (odd) { + OutStreamer.AddComment("DW_OP_bit_piece 32 32"); + EmitInt8(dwarf::DW_OP_bit_piece); + EmitULEB128(32); + EmitULEB128(32); + } else { + OutStreamer.AddComment("DW_OP_bit_piece 32 0"); + EmitInt8(dwarf::DW_OP_bit_piece); + EmitULEB128(32); + EmitULEB128(0); } + } else if (Reg >= ARM::Q0 && Reg <= ARM::Q15) { + assert(ARM::Q0 + 15 == ARM::Q15 && "Unexpected ARM Q register numbering"); + // Q registers Q0-Q15 are described by composing two D registers together. + // Qx = DW_OP_regx(256+2x) DW_OP_piece(8) DW_OP_regx(256+2x+1) + // DW_OP_piece(8) + + unsigned QReg = Reg - ARM::Q0; + unsigned D1 = 256 + 2 * QReg; + unsigned D2 = D1 + 1; + + OutStreamer.AddComment("DW_OP_regx for Q register: D1"); + EmitInt8(dwarf::DW_OP_regx); + EmitULEB128(D1); + OutStreamer.AddComment("DW_OP_piece 8"); + EmitInt8(dwarf::DW_OP_piece); + EmitULEB128(8); + + OutStreamer.AddComment("DW_OP_regx for Q register: D2"); + EmitInt8(dwarf::DW_OP_regx); + EmitULEB128(D2); + OutStreamer.AddComment("DW_OP_piece 8"); + EmitInt8(dwarf::DW_OP_piece); + EmitULEB128(8); } } @@ -312,7 +145,7 @@ void ARMAsmPrinter::EmitXXStructor(const Constant *CV) { const GlobalValue *GV = dyn_cast(CV->stripPointerCasts()); assert(GV && "C++ constructor pointer was not a GlobalValue!"); - const MCExpr *E = MCSymbolRefExpr::Create(Mang->getSymbol(GV), + const MCExpr *E = MCSymbolRefExpr::Create(getSymbol(GV), (Subtarget->isTargetDarwin() ? MCSymbolRefExpr::VK_None : MCSymbolRefExpr::VK_ARM_TARGET1), @@ -373,7 +206,7 @@ void ARMAsmPrinter::printOperand(const MachineInstr *MI, int OpNum, else if ((Modifier && strcmp(Modifier, "hi16") == 0) || (TF & ARMII::MO_HI16)) O << ":upper16:"; - O << *Mang->getSymbol(GV); + O << *getSymbol(GV); printOffset(MO.getOffset(), O); if (TF == ARMII::MO_PLT) @@ -474,8 +307,14 @@ bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum, // This takes advantage of the 2 operand-ness of ldm/stm and that we've // already got the operands in registers that are operands to the // inline asm statement. - - O << "{" << ARMInstPrinter::getRegisterName(RegBegin); + O << "{"; + if (ARM::GPRPairRegClass.contains(RegBegin)) { + const TargetRegisterInfo *TRI = MF->getTarget().getRegisterInfo(); + unsigned Reg0 = TRI->getSubReg(RegBegin, ARM::gsub_0); + O << ARMInstPrinter::getRegisterName(Reg0) << ", ";; + RegBegin = TRI->getSubReg(RegBegin, ARM::gsub_1); + } + O << ARMInstPrinter::getRegisterName(RegBegin); // FIXME: The register allocator not only may not have given us the // registers in sequence, but may not be in ascending registers. This @@ -500,7 +339,38 @@ bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum, if (!FlagsOP.isImm()) return true; unsigned Flags = FlagsOP.getImm(); + + // This operand may not be the one that actually provides the register. If + // it's tied to a previous one then we should refer instead to that one + // for registers and their classes. + unsigned TiedIdx; + if (InlineAsm::isUseOperandTiedToDef(Flags, TiedIdx)) { + for (OpNum = InlineAsm::MIOp_FirstOperand; TiedIdx; --TiedIdx) { + unsigned OpFlags = MI->getOperand(OpNum).getImm(); + OpNum += InlineAsm::getNumOperandRegisters(OpFlags) + 1; + } + Flags = MI->getOperand(OpNum).getImm(); + + // Later code expects OpNum to be pointing at the register rather than + // the flags. + OpNum += 1; + } + unsigned NumVals = InlineAsm::getNumOperandRegisters(Flags); + unsigned RC; + InlineAsm::hasRegClassConstraint(Flags, RC); + if (RC == ARM::GPRPairRegClassID) { + if (NumVals != 1) + return true; + const MachineOperand &MO = MI->getOperand(OpNum); + if (!MO.isReg()) + return true; + const TargetRegisterInfo *TRI = MF->getTarget().getRegisterInfo(); + unsigned Reg = TRI->getSubReg(MO.getReg(), ExtraCode[0] == 'Q' ? + ARM::gsub_0 : ARM::gsub_1); + O << ARMInstPrinter::getRegisterName(Reg); + return false; + } if (NumVals != 2) return true; unsigned RegOp = ExtraCode[0] == 'Q' ? OpNum : OpNum + 1; @@ -704,11 +574,6 @@ void ARMAsmPrinter::EmitEndOfAsmFile(Module &M) { // generates code that does this, it is always safe to set. OutStreamer.EmitAssemblerFlag(MCAF_SubsectionsViaSymbols); } - // FIXME: This should eventually end up somewhere else where more - // intelligent flag decisions can be made. For now we are just maintaining - // the status quo for ARM and setting EF_ARM_EABI_VER5 as the default. - if (MCELFStreamer *MES = dyn_cast(&OutStreamer)) - MES->getAssembler().setELFHeaderEFlags(ELF::EF_ARM_EABI_VER5); } //===----------------------------------------------------------------------===// @@ -718,145 +583,150 @@ void ARMAsmPrinter::EmitEndOfAsmFile(Module &M) { // to appear in the .ARM.attributes section in ELF. // Instead of subclassing the MCELFStreamer, we do the work here. -void ARMAsmPrinter::emitAttributes() { - - emitARMAttributeSection(); - - /* GAS expect .fpu to be emitted, regardless of VFP build attribute */ - bool emitFPU = false; - AttributeEmitter *AttrEmitter; - if (OutStreamer.hasRawTextSupport()) { - AttrEmitter = new AsmAttributeEmitter(OutStreamer); - emitFPU = true; - } else { - MCObjectStreamer &O = static_cast(OutStreamer); - AttrEmitter = new ObjectAttributeEmitter(O); - } - - AttrEmitter->MaybeSwitchVendor("aeabi"); - - std::string CPUString = Subtarget->getCPUString(); - - if (CPUString == "cortex-a8" || - Subtarget->isCortexA8()) { - AttrEmitter->EmitTextAttribute(ARMBuildAttrs::CPU_name, "cortex-a8"); - AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch, ARMBuildAttrs::v7); - AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch_profile, - ARMBuildAttrs::ApplicationProfile); - AttrEmitter->EmitAttribute(ARMBuildAttrs::ARM_ISA_use, - ARMBuildAttrs::Allowed); - AttrEmitter->EmitAttribute(ARMBuildAttrs::THUMB_ISA_use, - ARMBuildAttrs::AllowThumb32); - // Fixme: figure out when this is emitted. - //AttrEmitter->EmitAttribute(ARMBuildAttrs::WMMX_arch, - // ARMBuildAttrs::AllowWMMXv1); - // - - /// ADD additional Else-cases here! - } else if (CPUString == "xscale") { - AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch, ARMBuildAttrs::v5TEJ); - AttrEmitter->EmitAttribute(ARMBuildAttrs::ARM_ISA_use, - ARMBuildAttrs::Allowed); - AttrEmitter->EmitAttribute(ARMBuildAttrs::THUMB_ISA_use, - ARMBuildAttrs::Allowed); - } else if (CPUString == "generic") { - // For a generic CPU, we assume a standard v7a architecture in Subtarget. - AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch, ARMBuildAttrs::v7); - AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch_profile, - ARMBuildAttrs::ApplicationProfile); - AttrEmitter->EmitAttribute(ARMBuildAttrs::ARM_ISA_use, - ARMBuildAttrs::Allowed); - AttrEmitter->EmitAttribute(ARMBuildAttrs::THUMB_ISA_use, - ARMBuildAttrs::AllowThumb32); - } else if (Subtarget->hasV7Ops()) { - AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch, ARMBuildAttrs::v7); - AttrEmitter->EmitAttribute(ARMBuildAttrs::THUMB_ISA_use, - ARMBuildAttrs::AllowThumb32); +static ARMBuildAttrs::CPUArch getArchForCPU(StringRef CPU, + const ARMSubtarget *Subtarget) { + if (CPU == "xscale") + return ARMBuildAttrs::v5TEJ; + + if (Subtarget->hasV8Ops()) + return ARMBuildAttrs::v8; + else if (Subtarget->hasV7Ops()) { + if (Subtarget->isMClass() && Subtarget->hasThumb2DSP()) + return ARMBuildAttrs::v7E_M; + return ARMBuildAttrs::v7; } else if (Subtarget->hasV6T2Ops()) - AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch, ARMBuildAttrs::v6T2); + return ARMBuildAttrs::v6T2; + else if (Subtarget->hasV6MOps()) + return ARMBuildAttrs::v6S_M; else if (Subtarget->hasV6Ops()) - AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch, ARMBuildAttrs::v6); + return ARMBuildAttrs::v6; else if (Subtarget->hasV5TEOps()) - AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch, ARMBuildAttrs::v5TE); + return ARMBuildAttrs::v5TE; else if (Subtarget->hasV5TOps()) - AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch, ARMBuildAttrs::v5T); + return ARMBuildAttrs::v5T; else if (Subtarget->hasV4TOps()) - AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch, ARMBuildAttrs::v4T); + return ARMBuildAttrs::v4T; + else + return ARMBuildAttrs::v4; +} - if (Subtarget->hasNEON() && emitFPU) { - /* NEON is not exactly a VFP architecture, but GAS emit one of - * neon/neon-vfpv4/vfpv3/vfpv2 for .fpu parameters */ - if (Subtarget->hasVFP4()) - AttrEmitter->EmitTextAttribute(ARMBuildAttrs::Advanced_SIMD_arch, - "neon-vfpv4"); - else - AttrEmitter->EmitTextAttribute(ARMBuildAttrs::Advanced_SIMD_arch, "neon"); - /* If emitted for NEON, omit from VFP below, since you can have both - * NEON and VFP in build attributes but only one .fpu */ - emitFPU = false; +void ARMAsmPrinter::emitAttributes() { + MCTargetStreamer &TS = OutStreamer.getTargetStreamer(); + ARMTargetStreamer &ATS = static_cast(TS); + + ATS.switchVendor("aeabi"); + + std::string CPUString = Subtarget->getCPUString(); + + if (CPUString != "generic") + ATS.emitTextAttribute(ARMBuildAttrs::CPU_name, CPUString); + + ATS.emitAttribute(ARMBuildAttrs::CPU_arch, + getArchForCPU(CPUString, Subtarget)); + + if (Subtarget->isAClass()) { + ATS.emitAttribute(ARMBuildAttrs::CPU_arch_profile, + ARMBuildAttrs::ApplicationProfile); + } else if (Subtarget->isRClass()) { + ATS.emitAttribute(ARMBuildAttrs::CPU_arch_profile, + ARMBuildAttrs::RealTimeProfile); + } else if (Subtarget->isMClass()){ + ATS.emitAttribute(ARMBuildAttrs::CPU_arch_profile, + ARMBuildAttrs::MicroControllerProfile); } - /* VFPv4 + .fpu */ - if (Subtarget->hasVFP4()) { - AttrEmitter->EmitAttribute(ARMBuildAttrs::VFP_arch, - ARMBuildAttrs::AllowFPv4A); - if (emitFPU) - AttrEmitter->EmitTextAttribute(ARMBuildAttrs::VFP_arch, "vfpv4"); - - /* VFPv3 + .fpu */ - } else if (Subtarget->hasVFP3()) { - AttrEmitter->EmitAttribute(ARMBuildAttrs::VFP_arch, - ARMBuildAttrs::AllowFPv3A); - if (emitFPU) - AttrEmitter->EmitTextAttribute(ARMBuildAttrs::VFP_arch, "vfpv3"); - - /* VFPv2 + .fpu */ - } else if (Subtarget->hasVFP2()) { - AttrEmitter->EmitAttribute(ARMBuildAttrs::VFP_arch, - ARMBuildAttrs::AllowFPv2); - if (emitFPU) - AttrEmitter->EmitTextAttribute(ARMBuildAttrs::VFP_arch, "vfpv2"); + ATS.emitAttribute(ARMBuildAttrs::ARM_ISA_use, Subtarget->hasARMOps() ? + ARMBuildAttrs::Allowed : ARMBuildAttrs::Not_Allowed); + if (Subtarget->isThumb1Only()) { + ATS.emitAttribute(ARMBuildAttrs::THUMB_ISA_use, + ARMBuildAttrs::Allowed); + } else if (Subtarget->hasThumb2()) { + ATS.emitAttribute(ARMBuildAttrs::THUMB_ISA_use, + ARMBuildAttrs::AllowThumb32); } - /* TODO: ARMBuildAttrs::Allowed is not completely accurate, - * since NEON can have 1 (allowed) or 2 (MAC operations) */ if (Subtarget->hasNEON()) { - AttrEmitter->EmitAttribute(ARMBuildAttrs::Advanced_SIMD_arch, - ARMBuildAttrs::Allowed); + /* NEON is not exactly a VFP architecture, but GAS emit one of + * neon/neon-fp-armv8/neon-vfpv4/vfpv3/vfpv2 for .fpu parameters */ + if (Subtarget->hasFPARMv8()) { + if (Subtarget->hasCrypto()) + ATS.emitFPU(ARM::CRYPTO_NEON_FP_ARMV8); + else + ATS.emitFPU(ARM::NEON_FP_ARMV8); + } + else if (Subtarget->hasVFP4()) + ATS.emitFPU(ARM::NEON_VFPV4); + else + ATS.emitFPU(ARM::NEON); + // Emit Tag_Advanced_SIMD_arch for ARMv8 architecture + if (Subtarget->hasV8Ops()) + ATS.emitAttribute(ARMBuildAttrs::Advanced_SIMD_arch, + ARMBuildAttrs::AllowNeonARMv8); + } else { + if (Subtarget->hasFPARMv8()) + ATS.emitFPU(ARM::FP_ARMV8); + else if (Subtarget->hasVFP4()) + ATS.emitFPU(Subtarget->hasD16() ? ARM::VFPV4_D16 : ARM::VFPV4); + else if (Subtarget->hasVFP3()) + ATS.emitFPU(Subtarget->hasD16() ? ARM::VFPV3_D16 : ARM::VFPV3); + else if (Subtarget->hasVFP2()) + ATS.emitFPU(ARM::VFPV2); } // Signal various FP modes. if (!TM.Options.UnsafeFPMath) { - AttrEmitter->EmitAttribute(ARMBuildAttrs::ABI_FP_denormal, - ARMBuildAttrs::Allowed); - AttrEmitter->EmitAttribute(ARMBuildAttrs::ABI_FP_exceptions, - ARMBuildAttrs::Allowed); + ATS.emitAttribute(ARMBuildAttrs::ABI_FP_denormal, ARMBuildAttrs::Allowed); + ATS.emitAttribute(ARMBuildAttrs::ABI_FP_exceptions, + ARMBuildAttrs::Allowed); } if (TM.Options.NoInfsFPMath && TM.Options.NoNaNsFPMath) - AttrEmitter->EmitAttribute(ARMBuildAttrs::ABI_FP_number_model, - ARMBuildAttrs::Allowed); + ATS.emitAttribute(ARMBuildAttrs::ABI_FP_number_model, + ARMBuildAttrs::Allowed); else - AttrEmitter->EmitAttribute(ARMBuildAttrs::ABI_FP_number_model, - ARMBuildAttrs::AllowIEE754); + ATS.emitAttribute(ARMBuildAttrs::ABI_FP_number_model, + ARMBuildAttrs::AllowIEE754); // FIXME: add more flags to ARMBuildAttrs.h // 8-bytes alignment stuff. - AttrEmitter->EmitAttribute(ARMBuildAttrs::ABI_align8_needed, 1); - AttrEmitter->EmitAttribute(ARMBuildAttrs::ABI_align8_preserved, 1); + ATS.emitAttribute(ARMBuildAttrs::ABI_align8_needed, 1); + ATS.emitAttribute(ARMBuildAttrs::ABI_align8_preserved, 1); + + // ABI_HardFP_use attribute to indicate single precision FP. + if (Subtarget->isFPOnlySP()) + ATS.emitAttribute(ARMBuildAttrs::ABI_HardFP_use, + ARMBuildAttrs::HardFPSinglePrecision); // Hard float. Use both S and D registers and conform to AAPCS-VFP. - if (Subtarget->isAAPCS_ABI() && TM.Options.FloatABIType == FloatABI::Hard) { - AttrEmitter->EmitAttribute(ARMBuildAttrs::ABI_HardFP_use, 3); - AttrEmitter->EmitAttribute(ARMBuildAttrs::ABI_VFP_args, 1); - } + if (Subtarget->isAAPCS_ABI() && TM.Options.FloatABIType == FloatABI::Hard) + ATS.emitAttribute(ARMBuildAttrs::ABI_VFP_args, ARMBuildAttrs::HardFPAAPCS); + // FIXME: Should we signal R9 usage? - if (Subtarget->hasDivide()) - AttrEmitter->EmitAttribute(ARMBuildAttrs::DIV_use, 1); + if (Subtarget->hasFP16()) + ATS.emitAttribute(ARMBuildAttrs::FP_HP_extension, ARMBuildAttrs::AllowHPFP); + + if (Subtarget->hasMPExtension()) + ATS.emitAttribute(ARMBuildAttrs::MPextension_use, ARMBuildAttrs::AllowMP); + + if (Subtarget->hasDivide()) { + // Check if hardware divide is only available in thumb2 or ARM as well. + ATS.emitAttribute(ARMBuildAttrs::DIV_use, + Subtarget->hasDivideInARMMode() ? ARMBuildAttrs::AllowDIVExt : + ARMBuildAttrs::AllowDIVIfExists); + } - AttrEmitter->Finish(); - delete AttrEmitter; + if (Subtarget->hasTrustZone() && Subtarget->hasVirtualization()) + ATS.emitAttribute(ARMBuildAttrs::Virtualization_use, + ARMBuildAttrs::AllowTZVirtualization); + else if (Subtarget->hasTrustZone()) + ATS.emitAttribute(ARMBuildAttrs::Virtualization_use, + ARMBuildAttrs::AllowTZ); + else if (Subtarget->hasVirtualization()) + ATS.emitAttribute(ARMBuildAttrs::Virtualization_use, + ARMBuildAttrs::AllowVirtualization); + + ATS.finishAttributeSection(); } void ARMAsmPrinter::emitARMAttributeSection() { @@ -908,7 +778,7 @@ MCSymbol *ARMAsmPrinter::GetARMGVSymbol(const GlobalValue *GV) { bool isIndirect = Subtarget->isTargetDarwin() && Subtarget->GVIsIndirectSymbol(GV, TM.getRelocationModel()); if (!isIndirect) - return Mang->getSymbol(GV); + return getSymbol(GV); // FIXME: Remove this when Darwin transition to @GOT like syntax. MCSymbol *MCSym = GetSymbolWithGlobalValueBase(GV, "$non_lazy_ptr"); @@ -919,7 +789,7 @@ MCSymbol *ARMAsmPrinter::GetARMGVSymbol(const GlobalValue *GV) { MMIMachO.getGVStubEntry(MCSym); if (StubSym.getPointer() == 0) StubSym = MachineModuleInfoImpl:: - StubValueTy(Mang->getSymbol(GV), !GV->hasInternalLinkage()); + StubValueTy(getSymbol(GV), !GV->hasInternalLinkage()); return MCSym; } @@ -1092,27 +962,12 @@ void ARMAsmPrinter::EmitJump2Table(const MachineInstr *MI) { OutStreamer.EmitDataRegion(MCDR_DataRegionEnd); } -void ARMAsmPrinter::PrintDebugValueComment(const MachineInstr *MI, - raw_ostream &OS) { - unsigned NOps = MI->getNumOperands(); - assert(NOps==4); - OS << '\t' << MAI->getCommentString() << "DEBUG_VALUE: "; - // cast away const; DIetc do not take const operands for some reason. - DIVariable V(const_cast(MI->getOperand(NOps-1).getMetadata())); - OS << V.getName(); - OS << " <- "; - // Frame address. Currently handles register +- offset only. - assert(MI->getOperand(0).isReg() && MI->getOperand(1).isImm()); - OS << '['; printOperand(MI, 0, OS); OS << '+'; printOperand(MI, 1, OS); - OS << ']'; - OS << "+"; - printOperand(MI, NOps-2, OS); -} - void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) { assert(MI->getFlag(MachineInstr::FrameSetup) && "Only instruction which are involved into frame setup code are allowed"); + MCTargetStreamer &TS = OutStreamer.getTargetStreamer(); + ARMTargetStreamer &ATS = static_cast(TS); const MachineFunction &MF = *MI->getParent()->getParent(); const TargetRegisterInfo *RegInfo = MF.getTarget().getRegisterInfo(); const ARMFunctionInfo &AFI = *MF.getInfo(); @@ -1175,7 +1030,7 @@ void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) { RegList.push_back(SrcReg); break; } - OutStreamer.EmitRegSave(RegList, Opc == ARM::VSTMDDB_UPD); + ATS.emitRegSave(RegList, Opc == ARM::VSTMDDB_UPD); } else { // Changes of stack / frame pointer. if (SrcReg == ARM::SP) { @@ -1223,11 +1078,11 @@ void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) { if (DstReg == FramePtr && FramePtr != ARM::SP) // Set-up of the frame pointer. Positive values correspond to "add" // instruction. - OutStreamer.EmitSetFP(FramePtr, ARM::SP, -Offset); + ATS.emitSetFP(FramePtr, ARM::SP, -Offset); else if (DstReg == ARM::SP) { // Change of SP by an offset. Positive values correspond to "sub" // instruction. - OutStreamer.EmitPad(Offset); + ATS.emitPad(Offset); } else { MI->dump(); llvm_unreachable("Unsupported opcode for unwinding information"); @@ -1272,15 +1127,7 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) { unsigned Opc = MI->getOpcode(); switch (Opc) { case ARM::t2MOVi32imm: llvm_unreachable("Should be lowered by thumb2it pass"); - case ARM::DBG_VALUE: { - if (isVerbose() && OutStreamer.hasRawTextSupport()) { - SmallString<128> TmpStr; - raw_svector_ostream OS(TmpStr); - PrintDebugValueComment(MI, OS); - OutStreamer.EmitRawText(StringRef(OS.str())); - } - return; - } + case ARM::DBG_VALUE: llvm_unreachable("Should be handled by generic printing"); case ARM::LEApcrel: case ARM::tLEApcrel: case ARM::t2LEApcrel: { @@ -1376,7 +1223,7 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) { .addReg(0)); const GlobalValue *GV = MI->getOperand(0).getGlobal(); - MCSymbol *GVSym = Mang->getSymbol(GV); + MCSymbol *GVSym = getSymbol(GV); const MCExpr *GVSymExpr = MCSymbolRefExpr::Create(GVSym, OutContext); OutStreamer.EmitInstruction(MCInstBuilder(ARM::Bcc) .addExpr(GVSymExpr) diff --git a/lib/Target/ARM/ARMAsmPrinter.h b/lib/Target/ARM/ARMAsmPrinter.h index c945e4f28699..de72e063e0d5 100644 --- a/lib/Target/ARM/ARMAsmPrinter.h +++ b/lib/Target/ARM/ARMAsmPrinter.h @@ -97,13 +97,9 @@ private: const MachineInstr *MI); public: - void PrintDebugValueComment(const MachineInstr *MI, raw_ostream &OS); - - virtual MachineLocation - getDebugValueLocation(const MachineInstr *MI) const LLVM_OVERRIDE; - /// EmitDwarfRegOp - Emit dwarf register operation. - virtual void EmitDwarfRegOp(const MachineLocation &MLoc) const LLVM_OVERRIDE; + virtual void EmitDwarfRegOp(const MachineLocation &MLoc, bool Indirect) const + LLVM_OVERRIDE; virtual unsigned getISAEncoding() LLVM_OVERRIDE { // ARM/Darwin adds ISA to the DWARF info for each function. diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp index 60050542716c..f835a4e5b5fe 100644 --- a/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -11,10 +11,11 @@ // //===----------------------------------------------------------------------===// -#include "ARMBaseInstrInfo.h" #include "ARM.h" +#include "ARMBaseInstrInfo.h" #include "ARMBaseRegisterInfo.h" #include "ARMConstantPoolValue.h" +#include "ARMFeatures.h" #include "ARMHazardRecognizer.h" #include "ARMMachineFunctionInfo.h" #include "MCTargetDesc/ARMAddressingModes.h" @@ -36,7 +37,7 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" -#define GET_INSTRINFO_CTOR +#define GET_INSTRINFO_CTOR_DTOR #include "ARMGenInstrInfo.inc" using namespace llvm; @@ -113,8 +114,7 @@ ScheduleHazardRecognizer *ARMBaseInstrInfo:: CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, const ScheduleDAG *DAG) const { if (Subtarget.isThumb2() || Subtarget.hasVFP2()) - return (ScheduleHazardRecognizer *) - new ARMHazardRecognizer(II, *this, getRegisterInfo(), Subtarget, DAG); + return (ScheduleHazardRecognizer *)new ARMHazardRecognizer(II, DAG); return TargetInstrInfo::CreateTargetPostRAHazardRecognizer(II, DAG); } @@ -273,104 +273,90 @@ ARMBaseInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl &Cond, bool AllowModify) const { - // If the block has no terminators, it just falls into the block after it. + TBB = 0; + FBB = 0; + MachineBasicBlock::iterator I = MBB.end(); if (I == MBB.begin()) - return false; + return false; // Empty blocks are easy. --I; - while (I->isDebugValue()) { - if (I == MBB.begin()) - return false; - --I; - } - - // Get the last instruction in the block. - MachineInstr *LastInst = I; - unsigned LastOpc = LastInst->getOpcode(); - // Check if it's an indirect branch first, this should return 'unanalyzable' - // even if it's predicated. - if (isIndirectBranchOpcode(LastOpc)) - return true; + // Walk backwards from the end of the basic block until the branch is + // analyzed or we give up. + while (isPredicated(I) || I->isTerminator()) { - if (!isUnpredicatedTerminator(I)) - return false; + // Flag to be raised on unanalyzeable instructions. This is useful in cases + // where we want to clean up on the end of the basic block before we bail + // out. + bool CantAnalyze = false; - // If there is only one terminator instruction, process it. - if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) { - if (isUncondBranchOpcode(LastOpc)) { - TBB = LastInst->getOperand(0).getMBB(); - return false; + // Skip over DEBUG values and predicated nonterminators. + while (I->isDebugValue() || !I->isTerminator()) { + if (I == MBB.begin()) + return false; + --I; } - if (isCondBranchOpcode(LastOpc)) { - // Block ends with fall-through condbranch. - TBB = LastInst->getOperand(0).getMBB(); - Cond.push_back(LastInst->getOperand(1)); - Cond.push_back(LastInst->getOperand(2)); - return false; + + if (isIndirectBranchOpcode(I->getOpcode()) || + isJumpTableBranchOpcode(I->getOpcode())) { + // Indirect branches and jump tables can't be analyzed, but we still want + // to clean up any instructions at the tail of the basic block. + CantAnalyze = true; + } else if (isUncondBranchOpcode(I->getOpcode())) { + TBB = I->getOperand(0).getMBB(); + } else if (isCondBranchOpcode(I->getOpcode())) { + // Bail out if we encounter multiple conditional branches. + if (!Cond.empty()) + return true; + + assert(!FBB && "FBB should have been null."); + FBB = TBB; + TBB = I->getOperand(0).getMBB(); + Cond.push_back(I->getOperand(1)); + Cond.push_back(I->getOperand(2)); + } else if (I->isReturn()) { + // Returns can't be analyzed, but we should run cleanup. + CantAnalyze = !isPredicated(I); + } else { + // We encountered other unrecognized terminator. Bail out immediately. + return true; } - return true; // Can't handle indirect branch. - } - // Get the instruction before it if it is a terminator. - MachineInstr *SecondLastInst = I; - unsigned SecondLastOpc = SecondLastInst->getOpcode(); - - // If AllowModify is true and the block ends with two or more unconditional - // branches, delete all but the first unconditional branch. - if (AllowModify && isUncondBranchOpcode(LastOpc)) { - while (isUncondBranchOpcode(SecondLastOpc)) { - LastInst->eraseFromParent(); - LastInst = SecondLastInst; - LastOpc = LastInst->getOpcode(); - if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) { - // Return now the only terminator is an unconditional branch. - TBB = LastInst->getOperand(0).getMBB(); - return false; - } else { - SecondLastInst = I; - SecondLastOpc = SecondLastInst->getOpcode(); + // Cleanup code - to be run for unpredicated unconditional branches and + // returns. + if (!isPredicated(I) && + (isUncondBranchOpcode(I->getOpcode()) || + isIndirectBranchOpcode(I->getOpcode()) || + isJumpTableBranchOpcode(I->getOpcode()) || + I->isReturn())) { + // Forget any previous condition branch information - it no longer applies. + Cond.clear(); + FBB = 0; + + // If we can modify the function, delete everything below this + // unconditional branch. + if (AllowModify) { + MachineBasicBlock::iterator DI = llvm::next(I); + while (DI != MBB.end()) { + MachineInstr *InstToDelete = DI; + ++DI; + InstToDelete->eraseFromParent(); + } } } - } - // If there are three terminators, we don't know what sort of block this is. - if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(--I)) - return true; - - // If the block ends with a B and a Bcc, handle it. - if (isCondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) { - TBB = SecondLastInst->getOperand(0).getMBB(); - Cond.push_back(SecondLastInst->getOperand(1)); - Cond.push_back(SecondLastInst->getOperand(2)); - FBB = LastInst->getOperand(0).getMBB(); - return false; - } + if (CantAnalyze) + return true; - // If the block ends with two unconditional branches, handle it. The second - // one is not executed, so remove it. - if (isUncondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) { - TBB = SecondLastInst->getOperand(0).getMBB(); - I = LastInst; - if (AllowModify) - I->eraseFromParent(); - return false; - } + if (I == MBB.begin()) + return false; - // ...likewise if it ends with a branch table followed by an unconditional - // branch. The branch folder can create these, and we must get rid of them for - // correctness of Thumb constant islands. - if ((isJumpTableBranchOpcode(SecondLastOpc) || - isIndirectBranchOpcode(SecondLastOpc)) && - isUncondBranchOpcode(LastOpc)) { - I = LastInst; - if (AllowModify) - I->eraseFromParent(); - return true; + --I; } - // Otherwise, can't handle this. - return true; + // We made it past the terminators without bailing out - we must have + // analyzed this branch successfully. + return false; } @@ -535,11 +521,17 @@ bool ARMBaseInstrInfo::isPredicable(MachineInstr *MI) const { if (!MI->isPredicable()) return false; - if ((MI->getDesc().TSFlags & ARMII::DomainMask) == ARMII::DomainNEON) { - ARMFunctionInfo *AFI = - MI->getParent()->getParent()->getInfo(); - return AFI->isThumb2Function(); + ARMFunctionInfo *AFI = + MI->getParent()->getParent()->getInfo(); + + if (AFI->isThumb2Function()) { + if (getSubtarget().restrictIT()) + return isV8EligibleForIT(MI); + } else { // non-Thumb + if ((MI->getDesc().TSFlags & ARMII::DomainMask) == ARMII::DomainNEON) + return false; } + return true; } @@ -660,16 +652,16 @@ void ARMBaseInstrInfo::copyPhysReg(MachineBasicBlock &MBB, unsigned DestReg, unsigned SrcReg, bool KillSrc) const { bool GPRDest = ARM::GPRRegClass.contains(DestReg); - bool GPRSrc = ARM::GPRRegClass.contains(SrcReg); + bool GPRSrc = ARM::GPRRegClass.contains(SrcReg); if (GPRDest && GPRSrc) { AddDefaultCC(AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::MOVr), DestReg) - .addReg(SrcReg, getKillRegState(KillSrc)))); + .addReg(SrcReg, getKillRegState(KillSrc)))); return; } bool SPRDest = ARM::SPRRegClass.contains(DestReg); - bool SPRSrc = ARM::SPRRegClass.contains(SrcReg); + bool SPRSrc = ARM::SPRRegClass.contains(SrcReg); unsigned Opc = 0; if (SPRDest && SPRSrc) @@ -698,26 +690,47 @@ void ARMBaseInstrInfo::copyPhysReg(MachineBasicBlock &MBB, int Spacing = 1; // Use VORRq when possible. - if (ARM::QQPRRegClass.contains(DestReg, SrcReg)) - Opc = ARM::VORRq, BeginIdx = ARM::qsub_0, SubRegs = 2; - else if (ARM::QQQQPRRegClass.contains(DestReg, SrcReg)) - Opc = ARM::VORRq, BeginIdx = ARM::qsub_0, SubRegs = 4; + if (ARM::QQPRRegClass.contains(DestReg, SrcReg)) { + Opc = ARM::VORRq; + BeginIdx = ARM::qsub_0; + SubRegs = 2; + } else if (ARM::QQQQPRRegClass.contains(DestReg, SrcReg)) { + Opc = ARM::VORRq; + BeginIdx = ARM::qsub_0; + SubRegs = 4; // Fall back to VMOVD. - else if (ARM::DPairRegClass.contains(DestReg, SrcReg)) - Opc = ARM::VMOVD, BeginIdx = ARM::dsub_0, SubRegs = 2; - else if (ARM::DTripleRegClass.contains(DestReg, SrcReg)) - Opc = ARM::VMOVD, BeginIdx = ARM::dsub_0, SubRegs = 3; - else if (ARM::DQuadRegClass.contains(DestReg, SrcReg)) - Opc = ARM::VMOVD, BeginIdx = ARM::dsub_0, SubRegs = 4; - else if (ARM::GPRPairRegClass.contains(DestReg, SrcReg)) - Opc = ARM::MOVr, BeginIdx = ARM::gsub_0, SubRegs = 2; - - else if (ARM::DPairSpcRegClass.contains(DestReg, SrcReg)) - Opc = ARM::VMOVD, BeginIdx = ARM::dsub_0, SubRegs = 2, Spacing = 2; - else if (ARM::DTripleSpcRegClass.contains(DestReg, SrcReg)) - Opc = ARM::VMOVD, BeginIdx = ARM::dsub_0, SubRegs = 3, Spacing = 2; - else if (ARM::DQuadSpcRegClass.contains(DestReg, SrcReg)) - Opc = ARM::VMOVD, BeginIdx = ARM::dsub_0, SubRegs = 4, Spacing = 2; + } else if (ARM::DPairRegClass.contains(DestReg, SrcReg)) { + Opc = ARM::VMOVD; + BeginIdx = ARM::dsub_0; + SubRegs = 2; + } else if (ARM::DTripleRegClass.contains(DestReg, SrcReg)) { + Opc = ARM::VMOVD; + BeginIdx = ARM::dsub_0; + SubRegs = 3; + } else if (ARM::DQuadRegClass.contains(DestReg, SrcReg)) { + Opc = ARM::VMOVD; + BeginIdx = ARM::dsub_0; + SubRegs = 4; + } else if (ARM::GPRPairRegClass.contains(DestReg, SrcReg)) { + Opc = Subtarget.isThumb2() ? ARM::tMOVr : ARM::MOVr; + BeginIdx = ARM::gsub_0; + SubRegs = 2; + } else if (ARM::DPairSpcRegClass.contains(DestReg, SrcReg)) { + Opc = ARM::VMOVD; + BeginIdx = ARM::dsub_0; + SubRegs = 2; + Spacing = 2; + } else if (ARM::DTripleSpcRegClass.contains(DestReg, SrcReg)) { + Opc = ARM::VMOVD; + BeginIdx = ARM::dsub_0; + SubRegs = 3; + Spacing = 2; + } else if (ARM::DQuadSpcRegClass.contains(DestReg, SrcReg)) { + Opc = ARM::VMOVD; + BeginIdx = ARM::dsub_0; + SubRegs = 4; + Spacing = 2; + } assert(Opc && "Impossible reg-to-reg copy"); @@ -726,26 +739,28 @@ void ARMBaseInstrInfo::copyPhysReg(MachineBasicBlock &MBB, // Copy register tuples backward when the first Dest reg overlaps with SrcReg. if (TRI->regsOverlap(SrcReg, TRI->getSubReg(DestReg, BeginIdx))) { - BeginIdx = BeginIdx + ((SubRegs-1)*Spacing); + BeginIdx = BeginIdx + ((SubRegs - 1) * Spacing); Spacing = -Spacing; } #ifndef NDEBUG SmallSet DstRegs; #endif for (unsigned i = 0; i != SubRegs; ++i) { - unsigned Dst = TRI->getSubReg(DestReg, BeginIdx + i*Spacing); - unsigned Src = TRI->getSubReg(SrcReg, BeginIdx + i*Spacing); + unsigned Dst = TRI->getSubReg(DestReg, BeginIdx + i * Spacing); + unsigned Src = TRI->getSubReg(SrcReg, BeginIdx + i * Spacing); assert(Dst && Src && "Bad sub-register"); #ifndef NDEBUG assert(!DstRegs.count(Src) && "destructive vector copy"); DstRegs.insert(Dst); #endif - Mov = BuildMI(MBB, I, I->getDebugLoc(), get(Opc), Dst) - .addReg(Src); + Mov = BuildMI(MBB, I, I->getDebugLoc(), get(Opc), Dst).addReg(Src); // VORR takes two source operands. if (Opc == ARM::VORRq) Mov.addReg(Src); Mov = AddDefaultPred(Mov); + // MOVr can set CC. + if (Opc == ARM::MOVr) + Mov = AddDefaultCC(Mov); } // Add implicit super-register defs and kills to the last instruction. Mov->addRegisterDefined(DestReg, TRI); @@ -1214,16 +1229,6 @@ bool ARMBaseInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const{ return true; } -MachineInstr* -ARMBaseInstrInfo::emitFrameIndexDebugValue(MachineFunction &MF, - int FrameIx, uint64_t Offset, - const MDNode *MDPtr, - DebugLoc DL) const { - MachineInstrBuilder MIB = BuildMI(MF, DL, get(ARM::DBG_VALUE)) - .addFrameIndex(FrameIx).addImm(0).addImm(Offset).addMetadata(MDPtr); - return &*MIB; -} - /// Create a copy of a const pool value. Update CPI to the new index and return /// the label UID. static unsigned duplicateCPV(MachineFunction &MF, unsigned &CPI) { @@ -1426,9 +1431,11 @@ bool ARMBaseInstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, case ARM::VLDRD: case ARM::VLDRS: case ARM::t2LDRi8: + case ARM::t2LDRBi8: case ARM::t2LDRDi8: case ARM::t2LDRSHi8: case ARM::t2LDRi12: + case ARM::t2LDRBi12: case ARM::t2LDRSHi12: break; } @@ -1445,8 +1452,10 @@ bool ARMBaseInstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, case ARM::VLDRD: case ARM::VLDRS: case ARM::t2LDRi8: + case ARM::t2LDRBi8: case ARM::t2LDRSHi8: case ARM::t2LDRi12: + case ARM::t2LDRBi12: case ARM::t2LDRSHi12: break; } @@ -1493,7 +1502,16 @@ bool ARMBaseInstrInfo::shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2, if ((Offset2 - Offset1) / 8 > 64) return false; - if (Load1->getMachineOpcode() != Load2->getMachineOpcode()) + // Check if the machine opcodes are different. If they are different + // then we consider them to not be of the same base address, + // EXCEPT in the case of Thumb2 byte loads where one is LDRBi8 and the other LDRBi12. + // In this case, they are considered to be the same because they are different + // encoding forms of the same basic instruction. + if ((Load1->getMachineOpcode() != Load2->getMachineOpcode()) && + !((Load1->getMachineOpcode() == ARM::t2LDRBi8 && + Load2->getMachineOpcode() == ARM::t2LDRBi12) || + (Load1->getMachineOpcode() == ARM::t2LDRBi12 && + Load2->getMachineOpcode() == ARM::t2LDRBi8))) return false; // FIXME: overly conservative? // Four loads in a row should be sufficient. @@ -1708,7 +1726,7 @@ MachineInstr *ARMBaseInstrInfo::optimizeSelect(MachineInstr *MI, bool PreferFalse) const { assert((MI->getOpcode() == ARM::MOVCCr || MI->getOpcode() == ARM::t2MOVCCr) && "Unknown select instruction"); - const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); + MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); MachineInstr *DefMI = canFoldIntoMOVCC(MI->getOperand(2).getReg(), MRI, this); bool Invert = !DefMI; if (!DefMI) @@ -1716,11 +1734,17 @@ MachineInstr *ARMBaseInstrInfo::optimizeSelect(MachineInstr *MI, if (!DefMI) return 0; + // Find new register class to use. + MachineOperand FalseReg = MI->getOperand(Invert ? 2 : 1); + unsigned DestReg = MI->getOperand(0).getReg(); + const TargetRegisterClass *PreviousClass = MRI.getRegClass(FalseReg.getReg()); + if (!MRI.constrainRegClass(DestReg, PreviousClass)) + return 0; + // Create a new predicated version of DefMI. // Rfalse is the first use. MachineInstrBuilder NewMI = BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), - DefMI->getDesc(), - MI->getOperand(0).getReg()); + DefMI->getDesc(), DestReg); // Copy all the DefMI operands, excluding its (null) predicate. const MCInstrDesc &DefDesc = DefMI->getDesc(); @@ -1743,7 +1767,6 @@ MachineInstr *ARMBaseInstrInfo::optimizeSelect(MachineInstr *MI, // register operand tied to the first def. // The tie makes the register allocator ensure the FalseReg is allocated the // same register as operand 0. - MachineOperand FalseReg = MI->getOperand(Invert ? 2 : 1); FalseReg.setImplicit(); NewMI.addOperand(FalseReg); NewMI->tieOperands(0, NewMI->getNumOperands() - 1); @@ -1803,6 +1826,14 @@ void llvm::emitARMRegPlusImmediate(MachineBasicBlock &MBB, unsigned DestReg, unsigned BaseReg, int NumBytes, ARMCC::CondCodes Pred, unsigned PredReg, const ARMBaseInstrInfo &TII, unsigned MIFlags) { + if (NumBytes == 0 && DestReg != BaseReg) { + BuildMI(MBB, MBBI, dl, TII.get(ARM::MOVr), DestReg) + .addReg(BaseReg, RegState::Kill) + .addImm((unsigned)Pred).addReg(PredReg).addReg(0) + .setMIFlags(MIFlags); + return; + } + bool isSub = NumBytes < 0; if (isSub) NumBytes = -NumBytes; @@ -1826,6 +1857,115 @@ void llvm::emitARMRegPlusImmediate(MachineBasicBlock &MBB, } } +bool llvm::tryFoldSPUpdateIntoPushPop(MachineFunction &MF, + MachineInstr *MI, + unsigned NumBytes) { + // This optimisation potentially adds lots of load and store + // micro-operations, it's only really a great benefit to code-size. + if (!MF.getFunction()->hasFnAttribute(Attribute::MinSize)) + return false; + + // If only one register is pushed/popped, LLVM can use an LDR/STR + // instead. We can't modify those so make sure we're dealing with an + // instruction we understand. + bool IsPop = isPopOpcode(MI->getOpcode()); + bool IsPush = isPushOpcode(MI->getOpcode()); + if (!IsPush && !IsPop) + return false; + + bool IsVFPPushPop = MI->getOpcode() == ARM::VSTMDDB_UPD || + MI->getOpcode() == ARM::VLDMDIA_UPD; + bool IsT1PushPop = MI->getOpcode() == ARM::tPUSH || + MI->getOpcode() == ARM::tPOP || + MI->getOpcode() == ARM::tPOP_RET; + + assert((IsT1PushPop || (MI->getOperand(0).getReg() == ARM::SP && + MI->getOperand(1).getReg() == ARM::SP)) && + "trying to fold sp update into non-sp-updating push/pop"); + + // The VFP push & pop act on D-registers, so we can only fold an adjustment + // by a multiple of 8 bytes in correctly. Similarly rN is 4-bytes. Don't try + // if this is violated. + if (NumBytes % (IsVFPPushPop ? 8 : 4) != 0) + return false; + + // ARM and Thumb2 push/pop insts have explicit "sp, sp" operands (+ + // pred) so the list starts at 4. Thumb1 starts after the predicate. + int RegListIdx = IsT1PushPop ? 2 : 4; + + // Calculate the space we'll need in terms of registers. + unsigned FirstReg = MI->getOperand(RegListIdx).getReg(); + unsigned RD0Reg, RegsNeeded; + if (IsVFPPushPop) { + RD0Reg = ARM::D0; + RegsNeeded = NumBytes / 8; + } else { + RD0Reg = ARM::R0; + RegsNeeded = NumBytes / 4; + } + + // We're going to have to strip all list operands off before + // re-adding them since the order matters, so save the existing ones + // for later. + SmallVector RegList; + for (int i = MI->getNumOperands() - 1; i >= RegListIdx; --i) + RegList.push_back(MI->getOperand(i)); + + MachineBasicBlock *MBB = MI->getParent(); + const TargetRegisterInfo *TRI = MF.getRegInfo().getTargetRegisterInfo(); + const MCPhysReg *CSRegs = TRI->getCalleeSavedRegs(&MF); + + // Now try to find enough space in the reglist to allocate NumBytes. + for (unsigned CurReg = FirstReg - 1; CurReg >= RD0Reg && RegsNeeded; + --CurReg) { + if (!IsPop) { + // Pushing any register is completely harmless, mark the + // register involved as undef since we don't care about it in + // the slightest. + RegList.push_back(MachineOperand::CreateReg(CurReg, false, false, + false, false, true)); + --RegsNeeded; + continue; + } + + // However, we can only pop an extra register if it's not live. For + // registers live within the function we might clobber a return value + // register; the other way a register can be live here is if it's + // callee-saved. + if (isCalleeSavedRegister(CurReg, CSRegs) || + MBB->computeRegisterLiveness(TRI, CurReg, MI) != + MachineBasicBlock::LQR_Dead) { + // VFP pops don't allow holes in the register list, so any skip is fatal + // for our transformation. GPR pops do, so we should just keep looking. + if (IsVFPPushPop) + return false; + else + continue; + } + + // Mark the unimportant registers as in the POP. + RegList.push_back(MachineOperand::CreateReg(CurReg, true, false, false, + true)); + --RegsNeeded; + } + + if (RegsNeeded > 0) + return false; + + // Finally we know we can profitably perform the optimisation so go + // ahead: strip all existing registers off and add them back again + // in the right order. + for (int i = MI->getNumOperands() - 1; i >= RegListIdx; --i) + MI->RemoveOperand(i); + + // Add the complete list back in. + MachineInstrBuilder MIB(MF, &*MI); + for (int i = RegList.size() - 1; i >= 0; --i) + MIB.addOperand(RegList[i]); + + return true; +} + bool llvm::rewriteARMFrameIndex(MachineInstr &MI, unsigned FrameRegIdx, unsigned FrameReg, int &Offset, const ARMBaseInstrInfo &TII) { @@ -2232,8 +2372,32 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2, isSafe = true; break; } - // Condition code is after the operand before CPSR. - ARMCC::CondCodes CC = (ARMCC::CondCodes)Instr.getOperand(IO-1).getImm(); + // Condition code is after the operand before CPSR except for VSELs. + ARMCC::CondCodes CC; + bool IsInstrVSel = true; + switch (Instr.getOpcode()) { + default: + IsInstrVSel = false; + CC = (ARMCC::CondCodes)Instr.getOperand(IO - 1).getImm(); + break; + case ARM::VSELEQD: + case ARM::VSELEQS: + CC = ARMCC::EQ; + break; + case ARM::VSELGTD: + case ARM::VSELGTS: + CC = ARMCC::GT; + break; + case ARM::VSELGED: + case ARM::VSELGES: + CC = ARMCC::GE; + break; + case ARM::VSELVSS: + case ARM::VSELVSD: + CC = ARMCC::VS; + break; + } + if (Sub) { ARMCC::CondCodes NewCC = getSwappedCondition(CC); if (NewCC == ARMCC::AL) @@ -2244,11 +2408,14 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2, // If it is safe to remove CmpInstr, the condition code of these // operands will be modified. if (SrcReg2 != 0 && Sub->getOperand(1).getReg() == SrcReg2 && - Sub->getOperand(2).getReg() == SrcReg) - OperandsToUpdate.push_back(std::make_pair(&((*I).getOperand(IO-1)), - NewCC)); - } - else + Sub->getOperand(2).getReg() == SrcReg) { + // VSel doesn't support condition code update. + if (IsInstrVSel) + return false; + OperandsToUpdate.push_back( + std::make_pair(&((*I).getOperand(IO - 1)), NewCC)); + } + } else switch (CC) { default: // CPSR can be used multiple times, we should continue. @@ -3604,6 +3771,24 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData, return Latency; } +unsigned ARMBaseInstrInfo::getPredicationCost(const MachineInstr *MI) const { + if (MI->isCopyLike() || MI->isInsertSubreg() || + MI->isRegSequence() || MI->isImplicitDef()) + return 0; + + if (MI->isBundle()) + return 0; + + const MCInstrDesc &MCID = MI->getDesc(); + + if (MCID.isCall() || MCID.hasImplicitDefOfPhysReg(ARM::CPSR)) { + // When predicated, CPSR is an additional source operand for CPSR updating + // instructions, this apparently increases their latencies. + return 1; + } + return 0; +} + unsigned ARMBaseInstrInfo::getInstrLatency(const InstrItineraryData *ItinData, const MachineInstr *MI, unsigned *PredCost) const { @@ -3685,8 +3870,7 @@ hasHighOperandLatency(const InstrItineraryData *ItinData, return true; // Hoist VFP / NEON instructions with 4 or higher latency. - int Latency = computeOperandLatency(ItinData, DefMI, DefIdx, UseMI, UseIdx, - /*FindMin=*/false); + int Latency = computeOperandLatency(ItinData, DefMI, DefIdx, UseMI, UseIdx); if (Latency < 0) Latency = getInstrLatency(ItinData, DefMI); if (Latency <= 3) @@ -4137,7 +4321,7 @@ breakPartialRegDependency(MachineBasicBlock::iterator MI, // FIXME: In some cases, VLDRS can be changed to a VLD1DUPd32 which defines // the full D-register by loading the same value to both lanes. The // instruction is micro-coded with 2 uops, so don't do this until we can - // properly schedule micro-coded instuctions. The dispatcher stalls cause + // properly schedule micro-coded instructions. The dispatcher stalls cause // too big regressions. // Insert the dependency-breaking FCONSTD before MI. @@ -4152,6 +4336,8 @@ bool ARMBaseInstrInfo::hasNOP() const { } bool ARMBaseInstrInfo::isSwiftFastImmShift(const MachineInstr *MI) const { + if (MI->getNumOperands() < 4) + return true; unsigned ShOpVal = MI->getOperand(3).getImm(); unsigned ShImm = ARM_AM::getSORegOffset(ShOpVal); // Swift supports faster shifts for: lsl 2, lsl 1, and lsr 1. diff --git a/lib/Target/ARM/ARMBaseInstrInfo.h b/lib/Target/ARM/ARMBaseInstrInfo.h index 2ef659c23bd6..93e59647d220 100644 --- a/lib/Target/ARM/ARMBaseInstrInfo.h +++ b/lib/Target/ARM/ARMBaseInstrInfo.h @@ -46,7 +46,7 @@ public: MachineBasicBlock::iterator &MBBI, LiveVariables *LV) const; - virtual const ARMBaseRegisterInfo &getRegisterInfo() const =0; + virtual const ARMBaseRegisterInfo &getRegisterInfo() const = 0; const ARMSubtarget &getSubtarget() const { return Subtarget; } ScheduleHazardRecognizer * @@ -125,12 +125,6 @@ public: virtual bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const; - virtual MachineInstr *emitFrameIndexDebugValue(MachineFunction &MF, - int FrameIx, - uint64_t Offset, - const MDNode *MDPtr, - DebugLoc DL) const; - virtual void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned DestReg, unsigned SubIdx, @@ -270,6 +264,8 @@ private: const MCInstrDesc &UseMCID, unsigned UseIdx, unsigned UseAlign) const; + unsigned getPredicationCost(const MachineInstr *MI) const; + unsigned getInstrLatency(const InstrItineraryData *ItinData, const MachineInstr *MI, unsigned *PredCost = 0) const; @@ -366,6 +362,17 @@ bool isIndirectBranchOpcode(int Opc) { return Opc == ARM::BX || Opc == ARM::MOVPCRX || Opc == ARM::tBRIND; } +static inline bool isPopOpcode(int Opc) { + return Opc == ARM::tPOP_RET || Opc == ARM::LDMIA_RET || + Opc == ARM::t2LDMIA_RET || Opc == ARM::tPOP || Opc == ARM::LDMIA_UPD || + Opc == ARM::t2LDMIA_UPD || Opc == ARM::VLDMDIA_UPD; +} + +static inline bool isPushOpcode(int Opc) { + return Opc == ARM::tPUSH || Opc == ARM::t2STMDB_UPD || + Opc == ARM::STMDB_UPD || Opc == ARM::VSTMDDB_UPD; +} + /// getInstrPredicate - If instruction is predicated, returns its predicate /// condition, otherwise returns AL. It also returns the condition code /// register by reference. @@ -405,6 +412,13 @@ void emitThumbRegPlusImmediate(MachineBasicBlock &MBB, const ARMBaseRegisterInfo& MRI, unsigned MIFlags = 0); +/// Tries to add registers to the reglist of a given base-updating +/// push/pop instruction to adjust the stack by an additional +/// NumBytes. This can save a few bytes per function in code-size, but +/// obviously generates more memory traffic. As such, it only takes +/// effect in functions being optimised for size. +bool tryFoldSPUpdateIntoPushPop(MachineFunction &MF, MachineInstr *MI, + unsigned NumBytes); /// rewriteARMFrameIndex / rewriteT2FrameIndex - /// Rewrite MI to access 'Offset' bytes from the FP. Return false if the diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/lib/Target/ARM/ARMBaseRegisterInfo.cpp index b0d34a76b014..8717dc0cde90 100644 --- a/lib/Target/ARM/ARMBaseRegisterInfo.cpp +++ b/lib/Target/ARM/ARMBaseRegisterInfo.cpp @@ -43,46 +43,73 @@ using namespace llvm; -ARMBaseRegisterInfo::ARMBaseRegisterInfo(const ARMBaseInstrInfo &tii, - const ARMSubtarget &sti) - : ARMGenRegisterInfo(ARM::LR, 0, 0, ARM::PC), TII(tii), STI(sti), +ARMBaseRegisterInfo::ARMBaseRegisterInfo(const ARMSubtarget &sti) + : ARMGenRegisterInfo(ARM::LR, 0, 0, ARM::PC), STI(sti), FramePtr((STI.isTargetDarwin() || STI.isThumb()) ? ARM::R7 : ARM::R11), BasePtr(ARM::R6) { } const uint16_t* ARMBaseRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { - bool ghcCall = false; - - if (MF) { - const Function *F = MF->getFunction(); - ghcCall = (F ? F->getCallingConv() == CallingConv::GHC : false); - } - - if (ghcCall) { - return CSR_GHC_SaveList; - } - else { - return (STI.isTargetIOS() && !STI.isAAPCS_ABI()) - ? CSR_iOS_SaveList : CSR_AAPCS_SaveList; + const uint16_t *RegList = (STI.isTargetIOS() && !STI.isAAPCS_ABI()) + ? CSR_iOS_SaveList + : CSR_AAPCS_SaveList; + + if (!MF) return RegList; + + const Function *F = MF->getFunction(); + if (F->getCallingConv() == CallingConv::GHC) { + // GHC set of callee saved regs is empty as all those regs are + // used for passing STG regs around + return CSR_NoRegs_SaveList; + } else if (F->hasFnAttribute("interrupt")) { + if (STI.isMClass()) { + // M-class CPUs have hardware which saves the registers needed to allow a + // function conforming to the AAPCS to function as a handler. + return CSR_AAPCS_SaveList; + } else if (F->getFnAttribute("interrupt").getValueAsString() == "FIQ") { + // Fast interrupt mode gives the handler a private copy of R8-R14, so less + // need to be saved to restore user-mode state. + return CSR_FIQ_SaveList; + } else { + // Generally only R13-R14 (i.e. SP, LR) are automatically preserved by + // exception handling. + return CSR_GenericInt_SaveList; + } } + + return RegList; } const uint32_t* -ARMBaseRegisterInfo::getCallPreservedMask(CallingConv::ID) const { +ARMBaseRegisterInfo::getCallPreservedMask(CallingConv::ID CC) const { + if (CC == CallingConv::GHC) + // This is academic becase all GHC calls are (supposed to be) tail calls + return CSR_NoRegs_RegMask; return (STI.isTargetIOS() && !STI.isAAPCS_ABI()) ? CSR_iOS_RegMask : CSR_AAPCS_RegMask; } const uint32_t* -ARMBaseRegisterInfo::getThisReturnPreservedMask(CallingConv::ID) const { - return (STI.isTargetIOS() && !STI.isAAPCS_ABI()) - ? CSR_iOS_ThisReturn_RegMask : CSR_AAPCS_ThisReturn_RegMask; +ARMBaseRegisterInfo::getNoPreservedMask() const { + return CSR_NoRegs_RegMask; } const uint32_t* -ARMBaseRegisterInfo::getNoPreservedMask() const { - return CSR_NoRegs_RegMask; +ARMBaseRegisterInfo::getThisReturnPreservedMask(CallingConv::ID CC) const { + // This should return a register mask that is the same as that returned by + // getCallPreservedMask but that additionally preserves the register used for + // the first i32 argument (which must also be the register used to return a + // single i32 return value) + // + // In case that the calling convention does not use the same register for + // both or otherwise does not want to enable this optimization, the function + // should return NULL + if (CC == CallingConv::GHC) + // This is academic becase all GHC calls are (supposed to be) tail calls + return NULL; + return (STI.isTargetIOS() && !STI.isAAPCS_ABI()) + ? CSR_iOS_ThisReturn_RegMask : CSR_AAPCS_ThisReturn_RegMask; } BitVector ARMBaseRegisterInfo:: @@ -94,6 +121,7 @@ getReservedRegs(const MachineFunction &MF) const { Reserved.set(ARM::SP); Reserved.set(ARM::PC); Reserved.set(ARM::FPSCR); + Reserved.set(ARM::APSR_NZCV); if (TFI->hasFP(MF)) Reserved.set(FramePtr); if (hasBasePointer(MF)) @@ -309,7 +337,7 @@ bool ARMBaseRegisterInfo::canRealignStack(const MachineFunction &MF) const { // 1. Dynamic stack realignment is explicitly disabled, // 2. This is a Thumb1 function (it's not useful, so we don't bother), or // 3. There are VLAs in the function and the base pointer is disabled. - if (!MF.getTarget().Options.RealignStack) + if (MF.getFunction()->hasFnAttribute("no-realign-stack")) return false; if (AFI->isThumb1OnlyFunction()) return false; @@ -357,14 +385,6 @@ ARMBaseRegisterInfo::getFrameRegister(const MachineFunction &MF) const { return ARM::SP; } -unsigned ARMBaseRegisterInfo::getEHExceptionRegister() const { - llvm_unreachable("What is the exception register"); -} - -unsigned ARMBaseRegisterInfo::getEHHandlerRegister() const { - llvm_unreachable("What is the exception handler register"); -} - /// emitLoadConstPool - Emits a load from constpool to materialize the /// specified immediate. void ARMBaseRegisterInfo:: @@ -375,6 +395,7 @@ emitLoadConstPool(MachineBasicBlock &MBB, ARMCC::CondCodes Pred, unsigned PredReg, unsigned MIFlags) const { MachineFunction &MF = *MBB.getParent(); + const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo(); MachineConstantPool *ConstantPool = MF.getConstantPool(); const Constant *C = ConstantInt::get(Type::getInt32Ty(MF.getFunction()->getContext()), Val); @@ -556,9 +577,10 @@ materializeFrameBaseRegister(MachineBasicBlock *MBB, if (Ins != MBB->end()) DL = Ins->getDebugLoc(); - const MCInstrDesc &MCID = TII.get(ADDriOpc); - MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); const MachineFunction &MF = *MBB->getParent(); + MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); + const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo(); + const MCInstrDesc &MCID = TII.get(ADDriOpc); MRI.constrainRegClass(BaseReg, TII.getRegClass(MCID, 0, this, MF)); MachineInstrBuilder MIB = AddDefaultPred(BuildMI(*MBB, Ins, DL, MCID, BaseReg) @@ -574,6 +596,8 @@ ARMBaseRegisterInfo::resolveFrameIndex(MachineBasicBlock::iterator I, MachineInstr &MI = *I; MachineBasicBlock &MBB = *MI.getParent(); MachineFunction &MF = *MBB.getParent(); + const ARMBaseInstrInfo &TII = + *static_cast(MF.getTarget().getInstrInfo()); ARMFunctionInfo *AFI = MF.getInfo(); int Off = Offset; // ARM doesn't need the general 64-bit offsets unsigned i = 0; @@ -671,6 +695,8 @@ ARMBaseRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, MachineInstr &MI = *II; MachineBasicBlock &MBB = *MI.getParent(); MachineFunction &MF = *MBB.getParent(); + const ARMBaseInstrInfo &TII = + *static_cast(MF.getTarget().getInstrInfo()); const ARMFrameLowering *TFI = static_cast(MF.getTarget().getFrameLowering()); ARMFunctionInfo *AFI = MF.getInfo(); @@ -696,12 +722,7 @@ ARMBaseRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, } #endif // NDEBUG - // Special handling of dbg_value instructions. - if (MI.isDebugValue()) { - MI.getOperand(FIOperandNum). ChangeToRegister(FrameReg, false /*isDef*/); - MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset); - return; - } + assert(!MI.isDebugValue() && "DBG_VALUEs should be handled in target-independent code"); // Modify MI as necessary to handle as much of 'Offset' as possible bool Done = false; diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.h b/lib/Target/ARM/ARMBaseRegisterInfo.h index 0679919152c0..e28fff68f4e2 100644 --- a/lib/Target/ARM/ARMBaseRegisterInfo.h +++ b/lib/Target/ARM/ARMBaseRegisterInfo.h @@ -72,9 +72,16 @@ static inline bool isARMArea3Register(unsigned Reg, bool isIOS) { } } +static inline bool isCalleeSavedRegister(unsigned Reg, + const MCPhysReg *CSRegs) { + for (unsigned i = 0; CSRegs[i]; ++i) + if (Reg == CSRegs[i]) + return true; + return false; +} + class ARMBaseRegisterInfo : public ARMGenRegisterInfo { protected: - const ARMBaseInstrInfo &TII; const ARMSubtarget &STI; /// FramePtr - ARM physical register used as frame ptr. @@ -86,8 +93,7 @@ protected: unsigned BasePtr; // Can be only subclassed. - explicit ARMBaseRegisterInfo(const ARMBaseInstrInfo &tii, - const ARMSubtarget &STI); + explicit ARMBaseRegisterInfo(const ARMSubtarget &STI); // Return the opcode that implements 'Op', or 0 if no opcode unsigned getOpcode(int Op) const; @@ -96,9 +102,18 @@ public: /// Code Generation virtual methods... const uint16_t *getCalleeSavedRegs(const MachineFunction *MF = 0) const; const uint32_t *getCallPreservedMask(CallingConv::ID) const; - const uint32_t *getThisReturnPreservedMask(CallingConv::ID) const; const uint32_t *getNoPreservedMask() const; + /// getThisReturnPreservedMask - Returns a call preserved mask specific to the + /// case that 'returned' is on an i32 first argument if the calling convention + /// is one that can (partially) model this attribute with a preserved mask + /// (i.e. it is a calling convention that uses the same register for the first + /// i32 argument and an i32 return value) + /// + /// Should return NULL in the case that the calling convention does not have + /// this property + const uint32_t *getThisReturnPreservedMask(CallingConv::ID) const; + BitVector getReservedRegs(const MachineFunction &MF) const; const TargetRegisterClass* @@ -142,10 +157,6 @@ public: unsigned getFrameRegister(const MachineFunction &MF) const; unsigned getBaseRegister() const { return BasePtr; } - // Exception handling queries. - unsigned getEHExceptionRegister() const; - unsigned getEHHandlerRegister() const; - bool isLowRegister(unsigned Reg) const; diff --git a/lib/Target/ARM/ARMBuildAttrs.h b/lib/Target/ARM/ARMBuildAttrs.h index 11bd6a4a8dbc..b16d4ef54b6d 100644 --- a/lib/Target/ARM/ARMBuildAttrs.h +++ b/lib/Target/ARM/ARMBuildAttrs.h @@ -15,11 +15,13 @@ #ifndef __TARGET_ARMBUILDATTRS_H__ #define __TARGET_ARMBUILDATTRS_H__ +namespace llvm { namespace ARMBuildAttrs { + enum SpecialAttr { // This is for the .cpu asm attr. It translates into one or more // AttrType (below) entries in the .ARM.attributes section in the ELF. - SEL_CPU + SEL_CPU }; enum AttrType { @@ -57,7 +59,7 @@ namespace ARMBuildAttrs { ABI_FP_optimization_goals = 31, compatibility = 32, CPU_unaligned_access = 34, - VFP_HP_extension = 36, + FP_HP_extension = 36, ABI_FP_16bit_format = 38, MPextension_use = 42, // was 70, 2.08 ABI DIV_use = 44, @@ -89,10 +91,11 @@ namespace ARMBuildAttrs { v7 = 10, // e.g. Cortex A8, Cortex M3 v6_M = 11, // e.g. Cortex M1 v6S_M = 12, // v6_M with the System extensions - v7E_M = 13 // v7_M with DSP extensions + v7E_M = 13, // v7_M with DSP extensions + v8 = 14 // v8, AArch32 }; - enum CPUArchProfile { // (=7), uleb128 + enum CPUArchProfile { // (=7), uleb128 Not_Applicable = 0, // pre v7, or cross-profile code ApplicationProfile = (0x41), // 'A' (e.g. for Cortex A8) RealTimeProfile = (0x52), // 'R' (e.g. for Cortex R4) @@ -101,31 +104,67 @@ namespace ARMBuildAttrs { }; // The following have a lot of common use cases - enum { - //ARMISAUse (=8), uleb128 and THUMBISAUse (=9), uleb128 + enum { Not_Allowed = 0, Allowed = 1, - // FP_arch (=10), uleb128 (formerly Tag_VFP_arch = 10) + // Tag_ARM_ISA_use (=8), uleb128 + + // Tag_THUMB_ISA_use, (=9), uleb128 + AllowThumb32 = 2, // 32-bit Thumb (implies 16-bit instructions) + + // Tag_FP_arch (=10), uleb128 (formerly Tag_VFP_arch = 10) AllowFPv2 = 2, // v2 FP ISA permitted (implies use of the v1 FP ISA) AllowFPv3A = 3, // v3 FP ISA permitted (implies use of the v2 FP ISA) - AllowFPv3B = 4, // v3 FP ISA permitted, but only D0-D15, S0-S31 - AllowFPv4A = 5, // v4 FP ISA permitted (implies use of v3 FP ISA) + AllowFPv3B = 4, // v3 FP ISA permitted, but only D0-D15, S0-S31 + AllowFPv4A = 5, // v4 FP ISA permitted (implies use of v3 FP ISA) AllowFPv4B = 6, // v4 FP ISA was permitted, but only D0-D15, S0-S31 + AllowFPARMv8A = 7, // Use of the ARM v8-A FP ISA was permitted + AllowFPARMv8B = 8, // Use of the ARM v8-A FP ISA was permitted, but only D0-D15, S0-S31 // Tag_WMMX_arch, (=11), uleb128 - AllowThumb32 = 2, // 32-bit Thumb (implies 16-bit instructions) - - // Tag_WMMX_arch, (=11), uleb128 - AllowWMMXv1 = 2, // The user permitted this entity to use WMMX v2 + AllowWMMXv1 = 1, // The user permitted this entity to use WMMX v1 + AllowWMMXv2 = 2, // The user permitted this entity to use WMMX v2 + + // Tag_Advanced_SIMD_arch, (=12), uleb128 + AllowNeon = 1, // SIMDv1 was permitted + AllowNeon2 = 2, // SIMDv2 was permitted (Half-precision FP, MAC operations) + AllowNeonARMv8 = 3, // ARM v8-A SIMD was permitted - // Tag_ABI_FP_denormal, (=20), uleb128 + // Tag_ABI_FP_denormal, (=20), uleb128 PreserveFPSign = 2, // sign when flushed-to-zero is preserved // Tag_ABI_FP_number_model, (=23), uleb128 AllowRTABI = 2, // numbers, infinities, and one quiet NaN (see [RTABI]) - AllowIEE754 = 3 // this code to use all the IEEE 754-defined FP encodings + AllowIEE754 = 3, // this code to use all the IEEE 754-defined FP encodings + + // Tag_ABI_HardFP_use, (=27), uleb128 + HardFPImplied = 0, // FP use should be implied by Tag_FP_arch + HardFPSinglePrecision = 1, // Single-precision only + + // Tag_ABI_VFP_args, (=28), uleb128 + BaseAAPCS = 0, + HardFPAAPCS = 1, + + // Tag_FP_HP_extension, (=36), uleb128 + AllowHPFP = 1, // Allow use of Half Precision FP + + // Tag_MPextension_use, (=42), uleb128 + AllowMP = 1, // Allow use of MP extensions + + // Tag_DIV_use, (=44), uleb128 + AllowDIVIfExists = 0, // Allow hardware divide if available in arch, or no info exists. + DisallowDIV = 1, // Hardware divide explicitly disallowed + AllowDIVExt = 2, // Allow hardware divide as optional architecture extension above + // the base arch specified by Tag_CPU_arch and Tag_CPU_arch_profile. + + // Tag_Virtualization_use, (=68), uleb128 + AllowTZ = 1, + AllowVirtualization = 2, + AllowTZVirtualization = 3 }; -} + +} // namespace ARMBuildAttrs +} // namespace llvm #endif // __TARGET_ARMBUILDATTRS_H__ diff --git a/lib/Target/ARM/ARMCallingConv.td b/lib/Target/ARM/ARMCallingConv.td index 8ff666ed2844..9bea4b2d68e9 100644 --- a/lib/Target/ARM/ARMCallingConv.td +++ b/lib/Target/ARM/ARMCallingConv.td @@ -207,10 +207,24 @@ def CSR_AAPCS_ThisReturn : CalleeSavedRegs<(add LR, R11, R10, R9, R8, R7, R6, def CSR_iOS : CalleeSavedRegs<(add LR, R7, R6, R5, R4, (sub CSR_AAPCS, R9))>; def CSR_iOS_ThisReturn : CalleeSavedRegs<(add LR, R7, R6, R5, R4, - (sub CSR_AAPCS_ThisReturn, R9))>; + (sub CSR_AAPCS_ThisReturn, R9))>; + +// The "interrupt" attribute is used to generate code that is acceptable in +// exception-handlers of various kinds. It makes us use a different return +// instruction (handled elsewhere) and affects which registers we must return to +// our "caller" in the same state as we receive them. + +// For most interrupts, all registers except SP and LR are shared with +// user-space. We mark LR to be saved anyway, since this is what the ARM backend +// generally does rather than tracking its liveness as a normal register. +def CSR_GenericInt : CalleeSavedRegs<(add LR, (sequence "R%u", 12, 0))>; + +// The fast interrupt handlers have more private state and get their own copies +// of R8-R12, in addition to SP and LR. As before, mark LR for saving too. + +// FIXME: we mark R11 as callee-saved since it's often the frame-pointer, and +// current frame lowering expects to encounter it while processing callee-saved +// registers. +def CSR_FIQ : CalleeSavedRegs<(add LR, R11, (sequence "R%u", 7, 0))>; + -// GHC set of callee saved regs is empty as all those regs are -// used for passing STG regs around -// add is a workaround for not being able to compile empty list: -// def CSR_GHC : CalleeSavedRegs<()>; -def CSR_GHC : CalleeSavedRegs<(add)>; diff --git a/lib/Target/ARM/ARMCodeEmitter.cpp b/lib/Target/ARM/ARMCodeEmitter.cpp index 5e8e1739a984..568ca858c4d2 100644 --- a/lib/Target/ARM/ARMCodeEmitter.cpp +++ b/lib/Target/ARM/ARMCodeEmitter.cpp @@ -167,6 +167,8 @@ namespace { const { return 0; } unsigned NEONThumb2DupPostEncoder(const MachineInstr &MI,unsigned Val) const { return 0; } + unsigned NEONThumb2V8PostEncoder(const MachineInstr &MI,unsigned Val) + const { return 0; } unsigned VFPThumb2PostEncoder(const MachineInstr&MI, unsigned Val) const { return 0; } unsigned getAdrLabelOpValue(const MachineInstr &MI, unsigned Op) @@ -1044,8 +1046,8 @@ void ARMCodeEmitter::emitDataProcessingInstruction(const MachineInstr &MI, return; } else if ((MCID.Opcode == ARM::BFC) || (MCID.Opcode == ARM::BFI)) { uint32_t v = ~MI.getOperand(2).getImm(); - int32_t lsb = CountTrailingZeros_32(v); - int32_t msb = (32 - CountLeadingZeros_32(v)) - 1; + int32_t lsb = countTrailingZeros(v); + int32_t msb = (32 - countLeadingZeros(v)) - 1; // Instr{20-16} = msb, Instr{11-7} = lsb Binary |= (msb & 0x1F) << 16; Binary |= (lsb & 0x1F) << 7; diff --git a/lib/Target/ARM/ARMConstantIslandPass.cpp b/lib/Target/ARM/ARMConstantIslandPass.cpp index 4891609b336f..cff5ce27bca6 100644 --- a/lib/Target/ARM/ARMConstantIslandPass.cpp +++ b/lib/Target/ARM/ARMConstantIslandPass.cpp @@ -128,7 +128,7 @@ namespace { // If the block size isn't a multiple of the known bits, assume the // worst case padding. if (Size & ((1u << Bits) - 1)) - Bits = CountTrailingZeros_32(Size); + Bits = countTrailingZeros(Size); return Bits; } @@ -753,6 +753,7 @@ initializeFunctionInfo(const std::vector &CPEMIs) { Scale = 4; break; + case ARM::LDRBi12: case ARM::LDRi12: case ARM::LDRcp: case ARM::t2LDRpci: diff --git a/lib/Target/ARM/ARMConstantPoolValue.cpp b/lib/Target/ARM/ARMConstantPoolValue.cpp index 4e703ec3c1a8..7d41c69f08b8 100644 --- a/lib/Target/ARM/ARMConstantPoolValue.cpp +++ b/lib/Target/ARM/ARMConstantPoolValue.cpp @@ -163,21 +163,7 @@ const BlockAddress *ARMConstantPoolConstant::getBlockAddress() const { int ARMConstantPoolConstant::getExistingMachineCPValue(MachineConstantPool *CP, unsigned Alignment) { - unsigned AlignMask = Alignment - 1; - const std::vector Constants = CP->getConstants(); - for (unsigned i = 0, e = Constants.size(); i != e; ++i) { - if (Constants[i].isMachineConstantPoolEntry() && - (Constants[i].getAlignment() & AlignMask) == 0) { - ARMConstantPoolValue *CPV = - (ARMConstantPoolValue *)Constants[i].Val.MachineCPVal; - ARMConstantPoolConstant *APC = dyn_cast(CPV); - if (!APC) continue; - if (APC->CVal == CVal && equals(APC)) - return i; - } - } - - return -1; + return getExistingMachineCPValueImpl(CP, Alignment); } bool ARMConstantPoolConstant::hasSameValue(ARMConstantPoolValue *ACPV) { @@ -216,22 +202,7 @@ ARMConstantPoolSymbol::Create(LLVMContext &C, const char *s, int ARMConstantPoolSymbol::getExistingMachineCPValue(MachineConstantPool *CP, unsigned Alignment) { - unsigned AlignMask = Alignment - 1; - const std::vector Constants = CP->getConstants(); - for (unsigned i = 0, e = Constants.size(); i != e; ++i) { - if (Constants[i].isMachineConstantPoolEntry() && - (Constants[i].getAlignment() & AlignMask) == 0) { - ARMConstantPoolValue *CPV = - (ARMConstantPoolValue *)Constants[i].Val.MachineCPVal; - ARMConstantPoolSymbol *APS = dyn_cast(CPV); - if (!APS) continue; - - if (APS->S == S && equals(APS)) - return i; - } - } - - return -1; + return getExistingMachineCPValueImpl(CP, Alignment); } bool ARMConstantPoolSymbol::hasSameValue(ARMConstantPoolValue *ACPV) { @@ -271,22 +242,7 @@ ARMConstantPoolMBB *ARMConstantPoolMBB::Create(LLVMContext &C, int ARMConstantPoolMBB::getExistingMachineCPValue(MachineConstantPool *CP, unsigned Alignment) { - unsigned AlignMask = Alignment - 1; - const std::vector Constants = CP->getConstants(); - for (unsigned i = 0, e = Constants.size(); i != e; ++i) { - if (Constants[i].isMachineConstantPoolEntry() && - (Constants[i].getAlignment() & AlignMask) == 0) { - ARMConstantPoolValue *CPV = - (ARMConstantPoolValue *)Constants[i].Val.MachineCPVal; - ARMConstantPoolMBB *APMBB = dyn_cast(CPV); - if (!APMBB) continue; - - if (APMBB->MBB == MBB && equals(APMBB)) - return i; - } - } - - return -1; + return getExistingMachineCPValueImpl(CP, Alignment); } bool ARMConstantPoolMBB::hasSameValue(ARMConstantPoolValue *ACPV) { diff --git a/lib/Target/ARM/ARMConstantPoolValue.h b/lib/Target/ARM/ARMConstantPoolValue.h index 93812fe6bb37..7ae7bf46f19d 100644 --- a/lib/Target/ARM/ARMConstantPoolValue.h +++ b/lib/Target/ARM/ARMConstantPoolValue.h @@ -15,6 +15,7 @@ #define LLVM_TARGET_ARM_CONSTANTPOOLVALUE_H #include "llvm/CodeGen/MachineConstantPool.h" +#include "llvm/Support/Casting.h" #include "llvm/Support/ErrorHandling.h" #include @@ -64,6 +65,26 @@ protected: ARMConstantPoolValue(LLVMContext &C, unsigned id, ARMCP::ARMCPKind Kind, unsigned char PCAdj, ARMCP::ARMCPModifier Modifier, bool AddCurrentAddress); + + template + int getExistingMachineCPValueImpl(MachineConstantPool *CP, + unsigned Alignment) { + unsigned AlignMask = Alignment - 1; + const std::vector &Constants = CP->getConstants(); + for (unsigned i = 0, e = Constants.size(); i != e; ++i) { + if (Constants[i].isMachineConstantPoolEntry() && + (Constants[i].getAlignment() & AlignMask) == 0) { + ARMConstantPoolValue *CPV = + (ARMConstantPoolValue *)Constants[i].Val.MachineCPVal; + if (Derived *APC = dyn_cast(CPV)) + if (cast(this)->equals(APC)) + return i; + } + } + + return -1; + } + public: virtual ~ARMConstantPoolValue(); @@ -156,6 +177,10 @@ public: static bool classof(const ARMConstantPoolValue *APV) { return APV->isGlobalValue() || APV->isBlockAddress() || APV->isLSDA(); } + + bool equals(const ARMConstantPoolConstant *A) const { + return CVal == A->CVal && ARMConstantPoolValue::equals(A); + } }; /// ARMConstantPoolSymbol - ARM-specific constantpool values for external @@ -187,6 +212,10 @@ public: static bool classof(const ARMConstantPoolValue *ACPV) { return ACPV->isExtSymbol(); } + + bool equals(const ARMConstantPoolSymbol *A) const { + return S == A->S && ARMConstantPoolValue::equals(A); + } }; /// ARMConstantPoolMBB - ARM-specific constantpool value of a machine basic @@ -219,6 +248,10 @@ public: static bool classof(const ARMConstantPoolValue *ACPV) { return ACPV->isMachineBasicBlock(); } + + bool equals(const ARMConstantPoolMBB *A) const { + return MBB == A->MBB && ARMConstantPoolValue::equals(A); + } }; } // End llvm namespace diff --git a/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/lib/Target/ARM/ARMExpandPseudoInsts.cpp index beb843ca9aa8..e6f7f86c5587 100644 --- a/lib/Target/ARM/ARMExpandPseudoInsts.cpp +++ b/lib/Target/ARM/ARMExpandPseudoInsts.cpp @@ -692,10 +692,9 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB, unsigned newOpc = Opcode == ARM::VMOVScc ? ARM::VMOVS : ARM::VMOVD; BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(newOpc), MI.getOperand(1).getReg()) - .addReg(MI.getOperand(2).getReg(), - getKillRegState(MI.getOperand(2).isKill())) + .addOperand(MI.getOperand(2)) .addImm(MI.getOperand(3).getImm()) // 'pred' - .addReg(MI.getOperand(4).getReg()); + .addOperand(MI.getOperand(4)); MI.eraseFromParent(); return true; @@ -705,10 +704,9 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB, unsigned Opc = AFI->isThumbFunction() ? ARM::t2MOVr : ARM::MOVr; BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc), MI.getOperand(1).getReg()) - .addReg(MI.getOperand(2).getReg(), - getKillRegState(MI.getOperand(2).isKill())) + .addOperand(MI.getOperand(2)) .addImm(MI.getOperand(3).getImm()) // 'pred' - .addReg(MI.getOperand(4).getReg()) + .addOperand(MI.getOperand(4)) .addReg(0); // 's' bit MI.eraseFromParent(); @@ -717,39 +715,36 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB, case ARM::MOVCCsi: { BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MOVsi), (MI.getOperand(1).getReg())) - .addReg(MI.getOperand(2).getReg(), - getKillRegState(MI.getOperand(2).isKill())) + .addOperand(MI.getOperand(2)) .addImm(MI.getOperand(3).getImm()) .addImm(MI.getOperand(4).getImm()) // 'pred' - .addReg(MI.getOperand(5).getReg()) + .addOperand(MI.getOperand(5)) .addReg(0); // 's' bit MI.eraseFromParent(); return true; } - case ARM::MOVCCsr: { BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MOVsr), (MI.getOperand(1).getReg())) - .addReg(MI.getOperand(2).getReg(), - getKillRegState(MI.getOperand(2).isKill())) - .addReg(MI.getOperand(3).getReg(), - getKillRegState(MI.getOperand(3).isKill())) + .addOperand(MI.getOperand(2)) + .addOperand(MI.getOperand(3)) .addImm(MI.getOperand(4).getImm()) .addImm(MI.getOperand(5).getImm()) // 'pred' - .addReg(MI.getOperand(6).getReg()) + .addOperand(MI.getOperand(6)) .addReg(0); // 's' bit MI.eraseFromParent(); return true; } + case ARM::t2MOVCCi16: case ARM::MOVCCi16: { - BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MOVi16), + unsigned NewOpc = AFI->isThumbFunction() ? ARM::t2MOVi16 : ARM::MOVi16; + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(NewOpc), MI.getOperand(1).getReg()) .addImm(MI.getOperand(2).getImm()) .addImm(MI.getOperand(3).getImm()) // 'pred' - .addReg(MI.getOperand(4).getReg()); - + .addOperand(MI.getOperand(4)); MI.eraseFromParent(); return true; } @@ -760,23 +755,47 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB, MI.getOperand(1).getReg()) .addImm(MI.getOperand(2).getImm()) .addImm(MI.getOperand(3).getImm()) // 'pred' - .addReg(MI.getOperand(4).getReg()) + .addOperand(MI.getOperand(4)) .addReg(0); // 's' bit MI.eraseFromParent(); return true; } + case ARM::t2MVNCCi: case ARM::MVNCCi: { - BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MVNi), + unsigned Opc = AFI->isThumbFunction() ? ARM::t2MVNi : ARM::MVNi; + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc), MI.getOperand(1).getReg()) .addImm(MI.getOperand(2).getImm()) .addImm(MI.getOperand(3).getImm()) // 'pred' - .addReg(MI.getOperand(4).getReg()) + .addOperand(MI.getOperand(4)) .addReg(0); // 's' bit MI.eraseFromParent(); return true; } + case ARM::t2MOVCClsl: + case ARM::t2MOVCClsr: + case ARM::t2MOVCCasr: + case ARM::t2MOVCCror: { + unsigned NewOpc; + switch (Opcode) { + case ARM::t2MOVCClsl: NewOpc = ARM::t2LSLri; break; + case ARM::t2MOVCClsr: NewOpc = ARM::t2LSRri; break; + case ARM::t2MOVCCasr: NewOpc = ARM::t2ASRri; break; + case ARM::t2MOVCCror: NewOpc = ARM::t2RORri; break; + default: llvm_unreachable("unexpeced conditional move"); + } + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(NewOpc), + MI.getOperand(1).getReg()) + .addOperand(MI.getOperand(2)) + .addImm(MI.getOperand(3).getImm()) + .addImm(MI.getOperand(4).getImm()) // 'pred' + .addOperand(MI.getOperand(5)) + .addReg(0); // 's' bit + MI.eraseFromParent(); + return true; + } case ARM::Int_eh_sjlj_dispatchsetup: { MachineFunction &MF = *MI.getParent()->getParent(); const ARMBaseInstrInfo *AII = @@ -823,7 +842,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB, case ARM::MOVsrl_flag: case ARM::MOVsra_flag: { - // These are just fancy MOVs insructions. + // These are just fancy MOVs instructions. AddDefaultPred(BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MOVsi), MI.getOperand(0).getReg()) .addOperand(MI.getOperand(1)) @@ -938,6 +957,18 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB, ExpandMOV32BitImm(MBB, MBBI); return true; + case ARM::SUBS_PC_LR: { + MachineInstrBuilder MIB = + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::SUBri), ARM::PC) + .addReg(ARM::LR) + .addOperand(MI.getOperand(0)) + .addOperand(MI.getOperand(1)) + .addOperand(MI.getOperand(2)) + .addReg(ARM::CPSR, RegState::Undef); + TransferImpOps(MI, MIB, MIB); + MI.eraseFromParent(); + return true; + } case ARM::VLDMQIA: { unsigned NewOpc = ARM::VLDMDIA; MachineInstrBuilder MIB = diff --git a/lib/Target/ARM/ARMFPUName.def b/lib/Target/ARM/ARMFPUName.def new file mode 100644 index 000000000000..9a1bbe703d99 --- /dev/null +++ b/lib/Target/ARM/ARMFPUName.def @@ -0,0 +1,32 @@ +//===-- ARMFPUName.def - List of the ARM FPU names --------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the list of the supported ARM FPU names. +// +//===----------------------------------------------------------------------===// + +// NOTE: NO INCLUDE GUARD DESIRED! + +#ifndef ARM_FPU_NAME +#error "You must define ARM_FPU_NAME(NAME, ID) before including ARMFPUName.h" +#endif + +ARM_FPU_NAME("vfp", VFP) +ARM_FPU_NAME("vfpv2", VFPV2) +ARM_FPU_NAME("vfpv3", VFPV3) +ARM_FPU_NAME("vfpv3-d16", VFPV3_D16) +ARM_FPU_NAME("vfpv4", VFPV4) +ARM_FPU_NAME("vfpv4-d16", VFPV4_D16) +ARM_FPU_NAME("fp-armv8", FP_ARMV8) +ARM_FPU_NAME("neon", NEON) +ARM_FPU_NAME("neon-vfpv4", NEON_VFPV4) +ARM_FPU_NAME("neon-fp-armv8", NEON_FP_ARMV8) +ARM_FPU_NAME("crypto-neon-fp-armv8", CRYPTO_NEON_FP_ARMV8) + +#undef ARM_FPU_NAME diff --git a/lib/Target/ARM/ARMFPUName.h b/lib/Target/ARM/ARMFPUName.h new file mode 100644 index 000000000000..2a64cce4880d --- /dev/null +++ b/lib/Target/ARM/ARMFPUName.h @@ -0,0 +1,26 @@ +//===-- ARMFPUName.h - List of the ARM FPU names ----------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef ARMFPUNAME_H +#define ARMFPUNAME_H + +namespace llvm { +namespace ARM { + +enum FPUKind { + INVALID_FPU = 0 + +#define ARM_FPU_NAME(NAME, ID) , ID +#include "ARMFPUName.def" +}; + +} // namespace ARM +} // namespace llvm + +#endif // ARMFPUNAME_H diff --git a/lib/Target/ARM/ARMFastISel.cpp b/lib/Target/ARM/ARMFastISel.cpp index 5d45f6491240..a4004f32db37 100644 --- a/lib/Target/ARM/ARMFastISel.cpp +++ b/lib/Target/ARM/ARMFastISel.cpp @@ -20,6 +20,7 @@ #include "ARMSubtarget.h" #include "ARMTargetMachine.h" #include "MCTargetDesc/ARMAddressingModes.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/CodeGen/Analysis.h" #include "llvm/CodeGen/FastISel.h" #include "llvm/CodeGen/FunctionLoweringInfo.h" @@ -175,6 +176,8 @@ class ARMFastISel : public FastISel { // Utility routines. private: + unsigned constrainOperandRegClass(const MCInstrDesc &II, unsigned OpNum, + unsigned Op); bool isTypeLegal(Type *Ty, MVT &VT); bool isLoadTypeLegal(Type *Ty, MVT &VT); bool ARMEmitCmp(const Value *Src1Value, const Value *Src2Value, @@ -251,10 +254,10 @@ bool ARMFastISel::DefinesOptionalPredicate(MachineInstr *MI, bool *CPSR) { bool ARMFastISel::isARMNEONPred(const MachineInstr *MI) { const MCInstrDesc &MCID = MI->getDesc(); - // If we're a thumb2 or not NEON function we were handled via isPredicable. + // If we're a thumb2 or not NEON function we'll be handled via isPredicable. if ((MCID.TSFlags & ARMII::DomainMask) != ARMII::DomainNEON || AFI->isThumb2Function()) - return false; + return MI->isPredicable(); for (unsigned i = 0, e = MCID.getNumOperands(); i != e; ++i) if (MCID.OpInfo[i].isPredicate()) @@ -275,7 +278,7 @@ ARMFastISel::AddOptionalDefs(const MachineInstrBuilder &MIB) { // Do we use a predicate? or... // Are we NEON in ARM mode and have a predicate operand? If so, I know // we're not predicable but add it anyways. - if (TII.isPredicable(MI) || isARMNEONPred(MI)) + if (isARMNEONPred(MI)) AddDefaultPred(MIB); // Do we optionally set a predicate? Preds is size > 0 iff the predicate @@ -290,6 +293,23 @@ ARMFastISel::AddOptionalDefs(const MachineInstrBuilder &MIB) { return MIB; } +unsigned ARMFastISel::constrainOperandRegClass(const MCInstrDesc &II, + unsigned Op, unsigned OpNum) { + if (TargetRegisterInfo::isVirtualRegister(Op)) { + const TargetRegisterClass *RegClass = + TII.getRegClass(II, OpNum, &TRI, *FuncInfo.MF); + if (!MRI.constrainRegClass(Op, RegClass)) { + // If it's not legal to COPY between the register classes, something + // has gone very wrong before we got here. + unsigned NewOp = createResultReg(RegClass); + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(TargetOpcode::COPY), NewOp).addReg(Op)); + return NewOp; + } + } + return Op; +} + unsigned ARMFastISel::FastEmitInst_(unsigned MachineInstOpcode, const TargetRegisterClass* RC) { unsigned ResultReg = createResultReg(RC); @@ -305,6 +325,9 @@ unsigned ARMFastISel::FastEmitInst_r(unsigned MachineInstOpcode, unsigned ResultReg = createResultReg(RC); const MCInstrDesc &II = TII.get(MachineInstOpcode); + // Make sure the input operand is sufficiently constrained to be legal + // for this instruction. + Op0 = constrainOperandRegClass(II, Op0, 1); if (II.getNumDefs() >= 1) { AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II, ResultReg) .addReg(Op0, Op0IsKill * RegState::Kill)); @@ -325,6 +348,11 @@ unsigned ARMFastISel::FastEmitInst_rr(unsigned MachineInstOpcode, unsigned ResultReg = createResultReg(RC); const MCInstrDesc &II = TII.get(MachineInstOpcode); + // Make sure the input operands are sufficiently constrained to be legal + // for this instruction. + Op0 = constrainOperandRegClass(II, Op0, 1); + Op1 = constrainOperandRegClass(II, Op1, 2); + if (II.getNumDefs() >= 1) { AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II, ResultReg) .addReg(Op0, Op0IsKill * RegState::Kill) @@ -348,6 +376,12 @@ unsigned ARMFastISel::FastEmitInst_rrr(unsigned MachineInstOpcode, unsigned ResultReg = createResultReg(RC); const MCInstrDesc &II = TII.get(MachineInstOpcode); + // Make sure the input operands are sufficiently constrained to be legal + // for this instruction. + Op0 = constrainOperandRegClass(II, Op0, 1); + Op1 = constrainOperandRegClass(II, Op1, 2); + Op2 = constrainOperandRegClass(II, Op1, 3); + if (II.getNumDefs() >= 1) { AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II, ResultReg) .addReg(Op0, Op0IsKill * RegState::Kill) @@ -372,6 +406,9 @@ unsigned ARMFastISel::FastEmitInst_ri(unsigned MachineInstOpcode, unsigned ResultReg = createResultReg(RC); const MCInstrDesc &II = TII.get(MachineInstOpcode); + // Make sure the input operand is sufficiently constrained to be legal + // for this instruction. + Op0 = constrainOperandRegClass(II, Op0, 1); if (II.getNumDefs() >= 1) { AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II, ResultReg) .addReg(Op0, Op0IsKill * RegState::Kill) @@ -394,6 +431,9 @@ unsigned ARMFastISel::FastEmitInst_rf(unsigned MachineInstOpcode, unsigned ResultReg = createResultReg(RC); const MCInstrDesc &II = TII.get(MachineInstOpcode); + // Make sure the input operand is sufficiently constrained to be legal + // for this instruction. + Op0 = constrainOperandRegClass(II, Op0, 1); if (II.getNumDefs() >= 1) { AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II, ResultReg) .addReg(Op0, Op0IsKill * RegState::Kill) @@ -417,6 +457,10 @@ unsigned ARMFastISel::FastEmitInst_rri(unsigned MachineInstOpcode, unsigned ResultReg = createResultReg(RC); const MCInstrDesc &II = TII.get(MachineInstOpcode); + // Make sure the input operands are sufficiently constrained to be legal + // for this instruction. + Op0 = constrainOperandRegClass(II, Op0, 1); + Op1 = constrainOperandRegClass(II, Op1, 2); if (II.getNumDefs() >= 1) { AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II, ResultReg) .addReg(Op0, Op0IsKill * RegState::Kill) @@ -609,6 +653,7 @@ unsigned ARMFastISel::ARMMaterializeInt(const Constant *C, MVT VT) { .addConstantPoolIndex(Idx)); else // The extra immediate is for addrmode2. + DestReg = constrainOperandRegClass(TII.get(ARM::LDRcp), DestReg, 0); AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(ARM::LDRcp), DestReg) .addConstantPoolIndex(Idx) @@ -628,6 +673,11 @@ unsigned ARMFastISel::ARMMaterializeGV(const GlobalValue *GV, MVT VT) { (const TargetRegisterClass*)&ARM::GPRRegClass; unsigned DestReg = createResultReg(RC); + // FastISel TLS support on non-Darwin is broken, punt to SelectionDAG. + const GlobalVariable *GVar = dyn_cast(GV); + bool IsThreadLocal = GVar && GVar->isThreadLocal(); + if (!Subtarget->isTargetDarwin() && IsThreadLocal) return 0; + // Use movw+movt when possible, it avoids constant pool entries. // Darwin targets don't support movt with Reloc::Static, see // ARMTargetLowering::LowerGlobalAddressDarwin. Other targets only support @@ -679,6 +729,7 @@ unsigned ARMFastISel::ARMMaterializeGV(const GlobalValue *GV, MVT VT) { AddOptionalDefs(MIB); } else { // The extra immediate is for addrmode2. + DestReg = constrainOperandRegClass(TII.get(ARM::LDRcp), DestReg, 0); MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(ARM::LDRcp), DestReg) .addConstantPoolIndex(Idx) @@ -814,22 +865,19 @@ bool ARMFastISel::ARMComputeAddress(const Value *Obj, Address &Addr) { switch (Opcode) { default: break; - case Instruction::BitCast: { + case Instruction::BitCast: // Look through bitcasts. return ARMComputeAddress(U->getOperand(0), Addr); - } - case Instruction::IntToPtr: { + case Instruction::IntToPtr: // Look past no-op inttoptrs. if (TLI.getValueType(U->getOperand(0)->getType()) == TLI.getPointerTy()) return ARMComputeAddress(U->getOperand(0), Addr); break; - } - case Instruction::PtrToInt: { + case Instruction::PtrToInt: // Look past no-op ptrtoints. if (TLI.getValueType(U->getType()) == TLI.getPointerTy()) return ARMComputeAddress(U->getOperand(0), Addr); break; - } case Instruction::GetElementPtr: { Address SavedAddr = Addr; int TmpOffset = Addr.Offset; @@ -852,13 +900,8 @@ bool ARMFastISel::ARMComputeAddress(const Value *Obj, Address &Addr) { TmpOffset += CI->getSExtValue() * S; break; } - if (isa(Op) && - (!isa(Op) || - FuncInfo.MBBMap[cast(Op)->getParent()] - == FuncInfo.MBB) && - isa(cast(Op)->getOperand(1))) { - // An add (in the same block) with a constant operand. Fold the - // constant. + if (canFoldAddIntoGEP(U, Op)) { + // A compatible add with a constant operand. Fold the constant. ConstantInt *CI = cast(cast(Op)->getOperand(1)); TmpOffset += CI->getSExtValue() * S; @@ -1025,7 +1068,7 @@ bool ARMFastISel::ARMEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr, useAM3 = true; } } - RC = &ARM::GPRRegClass; + RC = isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRnopcRegClass; break; case MVT::i16: if (Alignment && Alignment < 2 && !Subtarget->allowsUnalignedMem()) @@ -1040,7 +1083,7 @@ bool ARMFastISel::ARMEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr, Opc = isZExt ? ARM::LDRH : ARM::LDRSH; useAM3 = true; } - RC = &ARM::GPRRegClass; + RC = isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRnopcRegClass; break; case MVT::i32: if (Alignment && Alignment < 4 && !Subtarget->allowsUnalignedMem()) @@ -1054,7 +1097,7 @@ bool ARMFastISel::ARMEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr, } else { Opc = ARM::LDRi12; } - RC = &ARM::GPRRegClass; + RC = isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRnopcRegClass; break; case MVT::f32: if (!Subtarget->hasVFP2()) return false; @@ -1063,7 +1106,7 @@ bool ARMFastISel::ARMEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr, needVMOV = true; VT = MVT::i32; Opc = isThumb2 ? ARM::t2LDRi12 : ARM::LDRi12; - RC = &ARM::GPRRegClass; + RC = isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRnopcRegClass; } else { Opc = ARM::VLDRS; RC = TLI.getRegClassFor(VT); @@ -1136,6 +1179,7 @@ bool ARMFastISel::ARMEmitStore(MVT VT, unsigned SrcReg, Address &Addr, (const TargetRegisterClass*)&ARM::tGPRRegClass : (const TargetRegisterClass*)&ARM::GPRRegClass); unsigned Opc = isThumb2 ? ARM::t2ANDri : ARM::ANDri; + SrcReg = constrainOperandRegClass(TII.get(Opc), SrcReg, 1); AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), Res) .addReg(SrcReg).addImm(1)); @@ -1207,6 +1251,7 @@ bool ARMFastISel::ARMEmitStore(MVT VT, unsigned SrcReg, Address &Addr, ARMSimplifyAddress(Addr, VT, useAM3); // Create the base instruction, then add the operands. + SrcReg = constrainOperandRegClass(TII.get(StrOpc), SrcReg, 0); MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(StrOpc)) .addReg(SrcReg); @@ -1330,6 +1375,7 @@ bool ARMFastISel::SelectBranch(const Instruction *I) { (isLoadTypeLegal(TI->getOperand(0)->getType(), SourceVT))) { unsigned TstOpc = isThumb2 ? ARM::t2TSTri : ARM::TSTri; unsigned OpReg = getRegForValue(TI->getOperand(0)); + OpReg = constrainOperandRegClass(TII.get(TstOpc), OpReg, 0); AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TstOpc)) .addReg(OpReg).addImm(1)); @@ -1367,6 +1413,7 @@ bool ARMFastISel::SelectBranch(const Instruction *I) { // and it left a value for us in a virtual register. Ergo, we test // the one-bit value left in the virtual register. unsigned TstOpc = isThumb2 ? ARM::t2TSTri : ARM::TSTri; + CmpReg = constrainOperandRegClass(TII.get(TstOpc), CmpReg, 0); AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TstOpc)) .addReg(CmpReg).addImm(1)); @@ -1491,13 +1538,15 @@ bool ARMFastISel::ARMEmitCmp(const Value *Src1Value, const Value *Src2Value, } } + const MCInstrDesc &II = TII.get(CmpOpc); + SrcReg1 = constrainOperandRegClass(II, SrcReg1, 0); if (!UseImm) { - AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, - TII.get(CmpOpc)) + SrcReg2 = constrainOperandRegClass(II, SrcReg2, 1); + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II) .addReg(SrcReg1).addReg(SrcReg2)); } else { MachineInstrBuilder MIB; - MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(CmpOpc)) + MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II) .addReg(SrcReg1); // Only add immediate for icmp as the immediate for fcmp is an implicit 0.0. @@ -1696,6 +1745,7 @@ bool ARMFastISel::SelectSelect(const Instruction *I) { } unsigned CmpOpc = isThumb2 ? ARM::t2CMPri : ARM::CMPri; + CondReg = constrainOperandRegClass(TII.get(CmpOpc), CondReg, 0); AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(CmpOpc)) .addReg(CondReg).addImm(0)); @@ -1712,12 +1762,16 @@ bool ARMFastISel::SelectSelect(const Instruction *I) { MovCCOpc = isThumb2 ? ARM::t2MVNCCi : ARM::MVNCCi; } unsigned ResultReg = createResultReg(RC); - if (!UseImm) + if (!UseImm) { + Op2Reg = constrainOperandRegClass(TII.get(MovCCOpc), Op2Reg, 1); + Op1Reg = constrainOperandRegClass(TII.get(MovCCOpc), Op1Reg, 2); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(MovCCOpc), ResultReg) .addReg(Op2Reg).addReg(Op1Reg).addImm(ARMCC::NE).addReg(ARM::CPSR); - else + } else { + Op1Reg = constrainOperandRegClass(TII.get(MovCCOpc), Op1Reg, 1); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(MovCCOpc), ResultReg) .addReg(Op1Reg).addImm(Imm).addImm(ARMCC::EQ).addReg(ARM::CPSR); + } UpdateValueMap(I, ResultReg); return true; } @@ -1802,7 +1856,9 @@ bool ARMFastISel::SelectBinaryIntOp(const Instruction *I, unsigned ISDOpcode) { unsigned SrcReg2 = getRegForValue(I->getOperand(1)); if (SrcReg2 == 0) return false; - unsigned ResultReg = createResultReg(TLI.getRegClassFor(MVT::i32)); + unsigned ResultReg = createResultReg(&ARM::GPRnopcRegClass); + SrcReg1 = constrainOperandRegClass(TII.get(Opc), SrcReg1, 1); + SrcReg2 = constrainOperandRegClass(TII.get(Opc), SrcReg2, 2); AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), ResultReg) .addReg(SrcReg1).addReg(SrcReg2)); @@ -1930,7 +1986,7 @@ bool ARMFastISel::ProcessCallArgs(SmallVectorImpl &Args, !VA.isRegLoc() || !ArgLocs[++i].isRegLoc()) return false; } else { - switch (static_cast(ArgVT).getSimpleVT().SimpleTy) { + switch (ArgVT.SimpleTy) { default: return false; case MVT::i1: @@ -1985,7 +2041,7 @@ bool ARMFastISel::ProcessCallArgs(SmallVectorImpl &Args, case CCValAssign::ZExt: { MVT DestVT = VA.getLocVT(); Arg = ARMEmitIntExt(ArgVT, Arg, DestVT, /*isZExt*/true); - assert (Arg != 0 && "Failed to emit a sext"); + assert (Arg != 0 && "Failed to emit a zext"); ArgVT = DestVT; break; } @@ -2182,10 +2238,14 @@ unsigned ARMFastISel::ARMSelectCallOp(bool UseReg) { } unsigned ARMFastISel::getLibcallReg(const Twine &Name) { + // Manually compute the global's type to avoid building it when unnecessary. + Type *GVTy = Type::getInt32PtrTy(*Context, /*AS=*/0); + EVT LCREVT = TLI.getValueType(GVTy); + if (!LCREVT.isSimple()) return 0; + GlobalValue *GV = new GlobalVariable(Type::getInt32Ty(*Context), false, GlobalValue::ExternalLinkage, 0, Name); - EVT LCREVT = TLI.getValueType(GV->getType()); - if (!LCREVT.isSimple()) return 0; + assert(GV->getType() == GVTy && "We miscomputed the type for the global!"); return ARMMaterializeGV(GV, LCREVT.getSimpleVT()); } @@ -2403,15 +2463,22 @@ bool ARMFastISel::SelectCall(const Instruction *I, MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(CallOpc)); + unsigned char OpFlags = 0; + + // Add MO_PLT for global address or external symbol in the PIC relocation + // model. + if (Subtarget->isTargetELF() && TM.getRelocationModel() == Reloc::PIC_) + OpFlags = ARMII::MO_PLT; + // ARM calls don't take a predicate, but tBL / tBLX do. if(isThumb2) AddDefaultPred(MIB); if (UseReg) MIB.addReg(CalleeReg); else if (!IntrMemName) - MIB.addGlobalAddress(GV, 0, 0); + MIB.addGlobalAddress(GV, 0, OpFlags); else - MIB.addExternalSymbol(IntrMemName, 0); + MIB.addExternalSymbol(IntrMemName, OpFlags); // Add implicit physical register uses to the call. for (unsigned i = 0, e = RegArgs.size(); i != e; ++i) @@ -2602,47 +2669,136 @@ unsigned ARMFastISel::ARMEmitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT, bool isZExt) { if (DestVT != MVT::i32 && DestVT != MVT::i16 && DestVT != MVT::i8) return 0; + if (SrcVT != MVT::i16 && SrcVT != MVT::i8 && SrcVT != MVT::i1) + return 0; - unsigned Opc; - bool isBoolZext = false; - const TargetRegisterClass *RC; - switch (SrcVT.SimpleTy) { - default: return 0; - case MVT::i16: - if (!Subtarget->hasV6Ops()) return 0; - RC = isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRnopcRegClass; - if (isZExt) - Opc = isThumb2 ? ARM::t2UXTH : ARM::UXTH; - else - Opc = isThumb2 ? ARM::t2SXTH : ARM::SXTH; - break; - case MVT::i8: - if (!Subtarget->hasV6Ops()) return 0; - RC = isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRnopcRegClass; - if (isZExt) - Opc = isThumb2 ? ARM::t2UXTB : ARM::UXTB; - else - Opc = isThumb2 ? ARM::t2SXTB : ARM::SXTB; - break; - case MVT::i1: - if (isZExt) { - RC = isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRRegClass; - Opc = isThumb2 ? ARM::t2ANDri : ARM::ANDri; - isBoolZext = true; - break; + // Table of which combinations can be emitted as a single instruction, + // and which will require two. + static const uint8_t isSingleInstrTbl[3][2][2][2] = { + // ARM Thumb + // !hasV6Ops hasV6Ops !hasV6Ops hasV6Ops + // ext: s z s z s z s z + /* 1 */ { { { 0, 1 }, { 0, 1 } }, { { 0, 0 }, { 0, 1 } } }, + /* 8 */ { { { 0, 1 }, { 1, 1 } }, { { 0, 0 }, { 1, 1 } } }, + /* 16 */ { { { 0, 0 }, { 1, 1 } }, { { 0, 0 }, { 1, 1 } } } + }; + + // Target registers for: + // - For ARM can never be PC. + // - For 16-bit Thumb are restricted to lower 8 registers. + // - For 32-bit Thumb are restricted to non-SP and non-PC. + static const TargetRegisterClass *RCTbl[2][2] = { + // Instructions: Two Single + /* ARM */ { &ARM::GPRnopcRegClass, &ARM::GPRnopcRegClass }, + /* Thumb */ { &ARM::tGPRRegClass, &ARM::rGPRRegClass } + }; + + // Table governing the instruction(s) to be emitted. + static const struct InstructionTable { + uint32_t Opc : 16; + uint32_t hasS : 1; // Some instructions have an S bit, always set it to 0. + uint32_t Shift : 7; // For shift operand addressing mode, used by MOVsi. + uint32_t Imm : 8; // All instructions have either a shift or a mask. + } IT[2][2][3][2] = { + { // Two instructions (first is left shift, second is in this table). + { // ARM Opc S Shift Imm + /* 1 bit sext */ { { ARM::MOVsi , 1, ARM_AM::asr , 31 }, + /* 1 bit zext */ { ARM::MOVsi , 1, ARM_AM::lsr , 31 } }, + /* 8 bit sext */ { { ARM::MOVsi , 1, ARM_AM::asr , 24 }, + /* 8 bit zext */ { ARM::MOVsi , 1, ARM_AM::lsr , 24 } }, + /* 16 bit sext */ { { ARM::MOVsi , 1, ARM_AM::asr , 16 }, + /* 16 bit zext */ { ARM::MOVsi , 1, ARM_AM::lsr , 16 } } + }, + { // Thumb Opc S Shift Imm + /* 1 bit sext */ { { ARM::tASRri , 0, ARM_AM::no_shift, 31 }, + /* 1 bit zext */ { ARM::tLSRri , 0, ARM_AM::no_shift, 31 } }, + /* 8 bit sext */ { { ARM::tASRri , 0, ARM_AM::no_shift, 24 }, + /* 8 bit zext */ { ARM::tLSRri , 0, ARM_AM::no_shift, 24 } }, + /* 16 bit sext */ { { ARM::tASRri , 0, ARM_AM::no_shift, 16 }, + /* 16 bit zext */ { ARM::tLSRri , 0, ARM_AM::no_shift, 16 } } + } + }, + { // Single instruction. + { // ARM Opc S Shift Imm + /* 1 bit sext */ { { ARM::KILL , 0, ARM_AM::no_shift, 0 }, + /* 1 bit zext */ { ARM::ANDri , 1, ARM_AM::no_shift, 1 } }, + /* 8 bit sext */ { { ARM::SXTB , 0, ARM_AM::no_shift, 0 }, + /* 8 bit zext */ { ARM::ANDri , 1, ARM_AM::no_shift, 255 } }, + /* 16 bit sext */ { { ARM::SXTH , 0, ARM_AM::no_shift, 0 }, + /* 16 bit zext */ { ARM::UXTH , 0, ARM_AM::no_shift, 0 } } + }, + { // Thumb Opc S Shift Imm + /* 1 bit sext */ { { ARM::KILL , 0, ARM_AM::no_shift, 0 }, + /* 1 bit zext */ { ARM::t2ANDri, 1, ARM_AM::no_shift, 1 } }, + /* 8 bit sext */ { { ARM::t2SXTB , 0, ARM_AM::no_shift, 0 }, + /* 8 bit zext */ { ARM::t2ANDri, 1, ARM_AM::no_shift, 255 } }, + /* 16 bit sext */ { { ARM::t2SXTH , 0, ARM_AM::no_shift, 0 }, + /* 16 bit zext */ { ARM::t2UXTH , 0, ARM_AM::no_shift, 0 } } + } } - return 0; + }; + + unsigned SrcBits = SrcVT.getSizeInBits(); + unsigned DestBits = DestVT.getSizeInBits(); + (void) DestBits; + assert((SrcBits < DestBits) && "can only extend to larger types"); + assert((DestBits == 32 || DestBits == 16 || DestBits == 8) && + "other sizes unimplemented"); + assert((SrcBits == 16 || SrcBits == 8 || SrcBits == 1) && + "other sizes unimplemented"); + + bool hasV6Ops = Subtarget->hasV6Ops(); + unsigned Bitness = SrcBits / 8; // {1,8,16}=>{0,1,2} + assert((Bitness < 3) && "sanity-check table bounds"); + + bool isSingleInstr = isSingleInstrTbl[Bitness][isThumb2][hasV6Ops][isZExt]; + const TargetRegisterClass *RC = RCTbl[isThumb2][isSingleInstr]; + const InstructionTable *ITP = &IT[isSingleInstr][isThumb2][Bitness][isZExt]; + unsigned Opc = ITP->Opc; + assert(ARM::KILL != Opc && "Invalid table entry"); + unsigned hasS = ITP->hasS; + ARM_AM::ShiftOpc Shift = (ARM_AM::ShiftOpc) ITP->Shift; + assert(((Shift == ARM_AM::no_shift) == (Opc != ARM::MOVsi)) && + "only MOVsi has shift operand addressing mode"); + unsigned Imm = ITP->Imm; + + // 16-bit Thumb instructions always set CPSR (unless they're in an IT block). + bool setsCPSR = &ARM::tGPRRegClass == RC; + unsigned LSLOpc = isThumb2 ? ARM::tLSLri : ARM::MOVsi; + unsigned ResultReg; + // MOVsi encodes shift and immediate in shift operand addressing mode. + // The following condition has the same value when emitting two + // instruction sequences: both are shifts. + bool ImmIsSO = (Shift != ARM_AM::no_shift); + + // Either one or two instructions are emitted. + // They're always of the form: + // dst = in OP imm + // CPSR is set only by 16-bit Thumb instructions. + // Predicate, if any, is AL. + // S bit, if available, is always 0. + // When two are emitted the first's result will feed as the second's input, + // that value is then dead. + unsigned NumInstrsEmitted = isSingleInstr ? 1 : 2; + for (unsigned Instr = 0; Instr != NumInstrsEmitted; ++Instr) { + ResultReg = createResultReg(RC); + bool isLsl = (0 == Instr) && !isSingleInstr; + unsigned Opcode = isLsl ? LSLOpc : Opc; + ARM_AM::ShiftOpc ShiftAM = isLsl ? ARM_AM::lsl : Shift; + unsigned ImmEnc = ImmIsSO ? ARM_AM::getSORegOpc(ShiftAM, Imm) : Imm; + bool isKill = 1 == Instr; + MachineInstrBuilder MIB = BuildMI( + *FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opcode), ResultReg); + if (setsCPSR) + MIB.addReg(ARM::CPSR, RegState::Define); + SrcReg = constrainOperandRegClass(TII.get(Opcode), SrcReg, 1 + setsCPSR); + AddDefaultPred(MIB.addReg(SrcReg, isKill * RegState::Kill).addImm(ImmEnc)); + if (hasS) + AddDefaultCC(MIB); + // Second instruction consumes the first's result. + SrcReg = ResultReg; } - unsigned ResultReg = createResultReg(RC); - MachineInstrBuilder MIB; - MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), ResultReg) - .addReg(SrcReg); - if (isBoolZext) - MIB.addImm(1); - else - MIB.addImm(0); - AddOptionalDefs(MIB); return ResultReg; } @@ -2707,7 +2863,7 @@ bool ARMFastISel::SelectShift(const Instruction *I, if (Reg2 == 0) return false; } - unsigned ResultReg = createResultReg(TLI.getRegClassFor(MVT::i32)); + unsigned ResultReg = createResultReg(&ARM::GPRnopcRegClass); if(ResultReg == 0) return false; MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, @@ -2797,6 +2953,25 @@ bool ARMFastISel::TargetSelectInstruction(const Instruction *I) { return false; } +namespace { +// This table describes sign- and zero-extend instructions which can be +// folded into a preceding load. All of these extends have an immediate +// (sometimes a mask and sometimes a shift) that's applied after +// extension. +const struct FoldableLoadExtendsStruct { + uint16_t Opc[2]; // ARM, Thumb. + uint8_t ExpectedImm; + uint8_t isZExt : 1; + uint8_t ExpectedVT : 7; +} FoldableLoadExtends[] = { + { { ARM::SXTH, ARM::t2SXTH }, 0, 0, MVT::i16 }, + { { ARM::UXTH, ARM::t2UXTH }, 0, 1, MVT::i16 }, + { { ARM::ANDri, ARM::t2ANDri }, 255, 1, MVT::i8 }, + { { ARM::SXTB, ARM::t2SXTB }, 0, 0, MVT::i8 }, + { { ARM::UXTB, ARM::t2UXTB }, 0, 1, MVT::i8 } +}; +} + /// \brief The specified machine instr operand is a vreg, and that /// vreg is being provided by the specified load instruction. If possible, /// try to fold the load as an operand to the instruction, returning true if @@ -2812,26 +2987,23 @@ bool ARMFastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo, // ldrb r1, [r0] ldrb r1, [r0] // uxtb r2, r1 => // mov r3, r2 mov r3, r1 - bool isZExt = true; - switch(MI->getOpcode()) { - default: return false; - case ARM::SXTH: - case ARM::t2SXTH: - isZExt = false; - case ARM::UXTH: - case ARM::t2UXTH: - if (VT != MVT::i16) - return false; - break; - case ARM::SXTB: - case ARM::t2SXTB: - isZExt = false; - case ARM::UXTB: - case ARM::t2UXTB: - if (VT != MVT::i8) - return false; - break; + if (MI->getNumOperands() < 3 || !MI->getOperand(2).isImm()) + return false; + const uint64_t Imm = MI->getOperand(2).getImm(); + + bool Found = false; + bool isZExt; + for (unsigned i = 0, e = array_lengthof(FoldableLoadExtends); + i != e; ++i) { + if (FoldableLoadExtends[i].Opc[isThumb2] == MI->getOpcode() && + (uint64_t)FoldableLoadExtends[i].ExpectedImm == Imm && + MVT((MVT::SimpleValueType)FoldableLoadExtends[i].ExpectedVT) == VT) { + Found = true; + isZExt = FoldableLoadExtends[i].isZExt; + } } + if (!Found) return false; + // See if we can handle this address. Address Addr; if (!ARMComputeAddress(LI->getOperand(0), Addr)) return false; @@ -2854,12 +3026,14 @@ unsigned ARMFastISel::ARMLowerPICELF(const GlobalValue *GV, unsigned DestReg1 = createResultReg(TLI.getRegClassFor(VT)); // Load value. if (isThumb2) { + DestReg1 = constrainOperandRegClass(TII.get(ARM::t2LDRpci), DestReg1, 0); AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(ARM::t2LDRpci), DestReg1) .addConstantPoolIndex(Idx)); Opc = UseGOTOFF ? ARM::t2ADDrr : ARM::t2LDRs; } else { // The extra immediate is for addrmode2. + DestReg1 = constrainOperandRegClass(TII.get(ARM::LDRcp), DestReg1, 0); AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(ARM::LDRcp), DestReg1) .addConstantPoolIndex(Idx).addImm(0)); @@ -2873,6 +3047,9 @@ unsigned ARMFastISel::ARMLowerPICELF(const GlobalValue *GV, } unsigned DestReg2 = createResultReg(TLI.getRegClassFor(VT)); + DestReg2 = constrainOperandRegClass(TII.get(Opc), DestReg2, 0); + DestReg1 = constrainOperandRegClass(TII.get(Opc), DestReg1, 1); + GlobalBaseReg = constrainOperandRegClass(TII.get(Opc), GlobalBaseReg, 2); MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), DestReg2) .addReg(DestReg1) @@ -2938,12 +3115,10 @@ bool ARMFastISel::FastLowerArguments() { ARM::R0, ARM::R1, ARM::R2, ARM::R3 }; - const TargetRegisterClass *RC = TLI.getRegClassFor(MVT::i32); + const TargetRegisterClass *RC = &ARM::rGPRRegClass; Idx = 0; for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end(); I != E; ++I, ++Idx) { - if (I->use_empty()) - continue; unsigned SrcReg = GPRArgRegs[Idx]; unsigned DstReg = FuncInfo.MF->addLiveIn(SrcReg, RC); // FIXME: Unfortunately it's necessary to emit a copy from the livein copy. @@ -2961,13 +3136,23 @@ bool ARMFastISel::FastLowerArguments() { namespace llvm { FastISel *ARM::createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) { - // Completely untested on non-iOS. const TargetMachine &TM = funcInfo.MF->getTarget(); - // Darwin and thumb1 only for now. const ARMSubtarget *Subtarget = &TM.getSubtarget(); - if (Subtarget->isTargetIOS() && !Subtarget->isThumb1Only()) + // Thumb2 support on iOS; ARM support on iOS, Linux and NaCl. + bool UseFastISel = false; + UseFastISel |= Subtarget->isTargetIOS() && !Subtarget->isThumb1Only(); + UseFastISel |= Subtarget->isTargetLinux() && !Subtarget->isThumb(); + UseFastISel |= Subtarget->isTargetNaCl() && !Subtarget->isThumb(); + + if (UseFastISel) { + // iOS always has a FP for backtracking, force other targets + // to keep their FP when doing FastISel. The emitted code is + // currently superior, and in cases like test-suite's lencod + // FastISel isn't quite correct when FP is eliminated. + TM.Options.NoFramePointerElim = true; return new ARMFastISel(funcInfo, libInfo); + } return 0; } } diff --git a/lib/Target/ARM/ARMFeatures.h b/lib/Target/ARM/ARMFeatures.h new file mode 100644 index 000000000000..dafc4b3a82bd --- /dev/null +++ b/lib/Target/ARM/ARMFeatures.h @@ -0,0 +1,93 @@ +//===-- ARMFeatures.h - Checks for ARM instruction features ------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the code shared between ARM CodeGen and ARM MC +// +//===----------------------------------------------------------------------===// + +#ifndef TARGET_ARM_FEATURES_H +#define TARGET_ARM_FEATURES_H + +#include "ARM.h" + +namespace llvm { + +template // could be MachineInstr or MCInst +inline bool isV8EligibleForIT(InstrType *Instr, int BLXOperandIndex = 0) { + switch (Instr->getOpcode()) { + default: + return false; + case ARM::tADC: + case ARM::tADDi3: + case ARM::tADDi8: + case ARM::tADDrSPi: + case ARM::tADDrr: + case ARM::tAND: + case ARM::tASRri: + case ARM::tASRrr: + case ARM::tBIC: + case ARM::tCMNz: + case ARM::tCMPi8: + case ARM::tCMPr: + case ARM::tEOR: + case ARM::tLDRBi: + case ARM::tLDRBr: + case ARM::tLDRHi: + case ARM::tLDRHr: + case ARM::tLDRSB: + case ARM::tLDRSH: + case ARM::tLDRi: + case ARM::tLDRr: + case ARM::tLDRspi: + case ARM::tLSLri: + case ARM::tLSLrr: + case ARM::tLSRri: + case ARM::tLSRrr: + case ARM::tMOVi8: + case ARM::tMUL: + case ARM::tMVN: + case ARM::tORR: + case ARM::tROR: + case ARM::tRSB: + case ARM::tSBC: + case ARM::tSTRBi: + case ARM::tSTRBr: + case ARM::tSTRHi: + case ARM::tSTRHr: + case ARM::tSTRi: + case ARM::tSTRr: + case ARM::tSTRspi: + case ARM::tSUBi3: + case ARM::tSUBi8: + case ARM::tSUBrr: + case ARM::tTST: + return true; +// there are some "conditionally deprecated" opcodes + case ARM::tADDspr: + return Instr->getOperand(2).getReg() != ARM::PC; + // ADD PC, SP and BLX PC were always unpredictable, + // now on top of it they're deprecated + case ARM::tADDrSP: + case ARM::tBX: + return Instr->getOperand(0).getReg() != ARM::PC; + case ARM::tBLXr: + return Instr->getOperand(BLXOperandIndex).getReg() != ARM::PC; + case ARM::tADDhirr: + return Instr->getOperand(0).getReg() != ARM::PC && + Instr->getOperand(2).getReg() != ARM::PC; + case ARM::tCMPhir: + case ARM::tMOVr: + return Instr->getOperand(0).getReg() != ARM::PC && + Instr->getOperand(1).getReg() != ARM::PC; + } +} + +} + +#endif diff --git a/lib/Target/ARM/ARMFrameLowering.cpp b/lib/Target/ARM/ARMFrameLowering.cpp index 483802b130b5..d32bdbc58939 100644 --- a/lib/Target/ARM/ARMFrameLowering.cpp +++ b/lib/Target/ARM/ARMFrameLowering.cpp @@ -82,22 +82,11 @@ ARMFrameLowering::canSimplifyCallFramePseudos(const MachineFunction &MF) const { return hasReservedCallFrame(MF) || MF.getFrameInfo()->hasVarSizedObjects(); } -static bool isCalleeSavedRegister(unsigned Reg, const uint16_t *CSRegs) { - for (unsigned i = 0; CSRegs[i]; ++i) - if (Reg == CSRegs[i]) - return true; - return false; -} - static bool isCSRestore(MachineInstr *MI, const ARMBaseInstrInfo &TII, const uint16_t *CSRegs) { // Integer spill area is handled with "pop". - if (MI->getOpcode() == ARM::LDMIA_RET || - MI->getOpcode() == ARM::t2LDMIA_RET || - MI->getOpcode() == ARM::LDMIA_UPD || - MI->getOpcode() == ARM::t2LDMIA_UPD || - MI->getOpcode() == ARM::VLDMDIA_UPD) { + if (isPopOpcode(MI->getOpcode())) { // The first two operands are predicates. The last two are // imp-def and imp-use of SP. Check everything in between. for (int i = 5, e = MI->getNumOperands(); i != e; ++i) @@ -115,20 +104,31 @@ static bool isCSRestore(MachineInstr *MI, return false; } -static void -emitSPUpdate(bool isARM, - MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, - DebugLoc dl, const ARMBaseInstrInfo &TII, - int NumBytes, unsigned MIFlags = MachineInstr::NoFlags, - ARMCC::CondCodes Pred = ARMCC::AL, unsigned PredReg = 0) { +static void emitRegPlusImmediate(bool isARM, MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, DebugLoc dl, + const ARMBaseInstrInfo &TII, unsigned DestReg, + unsigned SrcReg, int NumBytes, + unsigned MIFlags = MachineInstr::NoFlags, + ARMCC::CondCodes Pred = ARMCC::AL, + unsigned PredReg = 0) { if (isARM) - emitARMRegPlusImmediate(MBB, MBBI, dl, ARM::SP, ARM::SP, NumBytes, + emitARMRegPlusImmediate(MBB, MBBI, dl, DestReg, SrcReg, NumBytes, Pred, PredReg, TII, MIFlags); else - emitT2RegPlusImmediate(MBB, MBBI, dl, ARM::SP, ARM::SP, NumBytes, + emitT2RegPlusImmediate(MBB, MBBI, dl, DestReg, SrcReg, NumBytes, Pred, PredReg, TII, MIFlags); } +static void emitSPUpdate(bool isARM, MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, DebugLoc dl, + const ARMBaseInstrInfo &TII, int NumBytes, + unsigned MIFlags = MachineInstr::NoFlags, + ARMCC::CondCodes Pred = ARMCC::AL, + unsigned PredReg = 0) { + emitRegPlusImmediate(isARM, MBB, MBBI, dl, TII, ARM::SP, ARM::SP, NumBytes, + MIFlags, Pred, PredReg); +} + void ARMFrameLowering::emitPrologue(MachineFunction &MF) const { MachineBasicBlock &MBB = MF.front(); MachineBasicBlock::iterator MBBI = MBB.begin(); @@ -141,7 +141,8 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF) const { assert(!AFI->isThumb1OnlyFunction() && "This emitPrologue does not support Thumb1!"); bool isARM = !AFI->isThumbFunction(); - unsigned ArgRegsSaveSize = AFI->getArgRegsSaveSize(); + unsigned Align = MF.getTarget().getFrameLowering()->getStackAlignment(); + unsigned ArgRegsSaveSize = AFI->getArgRegsSaveSize(Align); unsigned NumBytes = MFI->getStackSize(); const std::vector &CSI = MFI->getCalleeSavedInfo(); DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc(); @@ -174,6 +175,10 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF) const { unsigned Reg = CSI[i].getReg(); int FI = CSI[i].getFrameIdx(); switch (Reg) { + case ARM::R0: + case ARM::R1: + case ARM::R2: + case ARM::R3: case ARM::R4: case ARM::R5: case ARM::R6: @@ -181,73 +186,61 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF) const { case ARM::LR: if (Reg == FramePtr) FramePtrSpillFI = FI; - AFI->addGPRCalleeSavedArea1Frame(FI); GPRCS1Size += 4; break; case ARM::R8: case ARM::R9: case ARM::R10: case ARM::R11: + case ARM::R12: if (Reg == FramePtr) FramePtrSpillFI = FI; - if (STI.isTargetIOS()) { - AFI->addGPRCalleeSavedArea2Frame(FI); + if (STI.isTargetIOS()) GPRCS2Size += 4; - } else { - AFI->addGPRCalleeSavedArea1Frame(FI); + else GPRCS1Size += 4; - } break; default: // This is a DPR. Exclude the aligned DPRCS2 spills. if (Reg == ARM::D8) D8SpillFI = FI; - if (Reg < ARM::D8 || Reg >= ARM::D8 + AFI->getNumAlignedDPRCS2Regs()) { - AFI->addDPRCalleeSavedAreaFrame(FI); + if (Reg < ARM::D8 || Reg >= ARM::D8 + AFI->getNumAlignedDPRCS2Regs()) DPRCSSize += 8; - } } } // Move past area 1. - if (GPRCS1Size > 0) MBBI++; - - // Set FP to point to the stack slot that contains the previous FP. - // For iOS, FP is R7, which has now been stored in spill area 1. - // Otherwise, if this is not iOS, all the callee-saved registers go - // into spill area 1, including the FP in R11. In either case, it is - // now safe to emit this assignment. - bool HasFP = hasFP(MF); - if (HasFP) { - unsigned ADDriOpc = !AFI->isThumbFunction() ? ARM::ADDri : ARM::t2ADDri; - MachineInstrBuilder MIB = - BuildMI(MBB, MBBI, dl, TII.get(ADDriOpc), FramePtr) - .addFrameIndex(FramePtrSpillFI).addImm(0) - .setMIFlag(MachineInstr::FrameSetup); - AddDefaultCC(AddDefaultPred(MIB)); - } - - // Move past area 2. - if (GPRCS2Size > 0) MBBI++; + MachineBasicBlock::iterator LastPush = MBB.end(), FramePtrPush; + if (GPRCS1Size > 0) + FramePtrPush = LastPush = MBBI++; // Determine starting offsets of spill areas. + bool HasFP = hasFP(MF); unsigned DPRCSOffset = NumBytes - (GPRCS1Size + GPRCS2Size + DPRCSSize); unsigned GPRCS2Offset = DPRCSOffset + DPRCSSize; unsigned GPRCS1Offset = GPRCS2Offset + GPRCS2Size; - if (HasFP) + int FramePtrOffsetInPush = 0; + if (HasFP) { + FramePtrOffsetInPush = MFI->getObjectOffset(FramePtrSpillFI) + GPRCS1Size; AFI->setFramePtrSpillOffset(MFI->getObjectOffset(FramePtrSpillFI) + NumBytes); + } AFI->setGPRCalleeSavedArea1Offset(GPRCS1Offset); AFI->setGPRCalleeSavedArea2Offset(GPRCS2Offset); AFI->setDPRCalleeSavedAreaOffset(DPRCSOffset); + // Move past area 2. + if (GPRCS2Size > 0) { + LastPush = MBBI++; + } + // Move past area 3. if (DPRCSSize > 0) { - MBBI++; + LastPush = MBBI++; // Since vpush register list cannot have gaps, there may be multiple vpush // instructions in the prologue. while (MBBI->getOpcode() == ARM::VSTMDDB_UPD) - MBBI++; + LastPush = MBBI++; } // Move past the aligned DPRCS2 area. @@ -263,8 +256,13 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF) const { if (NumBytes) { // Adjust SP after all the callee-save spills. - emitSPUpdate(isARM, MBB, MBBI, dl, TII, -NumBytes, - MachineInstr::FrameSetup); + if (tryFoldSPUpdateIntoPushPop(MF, LastPush, NumBytes)) { + if (LastPush == FramePtrPush) + FramePtrOffsetInPush += NumBytes; + } else + emitSPUpdate(isARM, MBB, MBBI, dl, TII, -NumBytes, + MachineInstr::FrameSetup); + if (HasFP && isARM) // Restore from fp only in ARM mode: e.g. sub sp, r7, #24 // Note it's not safe to do this in Thumb2 mode because it would have @@ -277,6 +275,18 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF) const { AFI->setShouldRestoreSPFromFP(true); } + // Set FP to point to the stack slot that contains the previous FP. + // For iOS, FP is R7, which has now been stored in spill area 1. + // Otherwise, if this is not iOS, all the callee-saved registers go + // into spill area 1, including the FP in R11. In either case, it + // is in area one and the adjustment needs to take place just after + // that push. + if (HasFP) + emitRegPlusImmediate(!AFI->isThumbFunction(), MBB, ++FramePtrPush, dl, TII, + FramePtr, ARM::SP, FramePtrOffsetInPush, + MachineInstr::FrameSetup); + + if (STI.isTargetELF() && hasFP(MF)) MFI->setOffsetAdjustment(MFI->getOffsetAdjustment() - AFI->getFramePtrSpillOffset()); @@ -357,7 +367,8 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF, "This emitEpilogue does not support Thumb1!"); bool isARM = !AFI->isThumbFunction(); - unsigned ArgRegsSaveSize = AFI->getArgRegsSaveSize(); + unsigned Align = MF.getTarget().getFrameLowering()->getStackAlignment(); + unsigned ArgRegsSaveSize = AFI->getArgRegsSaveSize(Align); int NumBytes = (int)MFI->getStackSize(); unsigned FramePtr = RegInfo->getFrameRegister(MF); @@ -371,11 +382,11 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF, emitSPUpdate(isARM, MBB, MBBI, dl, TII, NumBytes); } else { // Unwind MBBI to point to first LDR / VLDRD. - const uint16_t *CSRegs = RegInfo->getCalleeSavedRegs(); + const uint16_t *CSRegs = RegInfo->getCalleeSavedRegs(&MF); if (MBBI != MBB.begin()) { - do + do { --MBBI; - while (MBBI != MBB.begin() && isCSRestore(MBBI, TII, CSRegs)); + } while (MBBI != MBB.begin() && isCSRestore(MBBI, TII, CSRegs)); if (!isCSRestore(MBBI, TII, CSRegs)) ++MBBI; } @@ -419,8 +430,8 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF, ARM::SP) .addReg(FramePtr)); } - } else if (NumBytes) - emitSPUpdate(isARM, MBB, MBBI, dl, TII, NumBytes); + } else if (NumBytes && !tryFoldSPUpdateIntoPushPop(MF, MBBI, NumBytes)) + emitSPUpdate(isARM, MBB, MBBI, dl, TII, NumBytes); // Increment past our save areas. if (AFI->getDPRCalleeSavedAreaSize()) { @@ -499,12 +510,6 @@ ARMFrameLowering::ResolveFrameIndexReference(const MachineFunction &MF, FrameReg = ARM::SP; Offset += SPAdj; - if (AFI->isGPRCalleeSavedArea1Frame(FI)) - return Offset - AFI->getGPRCalleeSavedArea1Offset(); - else if (AFI->isGPRCalleeSavedArea2Frame(FI)) - return Offset - AFI->getGPRCalleeSavedArea2Offset(); - else if (AFI->isDPRCalleeSavedAreaFrame(FI)) - return Offset - AFI->getDPRCalleeSavedAreaOffset(); // SP can move around if there are allocas. We may also lose track of SP // when emergency spilling inside a non-reserved call frame setup. @@ -656,6 +661,8 @@ void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB, unsigned RetOpcode = MI->getOpcode(); bool isTailCall = (RetOpcode == ARM::TCRETURNdi || RetOpcode == ARM::TCRETURNri); + bool isInterrupt = + RetOpcode == ARM::SUBS_PC_LR || RetOpcode == ARM::t2SUBS_PC_LR; SmallVector Regs; unsigned i = CSI.size(); @@ -670,7 +677,8 @@ void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB, if (Reg >= ARM::D8 && Reg < ARM::D8 + NumAlignedDPRCS2Regs) continue; - if (Reg == ARM::LR && !isTailCall && !isVarArg && STI.hasV5TOps()) { + if (Reg == ARM::LR && !isTailCall && !isVarArg && !isInterrupt && + STI.hasV5TOps()) { Reg = ARM::PC; LdmOpc = AFI->isThumbFunction() ? ARM::t2LDMIA_RET : ARM::LDMIA_RET; // Fold the return instruction into the LDM. @@ -1197,7 +1205,7 @@ ARMFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF, // Don't spill FP if the frame can be eliminated. This is determined // by scanning the callee-save registers to see if any is used. - const uint16_t *CSRegs = RegInfo->getCalleeSavedRegs(); + const uint16_t *CSRegs = RegInfo->getCalleeSavedRegs(&MF); for (unsigned i = 0; CSRegs[i]; ++i) { unsigned Reg = CSRegs[i]; bool Spilled = false; @@ -1224,6 +1232,8 @@ ARMFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF, case ARM::LR: LRSpilled = true; // Fallthrough + case ARM::R0: case ARM::R1: + case ARM::R2: case ARM::R3: case ARM::R4: case ARM::R5: case ARM::R6: case ARM::R7: CS1Spilled = true; @@ -1238,6 +1248,8 @@ ARMFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF, } switch (Reg) { + case ARM::R0: case ARM::R1: + case ARM::R2: case ARM::R3: case ARM::R4: case ARM::R5: case ARM::R6: case ARM::R7: case ARM::LR: @@ -1293,8 +1305,12 @@ ARMFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF, if (!LRSpilled && CS1Spilled) { MRI.setPhysRegUsed(ARM::LR); NumGPRSpills++; - UnspilledCS1GPRs.erase(std::find(UnspilledCS1GPRs.begin(), - UnspilledCS1GPRs.end(), (unsigned)ARM::LR)); + SmallVectorImpl::iterator LRPos; + LRPos = std::find(UnspilledCS1GPRs.begin(), UnspilledCS1GPRs.end(), + (unsigned)ARM::LR); + if (LRPos != UnspilledCS1GPRs.end()) + UnspilledCS1GPRs.erase(LRPos); + ForceLRSpill = false; ExtraCSSpill = true; } diff --git a/lib/Target/ARM/ARMHazardRecognizer.cpp b/lib/Target/ARM/ARMHazardRecognizer.cpp index 1240169e84ed..c69d313fd9ce 100644 --- a/lib/Target/ARM/ARMHazardRecognizer.cpp +++ b/lib/Target/ARM/ARMHazardRecognizer.cpp @@ -44,10 +44,16 @@ ARMHazardRecognizer::getHazardType(SUnit *SU, int Stalls) { if (LastMI && (MCID.TSFlags & ARMII::DomainMask) != ARMII::DomainGeneral) { MachineInstr *DefMI = LastMI; const MCInstrDesc &LastMCID = LastMI->getDesc(); + const TargetMachine &TM = + MI->getParent()->getParent()->getTarget(); + const ARMBaseInstrInfo &TII = + *static_cast(TM.getInstrInfo()); + // Skip over one non-VFP / NEON instruction. if (!LastMI->isBarrier() && // On A9, AGU and NEON/FPU are muxed. - !(STI.isLikeA9() && (LastMI->mayLoad() || LastMI->mayStore())) && + !(TII.getSubtarget().isLikeA9() && + (LastMI->mayLoad() || LastMI->mayStore())) && (LastMCID.TSFlags & ARMII::DomainMask) == ARMII::DomainGeneral) { MachineBasicBlock::iterator I = LastMI; if (I != LastMI->getParent()->begin()) { @@ -58,7 +64,7 @@ ARMHazardRecognizer::getHazardType(SUnit *SU, int Stalls) { if (TII.isFpMLxInstruction(DefMI->getOpcode()) && (TII.canCauseFpMLxStall(MI->getOpcode()) || - hasRAWHazard(DefMI, MI, TRI))) { + hasRAWHazard(DefMI, MI, TII.getRegisterInfo()))) { // Try to schedule another instruction for the next 4 cycles. if (FpMLxStalls == 0) FpMLxStalls = 4; diff --git a/lib/Target/ARM/ARMHazardRecognizer.h b/lib/Target/ARM/ARMHazardRecognizer.h index 98bfc4cf0cc5..e1dcec3d1cc8 100644 --- a/lib/Target/ARM/ARMHazardRecognizer.h +++ b/lib/Target/ARM/ARMHazardRecognizer.h @@ -28,21 +28,14 @@ class MachineInstr; /// ARM preRA scheduler uses an unspecialized instance of the /// ScoreboardHazardRecognizer. class ARMHazardRecognizer : public ScoreboardHazardRecognizer { - const ARMBaseInstrInfo &TII; - const ARMBaseRegisterInfo &TRI; - const ARMSubtarget &STI; - MachineInstr *LastMI; unsigned FpMLxStalls; public: ARMHazardRecognizer(const InstrItineraryData *ItinData, - const ARMBaseInstrInfo &tii, - const ARMBaseRegisterInfo &tri, - const ARMSubtarget &sti, - const ScheduleDAG *DAG) : - ScoreboardHazardRecognizer(ItinData, DAG, "post-RA-sched"), TII(tii), - TRI(tri), STI(sti), LastMI(0) {} + const ScheduleDAG *DAG) + : ScoreboardHazardRecognizer(ItinData, DAG, "post-RA-sched"), + LastMI(0) {} virtual HazardType getHazardType(SUnit *SU, int Stalls); virtual void Reset(); diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp index 9e1782e1191c..87d15226947a 100644 --- a/lib/Target/ARM/ARMISelDAGToDAG.cpp +++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -61,7 +61,6 @@ enum AddrMode2Type { class ARMDAGToDAGISel : public SelectionDAGISel { ARMBaseTargetMachine &TM; - const ARMBaseInstrInfo *TII; /// Subtarget - Keep a pointer to the ARMSubtarget around so that we can /// make the right decision when generating code for different targets. @@ -71,7 +70,6 @@ public: explicit ARMDAGToDAGISel(ARMBaseTargetMachine &tm, CodeGenOpt::Level OptLevel) : SelectionDAGISel(tm, OptLevel), TM(tm), - TII(static_cast(TM.getInstrInfo())), Subtarget(&TM.getSubtarget()) { } @@ -132,6 +130,13 @@ public: return true; } + bool SelectCMOVPred(SDValue N, SDValue &Pred, SDValue &Reg) { + const ConstantSDNode *CN = cast(N); + Pred = CurDAG->getTargetConstant(CN->getZExtValue(), MVT::i32); + Reg = CurDAG->getRegister(ARM::CPSR, MVT::i32); + return true; + } + bool SelectAddrMode2OffsetReg(SDNode *Op, SDValue N, SDValue &Offset, SDValue &Opc); bool SelectAddrMode2OffsetImm(SDNode *Op, SDValue N, @@ -177,6 +182,7 @@ public: SDValue &OffImm); bool SelectT2AddrModeSoReg(SDValue N, SDValue &Base, SDValue &OffReg, SDValue &ShImm); + bool SelectT2AddrModeExclusive(SDValue N, SDValue &Base, SDValue &OffImm); inline bool is_so_imm(unsigned Imm) const { return ARM_AM::getSOImmVal(Imm) != -1; @@ -240,21 +246,6 @@ private: /// SelectV6T2BitfieldExtractOp - Select SBFX/UBFX instructions for ARM. SDNode *SelectV6T2BitfieldExtractOp(SDNode *N, bool isSigned); - /// SelectCMOVOp - Select CMOV instructions for ARM. - SDNode *SelectCMOVOp(SDNode *N); - SDNode *SelectT2CMOVShiftOp(SDNode *N, SDValue FalseVal, SDValue TrueVal, - ARMCC::CondCodes CCVal, SDValue CCR, - SDValue InFlag); - SDNode *SelectARMCMOVShiftOp(SDNode *N, SDValue FalseVal, SDValue TrueVal, - ARMCC::CondCodes CCVal, SDValue CCR, - SDValue InFlag); - SDNode *SelectT2CMOVImmOp(SDNode *N, SDValue FalseVal, SDValue TrueVal, - ARMCC::CondCodes CCVal, SDValue CCR, - SDValue InFlag); - SDNode *SelectARMCMOVImmOp(SDNode *N, SDValue FalseVal, SDValue TrueVal, - ARMCC::CondCodes CCVal, SDValue CCR, - SDValue InFlag); - // Select special operations if node forms integer ABS pattern SDNode *SelectABSOp(SDNode *N); @@ -262,7 +253,7 @@ private: SDNode *SelectConcatVector(SDNode *N); - SDNode *SelectAtomic64(SDNode *Node, unsigned Opc); + SDNode *SelectAtomic(SDNode *N, unsigned Op8, unsigned Op16, unsigned Op32, unsigned Op64); /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for /// inline asm expressions. @@ -364,7 +355,7 @@ void ARMDAGToDAGISel::PreprocessISelDAG() { continue; // Check if the AND mask is an immediate of the form: 000.....1111111100 - unsigned TZ = CountTrailingZeros_32(And_imm); + unsigned TZ = countTrailingZeros(And_imm); if (TZ != 1 && TZ != 2) // Be conservative here. Shifter operands aren't always free. e.g. On // Swift, left shifter operand of 1 / 2 for free but others are not. @@ -402,12 +393,12 @@ void ARMDAGToDAGISel::PreprocessISelDAG() { } // Now make the transformation. - Srl = CurDAG->getNode(ISD::SRL, Srl.getDebugLoc(), MVT::i32, + Srl = CurDAG->getNode(ISD::SRL, SDLoc(Srl), MVT::i32, Srl.getOperand(0), CurDAG->getConstant(Srl_imm+TZ, MVT::i32)); - N1 = CurDAG->getNode(ISD::AND, N1.getDebugLoc(), MVT::i32, + N1 = CurDAG->getNode(ISD::AND, SDLoc(N1), MVT::i32, Srl, CurDAG->getConstant(And_imm, MVT::i32)); - N1 = CurDAG->getNode(ISD::SHL, N1.getDebugLoc(), MVT::i32, + N1 = CurDAG->getNode(ISD::SHL, SDLoc(N1), MVT::i32, N1, CurDAG->getConstant(TZ, MVT::i32)); CurDAG->UpdateNodeOperands(N, N0, N1); } @@ -423,7 +414,7 @@ bool ARMDAGToDAGISel::hasNoVMLxHazardUse(SDNode *N) const { if (!CheckVMLxHazard) return true; - if (!Subtarget->isCortexA8() && !Subtarget->isLikeA9() && + if (!Subtarget->isCortexA8() && !Subtarget->isCortexA9() && !Subtarget->isSwift()) return true; @@ -434,6 +425,9 @@ bool ARMDAGToDAGISel::hasNoVMLxHazardUse(SDNode *N) const { if (Use->getOpcode() == ISD::CopyToReg) return true; if (Use->isMachineOpcode()) { + const ARMBaseInstrInfo *TII = + static_cast(TM.getInstrInfo()); + const MCInstrDesc &MCID = TII->get(Use->getMachineOpcode()); if (MCID.mayStore()) return true; @@ -533,7 +527,8 @@ bool ARMDAGToDAGISel::SelectAddrModeImm12(SDValue N, if (N.getOpcode() == ISD::FrameIndex) { // Match frame index. int FI = cast(N)->getIndex(); - Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy()); + Base = CurDAG->getTargetFrameIndex(FI, + getTargetLowering()->getPointerTy()); OffImm = CurDAG->getTargetConstant(0, MVT::i32); return true; } @@ -557,7 +552,8 @@ bool ARMDAGToDAGISel::SelectAddrModeImm12(SDValue N, Base = N.getOperand(0); if (Base.getOpcode() == ISD::FrameIndex) { int FI = cast(Base)->getIndex(); - Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy()); + Base = CurDAG->getTargetFrameIndex(FI, + getTargetLowering()->getPointerTy()); } OffImm = CurDAG->getTargetConstant(RHSC, MVT::i32); return true; @@ -703,7 +699,8 @@ AddrMode2Type ARMDAGToDAGISel::SelectAddrMode2Worker(SDValue N, Base = N; if (N.getOpcode() == ISD::FrameIndex) { int FI = cast(N)->getIndex(); - Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy()); + Base = CurDAG->getTargetFrameIndex(FI, + getTargetLowering()->getPointerTy()); } else if (N.getOpcode() == ARMISD::Wrapper && !(Subtarget->useMovt() && N.getOperand(0).getOpcode() == ISD::TargetGlobalAddress)) { @@ -724,7 +721,8 @@ AddrMode2Type ARMDAGToDAGISel::SelectAddrMode2Worker(SDValue N, Base = N.getOperand(0); if (Base.getOpcode() == ISD::FrameIndex) { int FI = cast(Base)->getIndex(); - Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy()); + Base = CurDAG->getTargetFrameIndex(FI, + getTargetLowering()->getPointerTy()); } Offset = CurDAG->getRegister(0, MVT::i32); @@ -901,7 +899,8 @@ bool ARMDAGToDAGISel::SelectAddrMode3(SDValue N, Base = N; if (N.getOpcode() == ISD::FrameIndex) { int FI = cast(N)->getIndex(); - Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy()); + Base = CurDAG->getTargetFrameIndex(FI, + getTargetLowering()->getPointerTy()); } Offset = CurDAG->getRegister(0, MVT::i32); Opc = CurDAG->getTargetConstant(ARM_AM::getAM3Opc(ARM_AM::add, 0),MVT::i32); @@ -915,7 +914,8 @@ bool ARMDAGToDAGISel::SelectAddrMode3(SDValue N, Base = N.getOperand(0); if (Base.getOpcode() == ISD::FrameIndex) { int FI = cast(Base)->getIndex(); - Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy()); + Base = CurDAG->getTargetFrameIndex(FI, + getTargetLowering()->getPointerTy()); } Offset = CurDAG->getRegister(0, MVT::i32); @@ -960,7 +960,8 @@ bool ARMDAGToDAGISel::SelectAddrMode5(SDValue N, Base = N; if (N.getOpcode() == ISD::FrameIndex) { int FI = cast(N)->getIndex(); - Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy()); + Base = CurDAG->getTargetFrameIndex(FI, + getTargetLowering()->getPointerTy()); } else if (N.getOpcode() == ARMISD::Wrapper && !(Subtarget->useMovt() && N.getOperand(0).getOpcode() == ISD::TargetGlobalAddress)) { @@ -978,7 +979,8 @@ bool ARMDAGToDAGISel::SelectAddrMode5(SDValue N, Base = N.getOperand(0); if (Base.getOpcode() == ISD::FrameIndex) { int FI = cast(Base)->getIndex(); - Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy()); + Base = CurDAG->getTargetFrameIndex(FI, + getTargetLowering()->getPointerTy()); } ARM_AM::AddrOpc AddSub = ARM_AM::add; @@ -1202,7 +1204,8 @@ bool ARMDAGToDAGISel::SelectThumbAddrModeSP(SDValue N, SDValue &Base, SDValue &OffImm) { if (N.getOpcode() == ISD::FrameIndex) { int FI = cast(N)->getIndex(); - Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy()); + Base = CurDAG->getTargetFrameIndex(FI, + getTargetLowering()->getPointerTy()); OffImm = CurDAG->getTargetConstant(0, MVT::i32); return true; } @@ -1219,7 +1222,8 @@ bool ARMDAGToDAGISel::SelectThumbAddrModeSP(SDValue N, Base = N.getOperand(0); if (Base.getOpcode() == ISD::FrameIndex) { int FI = cast(Base)->getIndex(); - Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy()); + Base = CurDAG->getTargetFrameIndex(FI, + getTargetLowering()->getPointerTy()); } OffImm = CurDAG->getTargetConstant(RHSC, MVT::i32); return true; @@ -1267,7 +1271,8 @@ bool ARMDAGToDAGISel::SelectT2AddrModeImm12(SDValue N, if (N.getOpcode() == ISD::FrameIndex) { // Match frame index. int FI = cast(N)->getIndex(); - Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy()); + Base = CurDAG->getTargetFrameIndex(FI, + getTargetLowering()->getPointerTy()); OffImm = CurDAG->getTargetConstant(0, MVT::i32); return true; } @@ -1297,7 +1302,8 @@ bool ARMDAGToDAGISel::SelectT2AddrModeImm12(SDValue N, Base = N.getOperand(0); if (Base.getOpcode() == ISD::FrameIndex) { int FI = cast(Base)->getIndex(); - Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy()); + Base = CurDAG->getTargetFrameIndex(FI, + getTargetLowering()->getPointerTy()); } OffImm = CurDAG->getTargetConstant(RHSC, MVT::i32); return true; @@ -1326,7 +1332,8 @@ bool ARMDAGToDAGISel::SelectT2AddrModeImm8(SDValue N, Base = N.getOperand(0); if (Base.getOpcode() == ISD::FrameIndex) { int FI = cast(Base)->getIndex(); - Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy()); + Base = CurDAG->getTargetFrameIndex(FI, + getTargetLowering()->getPointerTy()); } OffImm = CurDAG->getTargetConstant(RHSC, MVT::i32); return true; @@ -1403,6 +1410,34 @@ bool ARMDAGToDAGISel::SelectT2AddrModeSoReg(SDValue N, return true; } +bool ARMDAGToDAGISel::SelectT2AddrModeExclusive(SDValue N, SDValue &Base, + SDValue &OffImm) { + // This *must* succeed since it's used for the irreplacable ldrex and strex + // instructions. + Base = N; + OffImm = CurDAG->getTargetConstant(0, MVT::i32); + + if (N.getOpcode() != ISD::ADD || !CurDAG->isBaseWithConstantOffset(N)) + return true; + + ConstantSDNode *RHS = dyn_cast(N.getOperand(1)); + if (!RHS) + return true; + + uint32_t RHSC = (int)RHS->getZExtValue(); + if (RHSC > 1020 || RHSC % 4 != 0) + return true; + + Base = N.getOperand(0); + if (Base.getOpcode() == ISD::FrameIndex) { + int FI = cast(Base)->getIndex(); + Base = CurDAG->getTargetFrameIndex(FI, getTargetLowering()->getPointerTy()); + } + + OffImm = CurDAG->getTargetConstant(RHSC / 4, MVT::i32); + return true; +} + //===--------------------------------------------------------------------===// /// getAL - Returns a ARMCC::AL immediate node. @@ -1468,14 +1503,14 @@ SDNode *ARMDAGToDAGISel::SelectARMIndexedLoad(SDNode *N) { SDValue Base = LD->getBasePtr(); SDValue Ops[]= { Base, AMOpc, getAL(CurDAG), CurDAG->getRegister(0, MVT::i32), Chain }; - return CurDAG->getMachineNode(Opcode, N->getDebugLoc(), MVT::i32, + return CurDAG->getMachineNode(Opcode, SDLoc(N), MVT::i32, MVT::i32, MVT::Other, Ops); } else { SDValue Chain = LD->getChain(); SDValue Base = LD->getBasePtr(); SDValue Ops[]= { Base, Offset, AMOpc, getAL(CurDAG), CurDAG->getRegister(0, MVT::i32), Chain }; - return CurDAG->getMachineNode(Opcode, N->getDebugLoc(), MVT::i32, + return CurDAG->getMachineNode(Opcode, SDLoc(N), MVT::i32, MVT::i32, MVT::Other, Ops); } } @@ -1524,7 +1559,7 @@ SDNode *ARMDAGToDAGISel::SelectT2IndexedLoad(SDNode *N) { SDValue Base = LD->getBasePtr(); SDValue Ops[]= { Base, Offset, getAL(CurDAG), CurDAG->getRegister(0, MVT::i32), Chain }; - return CurDAG->getMachineNode(Opcode, N->getDebugLoc(), MVT::i32, MVT::i32, + return CurDAG->getMachineNode(Opcode, SDLoc(N), MVT::i32, MVT::i32, MVT::Other, Ops); } @@ -1533,7 +1568,7 @@ SDNode *ARMDAGToDAGISel::SelectT2IndexedLoad(SDNode *N) { /// \brief Form a GPRPair pseudo register from a pair of GPR regs. SDNode *ARMDAGToDAGISel::createGPRPairNode(EVT VT, SDValue V0, SDValue V1) { - DebugLoc dl = V0.getNode()->getDebugLoc(); + SDLoc dl(V0.getNode()); SDValue RegClass = CurDAG->getTargetConstant(ARM::GPRPairRegClassID, MVT::i32); SDValue SubReg0 = CurDAG->getTargetConstant(ARM::gsub_0, MVT::i32); @@ -1544,7 +1579,7 @@ SDNode *ARMDAGToDAGISel::createGPRPairNode(EVT VT, SDValue V0, SDValue V1) { /// \brief Form a D register from a pair of S registers. SDNode *ARMDAGToDAGISel::createSRegPairNode(EVT VT, SDValue V0, SDValue V1) { - DebugLoc dl = V0.getNode()->getDebugLoc(); + SDLoc dl(V0.getNode()); SDValue RegClass = CurDAG->getTargetConstant(ARM::DPR_VFP2RegClassID, MVT::i32); SDValue SubReg0 = CurDAG->getTargetConstant(ARM::ssub_0, MVT::i32); @@ -1555,7 +1590,7 @@ SDNode *ARMDAGToDAGISel::createSRegPairNode(EVT VT, SDValue V0, SDValue V1) { /// \brief Form a quad register from a pair of D registers. SDNode *ARMDAGToDAGISel::createDRegPairNode(EVT VT, SDValue V0, SDValue V1) { - DebugLoc dl = V0.getNode()->getDebugLoc(); + SDLoc dl(V0.getNode()); SDValue RegClass = CurDAG->getTargetConstant(ARM::QPRRegClassID, MVT::i32); SDValue SubReg0 = CurDAG->getTargetConstant(ARM::dsub_0, MVT::i32); SDValue SubReg1 = CurDAG->getTargetConstant(ARM::dsub_1, MVT::i32); @@ -1565,7 +1600,7 @@ SDNode *ARMDAGToDAGISel::createDRegPairNode(EVT VT, SDValue V0, SDValue V1) { /// \brief Form 4 consecutive D registers from a pair of Q registers. SDNode *ARMDAGToDAGISel::createQRegPairNode(EVT VT, SDValue V0, SDValue V1) { - DebugLoc dl = V0.getNode()->getDebugLoc(); + SDLoc dl(V0.getNode()); SDValue RegClass = CurDAG->getTargetConstant(ARM::QQPRRegClassID, MVT::i32); SDValue SubReg0 = CurDAG->getTargetConstant(ARM::qsub_0, MVT::i32); SDValue SubReg1 = CurDAG->getTargetConstant(ARM::qsub_1, MVT::i32); @@ -1576,7 +1611,7 @@ SDNode *ARMDAGToDAGISel::createQRegPairNode(EVT VT, SDValue V0, SDValue V1) { /// \brief Form 4 consecutive S registers. SDNode *ARMDAGToDAGISel::createQuadSRegsNode(EVT VT, SDValue V0, SDValue V1, SDValue V2, SDValue V3) { - DebugLoc dl = V0.getNode()->getDebugLoc(); + SDLoc dl(V0.getNode()); SDValue RegClass = CurDAG->getTargetConstant(ARM::QPR_VFP2RegClassID, MVT::i32); SDValue SubReg0 = CurDAG->getTargetConstant(ARM::ssub_0, MVT::i32); @@ -1591,7 +1626,7 @@ SDNode *ARMDAGToDAGISel::createQuadSRegsNode(EVT VT, SDValue V0, SDValue V1, /// \brief Form 4 consecutive D registers. SDNode *ARMDAGToDAGISel::createQuadDRegsNode(EVT VT, SDValue V0, SDValue V1, SDValue V2, SDValue V3) { - DebugLoc dl = V0.getNode()->getDebugLoc(); + SDLoc dl(V0.getNode()); SDValue RegClass = CurDAG->getTargetConstant(ARM::QQPRRegClassID, MVT::i32); SDValue SubReg0 = CurDAG->getTargetConstant(ARM::dsub_0, MVT::i32); SDValue SubReg1 = CurDAG->getTargetConstant(ARM::dsub_1, MVT::i32); @@ -1605,7 +1640,7 @@ SDNode *ARMDAGToDAGISel::createQuadDRegsNode(EVT VT, SDValue V0, SDValue V1, /// \brief Form 4 consecutive Q registers. SDNode *ARMDAGToDAGISel::createQuadQRegsNode(EVT VT, SDValue V0, SDValue V1, SDValue V2, SDValue V3) { - DebugLoc dl = V0.getNode()->getDebugLoc(); + SDLoc dl(V0.getNode()); SDValue RegClass = CurDAG->getTargetConstant(ARM::QQQQPRRegClassID, MVT::i32); SDValue SubReg0 = CurDAG->getTargetConstant(ARM::qsub_0, MVT::i32); SDValue SubReg1 = CurDAG->getTargetConstant(ARM::qsub_1, MVT::i32); @@ -1689,7 +1724,7 @@ SDNode *ARMDAGToDAGISel::SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs, const uint16_t *QOpcodes0, const uint16_t *QOpcodes1) { assert(NumVecs >= 1 && NumVecs <= 4 && "VLD NumVecs out-of-range"); - DebugLoc dl = N->getDebugLoc(); + SDLoc dl(N); SDValue MemAddr, Align; unsigned AddrOpIdx = isUpdating ? 1 : 2; @@ -1821,7 +1856,7 @@ SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs, const uint16_t *QOpcodes0, const uint16_t *QOpcodes1) { assert(NumVecs >= 1 && NumVecs <= 4 && "VST NumVecs out-of-range"); - DebugLoc dl = N->getDebugLoc(); + SDLoc dl(N); SDValue MemAddr, Align; unsigned AddrOpIdx = isUpdating ? 1 : 2; @@ -1966,7 +2001,7 @@ SDNode *ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad, const uint16_t *DOpcodes, const uint16_t *QOpcodes) { assert(NumVecs >=2 && NumVecs <= 4 && "VLDSTLane NumVecs out-of-range"); - DebugLoc dl = N->getDebugLoc(); + SDLoc dl(N); SDValue MemAddr, Align; unsigned AddrOpIdx = isUpdating ? 1 : 2; @@ -2084,7 +2119,7 @@ SDNode *ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool isUpdating, unsigned NumVecs, const uint16_t *Opcodes) { assert(NumVecs >=2 && NumVecs <= 4 && "VLDDup NumVecs out-of-range"); - DebugLoc dl = N->getDebugLoc(); + SDLoc dl(N); SDValue MemAddr, Align; if (!SelectAddrMode6(N, N->getOperand(1), MemAddr, Align)) @@ -2166,7 +2201,7 @@ SDNode *ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool isUpdating, SDNode *ARMDAGToDAGISel::SelectVTBL(SDNode *N, bool IsExt, unsigned NumVecs, unsigned Opc) { assert(NumVecs >= 2 && NumVecs <= 4 && "VTBL NumVecs out-of-range"); - DebugLoc dl = N->getDebugLoc(); + SDLoc dl(N); EVT VT = N->getValueType(0); unsigned FirstTblReg = IsExt ? 2 : 1; @@ -2278,204 +2313,6 @@ SDNode *ARMDAGToDAGISel::SelectV6T2BitfieldExtractOp(SDNode *N, return NULL; } -SDNode *ARMDAGToDAGISel:: -SelectT2CMOVShiftOp(SDNode *N, SDValue FalseVal, SDValue TrueVal, - ARMCC::CondCodes CCVal, SDValue CCR, SDValue InFlag) { - SDValue CPTmp0; - SDValue CPTmp1; - if (SelectT2ShifterOperandReg(TrueVal, CPTmp0, CPTmp1)) { - unsigned SOVal = cast(CPTmp1)->getZExtValue(); - unsigned SOShOp = ARM_AM::getSORegShOp(SOVal); - unsigned Opc = 0; - switch (SOShOp) { - case ARM_AM::lsl: Opc = ARM::t2MOVCClsl; break; - case ARM_AM::lsr: Opc = ARM::t2MOVCClsr; break; - case ARM_AM::asr: Opc = ARM::t2MOVCCasr; break; - case ARM_AM::ror: Opc = ARM::t2MOVCCror; break; - default: - llvm_unreachable("Unknown so_reg opcode!"); - } - SDValue SOShImm = - CurDAG->getTargetConstant(ARM_AM::getSORegOffset(SOVal), MVT::i32); - SDValue CC = CurDAG->getTargetConstant(CCVal, MVT::i32); - SDValue Ops[] = { FalseVal, CPTmp0, SOShImm, CC, CCR, InFlag }; - return CurDAG->SelectNodeTo(N, Opc, MVT::i32,Ops, 6); - } - return 0; -} - -SDNode *ARMDAGToDAGISel:: -SelectARMCMOVShiftOp(SDNode *N, SDValue FalseVal, SDValue TrueVal, - ARMCC::CondCodes CCVal, SDValue CCR, SDValue InFlag) { - SDValue CPTmp0; - SDValue CPTmp1; - SDValue CPTmp2; - if (SelectImmShifterOperand(TrueVal, CPTmp0, CPTmp2)) { - SDValue CC = CurDAG->getTargetConstant(CCVal, MVT::i32); - SDValue Ops[] = { FalseVal, CPTmp0, CPTmp2, CC, CCR, InFlag }; - return CurDAG->SelectNodeTo(N, ARM::MOVCCsi, MVT::i32, Ops, 6); - } - - if (SelectRegShifterOperand(TrueVal, CPTmp0, CPTmp1, CPTmp2)) { - SDValue CC = CurDAG->getTargetConstant(CCVal, MVT::i32); - SDValue Ops[] = { FalseVal, CPTmp0, CPTmp1, CPTmp2, CC, CCR, InFlag }; - return CurDAG->SelectNodeTo(N, ARM::MOVCCsr, MVT::i32, Ops, 7); - } - return 0; -} - -SDNode *ARMDAGToDAGISel:: -SelectT2CMOVImmOp(SDNode *N, SDValue FalseVal, SDValue TrueVal, - ARMCC::CondCodes CCVal, SDValue CCR, SDValue InFlag) { - ConstantSDNode *T = dyn_cast(TrueVal); - if (!T) - return 0; - - unsigned Opc = 0; - unsigned TrueImm = T->getZExtValue(); - if (is_t2_so_imm(TrueImm)) { - Opc = ARM::t2MOVCCi; - } else if (TrueImm <= 0xffff) { - Opc = ARM::t2MOVCCi16; - } else if (is_t2_so_imm_not(TrueImm)) { - TrueImm = ~TrueImm; - Opc = ARM::t2MVNCCi; - } else if (TrueVal.getNode()->hasOneUse() && Subtarget->hasV6T2Ops()) { - // Large immediate. - Opc = ARM::t2MOVCCi32imm; - } - - if (Opc) { - SDValue True = CurDAG->getTargetConstant(TrueImm, MVT::i32); - SDValue CC = CurDAG->getTargetConstant(CCVal, MVT::i32); - SDValue Ops[] = { FalseVal, True, CC, CCR, InFlag }; - return CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops, 5); - } - - return 0; -} - -SDNode *ARMDAGToDAGISel:: -SelectARMCMOVImmOp(SDNode *N, SDValue FalseVal, SDValue TrueVal, - ARMCC::CondCodes CCVal, SDValue CCR, SDValue InFlag) { - ConstantSDNode *T = dyn_cast(TrueVal); - if (!T) - return 0; - - unsigned Opc = 0; - unsigned TrueImm = T->getZExtValue(); - bool isSoImm = is_so_imm(TrueImm); - if (isSoImm) { - Opc = ARM::MOVCCi; - } else if (Subtarget->hasV6T2Ops() && TrueImm <= 0xffff) { - Opc = ARM::MOVCCi16; - } else if (is_so_imm_not(TrueImm)) { - TrueImm = ~TrueImm; - Opc = ARM::MVNCCi; - } else if (TrueVal.getNode()->hasOneUse() && - (Subtarget->hasV6T2Ops() || ARM_AM::isSOImmTwoPartVal(TrueImm))) { - // Large immediate. - Opc = ARM::MOVCCi32imm; - } - - if (Opc) { - SDValue True = CurDAG->getTargetConstant(TrueImm, MVT::i32); - SDValue CC = CurDAG->getTargetConstant(CCVal, MVT::i32); - SDValue Ops[] = { FalseVal, True, CC, CCR, InFlag }; - return CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops, 5); - } - - return 0; -} - -SDNode *ARMDAGToDAGISel::SelectCMOVOp(SDNode *N) { - EVT VT = N->getValueType(0); - SDValue FalseVal = N->getOperand(0); - SDValue TrueVal = N->getOperand(1); - SDValue CC = N->getOperand(2); - SDValue CCR = N->getOperand(3); - SDValue InFlag = N->getOperand(4); - assert(CC.getOpcode() == ISD::Constant); - assert(CCR.getOpcode() == ISD::Register); - ARMCC::CondCodes CCVal = - (ARMCC::CondCodes)cast(CC)->getZExtValue(); - - if (!Subtarget->isThumb1Only() && VT == MVT::i32) { - // Pattern: (ARMcmov:i32 GPR:i32:$false, so_reg:i32:$true, (imm:i32):$cc) - // Emits: (MOVCCs:i32 GPR:i32:$false, so_reg:i32:$true, (imm:i32):$cc) - // Pattern complexity = 18 cost = 1 size = 0 - if (Subtarget->isThumb()) { - SDNode *Res = SelectT2CMOVShiftOp(N, FalseVal, TrueVal, - CCVal, CCR, InFlag); - if (!Res) - Res = SelectT2CMOVShiftOp(N, TrueVal, FalseVal, - ARMCC::getOppositeCondition(CCVal), CCR, InFlag); - if (Res) - return Res; - } else { - SDNode *Res = SelectARMCMOVShiftOp(N, FalseVal, TrueVal, - CCVal, CCR, InFlag); - if (!Res) - Res = SelectARMCMOVShiftOp(N, TrueVal, FalseVal, - ARMCC::getOppositeCondition(CCVal), CCR, InFlag); - if (Res) - return Res; - } - - // Pattern: (ARMcmov:i32 GPR:i32:$false, - // (imm:i32)<>:$true, - // (imm:i32):$cc) - // Emits: (MOVCCi:i32 GPR:i32:$false, - // (so_imm:i32 (imm:i32):$true), (imm:i32):$cc) - // Pattern complexity = 10 cost = 1 size = 0 - if (Subtarget->isThumb()) { - SDNode *Res = SelectT2CMOVImmOp(N, FalseVal, TrueVal, - CCVal, CCR, InFlag); - if (!Res) - Res = SelectT2CMOVImmOp(N, TrueVal, FalseVal, - ARMCC::getOppositeCondition(CCVal), CCR, InFlag); - if (Res) - return Res; - } else { - SDNode *Res = SelectARMCMOVImmOp(N, FalseVal, TrueVal, - CCVal, CCR, InFlag); - if (!Res) - Res = SelectARMCMOVImmOp(N, TrueVal, FalseVal, - ARMCC::getOppositeCondition(CCVal), CCR, InFlag); - if (Res) - return Res; - } - } - - // Pattern: (ARMcmov:i32 GPR:i32:$false, GPR:i32:$true, (imm:i32):$cc) - // Emits: (MOVCCr:i32 GPR:i32:$false, GPR:i32:$true, (imm:i32):$cc) - // Pattern complexity = 6 cost = 1 size = 0 - // - // Pattern: (ARMcmov:i32 GPR:i32:$false, GPR:i32:$true, (imm:i32):$cc) - // Emits: (tMOVCCr:i32 GPR:i32:$false, GPR:i32:$true, (imm:i32):$cc) - // Pattern complexity = 6 cost = 11 size = 0 - // - // Also VMOVScc and VMOVDcc. - SDValue Tmp2 = CurDAG->getTargetConstant(CCVal, MVT::i32); - SDValue Ops[] = { FalseVal, TrueVal, Tmp2, CCR, InFlag }; - unsigned Opc = 0; - switch (VT.getSimpleVT().SimpleTy) { - default: llvm_unreachable("Illegal conditional move type!"); - case MVT::i32: - Opc = Subtarget->isThumb() - ? (Subtarget->hasThumb2() ? ARM::t2MOVCCr : ARM::tMOVCCr_pseudo) - : ARM::MOVCCr; - break; - case MVT::f32: - Opc = ARM::VMOVScc; - break; - case MVT::f64: - Opc = ARM::VMOVDcc; - break; - } - return CurDAG->SelectNodeTo(N, Opc, VT, Ops, 5); -} - /// Target-specific DAG combining for ISD::XOR. /// Target-independent combining lowers SELECT_CC nodes of the form /// select_cc setg[ge] X, 0, X, -X @@ -2524,30 +2361,45 @@ SDNode *ARMDAGToDAGISel::SelectConcatVector(SDNode *N) { return createDRegPairNode(VT, N->getOperand(0), N->getOperand(1)); } -SDNode *ARMDAGToDAGISel::SelectAtomic64(SDNode *Node, unsigned Opc) { +SDNode *ARMDAGToDAGISel::SelectAtomic(SDNode *Node, unsigned Op8, + unsigned Op16,unsigned Op32, + unsigned Op64) { + // Mostly direct translation to the given operations, except that we preserve + // the AtomicOrdering for use later on. + AtomicSDNode *AN = cast(Node); + EVT VT = AN->getMemoryVT(); + + unsigned Op; + SDVTList VTs = CurDAG->getVTList(AN->getValueType(0), MVT::Other); + if (VT == MVT::i8) + Op = Op8; + else if (VT == MVT::i16) + Op = Op16; + else if (VT == MVT::i32) + Op = Op32; + else if (VT == MVT::i64) { + Op = Op64; + VTs = CurDAG->getVTList(MVT::i32, MVT::i32, MVT::Other); + } else + llvm_unreachable("Unexpected atomic operation"); + SmallVector Ops; - Ops.push_back(Node->getOperand(1)); // Ptr - Ops.push_back(Node->getOperand(2)); // Low part of Val1 - Ops.push_back(Node->getOperand(3)); // High part of Val1 - if (Opc == ARM::ATOMCMPXCHG6432) { - Ops.push_back(Node->getOperand(4)); // Low part of Val2 - Ops.push_back(Node->getOperand(5)); // High part of Val2 - } - Ops.push_back(Node->getOperand(0)); // Chain - MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); - MemOp[0] = cast(Node)->getMemOperand(); - SDNode *ResNode = CurDAG->getMachineNode(Opc, Node->getDebugLoc(), - MVT::i32, MVT::i32, MVT::Other, - Ops); - cast(ResNode)->setMemRefs(MemOp, MemOp + 1); - return ResNode; + for (unsigned i = 1; i < AN->getNumOperands(); ++i) + Ops.push_back(AN->getOperand(i)); + + Ops.push_back(CurDAG->getTargetConstant(AN->getOrdering(), MVT::i32)); + Ops.push_back(AN->getOperand(0)); // Chain moves to the end + + return CurDAG->SelectNodeTo(Node, Op, VTs, &Ops[0], Ops.size()); } SDNode *ARMDAGToDAGISel::Select(SDNode *N) { - DebugLoc dl = N->getDebugLoc(); + SDLoc dl(N); - if (N->isMachineOpcode()) + if (N->isMachineOpcode()) { + N->setNodeId(-1); return NULL; // Already selected. + } switch (N->getOpcode()) { default: break; @@ -2587,7 +2439,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { SDValue CPIdx = CurDAG->getTargetConstantPool(ConstantInt::get( Type::getInt32Ty(*CurDAG->getContext()), Val), - TLI.getPointerTy()); + getTargetLowering()->getPointerTy()); SDNode *ResNode; if (Subtarget->isThumb1Only()) { @@ -2617,7 +2469,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { case ISD::FrameIndex: { // Selects to ADDri FI, 0 which in turn will become ADDri SP, imm. int FI = cast(N)->getIndex(); - SDValue TFI = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy()); + SDValue TFI = CurDAG->getTargetFrameIndex(FI, + getTargetLowering()->getPointerTy()); if (Subtarget->isThumb1Only()) { SDValue Ops[] = { TFI, CurDAG->getTargetConstant(0, MVT::i32), getAL(CurDAG), CurDAG->getRegister(0, MVT::i32) }; @@ -2838,8 +2691,6 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { SDValue(Chain.getNode(), Chain.getResNo())); return NULL; } - case ARMISD::CMOV: - return SelectCMOVOp(N); case ARMISD::VZIP: { unsigned Opc = 0; EVT VT = N->getValueType(0); @@ -3121,7 +2972,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { case Intrinsic::arm_ldrexd: { SDValue MemAddr = N->getOperand(2); - DebugLoc dl = N->getDebugLoc(); + SDLoc dl(N); SDValue Chain = N->getOperand(0); bool isThumb = Subtarget->isThumb() && Subtarget->hasThumb2(); @@ -3179,7 +3030,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { } case Intrinsic::arm_strexd: { - DebugLoc dl = N->getDebugLoc(); + SDLoc dl(N); SDValue Chain = N->getOperand(0); SDValue Val0 = N->getOperand(2); SDValue Val1 = N->getOperand(3); @@ -3383,7 +3234,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { } case ARMISD::VTBL1: { - DebugLoc dl = N->getDebugLoc(); + SDLoc dl(N); EVT VT = N->getValueType(0); SmallVector Ops; @@ -3394,7 +3245,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { return CurDAG->getMachineNode(ARM::VTBL1, dl, VT, Ops); } case ARMISD::VTBL2: { - DebugLoc dl = N->getDebugLoc(); + SDLoc dl(N); EVT VT = N->getValueType(0); // Form a REG_SEQUENCE to force register allocation. @@ -3413,31 +3264,90 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { case ISD::CONCAT_VECTORS: return SelectConcatVector(N); - case ARMISD::ATOMOR64_DAG: - return SelectAtomic64(N, ARM::ATOMOR6432); - case ARMISD::ATOMXOR64_DAG: - return SelectAtomic64(N, ARM::ATOMXOR6432); - case ARMISD::ATOMADD64_DAG: - return SelectAtomic64(N, ARM::ATOMADD6432); - case ARMISD::ATOMSUB64_DAG: - return SelectAtomic64(N, ARM::ATOMSUB6432); - case ARMISD::ATOMNAND64_DAG: - return SelectAtomic64(N, ARM::ATOMNAND6432); - case ARMISD::ATOMAND64_DAG: - return SelectAtomic64(N, ARM::ATOMAND6432); - case ARMISD::ATOMSWAP64_DAG: - return SelectAtomic64(N, ARM::ATOMSWAP6432); - case ARMISD::ATOMCMPXCHG64_DAG: - return SelectAtomic64(N, ARM::ATOMCMPXCHG6432); - - case ARMISD::ATOMMIN64_DAG: - return SelectAtomic64(N, ARM::ATOMMIN6432); - case ARMISD::ATOMUMIN64_DAG: - return SelectAtomic64(N, ARM::ATOMUMIN6432); - case ARMISD::ATOMMAX64_DAG: - return SelectAtomic64(N, ARM::ATOMMAX6432); - case ARMISD::ATOMUMAX64_DAG: - return SelectAtomic64(N, ARM::ATOMUMAX6432); + case ISD::ATOMIC_LOAD: + if (cast(N)->getMemoryVT() == MVT::i64) + return SelectAtomic(N, 0, 0, 0, ARM::ATOMIC_LOAD_I64); + else + break; + + case ISD::ATOMIC_STORE: + if (cast(N)->getMemoryVT() == MVT::i64) + return SelectAtomic(N, 0, 0, 0, ARM::ATOMIC_STORE_I64); + else + break; + + case ISD::ATOMIC_LOAD_ADD: + return SelectAtomic(N, + ARM::ATOMIC_LOAD_ADD_I8, + ARM::ATOMIC_LOAD_ADD_I16, + ARM::ATOMIC_LOAD_ADD_I32, + ARM::ATOMIC_LOAD_ADD_I64); + case ISD::ATOMIC_LOAD_SUB: + return SelectAtomic(N, + ARM::ATOMIC_LOAD_SUB_I8, + ARM::ATOMIC_LOAD_SUB_I16, + ARM::ATOMIC_LOAD_SUB_I32, + ARM::ATOMIC_LOAD_SUB_I64); + case ISD::ATOMIC_LOAD_AND: + return SelectAtomic(N, + ARM::ATOMIC_LOAD_AND_I8, + ARM::ATOMIC_LOAD_AND_I16, + ARM::ATOMIC_LOAD_AND_I32, + ARM::ATOMIC_LOAD_AND_I64); + case ISD::ATOMIC_LOAD_OR: + return SelectAtomic(N, + ARM::ATOMIC_LOAD_OR_I8, + ARM::ATOMIC_LOAD_OR_I16, + ARM::ATOMIC_LOAD_OR_I32, + ARM::ATOMIC_LOAD_OR_I64); + case ISD::ATOMIC_LOAD_XOR: + return SelectAtomic(N, + ARM::ATOMIC_LOAD_XOR_I8, + ARM::ATOMIC_LOAD_XOR_I16, + ARM::ATOMIC_LOAD_XOR_I32, + ARM::ATOMIC_LOAD_XOR_I64); + case ISD::ATOMIC_LOAD_NAND: + return SelectAtomic(N, + ARM::ATOMIC_LOAD_NAND_I8, + ARM::ATOMIC_LOAD_NAND_I16, + ARM::ATOMIC_LOAD_NAND_I32, + ARM::ATOMIC_LOAD_NAND_I64); + case ISD::ATOMIC_LOAD_MIN: + return SelectAtomic(N, + ARM::ATOMIC_LOAD_MIN_I8, + ARM::ATOMIC_LOAD_MIN_I16, + ARM::ATOMIC_LOAD_MIN_I32, + ARM::ATOMIC_LOAD_MIN_I64); + case ISD::ATOMIC_LOAD_MAX: + return SelectAtomic(N, + ARM::ATOMIC_LOAD_MAX_I8, + ARM::ATOMIC_LOAD_MAX_I16, + ARM::ATOMIC_LOAD_MAX_I32, + ARM::ATOMIC_LOAD_MAX_I64); + case ISD::ATOMIC_LOAD_UMIN: + return SelectAtomic(N, + ARM::ATOMIC_LOAD_UMIN_I8, + ARM::ATOMIC_LOAD_UMIN_I16, + ARM::ATOMIC_LOAD_UMIN_I32, + ARM::ATOMIC_LOAD_UMIN_I64); + case ISD::ATOMIC_LOAD_UMAX: + return SelectAtomic(N, + ARM::ATOMIC_LOAD_UMAX_I8, + ARM::ATOMIC_LOAD_UMAX_I16, + ARM::ATOMIC_LOAD_UMAX_I32, + ARM::ATOMIC_LOAD_UMAX_I64); + case ISD::ATOMIC_SWAP: + return SelectAtomic(N, + ARM::ATOMIC_SWAP_I8, + ARM::ATOMIC_SWAP_I16, + ARM::ATOMIC_SWAP_I32, + ARM::ATOMIC_SWAP_I64); + case ISD::ATOMIC_CMP_SWAP: + return SelectAtomic(N, + ARM::ATOMIC_CMP_SWAP_I8, + ARM::ATOMIC_CMP_SWAP_I16, + ARM::ATOMIC_CMP_SWAP_I32, + ARM::ATOMIC_CMP_SWAP_I64); } return SelectCode(N); @@ -3449,24 +3359,20 @@ SDNode *ARMDAGToDAGISel::SelectInlineAsm(SDNode *N){ bool Changed = false; unsigned NumOps = N->getNumOperands(); - ExternalSymbolSDNode *S = dyn_cast( - N->getOperand(InlineAsm::Op_AsmString)); - StringRef AsmString = StringRef(S->getSymbol()); - // Normally, i64 data is bounded to two arbitrary GRPs for "%r" constraint. // However, some instrstions (e.g. ldrexd/strexd in ARM mode) require // (even/even+1) GPRs and use %n and %Hn to refer to the individual regs // respectively. Since there is no constraint to explicitly specify a - // reg pair, we search %H operand inside the asm string. If it is found, the - // transformation below enforces a GPRPair reg class for "%r" for 64-bit data. - if (AsmString.find(":H}") == StringRef::npos) - return NULL; + // reg pair, we use GPRPair reg class for "%r" for 64-bit data. For Thumb, + // the 64-bit data may be referred by H, Q, R modifiers, so we still pack + // them into a GPRPair. - DebugLoc dl = N->getDebugLoc(); - SDValue Glue = N->getOperand(NumOps-1); + SDLoc dl(N); + SDValue Glue = N->getGluedNode() ? N->getOperand(NumOps-1) : SDValue(0,0); + SmallVector OpChanged; // Glue node will be appended late. - for(unsigned i = 0; i < NumOps -1; ++i) { + for(unsigned i = 0, e = N->getGluedNode() ? NumOps - 1 : NumOps; i < e; ++i) { SDValue op = N->getOperand(i); AsmNodeOperands.push_back(op); @@ -3480,17 +3386,38 @@ SDNode *ARMDAGToDAGISel::SelectInlineAsm(SDNode *N){ else continue; + // Immediate operands to inline asm in the SelectionDAG are modeled with + // two operands. The first is a constant of value InlineAsm::Kind_Imm, and + // the second is a constant with the value of the immediate. If we get here + // and we have a Kind_Imm, skip the next operand, and continue. + if (Kind == InlineAsm::Kind_Imm) { + SDValue op = N->getOperand(++i); + AsmNodeOperands.push_back(op); + continue; + } + + unsigned NumRegs = InlineAsm::getNumOperandRegisters(Flag); + if (NumRegs) + OpChanged.push_back(false); + + unsigned DefIdx = 0; + bool IsTiedToChangedOp = false; + // If it's a use that is tied with a previous def, it has no + // reg class constraint. + if (Changed && InlineAsm::isUseOperandTiedToDef(Flag, DefIdx)) + IsTiedToChangedOp = OpChanged[DefIdx]; + if (Kind != InlineAsm::Kind_RegUse && Kind != InlineAsm::Kind_RegDef && Kind != InlineAsm::Kind_RegDefEarlyClobber) continue; - unsigned RegNum = InlineAsm::getNumOperandRegisters(Flag); unsigned RC; bool HasRC = InlineAsm::hasRegClassConstraint(Flag, RC); - if (!HasRC || RC != ARM::GPRRegClassID || RegNum != 2) + if ((!IsTiedToChangedOp && (!HasRC || RC != ARM::GPRRegClassID)) + || NumRegs != 2) continue; - assert((i+2 < NumOps-1) && "Invalid number of operands in inline asm"); + assert((i+2 < NumOps) && "Invalid number of operands in inline asm"); SDValue V0 = N->getOperand(i+1); SDValue V1 = N->getOperand(i+2); unsigned Reg0 = cast(V0)->getReg(); @@ -3551,8 +3478,12 @@ SDNode *ARMDAGToDAGISel::SelectInlineAsm(SDNode *N){ Changed = true; if(PairedReg.getNode()) { + OpChanged[OpChanged.size() -1 ] = true; Flag = InlineAsm::getFlagWord(Kind, 1 /* RegNum*/); - Flag = InlineAsm::getFlagWordForRegClass(Flag, ARM::GPRPairRegClassID); + if (IsTiedToChangedOp) + Flag = InlineAsm::getFlagWordForMatchingOp(Flag, DefIdx); + else + Flag = InlineAsm::getFlagWordForRegClass(Flag, ARM::GPRPairRegClassID); // Replace the current flag. AsmNodeOperands[AsmNodeOperands.size() -1] = CurDAG->getTargetConstant( Flag, MVT::i32); @@ -3563,11 +3494,12 @@ SDNode *ARMDAGToDAGISel::SelectInlineAsm(SDNode *N){ } } - AsmNodeOperands.push_back(Glue); + if (Glue.getNode()) + AsmNodeOperands.push_back(Glue); if (!Changed) return NULL; - SDValue New = CurDAG->getNode(ISD::INLINEASM, N->getDebugLoc(), + SDValue New = CurDAG->getNode(ISD::INLINEASM, SDLoc(N), CurDAG->getVTList(MVT::Other, MVT::Glue), &AsmNodeOperands[0], AsmNodeOperands.size()); New->setNodeId(-1); diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index e49cfc49854a..76a0a831f695 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -48,6 +48,7 @@ #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetOptions.h" +#include using namespace llvm; STATISTIC(NumTailCalls, "Number of tail calls"); @@ -74,7 +75,7 @@ namespace { class ARMCCState : public CCState { public: ARMCCState(CallingConv::ID CC, bool isVarArg, MachineFunction &MF, - const TargetMachine &TM, SmallVector &locs, + const TargetMachine &TM, SmallVectorImpl &locs, LLVMContext &C, ParmContext PC) : CCState(CC, isVarArg, MF, TM, locs, C) { assert(((PC == Call) || (PC == Prologue)) && @@ -174,9 +175,10 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); - if (Subtarget->isTargetDarwin()) { + if (Subtarget->isTargetIOS()) { // Uses VFP for Thumb libfuncs if available. - if (Subtarget->isThumb() && Subtarget->hasVFP2()) { + if (Subtarget->isThumb() && Subtarget->hasVFP2() && + Subtarget->hasARMOps()) { // Single-precision floating-point arithmetic. setLibcallName(RTLIB::ADD_F32, "__addsf3vfp"); setLibcallName(RTLIB::SUB_F32, "__subsf3vfp"); @@ -421,7 +423,7 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) } // Use divmod compiler-rt calls for iOS 5.0 and later. - if (Subtarget->getTargetTriple().getOS() == Triple::IOS && + if (Subtarget->getTargetTriple().isiOS() && !Subtarget->getTargetTriple().isOSVersionLT(5, 0)) { setLibcallName(RTLIB::SDIVREM_I32, "__divmodsi4"); setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4"); @@ -452,6 +454,7 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) } setOperationAction(ISD::ConstantFP, MVT::f32, Custom); + setOperationAction(ISD::ConstantFP, MVT::f64, Custom); if (Subtarget->hasNEON()) { addDRTypeForNEON(MVT::v2f32); @@ -564,16 +567,6 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand); setOperationAction(ISD::FP_EXTEND, MVT::v2f64, Expand); - // Custom expand long extensions to vectors. - setOperationAction(ISD::SIGN_EXTEND, MVT::v8i32, Custom); - setOperationAction(ISD::ZERO_EXTEND, MVT::v8i32, Custom); - setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom); - setOperationAction(ISD::ZERO_EXTEND, MVT::v4i64, Custom); - setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom); - setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom); - setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom); - setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom); - // NEON does not have single instruction CTPOP for vectors with element // types wider than 8-bits. However, custom lowering can leverage the // v8i8/v16i8 vcnt instruction. @@ -681,6 +674,8 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) setOperationAction(ISD::CTTZ_ZERO_UNDEF , MVT::i32 , Expand); setOperationAction(ISD::CTLZ_ZERO_UNDEF , MVT::i32 , Expand); + setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Custom); + // Only ARMv6 has BSWAP. if (!Subtarget->hasV6Ops()) setOperationAction(ISD::BSWAP, MVT::i32, Expand); @@ -691,10 +686,36 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) setOperationAction(ISD::SDIV, MVT::i32, Expand); setOperationAction(ISD::UDIV, MVT::i32, Expand); } + + // FIXME: Also set divmod for SREM on EABI setOperationAction(ISD::SREM, MVT::i32, Expand); setOperationAction(ISD::UREM, MVT::i32, Expand); - setOperationAction(ISD::SDIVREM, MVT::i32, Expand); - setOperationAction(ISD::UDIVREM, MVT::i32, Expand); + // Register based DivRem for AEABI (RTABI 4.2) + if (Subtarget->isTargetAEABI()) { + setLibcallName(RTLIB::SDIVREM_I8, "__aeabi_idivmod"); + setLibcallName(RTLIB::SDIVREM_I16, "__aeabi_idivmod"); + setLibcallName(RTLIB::SDIVREM_I32, "__aeabi_idivmod"); + setLibcallName(RTLIB::SDIVREM_I64, "__aeabi_ldivmod"); + setLibcallName(RTLIB::UDIVREM_I8, "__aeabi_uidivmod"); + setLibcallName(RTLIB::UDIVREM_I16, "__aeabi_uidivmod"); + setLibcallName(RTLIB::UDIVREM_I32, "__aeabi_uidivmod"); + setLibcallName(RTLIB::UDIVREM_I64, "__aeabi_uldivmod"); + + setLibcallCallingConv(RTLIB::SDIVREM_I8, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::SDIVREM_I16, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::SDIVREM_I32, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::SDIVREM_I64, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::UDIVREM_I8, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::UDIVREM_I16, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::UDIVREM_I32, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::UDIVREM_I64, CallingConv::ARM_AAPCS); + + setOperationAction(ISD::SDIVREM, MVT::i32, Custom); + setOperationAction(ISD::UDIVREM, MVT::i32, Custom); + } else { + setOperationAction(ISD::SDIVREM, MVT::i32, Expand); + setOperationAction(ISD::UDIVREM, MVT::i32, Expand); + } setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); setOperationAction(ISD::ConstantPool, MVT::i32, Custom); @@ -715,8 +736,6 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) if (!Subtarget->isTargetDarwin()) { // Non-Darwin platforms may return values in these registers via the // personality function. - setOperationAction(ISD::EHSELECTION, MVT::i32, Expand); - setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand); setExceptionPointerRegister(ARM::R0); setExceptionSelectorRegister(ARM::R1); } @@ -724,12 +743,10 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand); // ARMv6 Thumb1 (except for CPUs that support dmb / dsb) and earlier use // the default expansion. - // FIXME: This should be checking for v6k, not just v6. - if (Subtarget->hasDataBarrier() || - (Subtarget->hasV6Ops() && !Subtarget->isThumb())) { - // membarrier needs custom lowering; the rest are legal and handled - // normally. - setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom); + if (Subtarget->hasAnyDataBarrier() && !Subtarget->isThumb1Only()) { + // ATOMIC_FENCE needs custom lowering; the other 32-bit ones are legal and + // handled normally. + setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom); // Custom lowering for 64-bit ops setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, Custom); setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom); @@ -742,11 +759,20 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i64, Custom); setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i64, Custom); setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom); - // Automatically insert fences (dmb ist) around ATOMIC_SWAP etc. - setInsertFencesForAtomic(true); + // On v8, we have particularly efficient implementations of atomic fences + // if they can be combined with nearby atomic loads and stores. + if (!Subtarget->hasV8Ops()) { + // Automatically insert fences (dmb ist) around ATOMIC_SWAP etc. + setInsertFencesForAtomic(true); + } + setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom); } else { + // If there's anything we can use as a barrier, go through custom lowering + // for ATOMIC_FENCE. + setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, + Subtarget->hasAnyDataBarrier() ? Custom : Expand); + // Set them all for expansion, which will force libcalls. - setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Expand); setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Expand); setOperationAction(ISD::ATOMIC_SWAP, MVT::i32, Expand); setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i32, Expand); @@ -843,6 +869,18 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) setOperationAction(ISD::FP32_TO_FP16, MVT::i32, Expand); } } + + // Combine sin / cos into one node or libcall if possible. + if (Subtarget->hasSinCos()) { + setLibcallName(RTLIB::SINCOS_F32, "sincosf"); + setLibcallName(RTLIB::SINCOS_F64, "sincos"); + if (Subtarget->getTargetTriple().getOS() == Triple::IOS) { + // For iOS, we don't want to the normal expansion of a libcall to + // sincos. We want to issue a libcall to __sincos_stret. + setOperationAction(ISD::FSINCOS, MVT::f64, Custom); + setOperationAction(ISD::FSINCOS, MVT::f32, Custom); + } + } // We have target-specific dag combine patterns for the following nodes: // ARMISD::VMOVRRD - No need to call setTargetDAGCombine @@ -882,6 +920,44 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) setMinFunctionAlignment(Subtarget->isThumb() ? 1 : 2); } +static void getExclusiveOperation(unsigned Size, AtomicOrdering Ord, + bool isThumb2, unsigned &LdrOpc, + unsigned &StrOpc) { + static const unsigned LoadBares[4][2] = {{ARM::LDREXB, ARM::t2LDREXB}, + {ARM::LDREXH, ARM::t2LDREXH}, + {ARM::LDREX, ARM::t2LDREX}, + {ARM::LDREXD, ARM::t2LDREXD}}; + static const unsigned LoadAcqs[4][2] = {{ARM::LDAEXB, ARM::t2LDAEXB}, + {ARM::LDAEXH, ARM::t2LDAEXH}, + {ARM::LDAEX, ARM::t2LDAEX}, + {ARM::LDAEXD, ARM::t2LDAEXD}}; + static const unsigned StoreBares[4][2] = {{ARM::STREXB, ARM::t2STREXB}, + {ARM::STREXH, ARM::t2STREXH}, + {ARM::STREX, ARM::t2STREX}, + {ARM::STREXD, ARM::t2STREXD}}; + static const unsigned StoreRels[4][2] = {{ARM::STLEXB, ARM::t2STLEXB}, + {ARM::STLEXH, ARM::t2STLEXH}, + {ARM::STLEX, ARM::t2STLEX}, + {ARM::STLEXD, ARM::t2STLEXD}}; + + const unsigned (*LoadOps)[2], (*StoreOps)[2]; + if (Ord == Acquire || Ord == AcquireRelease || Ord == SequentiallyConsistent) + LoadOps = LoadAcqs; + else + LoadOps = LoadBares; + + if (Ord == Release || Ord == AcquireRelease || Ord == SequentiallyConsistent) + StoreOps = StoreRels; + else + StoreOps = StoreBares; + + assert(isPowerOf2_32(Size) && Size <= 8 && + "unsupported size for atomic binary op!"); + + LdrOpc = LoadOps[Log2_32(Size)][isThumb2]; + StrOpc = StoreOps[Log2_32(Size)][isThumb2]; +} + // FIXME: It might make sense to define the representative register class as the // nearest super-register that has a non-null superset. For example, DPR_VFP2 is // a super-register of SPR, and DPR is a superset if DPR_VFP2. Consequently, @@ -944,6 +1020,7 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::BR_JT: return "ARMISD::BR_JT"; case ARMISD::BR2_JT: return "ARMISD::BR2_JT"; case ARMISD::RET_FLAG: return "ARMISD::RET_FLAG"; + case ARMISD::INTRET_FLAG: return "ARMISD::INTRET_FLAG"; case ARMISD::PIC_ADD: return "ARMISD::PIC_ADD"; case ARMISD::CMP: return "ARMISD::CMP"; case ARMISD::CMN: return "ARMISD::CMN"; @@ -983,7 +1060,6 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::DYN_ALLOC: return "ARMISD::DYN_ALLOC"; - case ARMISD::MEMBARRIER: return "ARMISD::MEMBARRIER"; case ARMISD::MEMBARRIER_MCR: return "ARMISD::MEMBARRIER_MCR"; case ARMISD::PRELOAD: return "ARMISD::PRELOAD"; @@ -1042,6 +1118,8 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::BUILD_VECTOR: return "ARMISD::BUILD_VECTOR"; case ARMISD::FMAX: return "ARMISD::FMAX"; case ARMISD::FMIN: return "ARMISD::FMIN"; + case ARMISD::VMAXNM: return "ARMISD::VMAX"; + case ARMISD::VMINNM: return "ARMISD::VMIN"; case ARMISD::BFI: return "ARMISD::BFI"; case ARMISD::VORRIMM: return "ARMISD::VORRIMM"; case ARMISD::VBICIMM: return "ARMISD::VBICIMM"; @@ -1069,7 +1147,7 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { } } -EVT ARMTargetLowering::getSetCCResultType(EVT VT) const { +EVT ARMTargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const { if (!VT.isVector()) return getPointerTy(); return VT.changeVectorElementTypeToInteger(); } @@ -1233,7 +1311,7 @@ SDValue ARMTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Ins, - DebugLoc dl, SelectionDAG &DAG, + SDLoc dl, SelectionDAG &DAG, SmallVectorImpl &InVals, bool isThisReturn, SDValue ThisVal) const { @@ -1314,7 +1392,7 @@ ARMTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, SDValue ARMTargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, SDValue Arg, - DebugLoc dl, SelectionDAG &DAG, + SDLoc dl, SelectionDAG &DAG, const CCValAssign &VA, ISD::ArgFlagsTy Flags) const { unsigned LocMemOffset = VA.getLocMemOffset(); @@ -1325,12 +1403,12 @@ ARMTargetLowering::LowerMemOpCallTo(SDValue Chain, false, false, 0); } -void ARMTargetLowering::PassF64ArgInRegs(DebugLoc dl, SelectionDAG &DAG, +void ARMTargetLowering::PassF64ArgInRegs(SDLoc dl, SelectionDAG &DAG, SDValue Chain, SDValue &Arg, RegsToPassVector &RegsToPass, CCValAssign &VA, CCValAssign &NextVA, SDValue &StackPtr, - SmallVector &MemOpChains, + SmallVectorImpl &MemOpChains, ISD::ArgFlagsTy Flags) const { SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl, @@ -1357,10 +1435,10 @@ SDValue ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SmallVectorImpl &InVals) const { SelectionDAG &DAG = CLI.DAG; - DebugLoc &dl = CLI.DL; - SmallVector &Outs = CLI.Outs; - SmallVector &OutVals = CLI.OutVals; - SmallVector &Ins = CLI.Ins; + SDLoc &dl = CLI.DL; + SmallVectorImpl &Outs = CLI.Outs; + SmallVectorImpl &OutVals = CLI.OutVals; + SmallVectorImpl &Ins = CLI.Ins; SDValue Chain = CLI.Chain; SDValue Callee = CLI.Callee; bool &isTailCall = CLI.IsTailCall; @@ -1406,7 +1484,8 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // Adjust the stack pointer for the new arguments... // These operations are automatically eliminated by the prolog/epilog pass if (!isSibCall) - Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true)); + Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true), + dl); SDValue StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy()); @@ -1496,7 +1575,8 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const); SDValue Load = DAG.getLoad(PtrVT, dl, Chain, AddArg, MachinePointerInfo(), - false, false, false, 0); + false, false, false, + DAG.InferPtrAlignment(AddArg)); MemOpChains.push_back(Load.getValue(1)); RegsToPass.push_back(std::make_pair(j, Load)); } @@ -1705,17 +1785,26 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, RegsToPass[i].second.getValueType())); // Add a register mask operand representing the call-preserved registers. - const uint32_t *Mask; - const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo(); - const ARMBaseRegisterInfo *ARI = static_cast(TRI); - if (isThisReturn) - // For 'this' returns, use the R0-preserving mask - Mask = ARI->getThisReturnPreservedMask(CallConv); - else - Mask = ARI->getCallPreservedMask(CallConv); + if (!isTailCall) { + const uint32_t *Mask; + const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo(); + const ARMBaseRegisterInfo *ARI = static_cast(TRI); + if (isThisReturn) { + // For 'this' returns, use the R0-preserving mask if applicable + Mask = ARI->getThisReturnPreservedMask(CallConv); + if (!Mask) { + // Set isThisReturn to false if the calling convention is not one that + // allows 'returned' to be modeled in this way, so LowerCallResult does + // not try to pass 'this' straight through + isThisReturn = false; + Mask = ARI->getCallPreservedMask(CallConv); + } + } else + Mask = ARI->getCallPreservedMask(CallConv); - assert(Mask && "Missing call preserved mask for calling convention"); - Ops.push_back(DAG.getRegisterMask(Mask)); + assert(Mask && "Missing call preserved mask for calling convention"); + Ops.push_back(DAG.getRegisterMask(Mask)); + } if (InFlag.getNode()) Ops.push_back(InFlag); @@ -1729,7 +1818,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, InFlag = Chain.getValue(1); Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true), - DAG.getIntPtrConstant(0, true), InFlag); + DAG.getIntPtrConstant(0, true), InFlag, dl); if (!Ins.empty()) InFlag = Chain.getValue(1); @@ -1795,7 +1884,7 @@ ARMTargetLowering::HandleByVal( // else parameter would be splitted between registers and stack, // end register would be r4 in this case. unsigned ByValRegBegin = reg; - unsigned ByValRegEnd = (size < excess) ? reg + size/4 : ARM::R4; + unsigned ByValRegEnd = (size < excess) ? reg + size/4 : (unsigned)ARM::R4; State->addInRegsParamInfo(ByValRegBegin, ByValRegEnd); // Note, first register is allocated in the beginning of function already, // allocate remained amount of registers we need. @@ -1886,6 +1975,12 @@ ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, if (isVarArg && !Outs.empty()) return false; + // Exception-handling functions need a special set of instructions to indicate + // a return to the hardware. Tail-calling another function would probably + // break this. + if (CallerF->hasFnAttribute("interrupt")) + return false; + // Also avoid sibcall optimization if either caller or callee uses struct // return semantics. if (isCalleeStructRet || isCallerStructRet) @@ -2014,12 +2109,45 @@ ARMTargetLowering::CanLowerReturn(CallingConv::ID CallConv, isVarArg)); } +static SDValue LowerInterruptReturn(SmallVectorImpl &RetOps, + SDLoc DL, SelectionDAG &DAG) { + const MachineFunction &MF = DAG.getMachineFunction(); + const Function *F = MF.getFunction(); + + StringRef IntKind = F->getFnAttribute("interrupt").getValueAsString(); + + // See ARM ARM v7 B1.8.3. On exception entry LR is set to a possibly offset + // version of the "preferred return address". These offsets affect the return + // instruction if this is a return from PL1 without hypervisor extensions. + // IRQ/FIQ: +4 "subs pc, lr, #4" + // SWI: 0 "subs pc, lr, #0" + // ABORT: +4 "subs pc, lr, #4" + // UNDEF: +4/+2 "subs pc, lr, #0" + // UNDEF varies depending on where the exception came from ARM or Thumb + // mode. Alongside GCC, we throw our hands up in disgust and pretend it's 0. + + int64_t LROffset; + if (IntKind == "" || IntKind == "IRQ" || IntKind == "FIQ" || + IntKind == "ABORT") + LROffset = 4; + else if (IntKind == "SWI" || IntKind == "UNDEF") + LROffset = 0; + else + report_fatal_error("Unsupported interrupt attribute. If present, value " + "must be one of: IRQ, FIQ, SWI, ABORT or UNDEF"); + + RetOps.insert(RetOps.begin() + 1, DAG.getConstant(LROffset, MVT::i32, false)); + + return DAG.getNode(ARMISD::INTRET_FLAG, DL, MVT::Other, + RetOps.data(), RetOps.size()); +} + SDValue ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Outs, const SmallVectorImpl &OutVals, - DebugLoc dl, SelectionDAG &DAG) const { + SDLoc dl, SelectionDAG &DAG) const { // CCValAssign - represent the assignment of the return value to a location. SmallVector RVLocs; @@ -2099,6 +2227,19 @@ ARMTargetLowering::LowerReturn(SDValue Chain, if (Flag.getNode()) RetOps.push_back(Flag); + // CPUs which aren't M-class use a special sequence to return from + // exceptions (roughly, any instruction setting pc and cpsr simultaneously, + // though we use "subs pc, lr, #N"). + // + // M-class CPUs actually use a normal return sequence with a special + // (hardware-provided) value in LR, so the normal code path works. + if (DAG.getMachineFunction().getFunction()->hasFnAttribute("interrupt") && + !Subtarget->isMClass()) { + if (Subtarget->isThumb1Only()) + report_fatal_error("interrupt attribute is not supported in Thumb1"); + return LowerInterruptReturn(RetOps, dl, DAG); + } + return DAG.getNode(ARMISD::RET_FLAG, dl, MVT::Other, RetOps.data(), RetOps.size()); } @@ -2147,7 +2288,7 @@ bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const { Copy = *Copy->use_begin(); if (Copy->getOpcode() != ISD::CopyToReg || !Copy->hasNUsesOfValue(1, 0)) return false; - Chain = Copy->getOperand(0); + TCChain = Copy->getOperand(0); } else { return false; } @@ -2155,7 +2296,8 @@ bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const { bool HasRet = false; for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end(); UI != UE; ++UI) { - if (UI->getOpcode() != ARMISD::RET_FLAG) + if (UI->getOpcode() != ARMISD::RET_FLAG && + UI->getOpcode() != ARMISD::INTRET_FLAG) return false; HasRet = true; } @@ -2186,7 +2328,7 @@ bool ARMTargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const { static SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) { EVT PtrVT = Op.getValueType(); // FIXME there is no actual debug info here - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); ConstantPoolSDNode *CP = cast(Op); SDValue Res; if (CP->isMachineConstantPoolEntry()) @@ -2207,7 +2349,7 @@ SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op, MachineFunction &MF = DAG.getMachineFunction(); ARMFunctionInfo *AFI = MF.getInfo(); unsigned ARMPCLabelIndex = 0; - DebugLoc DL = Op.getDebugLoc(); + SDLoc DL(Op); EVT PtrVT = getPointerTy(); const BlockAddress *BA = cast(Op)->getBlockAddress(); Reloc::Model RelocM = getTargetMachine().getRelocationModel(); @@ -2236,7 +2378,7 @@ SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op, SDValue ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA, SelectionDAG &DAG) const { - DebugLoc dl = GA->getDebugLoc(); + SDLoc dl(GA); EVT PtrVT = getPointerTy(); unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8; MachineFunction &MF = DAG.getMachineFunction(); @@ -2279,7 +2421,7 @@ ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA, SelectionDAG &DAG, TLSModel::Model model) const { const GlobalValue *GV = GA->getGlobal(); - DebugLoc dl = GA->getDebugLoc(); + SDLoc dl(GA); SDValue Offset; SDValue Chain = DAG.getEntryNode(); EVT PtrVT = getPointerTy(); @@ -2349,7 +2491,7 @@ ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op, SelectionDAG &DAG) const { EVT PtrVT = getPointerTy(); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); const GlobalValue *GV = cast(Op)->getGlobal(); if (getTargetMachine().getRelocationModel() == Reloc::PIC_) { bool UseGOTOFF = GV->hasLocalLinkage() || GV->hasHiddenVisibility(); @@ -2392,7 +2534,7 @@ SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op, SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op, SelectionDAG &DAG) const { EVT PtrVT = getPointerTy(); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); const GlobalValue *GV = cast(Op)->getGlobal(); Reloc::Model RelocM = getTargetMachine().getRelocationModel(); @@ -2457,7 +2599,7 @@ SDValue ARMTargetLowering::LowerGLOBAL_OFFSET_TABLE(SDValue Op, ARMFunctionInfo *AFI = MF.getInfo(); unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); EVT PtrVT = getPointerTy(); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); unsigned PCAdj = Subtarget->isThumb() ? 4 : 8; ARMConstantPoolValue *CPV = ARMConstantPoolSymbol::Create(*DAG.getContext(), "_GLOBAL_OFFSET_TABLE_", @@ -2473,7 +2615,7 @@ SDValue ARMTargetLowering::LowerGLOBAL_OFFSET_TABLE(SDValue Op, SDValue ARMTargetLowering::LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const { - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); SDValue Val = DAG.getConstant(0, MVT::i32); return DAG.getNode(ARMISD::EH_SJLJ_SETJMP, dl, DAG.getVTList(MVT::i32, MVT::Other), Op.getOperand(0), @@ -2482,7 +2624,7 @@ ARMTargetLowering::LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const { SDValue ARMTargetLowering::LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const { - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); return DAG.getNode(ARMISD::EH_SJLJ_LONGJMP, dl, MVT::Other, Op.getOperand(0), Op.getOperand(1), DAG.getConstant(0, MVT::i32)); } @@ -2491,7 +2633,7 @@ SDValue ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget) const { unsigned IntNo = cast(Op.getOperand(0))->getZExtValue(); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); switch (IntNo) { default: return SDValue(); // Don't custom lower most intrinsics. case Intrinsic::arm_thread_pointer: { @@ -2527,7 +2669,7 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG, case Intrinsic::arm_neon_vmullu: { unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmulls) ? ARMISD::VMULLs : ARMISD::VMULLu; - return DAG.getNode(NewOpc, Op.getDebugLoc(), Op.getValueType(), + return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); } } @@ -2536,19 +2678,33 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG, static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget) { // FIXME: handle "fence singlethread" more efficiently. - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); if (!Subtarget->hasDataBarrier()) { // Some ARMv6 cpus can support data barriers with an mcr instruction. // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get // here. assert(Subtarget->hasV6Ops() && !Subtarget->isThumb() && - "Unexpected ISD::MEMBARRIER encountered. Should be libcall!"); + "Unexpected ISD::ATOMIC_FENCE encountered. Should be libcall!"); return DAG.getNode(ARMISD::MEMBARRIER_MCR, dl, MVT::Other, Op.getOperand(0), DAG.getConstant(0, MVT::i32)); } - return DAG.getNode(ARMISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0), - DAG.getConstant(ARM_MB::ISH, MVT::i32)); + ConstantSDNode *OrdN = cast(Op.getOperand(1)); + AtomicOrdering Ord = static_cast(OrdN->getZExtValue()); + unsigned Domain = ARM_MB::ISH; + if (Subtarget->isMClass()) { + // Only a full system barrier exists in the M-class architectures. + Domain = ARM_MB::SY; + } else if (Subtarget->isSwift() && Ord == Release) { + // Swift happens to implement ISHST barriers in a way that's compatible with + // Release semantics but weaker than ISH so we'd be fools not to use + // it. Beware: other processors probably don't! + Domain = ARM_MB::ISHST; + } + + return DAG.getNode(ISD::INTRINSIC_VOID, dl, MVT::Other, Op.getOperand(0), + DAG.getConstant(Intrinsic::arm_dmb, MVT::i32), + DAG.getConstant(Domain, MVT::i32)); } static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG, @@ -2559,7 +2715,7 @@ static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG, // Just preserve the chain. return Op.getOperand(0); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); unsigned isRead = ~cast(Op.getOperand(2))->getZExtValue() & 1; if (!isRead && (!Subtarget->hasV7Ops() || !Subtarget->hasMPExtension())) @@ -2584,7 +2740,7 @@ static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) { // vastart just stores the address of the VarArgsFrameIndex slot into the // memory location argument. - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); const Value *SV = cast(Op.getOperand(2))->getValue(); @@ -2595,7 +2751,7 @@ static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) { SDValue ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA, SDValue &Root, SelectionDAG &DAG, - DebugLoc dl) const { + SDLoc dl) const { MachineFunction &MF = DAG.getMachineFunction(); ARMFunctionInfo *AFI = MF.getInfo(); @@ -2630,6 +2786,7 @@ ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA, void ARMTargetLowering::computeRegArea(CCState &CCInfo, MachineFunction &MF, unsigned InRegsParamRecordIdx, + unsigned ArgSize, unsigned &ArgRegsSize, unsigned &ArgRegsSaveSize) const { @@ -2648,7 +2805,29 @@ ARMTargetLowering::computeRegArea(CCState &CCInfo, MachineFunction &MF, unsigned Align = MF.getTarget().getFrameLowering()->getStackAlignment(); ArgRegsSize = NumGPRs * 4; - ArgRegsSaveSize = (ArgRegsSize + Align - 1) & ~(Align - 1); + + // If parameter is split between stack and GPRs... + if (NumGPRs && Align == 8 && + (ArgRegsSize < ArgSize || + InRegsParamRecordIdx >= CCInfo.getInRegsParamsCount())) { + // Add padding for part of param recovered from GPRs, so + // its last byte must be at address K*8 - 1. + // We need to do it, since remained (stack) part of parameter has + // stack alignment, and we need to "attach" "GPRs head" without gaps + // to it: + // Stack: + // |---- 8 bytes block ----| |---- 8 bytes block ----| |---- 8 bytes... + // [ [padding] [GPRs head] ] [ Tail passed via stack .... + // + ARMFunctionInfo *AFI = MF.getInfo(); + unsigned Padding = + ((ArgRegsSize + AFI->getArgRegsSaveSize() + Align - 1) & ~(Align-1)) - + (ArgRegsSize + AFI->getArgRegsSaveSize()); + ArgRegsSaveSize = ArgRegsSize + Padding; + } else + // We don't need to extend regs save size for byval parameters if they + // are passed via GPRs only. + ArgRegsSaveSize = ArgRegsSize; } // The remaining GPRs hold either the beginning of variable-argument @@ -2661,11 +2840,12 @@ ARMTargetLowering::computeRegArea(CCState &CCInfo, MachineFunction &MF, // Return: The frame index registers were stored into. int ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG, - DebugLoc dl, SDValue &Chain, + SDLoc dl, SDValue &Chain, const Value *OrigArg, unsigned InRegsParamRecordIdx, unsigned OffsetFromOrigArg, unsigned ArgOffset, + unsigned ArgSize, bool ForceMutable) const { // Currently, two use-cases possible: @@ -2690,12 +2870,13 @@ ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG, lastRegToSaveIndex = REnd - ARM::R0; } else { firstRegToSaveIndex = CCInfo.getFirstUnallocated - (GPRArgRegs, sizeof(GPRArgRegs) / sizeof(GPRArgRegs[0])); + (GPRArgRegs, array_lengthof(GPRArgRegs)); lastRegToSaveIndex = 4; } unsigned ArgRegsSize, ArgRegsSaveSize; - computeRegArea(CCInfo, MF, InRegsParamRecordIdx, ArgRegsSize, ArgRegsSaveSize); + computeRegArea(CCInfo, MF, InRegsParamRecordIdx, ArgSize, + ArgRegsSize, ArgRegsSaveSize); // Store any by-val regs to their spots on the stack so that they may be // loaded by deferencing the result of formal parameter pointer or va_next. @@ -2703,9 +2884,17 @@ ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG, // was initialized, it can't be initialized again. if (ArgRegsSaveSize) { + unsigned Padding = ArgRegsSaveSize - ArgRegsSize; + + if (Padding) { + assert(AFI->getStoredByValParamsPadding() == 0 && + "The only parameter may be padded."); + AFI->setStoredByValParamsPadding(Padding); + } + int FrameIndex = MFI->CreateFixedObject( ArgRegsSaveSize, - ArgOffset + ArgRegsSaveSize - ArgRegsSize, + Padding + ArgOffset, false); SDValue FIN = DAG.getFrameIndex(FrameIndex, getPointerTy()); @@ -2737,13 +2926,14 @@ ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG, return FrameIndex; } else // This will point to the next argument passed via stack. - return MFI->CreateFixedObject(4, ArgOffset, !ForceMutable); + return MFI->CreateFixedObject( + 4, AFI->getStoredByValParamsPadding() + ArgOffset, !ForceMutable); } // Setup stack frame, the va_list pointer will start from. void ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG, - DebugLoc dl, SDValue &Chain, + SDLoc dl, SDValue &Chain, unsigned ArgOffset, bool ForceMutable) const { MachineFunction &MF = DAG.getMachineFunction(); @@ -2756,7 +2946,7 @@ ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG, // argument passed via stack. int FrameIndex = StoreByValRegs(CCInfo, DAG, dl, Chain, 0, CCInfo.getInRegsParamsCount(), - 0, ArgOffset, ForceMutable); + 0, ArgOffset, 0, ForceMutable); AFI->setVarArgsFrameIndex(FrameIndex); } @@ -2766,7 +2956,7 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Ins, - DebugLoc dl, SelectionDAG &DAG, + SDLoc dl, SelectionDAG &DAG, SmallVectorImpl &InVals) const { MachineFunction &MF = DAG.getMachineFunction(); @@ -2896,12 +3086,15 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain, CurByValIndex, Ins[VA.getValNo()].PartOffset, VA.getLocMemOffset(), + Flags.getByValSize(), true /*force mutable frames*/); InVals.push_back(DAG.getFrameIndex(FrameIndex, getPointerTy())); CCInfo.nextInRegsParam(); } else { + unsigned FIOffset = VA.getLocMemOffset() + + AFI->getStoredByValParamsPadding(); int FI = MFI->CreateFixedObject(VA.getLocVT().getSizeInBits()/8, - VA.getLocMemOffset(), true); + FIOffset, true); // Create load nodes to retrieve arguments from the stack. SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); @@ -2943,7 +3136,7 @@ static bool isFloatingPointZero(SDValue Op) { SDValue ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue &ARMcc, SelectionDAG &DAG, - DebugLoc dl) const { + SDLoc dl) const { if (ConstantSDNode *RHSC = dyn_cast(RHS.getNode())) { unsigned C = RHSC->getZExtValue(); if (!isLegalICmpImmediate(C)) { @@ -3001,7 +3194,7 @@ ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, /// Returns a appropriate VFP CMP (fcmp{s|d}+fmstat) for the given operands. SDValue ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS, SelectionDAG &DAG, - DebugLoc dl) const { + SDLoc dl) const { SDValue Cmp; if (!isFloatingPointZero(RHS)) Cmp = DAG.getNode(ARMISD::CMPFP, dl, MVT::Glue, LHS, RHS); @@ -3015,7 +3208,7 @@ ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS, SelectionDAG &DAG, SDValue ARMTargetLowering::duplicateCmp(SDValue Cmp, SelectionDAG &DAG) const { unsigned Opc = Cmp.getOpcode(); - DebugLoc DL = Cmp.getDebugLoc(); + SDLoc DL(Cmp); if (Opc == ARMISD::CMP || Opc == ARMISD::CMPZ) return DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1)); @@ -3035,7 +3228,7 @@ SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { SDValue Cond = Op.getOperand(0); SDValue SelectTrue = Op.getOperand(1); SDValue SelectFalse = Op.getOperand(2); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); // Convert: // @@ -3083,6 +3276,61 @@ SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { SelectTrue, SelectFalse, ISD::SETNE); } +static ISD::CondCode getInverseCCForVSEL(ISD::CondCode CC) { + if (CC == ISD::SETNE) + return ISD::SETEQ; + return ISD::getSetCCSwappedOperands(CC); +} + +static void checkVSELConstraints(ISD::CondCode CC, ARMCC::CondCodes &CondCode, + bool &swpCmpOps, bool &swpVselOps) { + // Start by selecting the GE condition code for opcodes that return true for + // 'equality' + if (CC == ISD::SETUGE || CC == ISD::SETOGE || CC == ISD::SETOLE || + CC == ISD::SETULE) + CondCode = ARMCC::GE; + + // and GT for opcodes that return false for 'equality'. + else if (CC == ISD::SETUGT || CC == ISD::SETOGT || CC == ISD::SETOLT || + CC == ISD::SETULT) + CondCode = ARMCC::GT; + + // Since we are constrained to GE/GT, if the opcode contains 'less', we need + // to swap the compare operands. + if (CC == ISD::SETOLE || CC == ISD::SETULE || CC == ISD::SETOLT || + CC == ISD::SETULT) + swpCmpOps = true; + + // Both GT and GE are ordered comparisons, and return false for 'unordered'. + // If we have an unordered opcode, we need to swap the operands to the VSEL + // instruction (effectively negating the condition). + // + // This also has the effect of swapping which one of 'less' or 'greater' + // returns true, so we also swap the compare operands. It also switches + // whether we return true for 'equality', so we compensate by picking the + // opposite condition code to our original choice. + if (CC == ISD::SETULE || CC == ISD::SETULT || CC == ISD::SETUGE || + CC == ISD::SETUGT) { + swpCmpOps = !swpCmpOps; + swpVselOps = !swpVselOps; + CondCode = CondCode == ARMCC::GT ? ARMCC::GE : ARMCC::GT; + } + + // 'ordered' is 'anything but unordered', so use the VS condition code and + // swap the VSEL operands. + if (CC == ISD::SETO) { + CondCode = ARMCC::VS; + swpVselOps = true; + } + + // 'unordered or not equal' is 'anything but equal', so use the EQ condition + // code and swap the VSEL operands. + if (CC == ISD::SETUNE) { + CondCode = ARMCC::EQ; + swpVselOps = true; + } +} + SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); SDValue LHS = Op.getOperand(0); @@ -3090,18 +3338,69 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { ISD::CondCode CC = cast(Op.getOperand(4))->get(); SDValue TrueVal = Op.getOperand(2); SDValue FalseVal = Op.getOperand(3); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); if (LHS.getValueType() == MVT::i32) { + // Try to generate VSEL on ARMv8. + // The VSEL instruction can't use all the usual ARM condition + // codes: it only has two bits to select the condition code, so it's + // constrained to use only GE, GT, VS and EQ. + // + // To implement all the various ISD::SETXXX opcodes, we sometimes need to + // swap the operands of the previous compare instruction (effectively + // inverting the compare condition, swapping 'less' and 'greater') and + // sometimes need to swap the operands to the VSEL (which inverts the + // condition in the sense of firing whenever the previous condition didn't) + if (getSubtarget()->hasFPARMv8() && (TrueVal.getValueType() == MVT::f32 || + TrueVal.getValueType() == MVT::f64)) { + ARMCC::CondCodes CondCode = IntCCToARMCC(CC); + if (CondCode == ARMCC::LT || CondCode == ARMCC::LE || + CondCode == ARMCC::VC || CondCode == ARMCC::NE) { + CC = getInverseCCForVSEL(CC); + std::swap(TrueVal, FalseVal); + } + } + SDValue ARMcc; SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl); - return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc, CCR,Cmp); + return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc, CCR, + Cmp); } ARMCC::CondCodes CondCode, CondCode2; FPCCToARMCC(CC, CondCode, CondCode2); + // Try to generate VSEL on ARMv8. + if (getSubtarget()->hasFPARMv8() && (TrueVal.getValueType() == MVT::f32 || + TrueVal.getValueType() == MVT::f64)) { + // We can select VMAXNM/VMINNM from a compare followed by a select with the + // same operands, as follows: + // c = fcmp [ogt, olt, ugt, ult] a, b + // select c, a, b + // We only do this in unsafe-fp-math, because signed zeros and NaNs are + // handled differently than the original code sequence. + if (getTargetMachine().Options.UnsafeFPMath && LHS == TrueVal && + RHS == FalseVal) { + if (CC == ISD::SETOGT || CC == ISD::SETUGT) + return DAG.getNode(ARMISD::VMAXNM, dl, VT, TrueVal, FalseVal); + if (CC == ISD::SETOLT || CC == ISD::SETULT) + return DAG.getNode(ARMISD::VMINNM, dl, VT, TrueVal, FalseVal); + } + + bool swpCmpOps = false; + bool swpVselOps = false; + checkVSELConstraints(CC, CondCode, swpCmpOps, swpVselOps); + + if (CondCode == ARMCC::GT || CondCode == ARMCC::GE || + CondCode == ARMCC::VS || CondCode == ARMCC::EQ) { + if (swpCmpOps) + std::swap(LHS, RHS); + if (swpVselOps) + std::swap(TrueVal, FalseVal); + } + } + SDValue ARMcc = DAG.getConstant(CondCode, MVT::i32); SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl); SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); @@ -3145,7 +3444,7 @@ static SDValue bitcastf32Toi32(SDValue Op, SelectionDAG &DAG) { return DAG.getConstant(0, MVT::i32); if (LoadSDNode *Ld = dyn_cast(Op)) - return DAG.getLoad(MVT::i32, Op.getDebugLoc(), + return DAG.getLoad(MVT::i32, SDLoc(Op), Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(), Ld->isVolatile(), Ld->isNonTemporal(), Ld->isInvariant(), Ld->getAlignment()); @@ -3163,7 +3462,7 @@ static void expandf64Toi32(SDValue Op, SelectionDAG &DAG, if (LoadSDNode *Ld = dyn_cast(Op)) { SDValue Ptr = Ld->getBasePtr(); - RetVal1 = DAG.getLoad(MVT::i32, Op.getDebugLoc(), + RetVal1 = DAG.getLoad(MVT::i32, SDLoc(Op), Ld->getChain(), Ptr, Ld->getPointerInfo(), Ld->isVolatile(), Ld->isNonTemporal(), @@ -3171,9 +3470,9 @@ static void expandf64Toi32(SDValue Op, SelectionDAG &DAG, EVT PtrType = Ptr.getValueType(); unsigned NewAlign = MinAlign(Ld->getAlignment(), 4); - SDValue NewPtr = DAG.getNode(ISD::ADD, Op.getDebugLoc(), + SDValue NewPtr = DAG.getNode(ISD::ADD, SDLoc(Op), PtrType, Ptr, DAG.getConstant(4, PtrType)); - RetVal2 = DAG.getLoad(MVT::i32, Op.getDebugLoc(), + RetVal2 = DAG.getLoad(MVT::i32, SDLoc(Op), Ld->getChain(), NewPtr, Ld->getPointerInfo().getWithOffset(4), Ld->isVolatile(), Ld->isNonTemporal(), @@ -3193,7 +3492,7 @@ ARMTargetLowering::OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const { SDValue LHS = Op.getOperand(2); SDValue RHS = Op.getOperand(3); SDValue Dest = Op.getOperand(4); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); bool LHSSeenZero = false; bool LHSOk = canChangeToInt(LHS, LHSSeenZero, Subtarget); @@ -3243,7 +3542,7 @@ SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { SDValue LHS = Op.getOperand(2); SDValue RHS = Op.getOperand(3); SDValue Dest = Op.getOperand(4); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); if (LHS.getValueType() == MVT::i32) { SDValue ARMcc; @@ -3284,7 +3583,7 @@ SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const { SDValue Chain = Op.getOperand(0); SDValue Table = Op.getOperand(1); SDValue Index = Op.getOperand(2); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); EVT PTy = getPointerTy(); JumpTableSDNode *JT = cast(Table); @@ -3320,7 +3619,7 @@ SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const { static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) { EVT VT = Op.getValueType(); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); if (Op.getValueType().getVectorElementType() == MVT::i32) { if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::f32) @@ -3342,7 +3641,7 @@ static SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) { if (VT.isVector()) return LowerVectorFP_TO_INT(Op, DAG); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); unsigned Opc; switch (Op.getOpcode()) { @@ -3360,7 +3659,7 @@ static SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) { static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) { EVT VT = Op.getValueType(); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::i32) { if (VT.getVectorElementType() == MVT::f32) @@ -3396,7 +3695,7 @@ static SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) { if (VT.isVector()) return LowerVectorINT_TO_FP(Op, DAG); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); unsigned Opc; switch (Op.getOpcode()) { @@ -3417,7 +3716,7 @@ SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { // Implement fcopysign with a fabs and a conditional fneg. SDValue Tmp0 = Op.getOperand(0); SDValue Tmp1 = Op.getOperand(1); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); EVT VT = Op.getValueType(); EVT SrcVT = Tmp1.getValueType(); bool InGPR = Tmp0.getOpcode() == ISD::BITCAST || @@ -3501,7 +3800,7 @@ SDValue ARMTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{ MFI->setReturnAddressIsTaken(true); EVT VT = Op.getValueType(); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); unsigned Depth = cast(Op.getOperand(0))->getZExtValue(); if (Depth) { SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); @@ -3521,7 +3820,7 @@ SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { MFI->setFrameAddressIsTaken(true); EVT VT = Op.getValueType(); - DebugLoc dl = Op.getDebugLoc(); // FIXME probably not meaningful + SDLoc dl(Op); // FIXME probably not meaningful unsigned Depth = cast(Op.getOperand(0))->getZExtValue(); unsigned FrameReg = (Subtarget->isThumb() || Subtarget->isTargetDarwin()) ? ARM::R7 : ARM::R11; @@ -3533,47 +3832,6 @@ SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { return FrameAddr; } -/// Custom Expand long vector extensions, where size(DestVec) > 2*size(SrcVec), -/// and size(DestVec) > 128-bits. -/// This is achieved by doing the one extension from the SrcVec, splitting the -/// result, extending these parts, and then concatenating these into the -/// destination. -static SDValue ExpandVectorExtension(SDNode *N, SelectionDAG &DAG) { - SDValue Op = N->getOperand(0); - EVT SrcVT = Op.getValueType(); - EVT DestVT = N->getValueType(0); - - assert(DestVT.getSizeInBits() > 128 && - "Custom sext/zext expansion needs >128-bit vector."); - // If this is a normal length extension, use the default expansion. - if (SrcVT.getSizeInBits()*4 != DestVT.getSizeInBits() && - SrcVT.getSizeInBits()*8 != DestVT.getSizeInBits()) - return SDValue(); - - DebugLoc dl = N->getDebugLoc(); - unsigned SrcEltSize = SrcVT.getVectorElementType().getSizeInBits(); - unsigned DestEltSize = DestVT.getVectorElementType().getSizeInBits(); - unsigned NumElts = SrcVT.getVectorNumElements(); - LLVMContext &Ctx = *DAG.getContext(); - SDValue Mid, SplitLo, SplitHi, ExtLo, ExtHi; - - EVT MidVT = EVT::getVectorVT(Ctx, EVT::getIntegerVT(Ctx, SrcEltSize*2), - NumElts); - EVT SplitVT = EVT::getVectorVT(Ctx, EVT::getIntegerVT(Ctx, SrcEltSize*2), - NumElts/2); - EVT ExtVT = EVT::getVectorVT(Ctx, EVT::getIntegerVT(Ctx, DestEltSize), - NumElts/2); - - Mid = DAG.getNode(N->getOpcode(), dl, MidVT, Op); - SplitLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SplitVT, Mid, - DAG.getIntPtrConstant(0)); - SplitHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SplitVT, Mid, - DAG.getIntPtrConstant(NumElts/2)); - ExtLo = DAG.getNode(N->getOpcode(), dl, ExtVT, SplitLo); - ExtHi = DAG.getNode(N->getOpcode(), dl, ExtVT, SplitHi); - return DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, ExtLo, ExtHi); -} - /// ExpandBITCAST - If the target supports VFP, this function is called to /// expand a bit convert where either the source or destination type is i64 to /// use a VMOVDRR or VMOVRRD node. This should not be done when the non-i64 @@ -3581,7 +3839,7 @@ static SDValue ExpandVectorExtension(SDNode *N, SelectionDAG &DAG) { /// vectors), since the legalizer won't know what to do with that. static SDValue ExpandBITCAST(SDNode *N, SelectionDAG &DAG) { const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - DebugLoc dl = N->getDebugLoc(); + SDLoc dl(N); SDValue Op = N->getOperand(0); // This function is only supposed to be called for i64 types, either as the @@ -3618,7 +3876,7 @@ static SDValue ExpandBITCAST(SDNode *N, SelectionDAG &DAG) { /// not support i64 elements, so sometimes the zero vectors will need to be /// explicitly constructed. Regardless, use a canonical VMOV to create the /// zero vector. -static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, DebugLoc dl) { +static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, SDLoc dl) { assert(VT.isVector() && "Expected a vector type"); // The canonical modified immediate encoding of a zero vector is....0! SDValue EncodedVal = DAG.getTargetConstant(0, MVT::i32); @@ -3634,7 +3892,7 @@ SDValue ARMTargetLowering::LowerShiftRightParts(SDValue Op, assert(Op.getNumOperands() == 3 && "Not a double-shift!"); EVT VT = Op.getValueType(); unsigned VTBits = VT.getSizeInBits(); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); SDValue ShOpLo = Op.getOperand(0); SDValue ShOpHi = Op.getOperand(1); SDValue ShAmt = Op.getOperand(2); @@ -3670,7 +3928,7 @@ SDValue ARMTargetLowering::LowerShiftLeftParts(SDValue Op, assert(Op.getNumOperands() == 3 && "Not a double-shift!"); EVT VT = Op.getValueType(); unsigned VTBits = VT.getSizeInBits(); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); SDValue ShOpLo = Op.getOperand(0); SDValue ShOpHi = Op.getOperand(1); SDValue ShAmt = Op.getOperand(2); @@ -3703,7 +3961,7 @@ SDValue ARMTargetLowering::LowerFLT_ROUNDS_(SDValue Op, // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0 // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3) // so that the shift + and get folded into a bitfield extract. - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); SDValue FPSCR = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, DAG.getConstant(Intrinsic::arm_get_fpscr, MVT::i32)); @@ -3718,7 +3976,7 @@ SDValue ARMTargetLowering::LowerFLT_ROUNDS_(SDValue Op, static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST) { EVT VT = N->getValueType(0); - DebugLoc dl = N->getDebugLoc(); + SDLoc dl(N); if (!ST->hasV6T2Ops()) return SDValue(); @@ -3742,7 +4000,7 @@ static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG, /// vuzp: = [k0 k1 k2 k3 k0 k1 k2 k3] each ki is 8-bits) static SDValue getCTPOP16BitCounts(SDNode *N, SelectionDAG &DAG) { EVT VT = N->getValueType(0); - DebugLoc DL = N->getDebugLoc(); + SDLoc DL(N); EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8; SDValue N0 = DAG.getNode(ISD::BITCAST, DL, VT8Bit, N->getOperand(0)); @@ -3764,7 +4022,7 @@ static SDValue getCTPOP16BitCounts(SDNode *N, SelectionDAG &DAG) { /// v4i16:Extracted = [k0 k1 k2 k3 ] static SDValue lowerCTPOP16BitElements(SDNode *N, SelectionDAG &DAG) { EVT VT = N->getValueType(0); - DebugLoc DL = N->getDebugLoc(); + SDLoc DL(N); SDValue BitCounts = getCTPOP16BitCounts(N, DAG); if (VT.is64BitVector()) { @@ -3799,7 +4057,7 @@ static SDValue lowerCTPOP16BitElements(SDNode *N, SelectionDAG &DAG) { /// static SDValue lowerCTPOP32BitElements(SDNode *N, SelectionDAG &DAG) { EVT VT = N->getValueType(0); - DebugLoc DL = N->getDebugLoc(); + SDLoc DL(N); EVT VT16Bit = VT.is64BitVector() ? MVT::v4i16 : MVT::v8i16; @@ -3838,7 +4096,7 @@ static SDValue LowerCTPOP(SDNode *N, SelectionDAG &DAG, static SDValue LowerShift(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST) { EVT VT = N->getValueType(0); - DebugLoc dl = N->getDebugLoc(); + SDLoc dl(N); if (!VT.isVector()) return SDValue(); @@ -3873,7 +4131,7 @@ static SDValue LowerShift(SDNode *N, SelectionDAG &DAG, static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST) { EVT VT = N->getValueType(0); - DebugLoc dl = N->getDebugLoc(); + SDLoc dl(N); // We can get here for a node like i32 = ISD::SHL i32, i64 if (VT != MVT::i64) @@ -3919,7 +4177,7 @@ static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) { SDValue CC = Op.getOperand(2); EVT VT = Op.getValueType(); ISD::CondCode SetCCOpcode = cast(CC)->get(); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); if (Op.getOperand(1).getValueType().isFloatingPoint()) { switch (SetCCOpcode) { @@ -4177,18 +4435,26 @@ static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef, SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST) const { - if (!ST->useNEONForSinglePrecisionFP() || !ST->hasVFP3() || ST->hasD16()) + if (!ST->hasVFP3()) return SDValue(); + bool IsDouble = Op.getValueType() == MVT::f64; ConstantFPSDNode *CFP = cast(Op); - assert(Op.getValueType() == MVT::f32 && - "ConstantFP custom lowering should only occur for f32."); // Try splatting with a VMOV.f32... APFloat FPVal = CFP->getValueAPF(); - int ImmVal = ARM_AM::getFP32Imm(FPVal); + int ImmVal = IsDouble ? ARM_AM::getFP64Imm(FPVal) : ARM_AM::getFP32Imm(FPVal); + if (ImmVal != -1) { - DebugLoc DL = Op.getDebugLoc(); + if (IsDouble || !ST->useNEONForSinglePrecisionFP()) { + // We have code in place to select a valid ConstantFP already, no need to + // do any mangling. + return Op; + } + + // It's a float and we are trying to use NEON operations where + // possible. Lower it to a splat followed by an extract. + SDLoc DL(Op); SDValue NewVal = DAG.getTargetConstant(ImmVal, MVT::i32); SDValue VecConstant = DAG.getNode(ARMISD::VMOVFPIMM, DL, MVT::v2f32, NewVal); @@ -4196,15 +4462,31 @@ SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG, DAG.getConstant(0, MVT::i32)); } - // If that fails, try a VMOV.i32 + // The rest of our options are NEON only, make sure that's allowed before + // proceeding.. + if (!ST->hasNEON() || (!IsDouble && !ST->useNEONForSinglePrecisionFP())) + return SDValue(); + EVT VMovVT; - unsigned iVal = FPVal.bitcastToAPInt().getZExtValue(); - SDValue NewVal = isNEONModifiedImm(iVal, 0, 32, DAG, VMovVT, false, - VMOVModImm); + uint64_t iVal = FPVal.bitcastToAPInt().getZExtValue(); + + // It wouldn't really be worth bothering for doubles except for one very + // important value, which does happen to match: 0.0. So make sure we don't do + // anything stupid. + if (IsDouble && (iVal & 0xffffffff) != (iVal >> 32)) + return SDValue(); + + // Try a VMOV.i32 (FIXME: i8, i16, or i64 could work too). + SDValue NewVal = isNEONModifiedImm(iVal & 0xffffffffU, 0, 32, DAG, VMovVT, + false, VMOVModImm); if (NewVal != SDValue()) { - DebugLoc DL = Op.getDebugLoc(); + SDLoc DL(Op); SDValue VecConstant = DAG.getNode(ARMISD::VMOVIMM, DL, VMovVT, NewVal); + if (IsDouble) + return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant); + + // It's a float: cast and extract a vector element. SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, VecConstant); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant, @@ -4212,11 +4494,16 @@ SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG, } // Finally, try a VMVN.i32 - NewVal = isNEONModifiedImm(~iVal & 0xffffffff, 0, 32, DAG, VMovVT, false, - VMVNModImm); + NewVal = isNEONModifiedImm(~iVal & 0xffffffffU, 0, 32, DAG, VMovVT, + false, VMVNModImm); if (NewVal != SDValue()) { - DebugLoc DL = Op.getDebugLoc(); + SDLoc DL(Op); SDValue VecConstant = DAG.getNode(ARMISD::VMVNIMM, DL, VMovVT, NewVal); + + if (IsDouble) + return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant); + + // It's a float: cast and extract a vector element. SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, VecConstant); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant, @@ -4475,7 +4762,7 @@ static bool isReverseMask(ArrayRef M, EVT VT) { // instruction, return an SDValue of such a constant (will become a MOV // instruction). Otherwise return null. static SDValue IsSingleInstrConstant(SDValue N, SelectionDAG &DAG, - const ARMSubtarget *ST, DebugLoc dl) { + const ARMSubtarget *ST, SDLoc dl) { uint64_t Val; if (!isa(N)) return SDValue(); @@ -4496,7 +4783,7 @@ static SDValue IsSingleInstrConstant(SDValue N, SelectionDAG &DAG, SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST) const { BuildVectorSDNode *BVN = cast(Op.getNode()); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); EVT VT = Op.getValueType(); APInt SplatBits, SplatUndef; @@ -4580,7 +4867,9 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, if (ValueCounts.size() == 0) return DAG.getUNDEF(VT); - if (isOnlyLowElement) + // Loads are better lowered with insert_vector_elt/ARMISD::BUILD_VECTOR. + // Keep going if we are hitting this case. + if (isOnlyLowElement && !ISD::isNormalLoad(Value.getNode())) return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value); unsigned EltSize = VT.getVectorElementType().getSizeInBits(); @@ -4679,6 +4968,24 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, return DAG.getNode(ISD::BITCAST, dl, VT, Val); } + // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we + // know the default expansion would otherwise fall back on something even + // worse. For a vector with one or two non-undef values, that's + // scalar_to_vector for the elements followed by a shuffle (provided the + // shuffle is valid for the target) and materialization element by element + // on the stack followed by a load for everything else. + if (!isConstant && !usesOnlyOneValue) { + SDValue Vec = DAG.getUNDEF(VT); + for (unsigned i = 0 ; i < NumElts; ++i) { + SDValue V = Op.getOperand(i); + if (V.getOpcode() == ISD::UNDEF) + continue; + SDValue LaneIdx = DAG.getConstant(i, MVT::i32); + Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx); + } + return Vec; + } + return SDValue(); } @@ -4686,7 +4993,7 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, // shuffle in combination with VEXTs. SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const { - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); EVT VT = Op.getValueType(); unsigned NumElts = VT.getVectorNumElements(); @@ -4875,7 +5182,7 @@ ARMTargetLowering::isShuffleMaskLegal(const SmallVectorImpl &M, /// the specified operations to build the shuffle. static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS, SDValue RHS, SelectionDAG &DAG, - DebugLoc dl) { + SDLoc dl) { unsigned OpNum = (PFEntry >> 26) & 0x0F; unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1); unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1); @@ -4955,7 +5262,7 @@ static SDValue LowerVECTOR_SHUFFLEv8i8(SDValue Op, // Check to see if we can use the VTBL instruction. SDValue V1 = Op.getOperand(0); SDValue V2 = Op.getOperand(1); - DebugLoc DL = Op.getDebugLoc(); + SDLoc DL(Op); SmallVector VTBLMask; for (ArrayRef::iterator @@ -4974,7 +5281,7 @@ static SDValue LowerVECTOR_SHUFFLEv8i8(SDValue Op, static SDValue LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(SDValue Op, SelectionDAG &DAG) { - DebugLoc DL = Op.getDebugLoc(); + SDLoc DL(Op); SDValue OpLHS = Op.getOperand(0); EVT VT = OpLHS.getValueType(); @@ -4992,7 +5299,7 @@ static SDValue LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(SDValue Op, static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { SDValue V1 = Op.getOperand(0); SDValue V2 = Op.getOperand(1); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); EVT VT = Op.getValueType(); ShuffleVectorSDNode *SVN = cast(Op.getNode()); @@ -5156,7 +5463,7 @@ static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) { SDValue Vec = Op.getOperand(0); if (Op.getValueType() == MVT::i32 && Vec.getValueType().getVectorElementType().getSizeInBits() < 32) { - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); return DAG.getNode(ARMISD::VGETLANEu, dl, MVT::i32, Vec, Lane); } @@ -5168,7 +5475,7 @@ static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { // two 64-bit vectors are concatenated to a 128-bit vector. assert(Op.getValueType().is128BitVector() && Op.getNumOperands() == 2 && "unexpected CONCAT_VECTORS"); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); SDValue Val = DAG.getUNDEF(MVT::v2f64); SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); @@ -5291,7 +5598,7 @@ static SDValue AddRequiredExtensionForVMULL(SDValue N, SelectionDAG &DAG, // Must extend size to at least 64 bits to be used as an operand for VMULL. EVT NewVT = getExtensionTo64Bits(OrigTy); - return DAG.getNode(ExtOpcode, N->getDebugLoc(), NewVT, N); + return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N); } /// SkipLoadExtensionForVMULL - return a load of the original vector size that @@ -5304,7 +5611,7 @@ static SDValue SkipLoadExtensionForVMULL(LoadSDNode *LD, SelectionDAG& DAG) { // The load already has the right type. if (ExtendedTy == LD->getMemoryVT()) - return DAG.getLoad(LD->getMemoryVT(), LD->getDebugLoc(), LD->getChain(), + return DAG.getLoad(LD->getMemoryVT(), SDLoc(LD), LD->getChain(), LD->getBasePtr(), LD->getPointerInfo(), LD->isVolatile(), LD->isNonTemporal(), LD->isInvariant(), LD->getAlignment()); @@ -5312,7 +5619,7 @@ static SDValue SkipLoadExtensionForVMULL(LoadSDNode *LD, SelectionDAG& DAG) { // We need to create a zextload/sextload. We cannot just create a load // followed by a zext/zext node because LowerMUL is also run during normal // operation legalization where we can't create illegal types. - return DAG.getExtLoad(LD->getExtensionType(), LD->getDebugLoc(), ExtendedTy, + return DAG.getExtLoad(LD->getExtensionType(), SDLoc(LD), ExtendedTy, LD->getChain(), LD->getBasePtr(), LD->getPointerInfo(), LD->getMemoryVT(), LD->isVolatile(), LD->isNonTemporal(), LD->getAlignment()); @@ -5341,7 +5648,7 @@ static SDValue SkipExtensionForVMULL(SDNode *N, SelectionDAG &DAG) { assert(BVN->getOpcode() == ISD::BUILD_VECTOR && BVN->getValueType(0) == MVT::v4i32 && "expected v4i32 BUILD_VECTOR"); unsigned LowElt = DAG.getTargetLoweringInfo().isBigEndian() ? 1 : 0; - return DAG.getNode(ISD::BUILD_VECTOR, N->getDebugLoc(), MVT::v2i32, + return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), MVT::v2i32, BVN->getOperand(LowElt), BVN->getOperand(LowElt+2)); } // Construct a new BUILD_VECTOR with elements truncated to half the size. @@ -5358,7 +5665,7 @@ static SDValue SkipExtensionForVMULL(SDNode *N, SelectionDAG &DAG) { // The values are implicitly truncated so sext vs. zext doesn't matter. Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), MVT::i32)); } - return DAG.getNode(ISD::BUILD_VECTOR, N->getDebugLoc(), + return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), MVT::getVectorVT(TruncVT, NumElts), Ops.data(), NumElts); } @@ -5430,7 +5737,7 @@ static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) { } // Legalize to a VMULL instruction. - DebugLoc DL = Op.getDebugLoc(); + SDLoc DL(Op); SDValue Op0; SDValue Op1 = SkipExtensionForVMULL(N1, DAG); if (!isMLA) { @@ -5460,7 +5767,7 @@ static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) { } static SDValue -LowerSDIV_v4i8(SDValue X, SDValue Y, DebugLoc dl, SelectionDAG &DAG) { +LowerSDIV_v4i8(SDValue X, SDValue Y, SDLoc dl, SelectionDAG &DAG) { // Convert to float // float4 xf = vcvt_f32_s32(vmovl_s16(a.lo)); // float4 yf = vcvt_f32_s32(vmovl_s16(b.lo)); @@ -5489,7 +5796,7 @@ LowerSDIV_v4i8(SDValue X, SDValue Y, DebugLoc dl, SelectionDAG &DAG) { } static SDValue -LowerSDIV_v4i16(SDValue N0, SDValue N1, DebugLoc dl, SelectionDAG &DAG) { +LowerSDIV_v4i16(SDValue N0, SDValue N1, SDLoc dl, SelectionDAG &DAG) { SDValue N2; // Convert to float. // float4 yf = vcvt_f32_s32(vmovl_s16(y)); @@ -5530,7 +5837,7 @@ static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) { assert((VT == MVT::v4i16 || VT == MVT::v8i8) && "unexpected type for custom-lowering ISD::SDIV"); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); SDValue N0 = Op.getOperand(0); SDValue N1 = Op.getOperand(1); SDValue N2, N3; @@ -5565,7 +5872,7 @@ static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG) { assert((VT == MVT::v4i16 || VT == MVT::v8i8) && "unexpected type for custom-lowering ISD::UDIV"); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); SDValue N0 = Op.getOperand(0); SDValue N1 = Op.getOperand(1); SDValue N2, N3; @@ -5649,12 +5956,76 @@ static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) { } if (!ExtraOp) - return DAG.getNode(Opc, Op->getDebugLoc(), VTs, Op.getOperand(0), + return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1)); - return DAG.getNode(Opc, Op->getDebugLoc(), VTs, Op.getOperand(0), + return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1), Op.getOperand(2)); } +SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const { + assert(Subtarget->isTargetDarwin()); + + // For iOS, we want to call an alternative entry point: __sincos_stret, + // return values are passed via sret. + SDLoc dl(Op); + SDValue Arg = Op.getOperand(0); + EVT ArgVT = Arg.getValueType(); + Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); + + MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo(); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + + // Pair of floats / doubles used to pass the result. + StructType *RetTy = StructType::get(ArgTy, ArgTy, NULL); + + // Create stack object for sret. + const uint64_t ByteSize = TLI.getDataLayout()->getTypeAllocSize(RetTy); + const unsigned StackAlign = TLI.getDataLayout()->getPrefTypeAlignment(RetTy); + int FrameIdx = FrameInfo->CreateStackObject(ByteSize, StackAlign, false); + SDValue SRet = DAG.getFrameIndex(FrameIdx, TLI.getPointerTy()); + + ArgListTy Args; + ArgListEntry Entry; + + Entry.Node = SRet; + Entry.Ty = RetTy->getPointerTo(); + Entry.isSExt = false; + Entry.isZExt = false; + Entry.isSRet = true; + Args.push_back(Entry); + + Entry.Node = Arg; + Entry.Ty = ArgTy; + Entry.isSExt = false; + Entry.isZExt = false; + Args.push_back(Entry); + + const char *LibcallName = (ArgVT == MVT::f64) + ? "__sincos_stret" : "__sincosf_stret"; + SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy()); + + TargetLowering:: + CallLoweringInfo CLI(DAG.getEntryNode(), Type::getVoidTy(*DAG.getContext()), + false, false, false, false, 0, + CallingConv::C, /*isTaillCall=*/false, + /*doesNotRet=*/false, /*isReturnValueUsed*/false, + Callee, Args, DAG, dl); + std::pair CallResult = LowerCallTo(CLI); + + SDValue LoadSin = DAG.getLoad(ArgVT, dl, CallResult.second, SRet, + MachinePointerInfo(), false, false, false, 0); + + // Address of cos field. + SDValue Add = DAG.getNode(ISD::ADD, dl, getPointerTy(), SRet, + DAG.getIntPtrConstant(ArgVT.getStoreSize())); + SDValue LoadCos = DAG.getLoad(ArgVT, dl, LoadSin.getValue(1), Add, + MachinePointerInfo(), false, false, false, 0); + + SDVTList Tys = DAG.getVTList(ArgVT, ArgVT); + return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, + LoadSin.getValue(0), LoadCos.getValue(0)); +} + static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG) { // Monotonic load/store is legal for all targets if (cast(Op)->getOrdering() <= Monotonic) @@ -5665,40 +6036,73 @@ static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG) { return SDValue(); } - static void ReplaceATOMIC_OP_64(SDNode *Node, SmallVectorImpl& Results, - SelectionDAG &DAG, unsigned NewOp) { - DebugLoc dl = Node->getDebugLoc(); + SelectionDAG &DAG) { + SDLoc dl(Node); assert (Node->getValueType(0) == MVT::i64 && "Only know how to expand i64 atomics"); + AtomicSDNode *AN = cast(Node); SmallVector Ops; Ops.push_back(Node->getOperand(0)); // Chain Ops.push_back(Node->getOperand(1)); // Ptr - // Low part of Val1 - Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, - Node->getOperand(2), DAG.getIntPtrConstant(0))); - // High part of Val1 - Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, - Node->getOperand(2), DAG.getIntPtrConstant(1))); - if (NewOp == ARMISD::ATOMCMPXCHG64_DAG) { - // High part of Val1 + for(unsigned i=2; igetNumOperands(); i++) { + // Low part Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, - Node->getOperand(3), DAG.getIntPtrConstant(0))); - // High part of Val2 + Node->getOperand(i), DAG.getIntPtrConstant(0))); + // High part Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, - Node->getOperand(3), DAG.getIntPtrConstant(1))); + Node->getOperand(i), DAG.getIntPtrConstant(1))); } SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other); SDValue Result = - DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops.data(), Ops.size(), MVT::i64, - cast(Node)->getMemOperand()); + DAG.getAtomic(Node->getOpcode(), dl, MVT::i64, Tys, Ops.data(), Ops.size(), + cast(Node)->getMemOperand(), AN->getOrdering(), + AN->getSynchScope()); SDValue OpsF[] = { Result.getValue(0), Result.getValue(1) }; Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2)); Results.push_back(Result.getValue(2)); } +static void ReplaceREADCYCLECOUNTER(SDNode *N, + SmallVectorImpl &Results, + SelectionDAG &DAG, + const ARMSubtarget *Subtarget) { + SDLoc DL(N); + SDValue Cycles32, OutChain; + + if (Subtarget->hasPerfMon()) { + // Under Power Management extensions, the cycle-count is: + // mrc p15, #0, , c9, c13, #0 + SDValue Ops[] = { N->getOperand(0), // Chain + DAG.getConstant(Intrinsic::arm_mrc, MVT::i32), + DAG.getConstant(15, MVT::i32), + DAG.getConstant(0, MVT::i32), + DAG.getConstant(9, MVT::i32), + DAG.getConstant(13, MVT::i32), + DAG.getConstant(0, MVT::i32) + }; + + Cycles32 = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, + DAG.getVTList(MVT::i32, MVT::Other), &Ops[0], + array_lengthof(Ops)); + OutChain = Cycles32.getValue(1); + } else { + // Intrinsic is defined to return 0 on unsupported platforms. Technically + // there are older ARM CPUs that have implementation-specific ways of + // obtaining this information (FIXME!). + Cycles32 = DAG.getConstant(0, MVT::i32); + OutChain = DAG.getEntryNode(); + } + + + SDValue Cycles64 = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, + Cycles32, DAG.getConstant(0, MVT::i32)); + Results.push_back(Cycles64); + Results.push_back(OutChain); +} + SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { default: llvm_unreachable("Don't know how to custom lower this!"); @@ -5753,6 +6157,9 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::SUBE: return LowerADDC_ADDE_SUBC_SUBE(Op, DAG); case ISD::ATOMIC_LOAD: case ISD::ATOMIC_STORE: return LowerAtomicLoadStore(Op, DAG); + case ISD::FSINCOS: return LowerFSINCOS(Op, DAG); + case ISD::SDIVREM: + case ISD::UDIVREM: return LowerDivRem(Op, DAG); } } @@ -5768,49 +6175,28 @@ void ARMTargetLowering::ReplaceNodeResults(SDNode *N, case ISD::BITCAST: Res = ExpandBITCAST(N, DAG); break; - case ISD::SIGN_EXTEND: - case ISD::ZERO_EXTEND: - Res = ExpandVectorExtension(N, DAG); - break; case ISD::SRL: case ISD::SRA: Res = Expand64BitShift(N, DAG, Subtarget); break; - case ISD::ATOMIC_LOAD_ADD: - ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMADD64_DAG); + case ISD::READCYCLECOUNTER: + ReplaceREADCYCLECOUNTER(N, Results, DAG, Subtarget); return; + case ISD::ATOMIC_STORE: + case ISD::ATOMIC_LOAD: + case ISD::ATOMIC_LOAD_ADD: case ISD::ATOMIC_LOAD_AND: - ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMAND64_DAG); - return; case ISD::ATOMIC_LOAD_NAND: - ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMNAND64_DAG); - return; case ISD::ATOMIC_LOAD_OR: - ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMOR64_DAG); - return; case ISD::ATOMIC_LOAD_SUB: - ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMSUB64_DAG); - return; case ISD::ATOMIC_LOAD_XOR: - ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMXOR64_DAG); - return; case ISD::ATOMIC_SWAP: - ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMSWAP64_DAG); - return; case ISD::ATOMIC_CMP_SWAP: - ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMCMPXCHG64_DAG); - return; case ISD::ATOMIC_LOAD_MIN: - ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMMIN64_DAG); - return; case ISD::ATOMIC_LOAD_UMIN: - ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMUMIN64_DAG); - return; case ISD::ATOMIC_LOAD_MAX: - ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMMAX64_DAG); - return; case ISD::ATOMIC_LOAD_UMAX: - ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMUMAX64_DAG); + ReplaceATOMIC_OP_64(N, Results, DAG); return; } if (Res.getNode()) @@ -5830,6 +6216,7 @@ ARMTargetLowering::EmitAtomicCmpSwap(MachineInstr *MI, unsigned oldval = MI->getOperand(2).getReg(); unsigned newval = MI->getOperand(3).getReg(); const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + AtomicOrdering Ord = static_cast(MI->getOperand(4).getImm()); DebugLoc dl = MI->getDebugLoc(); bool isThumb2 = Subtarget->isThumb2(); @@ -5845,21 +6232,7 @@ ARMTargetLowering::EmitAtomicCmpSwap(MachineInstr *MI, } unsigned ldrOpc, strOpc; - switch (Size) { - default: llvm_unreachable("unsupported size for AtomicCmpSwap!"); - case 1: - ldrOpc = isThumb2 ? ARM::t2LDREXB : ARM::LDREXB; - strOpc = isThumb2 ? ARM::t2STREXB : ARM::STREXB; - break; - case 2: - ldrOpc = isThumb2 ? ARM::t2LDREXH : ARM::LDREXH; - strOpc = isThumb2 ? ARM::t2STREXH : ARM::STREXH; - break; - case 4: - ldrOpc = isThumb2 ? ARM::t2LDREX : ARM::LDREX; - strOpc = isThumb2 ? ARM::t2STREX : ARM::STREX; - break; - } + getExclusiveOperation(Size, Ord, isThumb2, ldrOpc, strOpc); MachineFunction *MF = BB->getParent(); const BasicBlock *LLVM_BB = BB->getBasicBlock(); @@ -5939,6 +6312,7 @@ ARMTargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB, unsigned dest = MI->getOperand(0).getReg(); unsigned ptr = MI->getOperand(1).getReg(); unsigned incr = MI->getOperand(2).getReg(); + AtomicOrdering Ord = static_cast(MI->getOperand(3).getImm()); DebugLoc dl = MI->getDebugLoc(); bool isThumb2 = Subtarget->isThumb2(); @@ -5946,24 +6320,11 @@ ARMTargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB, if (isThumb2) { MRI.constrainRegClass(dest, &ARM::rGPRRegClass); MRI.constrainRegClass(ptr, &ARM::rGPRRegClass); + MRI.constrainRegClass(incr, &ARM::rGPRRegClass); } unsigned ldrOpc, strOpc; - switch (Size) { - default: llvm_unreachable("unsupported size for AtomicCmpSwap!"); - case 1: - ldrOpc = isThumb2 ? ARM::t2LDREXB : ARM::LDREXB; - strOpc = isThumb2 ? ARM::t2STREXB : ARM::STREXB; - break; - case 2: - ldrOpc = isThumb2 ? ARM::t2LDREXH : ARM::LDREXH; - strOpc = isThumb2 ? ARM::t2STREXH : ARM::STREXH; - break; - case 4: - ldrOpc = isThumb2 ? ARM::t2LDREX : ARM::LDREX; - strOpc = isThumb2 ? ARM::t2STREX : ARM::STREX; - break; - } + getExclusiveOperation(Size, Ord, isThumb2, ldrOpc, strOpc); MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB); @@ -6047,6 +6408,7 @@ ARMTargetLowering::EmitAtomicBinaryMinMax(MachineInstr *MI, unsigned ptr = MI->getOperand(1).getReg(); unsigned incr = MI->getOperand(2).getReg(); unsigned oldval = dest; + AtomicOrdering Ord = static_cast(MI->getOperand(3).getImm()); DebugLoc dl = MI->getDebugLoc(); bool isThumb2 = Subtarget->isThumb2(); @@ -6054,24 +6416,20 @@ ARMTargetLowering::EmitAtomicBinaryMinMax(MachineInstr *MI, if (isThumb2) { MRI.constrainRegClass(dest, &ARM::rGPRRegClass); MRI.constrainRegClass(ptr, &ARM::rGPRRegClass); + MRI.constrainRegClass(incr, &ARM::rGPRRegClass); } unsigned ldrOpc, strOpc, extendOpc; + getExclusiveOperation(Size, Ord, isThumb2, ldrOpc, strOpc); switch (Size) { - default: llvm_unreachable("unsupported size for AtomicCmpSwap!"); + default: llvm_unreachable("unsupported size for AtomicBinaryMinMax!"); case 1: - ldrOpc = isThumb2 ? ARM::t2LDREXB : ARM::LDREXB; - strOpc = isThumb2 ? ARM::t2STREXB : ARM::STREXB; extendOpc = isThumb2 ? ARM::t2SXTB : ARM::SXTB; break; case 2: - ldrOpc = isThumb2 ? ARM::t2LDREXH : ARM::LDREXH; - strOpc = isThumb2 ? ARM::t2STREXH : ARM::STREXH; extendOpc = isThumb2 ? ARM::t2SXTH : ARM::SXTH; break; case 4: - ldrOpc = isThumb2 ? ARM::t2LDREX : ARM::LDREX; - strOpc = isThumb2 ? ARM::t2STREX : ARM::STREX; extendOpc = 0; break; } @@ -6115,7 +6473,10 @@ ARMTargetLowering::EmitAtomicBinaryMinMax(MachineInstr *MI, // Sign extend the value, if necessary. if (signExtend && extendOpc) { - oldval = MRI.createVirtualRegister(&ARM::GPRRegClass); + oldval = MRI.createVirtualRegister(isThumb2 ? &ARM::rGPRRegClass + : &ARM::GPRnopcRegClass); + if (!isThumb2) + MRI.constrainRegClass(dest, &ARM::GPRnopcRegClass); AddDefaultPred(BuildMI(BB, dl, TII->get(extendOpc), oldval) .addReg(dest) .addImm(0)); @@ -6153,7 +6514,7 @@ ARMTargetLowering::EmitAtomicBinary64(MachineInstr *MI, MachineBasicBlock *BB, unsigned Op1, unsigned Op2, bool NeedsCarry, bool IsCmpxchg, bool IsMinMax, ARMCC::CondCodes CC) const { - // This also handles ATOMIC_SWAP, indicated by Op1==0. + // This also handles ATOMIC_SWAP and ATOMIC_STORE, indicated by Op1==0. const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); const BasicBlock *LLVM_BB = BB->getBasicBlock(); @@ -6161,11 +6522,15 @@ ARMTargetLowering::EmitAtomicBinary64(MachineInstr *MI, MachineBasicBlock *BB, MachineFunction::iterator It = BB; ++It; + bool isStore = (MI->getOpcode() == ARM::ATOMIC_STORE_I64); + unsigned offset = (isStore ? -2 : 0); unsigned destlo = MI->getOperand(0).getReg(); unsigned desthi = MI->getOperand(1).getReg(); - unsigned ptr = MI->getOperand(2).getReg(); - unsigned vallo = MI->getOperand(3).getReg(); - unsigned valhi = MI->getOperand(4).getReg(); + unsigned ptr = MI->getOperand(offset+2).getReg(); + unsigned vallo = MI->getOperand(offset+3).getReg(); + unsigned valhi = MI->getOperand(offset+4).getReg(); + unsigned OrdIdx = offset + (IsCmpxchg ? 7 : 5); + AtomicOrdering Ord = static_cast(MI->getOperand(OrdIdx).getImm()); DebugLoc dl = MI->getDebugLoc(); bool isThumb2 = Subtarget->isThumb2(); @@ -6174,8 +6539,13 @@ ARMTargetLowering::EmitAtomicBinary64(MachineInstr *MI, MachineBasicBlock *BB, MRI.constrainRegClass(destlo, &ARM::rGPRRegClass); MRI.constrainRegClass(desthi, &ARM::rGPRRegClass); MRI.constrainRegClass(ptr, &ARM::rGPRRegClass); + MRI.constrainRegClass(vallo, &ARM::rGPRRegClass); + MRI.constrainRegClass(valhi, &ARM::rGPRRegClass); } + unsigned ldrOpc, strOpc; + getExclusiveOperation(8, Ord, isThumb2, ldrOpc, strOpc); + MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *contBB = 0, *cont2BB = 0; if (IsCmpxchg || IsMinMax) @@ -6215,21 +6585,23 @@ ARMTargetLowering::EmitAtomicBinary64(MachineInstr *MI, MachineBasicBlock *BB, // fallthrough --> exitMBB BB = loopMBB; - // Load - if (isThumb2) { - AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::t2LDREXD)) - .addReg(destlo, RegState::Define) - .addReg(desthi, RegState::Define) - .addReg(ptr)); - } else { - unsigned GPRPair0 = MRI.createVirtualRegister(&ARM::GPRPairRegClass); - AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::LDREXD)) - .addReg(GPRPair0, RegState::Define).addReg(ptr)); - // Copy r2/r3 into dest. (This copy will normally be coalesced.) - BuildMI(BB, dl, TII->get(TargetOpcode::COPY), destlo) - .addReg(GPRPair0, 0, ARM::gsub_0); - BuildMI(BB, dl, TII->get(TargetOpcode::COPY), desthi) - .addReg(GPRPair0, 0, ARM::gsub_1); + if (!isStore) { + // Load + if (isThumb2) { + AddDefaultPred(BuildMI(BB, dl, TII->get(ldrOpc)) + .addReg(destlo, RegState::Define) + .addReg(desthi, RegState::Define) + .addReg(ptr)); + } else { + unsigned GPRPair0 = MRI.createVirtualRegister(&ARM::GPRPairRegClass); + AddDefaultPred(BuildMI(BB, dl, TII->get(ldrOpc)) + .addReg(GPRPair0, RegState::Define).addReg(ptr)); + // Copy r2/r3 into dest. (This copy will normally be coalesced.) + BuildMI(BB, dl, TII->get(TargetOpcode::COPY), destlo) + .addReg(GPRPair0, 0, ARM::gsub_0); + BuildMI(BB, dl, TII->get(TargetOpcode::COPY), desthi) + .addReg(GPRPair0, 0, ARM::gsub_1); + } } unsigned StoreLo, StoreHi; @@ -6281,7 +6653,9 @@ ARMTargetLowering::EmitAtomicBinary64(MachineInstr *MI, MachineBasicBlock *BB, // Store if (isThumb2) { - AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::t2STREXD), storesuccess) + MRI.constrainRegClass(StoreLo, &ARM::rGPRRegClass); + MRI.constrainRegClass(StoreHi, &ARM::rGPRRegClass); + AddDefaultPred(BuildMI(BB, dl, TII->get(strOpc), storesuccess) .addReg(StoreLo).addReg(StoreHi).addReg(ptr)); } else { // Marshal a pair... @@ -6299,7 +6673,7 @@ ARMTargetLowering::EmitAtomicBinary64(MachineInstr *MI, MachineBasicBlock *BB, .addImm(ARM::gsub_1); // ...and store it - AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::STREXD), storesuccess) + AddDefaultPred(BuildMI(BB, dl, TII->get(strOpc), storesuccess) .addReg(StorePair).addReg(ptr)); } // Cmp+jump @@ -6320,6 +6694,51 @@ ARMTargetLowering::EmitAtomicBinary64(MachineInstr *MI, MachineBasicBlock *BB, return BB; } +MachineBasicBlock * +ARMTargetLowering::EmitAtomicLoad64(MachineInstr *MI, MachineBasicBlock *BB) const { + + const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + + unsigned destlo = MI->getOperand(0).getReg(); + unsigned desthi = MI->getOperand(1).getReg(); + unsigned ptr = MI->getOperand(2).getReg(); + AtomicOrdering Ord = static_cast(MI->getOperand(3).getImm()); + DebugLoc dl = MI->getDebugLoc(); + bool isThumb2 = Subtarget->isThumb2(); + + MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); + if (isThumb2) { + MRI.constrainRegClass(destlo, &ARM::rGPRRegClass); + MRI.constrainRegClass(desthi, &ARM::rGPRRegClass); + MRI.constrainRegClass(ptr, &ARM::rGPRRegClass); + } + unsigned ldrOpc, strOpc; + getExclusiveOperation(8, Ord, isThumb2, ldrOpc, strOpc); + + MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(ldrOpc)); + + if (isThumb2) { + MIB.addReg(destlo, RegState::Define) + .addReg(desthi, RegState::Define) + .addReg(ptr); + + } else { + unsigned GPRPair0 = MRI.createVirtualRegister(&ARM::GPRPairRegClass); + MIB.addReg(GPRPair0, RegState::Define).addReg(ptr); + + // Copy GPRPair0 into dest. (This copy will normally be coalesced.) + BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), destlo) + .addReg(GPRPair0, 0, ARM::gsub_0); + BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), desthi) + .addReg(GPRPair0, 0, ARM::gsub_1); + } + AddDefaultPred(MIB); + + MI->eraseFromParent(); // The instruction is gone now. + + return BB; +} + /// SetupEntryBlockForSjLj - Insert code into the entry block that creates and /// registers the function context. void ARMTargetLowering:: @@ -6851,8 +7270,109 @@ MachineBasicBlock *OtherSucc(MachineBasicBlock *MBB, MachineBasicBlock *Succ) { llvm_unreachable("Expecting a BB with two successors!"); } -MachineBasicBlock *ARMTargetLowering:: -EmitStructByval(MachineInstr *MI, MachineBasicBlock *BB) const { +/// Return the load opcode for a given load size. If load size >= 8, +/// neon opcode will be returned. +static unsigned getLdOpcode(unsigned LdSize, bool IsThumb1, bool IsThumb2) { + if (LdSize >= 8) + return LdSize == 16 ? ARM::VLD1q32wb_fixed + : LdSize == 8 ? ARM::VLD1d32wb_fixed : 0; + if (IsThumb1) + return LdSize == 4 ? ARM::tLDRi + : LdSize == 2 ? ARM::tLDRHi + : LdSize == 1 ? ARM::tLDRBi : 0; + if (IsThumb2) + return LdSize == 4 ? ARM::t2LDR_POST + : LdSize == 2 ? ARM::t2LDRH_POST + : LdSize == 1 ? ARM::t2LDRB_POST : 0; + return LdSize == 4 ? ARM::LDR_POST_IMM + : LdSize == 2 ? ARM::LDRH_POST + : LdSize == 1 ? ARM::LDRB_POST_IMM : 0; +} + +/// Return the store opcode for a given store size. If store size >= 8, +/// neon opcode will be returned. +static unsigned getStOpcode(unsigned StSize, bool IsThumb1, bool IsThumb2) { + if (StSize >= 8) + return StSize == 16 ? ARM::VST1q32wb_fixed + : StSize == 8 ? ARM::VST1d32wb_fixed : 0; + if (IsThumb1) + return StSize == 4 ? ARM::tSTRi + : StSize == 2 ? ARM::tSTRHi + : StSize == 1 ? ARM::tSTRBi : 0; + if (IsThumb2) + return StSize == 4 ? ARM::t2STR_POST + : StSize == 2 ? ARM::t2STRH_POST + : StSize == 1 ? ARM::t2STRB_POST : 0; + return StSize == 4 ? ARM::STR_POST_IMM + : StSize == 2 ? ARM::STRH_POST + : StSize == 1 ? ARM::STRB_POST_IMM : 0; +} + +/// Emit a post-increment load operation with given size. The instructions +/// will be added to BB at Pos. +static void emitPostLd(MachineBasicBlock *BB, MachineInstr *Pos, + const TargetInstrInfo *TII, DebugLoc dl, + unsigned LdSize, unsigned Data, unsigned AddrIn, + unsigned AddrOut, bool IsThumb1, bool IsThumb2) { + unsigned LdOpc = getLdOpcode(LdSize, IsThumb1, IsThumb2); + assert(LdOpc != 0 && "Should have a load opcode"); + if (LdSize >= 8) { + AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data) + .addReg(AddrOut, RegState::Define).addReg(AddrIn) + .addImm(0)); + } else if (IsThumb1) { + // load + update AddrIn + AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data) + .addReg(AddrIn).addImm(0)); + MachineInstrBuilder MIB = + BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut); + MIB = AddDefaultT1CC(MIB); + MIB.addReg(AddrIn).addImm(LdSize); + AddDefaultPred(MIB); + } else if (IsThumb2) { + AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data) + .addReg(AddrOut, RegState::Define).addReg(AddrIn) + .addImm(LdSize)); + } else { // arm + AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data) + .addReg(AddrOut, RegState::Define).addReg(AddrIn) + .addReg(0).addImm(LdSize)); + } +} + +/// Emit a post-increment store operation with given size. The instructions +/// will be added to BB at Pos. +static void emitPostSt(MachineBasicBlock *BB, MachineInstr *Pos, + const TargetInstrInfo *TII, DebugLoc dl, + unsigned StSize, unsigned Data, unsigned AddrIn, + unsigned AddrOut, bool IsThumb1, bool IsThumb2) { + unsigned StOpc = getStOpcode(StSize, IsThumb1, IsThumb2); + assert(StOpc != 0 && "Should have a store opcode"); + if (StSize >= 8) { + AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut) + .addReg(AddrIn).addImm(0).addReg(Data)); + } else if (IsThumb1) { + // store + update AddrIn + AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(StOpc)).addReg(Data) + .addReg(AddrIn).addImm(0)); + MachineInstrBuilder MIB = + BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut); + MIB = AddDefaultT1CC(MIB); + MIB.addReg(AddrIn).addImm(StSize); + AddDefaultPred(MIB); + } else if (IsThumb2) { + AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut) + .addReg(Data).addReg(AddrIn).addImm(StSize)); + } else { // arm + AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut) + .addReg(Data).addReg(AddrIn).addReg(0) + .addImm(StSize)); + } +} + +MachineBasicBlock * +ARMTargetLowering::EmitStructByval(MachineInstr *MI, + MachineBasicBlock *BB) const { // This pseudo instruction has 3 operands: dst, src, size // We expand it to a loop if size > Subtarget->getMaxInlineSizeThreshold(). // Otherwise, we will generate unrolled scalar copies. @@ -6867,23 +7387,18 @@ EmitStructByval(MachineInstr *MI, MachineBasicBlock *BB) const { unsigned Align = MI->getOperand(3).getImm(); DebugLoc dl = MI->getDebugLoc(); - bool isThumb2 = Subtarget->isThumb2(); MachineFunction *MF = BB->getParent(); MachineRegisterInfo &MRI = MF->getRegInfo(); - unsigned ldrOpc, strOpc, UnitSize = 0; + unsigned UnitSize = 0; + const TargetRegisterClass *TRC = 0; + const TargetRegisterClass *VecTRC = 0; - const TargetRegisterClass *TRC = isThumb2 ? - (const TargetRegisterClass*)&ARM::tGPRRegClass : - (const TargetRegisterClass*)&ARM::GPRRegClass; - const TargetRegisterClass *TRC_Vec = 0; + bool IsThumb1 = Subtarget->isThumb1Only(); + bool IsThumb2 = Subtarget->isThumb2(); if (Align & 1) { - ldrOpc = isThumb2 ? ARM::t2LDRB_POST : ARM::LDRB_POST_IMM; - strOpc = isThumb2 ? ARM::t2STRB_POST : ARM::STRB_POST_IMM; UnitSize = 1; } else if (Align & 2) { - ldrOpc = isThumb2 ? ARM::t2LDRH_POST : ARM::LDRH_POST; - strOpc = isThumb2 ? ARM::t2STRH_POST : ARM::STRH_POST; UnitSize = 2; } else { // Check whether we can use NEON instructions. @@ -6891,27 +7406,27 @@ EmitStructByval(MachineInstr *MI, MachineBasicBlock *BB) const { hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat) && Subtarget->hasNEON()) { - if ((Align % 16 == 0) && SizeVal >= 16) { - ldrOpc = ARM::VLD1q32wb_fixed; - strOpc = ARM::VST1q32wb_fixed; + if ((Align % 16 == 0) && SizeVal >= 16) UnitSize = 16; - TRC_Vec = (const TargetRegisterClass*)&ARM::DPairRegClass; - } - else if ((Align % 8 == 0) && SizeVal >= 8) { - ldrOpc = ARM::VLD1d32wb_fixed; - strOpc = ARM::VST1d32wb_fixed; + else if ((Align % 8 == 0) && SizeVal >= 8) UnitSize = 8; - TRC_Vec = (const TargetRegisterClass*)&ARM::DPRRegClass; - } } // Can't use NEON instructions. - if (UnitSize == 0) { - ldrOpc = isThumb2 ? ARM::t2LDR_POST : ARM::LDR_POST_IMM; - strOpc = isThumb2 ? ARM::t2STR_POST : ARM::STR_POST_IMM; + if (UnitSize == 0) UnitSize = 4; - } } + // Select the correct opcode and register class for unit size load/store + bool IsNeon = UnitSize >= 8; + TRC = (IsThumb1 || IsThumb2) ? (const TargetRegisterClass *)&ARM::tGPRRegClass + : (const TargetRegisterClass *)&ARM::GPRRegClass; + if (IsNeon) + VecTRC = UnitSize == 16 + ? (const TargetRegisterClass *)&ARM::DPairRegClass + : UnitSize == 8 + ? (const TargetRegisterClass *)&ARM::DPRRegClass + : 0; + unsigned BytesLeft = SizeVal % UnitSize; unsigned LoopSize = SizeVal - BytesLeft; @@ -6922,34 +7437,13 @@ EmitStructByval(MachineInstr *MI, MachineBasicBlock *BB) const { unsigned srcIn = src; unsigned destIn = dest; for (unsigned i = 0; i < LoopSize; i+=UnitSize) { - unsigned scratch = MRI.createVirtualRegister(UnitSize >= 8 ? TRC_Vec:TRC); unsigned srcOut = MRI.createVirtualRegister(TRC); unsigned destOut = MRI.createVirtualRegister(TRC); - if (UnitSize >= 8) { - AddDefaultPred(BuildMI(*BB, MI, dl, - TII->get(ldrOpc), scratch) - .addReg(srcOut, RegState::Define).addReg(srcIn).addImm(0)); - - AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(strOpc), destOut) - .addReg(destIn).addImm(0).addReg(scratch)); - } else if (isThumb2) { - AddDefaultPred(BuildMI(*BB, MI, dl, - TII->get(ldrOpc), scratch) - .addReg(srcOut, RegState::Define).addReg(srcIn).addImm(UnitSize)); - - AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(strOpc), destOut) - .addReg(scratch).addReg(destIn) - .addImm(UnitSize)); - } else { - AddDefaultPred(BuildMI(*BB, MI, dl, - TII->get(ldrOpc), scratch) - .addReg(srcOut, RegState::Define).addReg(srcIn).addReg(0) - .addImm(UnitSize)); - - AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(strOpc), destOut) - .addReg(scratch).addReg(destIn) - .addReg(0).addImm(UnitSize)); - } + unsigned scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC); + emitPostLd(BB, MI, TII, dl, UnitSize, scratch, srcIn, srcOut, + IsThumb1, IsThumb2); + emitPostSt(BB, MI, TII, dl, UnitSize, scratch, destIn, destOut, + IsThumb1, IsThumb2); srcIn = srcOut; destIn = destOut; } @@ -6957,30 +7451,14 @@ EmitStructByval(MachineInstr *MI, MachineBasicBlock *BB) const { // Handle the leftover bytes with LDRB and STRB. // [scratch, srcOut] = LDRB_POST(srcIn, 1) // [destOut] = STRB_POST(scratch, destIn, 1) - ldrOpc = isThumb2 ? ARM::t2LDRB_POST : ARM::LDRB_POST_IMM; - strOpc = isThumb2 ? ARM::t2STRB_POST : ARM::STRB_POST_IMM; for (unsigned i = 0; i < BytesLeft; i++) { - unsigned scratch = MRI.createVirtualRegister(TRC); unsigned srcOut = MRI.createVirtualRegister(TRC); unsigned destOut = MRI.createVirtualRegister(TRC); - if (isThumb2) { - AddDefaultPred(BuildMI(*BB, MI, dl, - TII->get(ldrOpc),scratch) - .addReg(srcOut, RegState::Define).addReg(srcIn).addImm(1)); - - AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(strOpc), destOut) - .addReg(scratch).addReg(destIn) - .addReg(0).addImm(1)); - } else { - AddDefaultPred(BuildMI(*BB, MI, dl, - TII->get(ldrOpc),scratch) - .addReg(srcOut, RegState::Define).addReg(srcIn) - .addReg(0).addImm(1)); - - AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(strOpc), destOut) - .addReg(scratch).addReg(destIn) - .addReg(0).addImm(1)); - } + unsigned scratch = MRI.createVirtualRegister(TRC); + emitPostLd(BB, MI, TII, dl, 1, scratch, srcIn, srcOut, + IsThumb1, IsThumb2); + emitPostSt(BB, MI, TII, dl, 1, scratch, destIn, destOut, + IsThumb1, IsThumb2); srcIn = srcOut; destIn = destOut; } @@ -7021,17 +7499,16 @@ EmitStructByval(MachineInstr *MI, MachineBasicBlock *BB) const { // Load an immediate to varEnd. unsigned varEnd = MRI.createVirtualRegister(TRC); - if (isThumb2) { - unsigned VReg1 = varEnd; + if (IsThumb2) { + unsigned Vtmp = varEnd; if ((LoopSize & 0xFFFF0000) != 0) - VReg1 = MRI.createVirtualRegister(TRC); - AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::t2MOVi16), VReg1) - .addImm(LoopSize & 0xFFFF)); + Vtmp = MRI.createVirtualRegister(TRC); + AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::t2MOVi16), Vtmp) + .addImm(LoopSize & 0xFFFF)); if ((LoopSize & 0xFFFF0000) != 0) AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::t2MOVTi16), varEnd) - .addReg(VReg1) - .addImm(LoopSize >> 16)); + .addReg(Vtmp).addImm(LoopSize >> 16)); } else { MachineConstantPool *ConstantPool = MF->getConstantPool(); Type *Int32Ty = Type::getInt32Ty(MF->getFunction()->getContext()); @@ -7043,10 +7520,12 @@ EmitStructByval(MachineInstr *MI, MachineBasicBlock *BB) const { Align = getDataLayout()->getTypeAllocSize(C->getType()); unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align); - AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::LDRcp)) - .addReg(varEnd, RegState::Define) - .addConstantPoolIndex(Idx) - .addImm(0)); + if (IsThumb1) + AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(ARM::tLDRpci)).addReg( + varEnd, RegState::Define).addConstantPoolIndex(Idx)); + else + AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(ARM::LDRcp)).addReg( + varEnd, RegState::Define).addConstantPoolIndex(Idx).addImm(0)); } BB->addSuccessor(loopMBB); @@ -7075,39 +7554,30 @@ EmitStructByval(MachineInstr *MI, MachineBasicBlock *BB) const { // [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize) // [destLoop] = STR_POST(scratch, destPhi, UnitSiz) - unsigned scratch = MRI.createVirtualRegister(UnitSize >= 8 ? TRC_Vec:TRC); - if (UnitSize >= 8) { - AddDefaultPred(BuildMI(BB, dl, TII->get(ldrOpc), scratch) - .addReg(srcLoop, RegState::Define).addReg(srcPhi).addImm(0)); - - AddDefaultPred(BuildMI(BB, dl, TII->get(strOpc), destLoop) - .addReg(destPhi).addImm(0).addReg(scratch)); - } else if (isThumb2) { - AddDefaultPred(BuildMI(BB, dl, TII->get(ldrOpc), scratch) - .addReg(srcLoop, RegState::Define).addReg(srcPhi).addImm(UnitSize)); - - AddDefaultPred(BuildMI(BB, dl, TII->get(strOpc), destLoop) - .addReg(scratch).addReg(destPhi) - .addImm(UnitSize)); - } else { - AddDefaultPred(BuildMI(BB, dl, TII->get(ldrOpc), scratch) - .addReg(srcLoop, RegState::Define).addReg(srcPhi).addReg(0) - .addImm(UnitSize)); - - AddDefaultPred(BuildMI(BB, dl, TII->get(strOpc), destLoop) - .addReg(scratch).addReg(destPhi) - .addReg(0).addImm(UnitSize)); - } + unsigned scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC); + emitPostLd(BB, BB->end(), TII, dl, UnitSize, scratch, srcPhi, srcLoop, + IsThumb1, IsThumb2); + emitPostSt(BB, BB->end(), TII, dl, UnitSize, scratch, destPhi, destLoop, + IsThumb1, IsThumb2); // Decrement loop variable by UnitSize. - MachineInstrBuilder MIB = BuildMI(BB, dl, - TII->get(isThumb2 ? ARM::t2SUBri : ARM::SUBri), varLoop); - AddDefaultCC(AddDefaultPred(MIB.addReg(varPhi).addImm(UnitSize))); - MIB->getOperand(5).setReg(ARM::CPSR); - MIB->getOperand(5).setIsDef(true); - - BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)) - .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR); + if (IsThumb1) { + MachineInstrBuilder MIB = + BuildMI(*BB, BB->end(), dl, TII->get(ARM::tSUBi8), varLoop); + MIB = AddDefaultT1CC(MIB); + MIB.addReg(varPhi).addImm(UnitSize); + AddDefaultPred(MIB); + } else { + MachineInstrBuilder MIB = + BuildMI(*BB, BB->end(), dl, + TII->get(IsThumb2 ? ARM::t2SUBri : ARM::SUBri), varLoop); + AddDefaultCC(AddDefaultPred(MIB.addReg(varPhi).addImm(UnitSize))); + MIB->getOperand(5).setReg(ARM::CPSR); + MIB->getOperand(5).setIsDef(true); + } + BuildMI(*BB, BB->end(), dl, + TII->get(IsThumb1 ? ARM::tBcc : IsThumb2 ? ARM::t2Bcc : ARM::Bcc)) + .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR); // loopMBB can loop back to loopMBB or fall through to exitMBB. BB->addSuccessor(loopMBB); @@ -7116,34 +7586,19 @@ EmitStructByval(MachineInstr *MI, MachineBasicBlock *BB) const { // Add epilogue to handle BytesLeft. BB = exitMBB; MachineInstr *StartOfExit = exitMBB->begin(); - ldrOpc = isThumb2 ? ARM::t2LDRB_POST : ARM::LDRB_POST_IMM; - strOpc = isThumb2 ? ARM::t2STRB_POST : ARM::STRB_POST_IMM; // [scratch, srcOut] = LDRB_POST(srcLoop, 1) // [destOut] = STRB_POST(scratch, destLoop, 1) unsigned srcIn = srcLoop; unsigned destIn = destLoop; for (unsigned i = 0; i < BytesLeft; i++) { - unsigned scratch = MRI.createVirtualRegister(TRC); unsigned srcOut = MRI.createVirtualRegister(TRC); unsigned destOut = MRI.createVirtualRegister(TRC); - if (isThumb2) { - AddDefaultPred(BuildMI(*BB, StartOfExit, dl, - TII->get(ldrOpc),scratch) - .addReg(srcOut, RegState::Define).addReg(srcIn).addImm(1)); - - AddDefaultPred(BuildMI(*BB, StartOfExit, dl, TII->get(strOpc), destOut) - .addReg(scratch).addReg(destIn) - .addImm(1)); - } else { - AddDefaultPred(BuildMI(*BB, StartOfExit, dl, - TII->get(ldrOpc),scratch) - .addReg(srcOut, RegState::Define).addReg(srcIn).addReg(0).addImm(1)); - - AddDefaultPred(BuildMI(*BB, StartOfExit, dl, TII->get(strOpc), destOut) - .addReg(scratch).addReg(destIn) - .addReg(0).addImm(1)); - } + unsigned scratch = MRI.createVirtualRegister(TRC); + emitPostLd(BB, StartOfExit, TII, dl, 1, scratch, srcIn, srcOut, + IsThumb1, IsThumb2); + emitPostSt(BB, StartOfExit, TII, dl, 1, scratch, destIn, destOut, + IsThumb1, IsThumb2); srcIn = srcOut; destIn = destOut; } @@ -7293,46 +7748,49 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, case ARM::ATOMIC_CMP_SWAP_I16: return EmitAtomicCmpSwap(MI, BB, 2); case ARM::ATOMIC_CMP_SWAP_I32: return EmitAtomicCmpSwap(MI, BB, 4); + case ARM::ATOMIC_LOAD_I64: + return EmitAtomicLoad64(MI, BB); - case ARM::ATOMADD6432: + case ARM::ATOMIC_LOAD_ADD_I64: return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2ADDrr : ARM::ADDrr, isThumb2 ? ARM::t2ADCrr : ARM::ADCrr, /*NeedsCarry*/ true); - case ARM::ATOMSUB6432: + case ARM::ATOMIC_LOAD_SUB_I64: return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr, isThumb2 ? ARM::t2SBCrr : ARM::SBCrr, /*NeedsCarry*/ true); - case ARM::ATOMOR6432: + case ARM::ATOMIC_LOAD_OR_I64: return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2ORRrr : ARM::ORRrr, isThumb2 ? ARM::t2ORRrr : ARM::ORRrr); - case ARM::ATOMXOR6432: + case ARM::ATOMIC_LOAD_XOR_I64: return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2EORrr : ARM::EORrr, isThumb2 ? ARM::t2EORrr : ARM::EORrr); - case ARM::ATOMAND6432: + case ARM::ATOMIC_LOAD_AND_I64: return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2ANDrr : ARM::ANDrr, isThumb2 ? ARM::t2ANDrr : ARM::ANDrr); - case ARM::ATOMSWAP6432: + case ARM::ATOMIC_STORE_I64: + case ARM::ATOMIC_SWAP_I64: return EmitAtomicBinary64(MI, BB, 0, 0, false); - case ARM::ATOMCMPXCHG6432: + case ARM::ATOMIC_CMP_SWAP_I64: return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr, isThumb2 ? ARM::t2SBCrr : ARM::SBCrr, /*NeedsCarry*/ false, /*IsCmpxchg*/true); - case ARM::ATOMMIN6432: + case ARM::ATOMIC_LOAD_MIN_I64: return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr, isThumb2 ? ARM::t2SBCrr : ARM::SBCrr, /*NeedsCarry*/ true, /*IsCmpxchg*/false, /*IsMinMax*/ true, ARMCC::LT); - case ARM::ATOMMAX6432: + case ARM::ATOMIC_LOAD_MAX_I64: return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr, isThumb2 ? ARM::t2SBCrr : ARM::SBCrr, /*NeedsCarry*/ true, /*IsCmpxchg*/false, /*IsMinMax*/ true, ARMCC::GE); - case ARM::ATOMUMIN6432: + case ARM::ATOMIC_LOAD_UMIN_I64: return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr, isThumb2 ? ARM::t2SBCrr : ARM::SBCrr, /*NeedsCarry*/ true, /*IsCmpxchg*/false, /*IsMinMax*/ true, ARMCC::LO); - case ARM::ATOMUMAX6432: + case ARM::ATOMIC_LOAD_UMAX_I64: return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr, isThumb2 ? ARM::t2SBCrr : ARM::SBCrr, /*NeedsCarry*/ true, /*IsCmpxchg*/false, @@ -7710,13 +8168,13 @@ SDValue combineSelectAndUse(SDNode *N, SDValue Slct, SDValue OtherOp, // Slct is now know to be the desired identity constant when CC is true. SDValue TrueVal = OtherOp; - SDValue FalseVal = DAG.getNode(N->getOpcode(), N->getDebugLoc(), VT, + SDValue FalseVal = DAG.getNode(N->getOpcode(), SDLoc(N), VT, OtherOp, NonConstantVal); // Unless SwapSelectOps says CC should be false. if (SwapSelectOps) std::swap(TrueVal, FalseVal); - return DAG.getNode(ISD::SELECT, N->getDebugLoc(), VT, + return DAG.getNode(ISD::SELECT, SDLoc(N), VT, CCOp, TrueVal, FalseVal); } @@ -7823,9 +8281,9 @@ static SDValue AddCombineToVPADDL(SDNode *N, SDValue N0, SDValue N1, llvm_unreachable("Invalid vector element type for padd optimization."); } - SDValue tmp = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, N->getDebugLoc(), + SDValue tmp = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), widenType, &Ops[0], Ops.size()); - return DAG.getNode(ISD::TRUNCATE, N->getDebugLoc(), VT, tmp); + return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, tmp); } static SDValue findMUL_LOHI(SDValue V) { @@ -7868,8 +8326,11 @@ static SDValue AddCombineTo64bitMLAL(SDNode *AddcNode, assert(AddcNode->getNumValues() == 2 && AddcNode->getValueType(0) == MVT::i32 && - AddcNode->getValueType(1) == MVT::Glue && - "Expect ADDC with two result values: i32, glue"); + "Expect ADDC with two result values. First: i32"); + + // Check that we have a glued ADDC node. + if (AddcNode->getValueType(1) != MVT::Glue) + return SDValue(); // Check that the ADDC adds the low result of the S/UMUL_LOHI. if (AddcOp0->getOpcode() != ISD::UMUL_LOHI && @@ -7950,7 +8411,7 @@ static SDValue AddCombineTo64bitMLAL(SDNode *AddcNode, Ops.push_back(*LowAdd); Ops.push_back(*HiAdd); - SDValue MLALNode = DAG.getNode(FinalOpc, AddcNode->getDebugLoc(), + SDValue MLALNode = DAG.getNode(FinalOpc, SDLoc(AddcNode), DAG.getVTList(MVT::i32, MVT::i32), &Ops[0], Ops.size()); @@ -8038,6 +8499,13 @@ static SDValue PerformSUBCombine(SDNode *N, /// is faster than /// vadd d3, d0, d1 /// vmul d3, d3, d2 +// However, for (A + B) * (A + B), +// vadd d2, d0, d1 +// vmul d3, d0, d2 +// vmla d3, d1, d2 +// is slower than +// vadd d2, d0, d1 +// vmul d3, d2, d2 static SDValue PerformVMULCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget) { @@ -8057,8 +8525,11 @@ static SDValue PerformVMULCombine(SDNode *N, std::swap(N0, N1); } + if (N0 == N1) + return SDValue(); + EVT VT = N->getValueType(0); - DebugLoc DL = N->getDebugLoc(); + SDLoc DL(N); SDValue N00 = N0->getOperand(0); SDValue N01 = N0->getOperand(1); return DAG.getNode(Opcode, DL, VT, @@ -8088,11 +8559,11 @@ static SDValue PerformMULCombine(SDNode *N, return SDValue(); int64_t MulAmt = C->getSExtValue(); - unsigned ShiftAmt = CountTrailingZeros_64(MulAmt); + unsigned ShiftAmt = countTrailingZeros(MulAmt); ShiftAmt = ShiftAmt & (32 - 1); SDValue V = N->getOperand(0); - DebugLoc DL = N->getDebugLoc(); + SDLoc DL(N); SDValue Res; MulAmt >>= ShiftAmt; @@ -8156,7 +8627,7 @@ static SDValue PerformANDCombine(SDNode *N, // Attempt to use immediate-form VBIC BuildVectorSDNode *BVN = dyn_cast(N->getOperand(1)); - DebugLoc dl = N->getDebugLoc(); + SDLoc dl(N); EVT VT = N->getValueType(0); SelectionDAG &DAG = DCI.DAG; @@ -8199,7 +8670,7 @@ static SDValue PerformORCombine(SDNode *N, const ARMSubtarget *Subtarget) { // Attempt to use immediate-form VORR BuildVectorSDNode *BVN = dyn_cast(N->getOperand(1)); - DebugLoc dl = N->getDebugLoc(); + SDLoc dl(N); EVT VT = N->getValueType(0); SelectionDAG &DAG = DCI.DAG; @@ -8248,22 +8719,29 @@ static SDValue PerformORCombine(SDNode *N, unsigned SplatBitSize; bool HasAnyUndefs; + APInt SplatBits0, SplatBits1; BuildVectorSDNode *BVN0 = dyn_cast(N0->getOperand(1)); - APInt SplatBits0; + BuildVectorSDNode *BVN1 = dyn_cast(N1->getOperand(1)); + // Ensure that the second operand of both ands are constants if (BVN0 && BVN0->isConstantSplat(SplatBits0, SplatUndef, SplatBitSize, - HasAnyUndefs) && !HasAnyUndefs) { - BuildVectorSDNode *BVN1 = dyn_cast(N1->getOperand(1)); - APInt SplatBits1; - if (BVN1 && BVN1->isConstantSplat(SplatBits1, SplatUndef, SplatBitSize, - HasAnyUndefs) && !HasAnyUndefs && - SplatBits0 == ~SplatBits1) { - // Canonicalize the vector type to make instruction selection simpler. - EVT CanonicalVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32; - SDValue Result = DAG.getNode(ARMISD::VBSL, dl, CanonicalVT, - N0->getOperand(1), N0->getOperand(0), - N1->getOperand(0)); - return DAG.getNode(ISD::BITCAST, dl, VT, Result); - } + HasAnyUndefs) && !HasAnyUndefs) { + if (BVN1 && BVN1->isConstantSplat(SplatBits1, SplatUndef, SplatBitSize, + HasAnyUndefs) && !HasAnyUndefs) { + // Ensure that the bit width of the constants are the same and that + // the splat arguments are logical inverses as per the pattern we + // are trying to simplify. + if (SplatBits0.getBitWidth() == SplatBits1.getBitWidth() && + SplatBits0 == ~SplatBits1) { + // Canonicalize the vector type to make instruction selection + // simpler. + EVT CanonicalVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32; + SDValue Result = DAG.getNode(ARMISD::VBSL, dl, CanonicalVT, + N0->getOperand(1), + N0->getOperand(0), + N1->getOperand(0)); + return DAG.getNode(ISD::BITCAST, dl, VT, Result); + } + } } } @@ -8274,7 +8752,7 @@ static SDValue PerformORCombine(SDNode *N, if (Subtarget->isThumb1Only() || !Subtarget->hasV6T2Ops()) return SDValue(); - DebugLoc DL = N->getDebugLoc(); + SDLoc DL(N); // 1) or (and A, mask), val => ARMbfi A, val, mask // iff (val & mask) == val // @@ -8309,7 +8787,7 @@ static SDValue PerformORCombine(SDNode *N, return SDValue(); if (ARM::isBitFieldInvertedMask(Mask)) { - Val >>= CountTrailingZeros_32(~Mask); + Val >>= countTrailingZeros(~Mask); Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, DAG.getConstant(Val, MVT::i32), @@ -8336,7 +8814,7 @@ static SDValue PerformORCombine(SDNode *N, (Mask == 0xffff || Mask == 0xffff0000)) return SDValue(); // 2a - unsigned amt = CountTrailingZeros_32(Mask2); + unsigned amt = countTrailingZeros(Mask2); Res = DAG.getNode(ISD::SRL, DL, VT, N1.getOperand(0), DAG.getConstant(amt, MVT::i32)); Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, Res, @@ -8352,7 +8830,7 @@ static SDValue PerformORCombine(SDNode *N, (Mask2 == 0xffff || Mask2 == 0xffff0000)) return SDValue(); // 2b - unsigned lsb = CountTrailingZeros_32(Mask); + unsigned lsb = countTrailingZeros(Mask); Res = DAG.getNode(ISD::SRL, DL, VT, N00, DAG.getConstant(lsb, MVT::i32)); Res = DAG.getNode(ARMISD::BFI, DL, VT, N1.getOperand(0), Res, @@ -8370,7 +8848,7 @@ static SDValue PerformORCombine(SDNode *N, // where lsb(mask) == #shamt and masked bits of B are known zero. SDValue ShAmt = N00.getOperand(1); unsigned ShAmtC = cast(ShAmt)->getZExtValue(); - unsigned LSB = CountTrailingZeros_32(Mask); + unsigned LSB = countTrailingZeros(Mask); if (ShAmtC != LSB) return SDValue(); @@ -8413,12 +8891,12 @@ static SDValue PerformBFICombine(SDNode *N, if (!N11C) return SDValue(); unsigned InvMask = cast(N->getOperand(2))->getZExtValue(); - unsigned LSB = CountTrailingZeros_32(~InvMask); - unsigned Width = (32 - CountLeadingZeros_32(~InvMask)) - LSB; + unsigned LSB = countTrailingZeros(~InvMask); + unsigned Width = (32 - countLeadingZeros(~InvMask)) - LSB; unsigned Mask = (1 << Width)-1; unsigned Mask2 = N11C->getZExtValue(); if ((Mask & (~Mask2)) == 0) - return DCI.DAG.getNode(ARMISD::BFI, N->getDebugLoc(), N->getValueType(0), + return DCI.DAG.getNode(ARMISD::BFI, SDLoc(N), N->getValueType(0), N->getOperand(0), N1.getOperand(0), N->getOperand(2)); } @@ -8444,7 +8922,7 @@ static SDValue PerformVMOVRRDCombine(SDNode *N, LoadSDNode *LD = cast(InNode); SelectionDAG &DAG = DCI.DAG; - DebugLoc DL = LD->getDebugLoc(); + SDLoc DL(LD); SDValue BasePtr = LD->getBasePtr(); SDValue NewLD1 = DAG.getLoad(MVT::i32, DL, LD->getChain(), BasePtr, LD->getPointerInfo(), LD->isVolatile(), @@ -8481,7 +8959,7 @@ static SDValue PerformVMOVDRRCombine(SDNode *N, SelectionDAG &DAG) { if (Op0.getOpcode() == ARMISD::VMOVRRD && Op0.getNode() == Op1.getNode() && Op0.getResNo() == 0 && Op1.getResNo() == 1) - return DAG.getNode(ISD::BITCAST, N->getDebugLoc(), + return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getValueType(0), Op0.getOperand(0)); return SDValue(); } @@ -8523,7 +9001,7 @@ static SDValue PerformSTORECombine(SDNode *N, NumElems*SizeRatio); assert(WideVecVT.getSizeInBits() == VT.getSizeInBits()); - DebugLoc DL = St->getDebugLoc(); + SDLoc DL(St); SDValue WideVec = DAG.getNode(ISD::BITCAST, DL, WideVecVT, StVal); SmallVector ShuffleVec(NumElems * SizeRatio, -1); for (unsigned i = 0; i < NumElems; ++i) ShuffleVec[i] = i * SizeRatio; @@ -8584,7 +9062,7 @@ static SDValue PerformSTORECombine(SDNode *N, if (StVal.getNode()->getOpcode() == ARMISD::VMOVDRR && StVal.getNode()->hasOneUse()) { SelectionDAG &DAG = DCI.DAG; - DebugLoc DL = St->getDebugLoc(); + SDLoc DL(St); SDValue BasePtr = St->getBasePtr(); SDValue NewST1 = DAG.getStore(St->getChain(), DL, StVal.getNode()->getOperand(0), BasePtr, @@ -8606,14 +9084,14 @@ static SDValue PerformSTORECombine(SDNode *N, // Bitcast an i64 store extracted from a vector to f64. // Otherwise, the i64 value will be legalized to a pair of i32 values. SelectionDAG &DAG = DCI.DAG; - DebugLoc dl = StVal.getDebugLoc(); + SDLoc dl(StVal); SDValue IntVec = StVal.getOperand(0); EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, IntVec.getValueType().getVectorNumElements()); SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, IntVec); SDValue ExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Vec, StVal.getOperand(1)); - dl = N->getDebugLoc(); + dl = SDLoc(N); SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ExtElt); // Make the DAGCombiner fold the bitcasts. DCI.AddToWorklist(Vec.getNode()); @@ -8659,7 +9137,7 @@ static SDValue PerformBUILD_VECTORCombine(SDNode *N, EVT VT = N->getValueType(0); if (VT.getVectorElementType() != MVT::i64 || !hasNormalLoadOperand(N)) return SDValue(); - DebugLoc dl = N->getDebugLoc(); + SDLoc dl(N); SmallVector Ops; unsigned NumElts = VT.getVectorNumElements(); for (unsigned i = 0; i < NumElts; ++i) { @@ -8673,6 +9151,98 @@ static SDValue PerformBUILD_VECTORCombine(SDNode *N, return DAG.getNode(ISD::BITCAST, dl, VT, BV); } +/// \brief Target-specific dag combine xforms for ARMISD::BUILD_VECTOR. +static SDValue +PerformARMBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { + // ARMISD::BUILD_VECTOR is introduced when legalizing ISD::BUILD_VECTOR. + // At that time, we may have inserted bitcasts from integer to float. + // If these bitcasts have survived DAGCombine, change the lowering of this + // BUILD_VECTOR in something more vector friendly, i.e., that does not + // force to use floating point types. + + // Make sure we can change the type of the vector. + // This is possible iff: + // 1. The vector is only used in a bitcast to a integer type. I.e., + // 1.1. Vector is used only once. + // 1.2. Use is a bit convert to an integer type. + // 2. The size of its operands are 32-bits (64-bits are not legal). + EVT VT = N->getValueType(0); + EVT EltVT = VT.getVectorElementType(); + + // Check 1.1. and 2. + if (EltVT.getSizeInBits() != 32 || !N->hasOneUse()) + return SDValue(); + + // By construction, the input type must be float. + assert(EltVT == MVT::f32 && "Unexpected type!"); + + // Check 1.2. + SDNode *Use = *N->use_begin(); + if (Use->getOpcode() != ISD::BITCAST || + Use->getValueType(0).isFloatingPoint()) + return SDValue(); + + // Check profitability. + // Model is, if more than half of the relevant operands are bitcast from + // i32, turn the build_vector into a sequence of insert_vector_elt. + // Relevant operands are everything that is not statically + // (i.e., at compile time) bitcasted. + unsigned NumOfBitCastedElts = 0; + unsigned NumElts = VT.getVectorNumElements(); + unsigned NumOfRelevantElts = NumElts; + for (unsigned Idx = 0; Idx < NumElts; ++Idx) { + SDValue Elt = N->getOperand(Idx); + if (Elt->getOpcode() == ISD::BITCAST) { + // Assume only bit cast to i32 will go away. + if (Elt->getOperand(0).getValueType() == MVT::i32) + ++NumOfBitCastedElts; + } else if (Elt.getOpcode() == ISD::UNDEF || isa(Elt)) + // Constants are statically casted, thus do not count them as + // relevant operands. + --NumOfRelevantElts; + } + + // Check if more than half of the elements require a non-free bitcast. + if (NumOfBitCastedElts <= NumOfRelevantElts / 2) + return SDValue(); + + SelectionDAG &DAG = DCI.DAG; + // Create the new vector type. + EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts); + // Check if the type is legal. + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (!TLI.isTypeLegal(VecVT)) + return SDValue(); + + // Combine: + // ARMISD::BUILD_VECTOR E1, E2, ..., EN. + // => BITCAST INSERT_VECTOR_ELT + // (INSERT_VECTOR_ELT (...), (BITCAST EN-1), N-1), + // (BITCAST EN), N. + SDValue Vec = DAG.getUNDEF(VecVT); + SDLoc dl(N); + for (unsigned Idx = 0 ; Idx < NumElts; ++Idx) { + SDValue V = N->getOperand(Idx); + if (V.getOpcode() == ISD::UNDEF) + continue; + if (V.getOpcode() == ISD::BITCAST && + V->getOperand(0).getValueType() == MVT::i32) + // Fold obvious case. + V = V.getOperand(0); + else { + V = DAG.getNode(ISD::BITCAST, SDLoc(V), MVT::i32, V); + // Make the DAGCombiner fold the bitcasts. + DCI.AddToWorklist(V.getNode()); + } + SDValue LaneIdx = DAG.getConstant(Idx, MVT::i32); + Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecVT, Vec, V, LaneIdx); + } + Vec = DAG.getNode(ISD::BITCAST, dl, VT, Vec); + // Make the DAGCombiner fold the bitcasts. + DCI.AddToWorklist(Vec.getNode()); + return Vec; +} + /// PerformInsertEltCombine - Target-specific dag combine xforms for /// ISD::INSERT_VECTOR_ELT. static SDValue PerformInsertEltCombine(SDNode *N, @@ -8686,7 +9256,7 @@ static SDValue PerformInsertEltCombine(SDNode *N, return SDValue(); SelectionDAG &DAG = DCI.DAG; - DebugLoc dl = N->getDebugLoc(); + SDLoc dl(N); EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VT.getVectorNumElements()); SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, N->getOperand(0)); @@ -8732,7 +9302,7 @@ static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG) { !TLI.isTypeLegal(Concat1Op1.getValueType())) return SDValue(); - SDValue NewConcat = DAG.getNode(ISD::CONCAT_VECTORS, N->getDebugLoc(), VT, + SDValue NewConcat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Op0.getOperand(0), Op1.getOperand(0)); // Translate the shuffle mask. SmallVector NewMask; @@ -8748,7 +9318,7 @@ static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG) { NewElt = HalfElts + MaskElt - NumElts; NewMask.push_back(NewElt); } - return DAG.getVectorShuffle(VT, N->getDebugLoc(), NewConcat, + return DAG.getVectorShuffle(VT, SDLoc(N), NewConcat, DAG.getUNDEF(VT), NewMask.data()); } @@ -8865,7 +9435,7 @@ static SDValue CombineBaseUpdate(SDNode *N, Ops.push_back(N->getOperand(i)); } MemIntrinsicSDNode *MemInt = cast(N); - SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, N->getDebugLoc(), SDTys, + SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys, Ops.data(), Ops.size(), MemInt->getMemoryVT(), MemInt->getMemOperand()); @@ -8939,7 +9509,7 @@ static bool CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { SDVTList SDTys = DAG.getVTList(Tys, NumVecs+1); SDValue Ops[] = { VLD->getOperand(0), VLD->getOperand(2) }; MemIntrinsicSDNode *VLDMemInt = cast(VLD); - SDValue VLDDup = DAG.getMemIntrinsicNode(NewOpc, VLD->getDebugLoc(), SDTys, + SDValue VLDDup = DAG.getMemIntrinsicNode(NewOpc, SDLoc(VLD), SDTys, Ops, 2, VLDMemInt->getMemoryVT(), VLDMemInt->getMemOperand()); @@ -8994,7 +9564,7 @@ static SDValue PerformVDUPLANECombine(SDNode *N, if (EltSize > VT.getVectorElementType().getSizeInBits()) return SDValue(); - return DCI.DAG.getNode(ISD::BITCAST, N->getDebugLoc(), VT, Op); + return DCI.DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op); } // isConstVecPow2 - Return true if each vector element is a power of 2, all @@ -9051,12 +9621,27 @@ static SDValue PerformVCVTCombine(SDNode *N, !isConstVecPow2(ConstVec, isSigned, C)) return SDValue(); + MVT FloatTy = Op.getSimpleValueType().getVectorElementType(); + MVT IntTy = N->getSimpleValueType(0).getVectorElementType(); + if (FloatTy.getSizeInBits() != 32 || IntTy.getSizeInBits() > 32) { + // These instructions only exist converting from f32 to i32. We can handle + // smaller integers by generating an extra truncate, but larger ones would + // be lossy. + return SDValue(); + } + unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfp2fxs : Intrinsic::arm_neon_vcvtfp2fxu; - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, N->getDebugLoc(), - N->getValueType(0), - DAG.getConstant(IntrinsicOpcode, MVT::i32), N0, - DAG.getConstant(Log2_64(C), MVT::i32)); + unsigned NumLanes = Op.getValueType().getVectorNumElements(); + SDValue FixConv = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), + NumLanes == 2 ? MVT::v2i32 : MVT::v4i32, + DAG.getConstant(IntrinsicOpcode, MVT::i32), N0, + DAG.getConstant(Log2_64(C), MVT::i32)); + + if (IntTy.getSizeInBits() < FloatTy.getSizeInBits()) + FixConv = DAG.getNode(ISD::TRUNCATE, SDLoc(N), N->getValueType(0), FixConv); + + return FixConv; } /// PerformVDIVCombine - VCVT (fixed-point to floating-point, Advanced SIMD) @@ -9087,12 +9672,28 @@ static SDValue PerformVDIVCombine(SDNode *N, !isConstVecPow2(ConstVec, isSigned, C)) return SDValue(); + MVT FloatTy = N->getSimpleValueType(0).getVectorElementType(); + MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType(); + if (FloatTy.getSizeInBits() != 32 || IntTy.getSizeInBits() > 32) { + // These instructions only exist converting from i32 to f32. We can handle + // smaller integers by generating an extra extend, but larger ones would + // be lossy. + return SDValue(); + } + + SDValue ConvInput = Op.getOperand(0); + unsigned NumLanes = Op.getValueType().getVectorNumElements(); + if (IntTy.getSizeInBits() < FloatTy.getSizeInBits()) + ConvInput = DAG.getNode(isSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, + SDLoc(N), NumLanes == 2 ? MVT::v2i32 : MVT::v4i32, + ConvInput); + unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfxs2fp : Intrinsic::arm_neon_vcvtfxu2fp; - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, N->getDebugLoc(), + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), Op.getValueType(), DAG.getConstant(IntrinsicOpcode, MVT::i32), - Op.getOperand(0), DAG.getConstant(Log2_64(C), MVT::i32)); + ConvInput, DAG.getConstant(Log2_64(C), MVT::i32)); } /// Getvshiftimm - Check if this is a valid build_vector for the immediate @@ -9273,7 +9874,7 @@ static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) { VShiftOpc = ARMISD::VQRSHRNsu; break; } - return DAG.getNode(VShiftOpc, N->getDebugLoc(), N->getValueType(0), + return DAG.getNode(VShiftOpc, SDLoc(N), N->getValueType(0), N->getOperand(1), DAG.getConstant(Cnt, MVT::i32)); } @@ -9290,7 +9891,7 @@ static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) { llvm_unreachable("invalid shift count for vsli/vsri intrinsic"); } - return DAG.getNode(VShiftOpc, N->getDebugLoc(), N->getValueType(0), + return DAG.getNode(VShiftOpc, SDLoc(N), N->getValueType(0), N->getOperand(1), N->getOperand(2), DAG.getConstant(Cnt, MVT::i32)); } @@ -9321,7 +9922,7 @@ static SDValue PerformShiftCombine(SDNode *N, SelectionDAG &DAG, if (C->getZExtValue() == 16 && N0.getOpcode() == ISD::BSWAP && DAG.MaskedValueIsZero(N0.getOperand(0), APInt::getHighBitsSet(32, 16))) - return DAG.getNode(ISD::ROTR, N->getDebugLoc(), VT, N0, N1); + return DAG.getNode(ISD::ROTR, SDLoc(N), VT, N0, N1); } } @@ -9338,7 +9939,7 @@ static SDValue PerformShiftCombine(SDNode *N, SelectionDAG &DAG, case ISD::SHL: if (isVShiftLImm(N->getOperand(1), VT, false, Cnt)) - return DAG.getNode(ARMISD::VSHL, N->getDebugLoc(), VT, N->getOperand(0), + return DAG.getNode(ARMISD::VSHL, SDLoc(N), VT, N->getOperand(0), DAG.getConstant(Cnt, MVT::i32)); break; @@ -9347,7 +9948,7 @@ static SDValue PerformShiftCombine(SDNode *N, SelectionDAG &DAG, if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) { unsigned VShiftOpc = (N->getOpcode() == ISD::SRA ? ARMISD::VSHRs : ARMISD::VSHRu); - return DAG.getNode(VShiftOpc, N->getDebugLoc(), VT, N->getOperand(0), + return DAG.getNode(VShiftOpc, SDLoc(N), VT, N->getOperand(0), DAG.getConstant(Cnt, MVT::i32)); } } @@ -9387,7 +9988,7 @@ static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG, Opc = ARMISD::VGETLANEu; break; } - return DAG.getNode(Opc, N->getDebugLoc(), VT, Vec, Lane); + return DAG.getNode(Opc, SDLoc(N), VT, Vec, Lane); } } @@ -9476,7 +10077,7 @@ static SDValue PerformSELECT_CCCombine(SDNode *N, SelectionDAG &DAG, if (!Opcode) return SDValue(); - return DAG.getNode(Opcode, N->getDebugLoc(), N->getValueType(0), LHS, RHS); + return DAG.getNode(Opcode, SDLoc(N), N->getValueType(0), LHS, RHS); } /// PerformCMOVCombine - Target-specific DAG combining for ARMISD::CMOV. @@ -9488,7 +10089,7 @@ ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const { return SDValue(); EVT VT = N->getValueType(0); - DebugLoc dl = N->getDebugLoc(); + SDLoc dl(N); SDValue LHS = Cmp.getOperand(0); SDValue RHS = Cmp.getOperand(1); SDValue FalseVal = N->getOperand(0); @@ -9578,6 +10179,8 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, case ARMISD::VLD3DUP: case ARMISD::VLD4DUP: return CombineBaseUpdate(N, DCI); + case ARMISD::BUILD_VECTOR: + return PerformARMBUILD_VECTORCombine(N, DCI); case ISD::INTRINSIC_VOID: case ISD::INTRINSIC_W_CHAIN: switch (cast(N->getOperand(1))->getZExtValue()) { @@ -9702,6 +10305,21 @@ bool ARMTargetLowering::isZExtFree(SDValue Val, EVT VT2) const { return false; } +bool ARMTargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const { + if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) + return false; + + if (!isTypeLegal(EVT::getEVT(Ty1))) + return false; + + assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop"); + + // Assuming the caller doesn't have a zeroext or signext return parameter, + // truncation all the way down to i1 is valid. + return true; +} + + static bool isLegalT1AddressImmediate(int64_t V, EVT VT) { if (V < 0) return false; @@ -10101,9 +10719,19 @@ void ARMTargetLowering::computeMaskedBitsForTargetNode(const SDValue Op, APInt &KnownOne, const SelectionDAG &DAG, unsigned Depth) const { - KnownZero = KnownOne = APInt(KnownOne.getBitWidth(), 0); + unsigned BitWidth = KnownOne.getBitWidth(); + KnownZero = KnownOne = APInt(BitWidth, 0); switch (Op.getOpcode()) { default: break; + case ARMISD::ADDC: + case ARMISD::ADDE: + case ARMISD::SUBC: + case ARMISD::SUBE: + // These nodes' second result is a boolean + if (Op.getResNo() == 0) + break; + KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1); + break; case ARMISD::CMOV: { // Bits are known zero/one if known on the LHS and RHS. DAG.ComputeMaskedBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1); @@ -10217,7 +10845,7 @@ ARMTargetLowering::getSingleConstraintMatchWeight( typedef std::pair RCPair; RCPair ARMTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, - EVT VT) const { + MVT VT) const { if (Constraint.size() == 1) { // GCC ARM Constraint Letters switch (Constraint[0]) { @@ -10232,6 +10860,8 @@ ARMTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, case 'r': return RCPair(0U, &ARM::GPRRegClass); case 'w': + if (VT == MVT::Other) + break; if (VT == MVT::f32) return RCPair(0U, &ARM::SPRRegClass); if (VT.getSizeInBits() == 64) @@ -10240,6 +10870,8 @@ ARMTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, return RCPair(0U, &ARM::QPRRegClass); break; case 'x': + if (VT == MVT::Other) + break; if (VT == MVT::f32) return RCPair(0U, &ARM::SPR_8RegClass); if (VT.getSizeInBits() == 64) @@ -10426,6 +11058,54 @@ void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op, return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); } +SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const { + assert(Subtarget->isTargetAEABI() && "Register-based DivRem lowering only"); + unsigned Opcode = Op->getOpcode(); + assert((Opcode == ISD::SDIVREM || Opcode == ISD::UDIVREM) && + "Invalid opcode for Div/Rem lowering"); + bool isSigned = (Opcode == ISD::SDIVREM); + EVT VT = Op->getValueType(0); + Type *Ty = VT.getTypeForEVT(*DAG.getContext()); + + RTLIB::Libcall LC; + switch (VT.getSimpleVT().SimpleTy) { + default: llvm_unreachable("Unexpected request for libcall!"); + case MVT::i8: LC= isSigned ? RTLIB::SDIVREM_I8 : RTLIB::UDIVREM_I8; break; + case MVT::i16: LC= isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break; + case MVT::i32: LC= isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32; break; + case MVT::i64: LC= isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64; break; + } + + SDValue InChain = DAG.getEntryNode(); + + TargetLowering::ArgListTy Args; + TargetLowering::ArgListEntry Entry; + for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) { + EVT ArgVT = Op->getOperand(i).getValueType(); + Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); + Entry.Node = Op->getOperand(i); + Entry.Ty = ArgTy; + Entry.isSExt = isSigned; + Entry.isZExt = !isSigned; + Args.push_back(Entry); + } + + SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC), + getPointerTy()); + + Type *RetTy = (Type*)StructType::get(Ty, Ty, NULL); + + SDLoc dl(Op); + TargetLowering:: + CallLoweringInfo CLI(InChain, RetTy, isSigned, !isSigned, false, true, + 0, getLibcallCallingConv(LC), /*isTailCall=*/false, + /*doesNotReturn=*/false, /*isReturnValueUsed=*/true, + Callee, Args, DAG, dl); + std::pair CallInfo = LowerCallTo(CLI); + + return CallInfo.first; +} + bool ARMTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const { // The ARM target isn't yet aware of offsets. @@ -10434,17 +11114,15 @@ ARMTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const { bool ARM::isBitFieldInvertedMask(unsigned v) { if (v == 0xffffffff) - return 0; + return false; + // there can be 1's on either or both "outsides", all the "inside" // bits must be 0's - unsigned int lsb = 0, msb = 31; - while (v & (1 << msb)) --msb; - while (v & (1 << lsb)) ++lsb; - for (unsigned int i = lsb; i <= msb; ++i) { - if (v & (1 << i)) - return 0; - } - return 1; + unsigned TO = CountTrailingOnes_32(v); + unsigned LO = CountLeadingOnes_32(v); + v = (v >> TO) << TO; + v = (v << LO) >> LO; + return v == 0; } /// isFPImmLegal - Returns true if the target can instruction select the @@ -10513,6 +11191,30 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.writeMem = true; return true; } + case Intrinsic::arm_ldrex: { + PointerType *PtrTy = cast(I.getArgOperand(0)->getType()); + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.memVT = MVT::getVT(PtrTy->getElementType()); + Info.ptrVal = I.getArgOperand(0); + Info.offset = 0; + Info.align = getDataLayout()->getABITypeAlignment(PtrTy->getElementType()); + Info.vol = true; + Info.readMem = true; + Info.writeMem = false; + return true; + } + case Intrinsic::arm_strex: { + PointerType *PtrTy = cast(I.getArgOperand(1)->getType()); + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.memVT = MVT::getVT(PtrTy->getElementType()); + Info.ptrVal = I.getArgOperand(1); + Info.offset = 0; + Info.align = getDataLayout()->getABITypeAlignment(PtrTy->getElementType()); + Info.vol = true; + Info.readMem = false; + Info.writeMem = true; + return true; + } case Intrinsic::arm_strexd: { Info.opc = ISD::INTRINSIC_W_CHAIN; Info.memVT = MVT::i64; diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h index 426010e295d7..90facddeb02b 100644 --- a/lib/Target/ARM/ARMISelLowering.h +++ b/lib/Target/ARM/ARMISelLowering.h @@ -52,6 +52,7 @@ namespace llvm { BR_JT, // Jumptable branch. BR2_JT, // Jumptable branch (2 level - jumptable entry is a jump). RET_FLAG, // Return with a flag operand. + INTRET_FLAG, // Interrupt return with an LR-offset and a flag operand. PIC_ADD, // Add with a PC operand and a PIC label. @@ -94,7 +95,6 @@ namespace llvm { DYN_ALLOC, // Dynamic allocation on the stack. - MEMBARRIER, // Memory barrier (DMB) MEMBARRIER_MCR, // Memory barrier (MCR) PRELOAD, // Preload @@ -186,6 +186,8 @@ namespace llvm { // Floating-point max and min: FMAX, FMIN, + VMAXNM, + VMINNM, // Bit-field insert BFI, @@ -222,21 +224,7 @@ namespace llvm { VST4_UPD, VST2LN_UPD, VST3LN_UPD, - VST4LN_UPD, - - // 64-bit atomic ops (value split into two registers) - ATOMADD64_DAG, - ATOMSUB64_DAG, - ATOMOR64_DAG, - ATOMXOR64_DAG, - ATOMAND64_DAG, - ATOMNAND64_DAG, - ATOMSWAP64_DAG, - ATOMCMPXCHG64_DAG, - ATOMMIN64_DAG, - ATOMUMIN64_DAG, - ATOMMAX64_DAG, - ATOMUMAX64_DAG + VST4LN_UPD }; } @@ -270,7 +258,7 @@ namespace llvm { } /// getSetCCResultType - Return the value type to use for ISD::SETCC. - virtual EVT getSetCCResultType(EVT VT) const; + virtual EVT getSetCCResultType(LLVMContext &Context, EVT VT) const; virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr *MI, @@ -298,6 +286,9 @@ namespace llvm { using TargetLowering::isZExtFree; virtual bool isZExtFree(SDValue Val, EVT VT2) const; + virtual bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const; + + /// isLegalAddressingMode - Return true if the addressing mode represented /// by AM is legal for this target, for a load/store of the specified type. virtual bool isLegalAddressingMode(const AddrMode &AM, Type *Ty)const; @@ -349,7 +340,7 @@ namespace llvm { std::pair getRegForInlineAsmConstraint(const std::string &Constraint, - EVT VT) const; + MVT VT) const; /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops /// vector. If it is invalid, don't add anything to Ops. If hasMemory is @@ -372,6 +363,12 @@ namespace llvm { /// be used for loads / stores from the global. virtual unsigned getMaximalGlobalOffset() const; + /// Returns true if a cast between SrcAS and DestAS is a noop. + virtual bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const { + // Addrspacecasts are always noops. + return true; + } + /// createFastISel - This method returns a target specific FastISel object, /// or null if the target does not support "fast" ISel. virtual FastISel *createFastISel(FunctionLoweringInfo &funcInfo, @@ -412,21 +409,21 @@ namespace llvm { void addQRTypeForNEON(MVT VT); typedef SmallVector, 8> RegsToPassVector; - void PassF64ArgInRegs(DebugLoc dl, SelectionDAG &DAG, + void PassF64ArgInRegs(SDLoc dl, SelectionDAG &DAG, SDValue Chain, SDValue &Arg, RegsToPassVector &RegsToPass, CCValAssign &VA, CCValAssign &NextVA, SDValue &StackPtr, - SmallVector &MemOpChains, + SmallVectorImpl &MemOpChains, ISD::ArgFlagsTy Flags) const; SDValue GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA, SDValue &Root, SelectionDAG &DAG, - DebugLoc dl) const; + SDLoc dl) const; CCAssignFn *CCAssignFnForNode(CallingConv::ID CC, bool Return, bool isVarArg) const; SDValue LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, SDValue Arg, - DebugLoc dl, SelectionDAG &DAG, + SDLoc dl, SelectionDAG &DAG, const CCValAssign &VA, ISD::ArgFlagsTy Flags) const; SDValue LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const; @@ -457,13 +454,26 @@ namespace llvm { const ARMSubtarget *ST) const; SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST) const; + SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerDivRem(SDValue Op, SelectionDAG &DAG) const; + + /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster + /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be + /// expanded to FMAs when this method returns true, otherwise fmuladd is + /// expanded to fmul + fadd. + /// + /// ARM supports both fused and unfused multiply-add operations; we already + /// lower a pair of fmul and fadd to the latter so it's not clear that there + /// would be a gain or that the gain would be worthwhile enough to risk + /// correctness bugs. + virtual bool isFMAFasterThanFMulAndFAdd(EVT VT) const { return false; } SDValue ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const; SDValue LowerCallResult(SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Ins, - DebugLoc dl, SelectionDAG &DAG, + SDLoc dl, SelectionDAG &DAG, SmallVectorImpl &InVals, bool isThisReturn, SDValue ThisVal) const; @@ -471,24 +481,26 @@ namespace llvm { LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Ins, - DebugLoc dl, SelectionDAG &DAG, + SDLoc dl, SelectionDAG &DAG, SmallVectorImpl &InVals) const; int StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG, - DebugLoc dl, SDValue &Chain, + SDLoc dl, SDValue &Chain, const Value *OrigArg, unsigned InRegsParamRecordIdx, unsigned OffsetFromOrigArg, unsigned ArgOffset, + unsigned ArgSize, bool ForceMutable) const; void VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG, - DebugLoc dl, SDValue &Chain, + SDLoc dl, SDValue &Chain, unsigned ArgOffset, bool ForceMutable = false) const; void computeRegArea(CCState &CCInfo, MachineFunction &MF, unsigned InRegsParamRecordIdx, + unsigned ArgSize, unsigned &ArgRegsSize, unsigned &ArgRegsSaveSize) const; @@ -522,16 +534,16 @@ namespace llvm { CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Outs, const SmallVectorImpl &OutVals, - DebugLoc dl, SelectionDAG &DAG) const; + SDLoc dl, SelectionDAG &DAG) const; virtual bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const; virtual bool mayBeEmittedAsTailCall(CallInst *CI) const; SDValue getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, - SDValue &ARMcc, SelectionDAG &DAG, DebugLoc dl) const; + SDValue &ARMcc, SelectionDAG &DAG, SDLoc dl) const; SDValue getVFPCmp(SDValue LHS, SDValue RHS, - SelectionDAG &DAG, DebugLoc dl) const; + SelectionDAG &DAG, SDLoc dl) const; SDValue duplicateCmp(SDValue Cmp, SelectionDAG &DAG) const; SDValue OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const; @@ -556,6 +568,8 @@ namespace llvm { unsigned Size, bool signExtend, ARMCC::CondCodes Cond) const; + MachineBasicBlock *EmitAtomicLoad64(MachineInstr *MI, + MachineBasicBlock *BB) const; void SetupEntryBlockForSjLj(MachineInstr *MI, MachineBasicBlock *MBB, diff --git a/lib/Target/ARM/ARMInstrFormats.td b/lib/Target/ARM/ARMInstrFormats.td index 67a6820932fc..f93504fddb8e 100644 --- a/lib/Target/ARM/ARMInstrFormats.td +++ b/lib/Target/ARM/ARMInstrFormats.td @@ -155,6 +155,16 @@ def pred : PredicateOperand, PredicateOp, + ComplexPattern { + let MIOperandInfo = (ops i32imm, i32imm); + let PrintMethod = "printPredicateOperand"; +} + // Conditional code result for instructions whose 's' bit is set, e.g. subs. def CCOutOperand : AsmOperandClass { let Name = "CCOut"; } def cc_out : OptionalDefOperand { @@ -237,6 +247,8 @@ class t2InstAlias : InstAlias, Requires<[IsThumb2]>; class VFP2InstAlias : InstAlias, Requires<[HasVFP2]>; +class VFP2DPInstAlias + : InstAlias, Requires<[HasVFP2,HasDPVFP]>; class VFP3InstAlias : InstAlias, Requires<[HasVFP3]>; class NEONInstAlias @@ -490,8 +502,7 @@ class JTI; -// Atomic load/store instructions -class AIldrex opcod, dag oops, dag iops, InstrItinClass itin, +class AIldr_ex_or_acq opcod, bits<2> opcod2, dag oops, dag iops, InstrItinClass itin, string opc, string asm, list pattern> : I { @@ -502,23 +513,52 @@ class AIldrex opcod, dag oops, dag iops, InstrItinClass itin, let Inst{20} = 1; let Inst{19-16} = addr; let Inst{15-12} = Rt; - let Inst{11-0} = 0b111110011111; + let Inst{11-10} = 0b11; + let Inst{9-8} = opcod2; + let Inst{7-0} = 0b10011111; } -class AIstrex opcod, dag oops, dag iops, InstrItinClass itin, +class AIstr_ex_or_rel opcod, bits<2> opcod2, dag oops, dag iops, InstrItinClass itin, string opc, string asm, list pattern> : I { - bits<4> Rd; bits<4> Rt; bits<4> addr; let Inst{27-23} = 0b00011; let Inst{22-21} = opcod; let Inst{20} = 0; let Inst{19-16} = addr; - let Inst{15-12} = Rd; - let Inst{11-4} = 0b11111001; + let Inst{11-10} = 0b11; + let Inst{9-8} = opcod2; + let Inst{7-4} = 0b1001; let Inst{3-0} = Rt; } +// Atomic load/store instructions +class AIldrex opcod, dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list pattern> + : AIldr_ex_or_acq; + +class AIstrex opcod, dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list pattern> + : AIstr_ex_or_rel { + bits<4> Rd; + let Inst{15-12} = Rd; +} + +// Exclusive load/store instructions + +class AIldaex opcod, dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list pattern> + : AIldr_ex_or_acq, + Requires<[IsARM, HasV8]>; + +class AIstlex opcod, dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list pattern> + : AIstr_ex_or_rel, + Requires<[IsARM, HasV8]> { + bits<4> Rd; + let Inst{15-12} = Rd; +} + class AIswp pattern> : AI { bits<4> Rt; @@ -535,6 +575,18 @@ class AIswp pattern> let Unpredictable{11-8} = 0b1111; let DecoderMethod = "DecodeSwap"; } +// Acquire/Release load/store instructions +class AIldracq opcod, dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list pattern> + : AIldr_ex_or_acq, + Requires<[IsARM, HasV8]>; + +class AIstrrel opcod, dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list pattern> + : AIstr_ex_or_rel, + Requires<[IsARM, HasV8]> { + let Inst{15-12} = 0b1111; +} // addrmode1 instructions class AI1 opcod, dag oops, dag iops, Format f, InstrItinClass itin, @@ -1230,8 +1282,9 @@ class T2JTI; // Move to/from coprocessor instructions -class T2Cop opc, dag oops, dag iops, string asm, list pattern> - : T2XI , Requires<[IsThumb2]> { +class T2Cop opc, dag oops, dag iops, string opcstr, string asm, + list pattern> + : T2I , Requires<[IsThumb2]> { let Inst{31-28} = opc; } @@ -1389,7 +1442,6 @@ class ADI5 opcod1, bits<2> opcod2, dag oops, dag iops, let Inst{15-12} = Dd{3-0}; let Inst{7-0} = addr{7-0}; // imm8 - // TODO: Mark the instructions with the appropriate subtarget info. let Inst{27-24} = opcod1; let Inst{21-20} = opcod2; let Inst{11-9} = 0b101; @@ -1415,7 +1467,6 @@ class ASI5 opcod1, bits<2> opcod2, dag oops, dag iops, let Inst{15-12} = Sd{4-1}; let Inst{7-0} = addr{7-0}; // imm8 - // TODO: Mark the instructions with the appropriate subtarget info. let Inst{27-24} = opcod1; let Inst{21-20} = opcod2; let Inst{11-9} = 0b101; @@ -1437,6 +1488,28 @@ class PseudoVFPLdStM pattern> + : VFPXI { + // Instruction operands. + bits<4> Rn; + bits<13> regs; + + // Encode instruction operands. + let Inst{19-16} = Rn; + let Inst{22} = 0; + let Inst{15-12} = regs{11-8}; + let Inst{7-1} = regs{7-1}; + + let Inst{27-25} = 0b110; + let Inst{11-8} = 0b1011; + let Inst{0} = 1; +} + +// Double precision class AXDI4 pattern> : VFPXI pattern> : VFPXI opcod1, bits<2> opcod2, bits<4> opcod3, bits<2> opcod4, let Inst{8} = 1; // Double precision let Inst{7-6} = opcod4; let Inst{4} = opcod5; + + let Predicates = [HasVFP2, HasDPVFP]; +} + +// Double precision, unary, not-predicated +class ADuInp opcod1, bits<2> opcod2, bits<4> opcod3, bits<2> opcod4, + bit opcod5, dag oops, dag iops, InstrItinClass itin, + string asm, list pattern> + : VFPXI { + // Instruction operands. + bits<5> Dd; + bits<5> Dm; + + let Inst{31-28} = 0b1111; + + // Encode instruction operands. + let Inst{3-0} = Dm{3-0}; + let Inst{5} = Dm{4}; + let Inst{15-12} = Dd{3-0}; + let Inst{22} = Dd{4}; + + let Inst{27-23} = opcod1; + let Inst{21-20} = opcod2; + let Inst{19-16} = opcod3; + let Inst{11-9} = 0b101; + let Inst{8} = 1; // Double precision + let Inst{7-6} = opcod4; + let Inst{4} = opcod5; } // Double precision, binary @@ -1525,9 +1626,42 @@ class ADbI opcod1, bits<2> opcod2, bit op6, bit op4, dag oops, let Inst{8} = 1; // Double precision let Inst{6} = op6; let Inst{4} = op4; + + let Predicates = [HasVFP2, HasDPVFP]; +} + +// FP, binary, not predicated +class ADbInp opcod1, bits<2> opcod2, bit opcod3, dag oops, dag iops, + InstrItinClass itin, string asm, list pattern> + : VFPXI +{ + // Instruction operands. + bits<5> Dd; + bits<5> Dn; + bits<5> Dm; + + let Inst{31-28} = 0b1111; + + // Encode instruction operands. + let Inst{3-0} = Dm{3-0}; + let Inst{5} = Dm{4}; + let Inst{19-16} = Dn{3-0}; + let Inst{7} = Dn{4}; + let Inst{15-12} = Dd{3-0}; + let Inst{22} = Dd{4}; + + let Inst{27-23} = opcod1; + let Inst{21-20} = opcod2; + let Inst{11-9} = 0b101; + let Inst{8} = 1; // double precision + let Inst{6} = opcod3; + let Inst{4} = 0; + + let Predicates = [HasVFP2, HasDPVFP]; } -// Single precision, unary +// Single precision, unary, predicated class ASuI opcod1, bits<2> opcod2, bits<4> opcod3, bits<2> opcod4, bit opcod5, dag oops, dag iops, InstrItinClass itin, string opc, string asm, list pattern> @@ -1551,6 +1685,33 @@ class ASuI opcod1, bits<2> opcod2, bits<4> opcod3, bits<2> opcod4, let Inst{4} = opcod5; } +// Single precision, unary, non-predicated +class ASuInp opcod1, bits<2> opcod2, bits<4> opcod3, bits<2> opcod4, + bit opcod5, dag oops, dag iops, InstrItinClass itin, + string asm, list pattern> + : VFPXI { + // Instruction operands. + bits<5> Sd; + bits<5> Sm; + + let Inst{31-28} = 0b1111; + + // Encode instruction operands. + let Inst{3-0} = Sm{4-1}; + let Inst{5} = Sm{0}; + let Inst{15-12} = Sd{4-1}; + let Inst{22} = Sd{0}; + + let Inst{27-23} = opcod1; + let Inst{21-20} = opcod2; + let Inst{19-16} = opcod3; + let Inst{11-9} = 0b101; + let Inst{8} = 0; // Single precision + let Inst{7-6} = opcod4; + let Inst{4} = opcod5; +} + // Single precision unary, if no NEON. Same as ASuI except not available if // NEON is enabled. class ASuIn opcod1, bits<2> opcod2, bits<4> opcod3, bits<2> opcod4, @@ -1586,6 +1747,35 @@ class ASbI opcod1, bits<2> opcod2, bit op6, bit op4, dag oops, dag iops, let Inst{4} = op4; } +// Single precision, binary, not predicated +class ASbInp opcod1, bits<2> opcod2, bit opcod3, dag oops, dag iops, + InstrItinClass itin, string asm, list pattern> + : VFPXI +{ + // Instruction operands. + bits<5> Sd; + bits<5> Sn; + bits<5> Sm; + + let Inst{31-28} = 0b1111; + + // Encode instruction operands. + let Inst{3-0} = Sm{4-1}; + let Inst{5} = Sm{0}; + let Inst{19-16} = Sn{4-1}; + let Inst{7} = Sn{0}; + let Inst{15-12} = Sd{4-1}; + let Inst{22} = Sd{0}; + + let Inst{27-23} = opcod1; + let Inst{21-20} = opcod2; + let Inst{11-9} = 0b101; + let Inst{8} = 0; // Single precision + let Inst{6} = opcod3; + let Inst{4} = 0; +} + // Single precision binary, if no NEON. Same as ASbI except not available if // NEON is enabled. class ASbIn opcod1, bits<2> opcod2, bit op6, bit op4, dag oops, @@ -1698,6 +1888,21 @@ class NeonXI pattern> + : InstARM { + let OutOperandList = oops; + let InOperandList = iops; + let AsmString = !strconcat(opc, ".", dt, "\t", asm); + let Pattern = pattern; + list Predicates = [HasNEON]; + let DecoderNamespace = "NEON"; + + let Inst{31-28} = 0b1111; +} + class NLdSt op21_20, bits<4> op11_8, bits<4> op7_4, dag oops, dag iops, InstrItinClass itin, string opc, string dt, string asm, string cstr, list pattern> @@ -1817,6 +2022,35 @@ class N2V op24_23, bits<2> op21_20, bits<2> op19_18, bits<2> op17_16, let Inst{5} = Vm{4}; } +// Same as N2V but not predicated. +class N2Vnp op19_18, bits<2> op17_16, bits<3> op10_8, bit op7, bit op6, + dag oops, dag iops, InstrItinClass itin, string OpcodeStr, + string Dt, ValueType ResTy, ValueType OpTy, list pattern> + : NeonInp { + bits<5> Vd; + bits<5> Vm; + + // Encode instruction operands + let Inst{22} = Vd{4}; + let Inst{15-12} = Vd{3-0}; + let Inst{5} = Vm{4}; + let Inst{3-0} = Vm{3-0}; + + // Encode constant bits + let Inst{27-23} = 0b00111; + let Inst{21-20} = 0b11; + let Inst{19-18} = op19_18; + let Inst{17-16} = op17_16; + let Inst{11} = 0; + let Inst{10-8} = op10_8; + let Inst{7} = op7; + let Inst{6} = op6; + let Inst{4} = 0; + + let DecoderNamespace = "NEON"; +} + // Same as N2V except it doesn't have a datatype suffix. class N2VX op24_23, bits<2> op21_20, bits<2> op19_18, bits<2> op17_16, bits<5> op11_7, bit op6, bit op4, @@ -1898,6 +2132,32 @@ class N3V op21_20, bits<4> op11_8, bit op6, bit op4, let Inst{5} = Vm{4}; } +class N3Vnp op27_23, bits<2> op21_20, bits<4> op11_8, bit op6, + bit op4, dag oops, dag iops,Format f, InstrItinClass itin, + string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy, + SDPatternOperator IntOp, bit Commutable, list pattern> + : NeonInp { + bits<5> Vd; + bits<5> Vn; + bits<5> Vm; + + // Encode instruction operands + let Inst{22} = Vd{4}; + let Inst{15-12} = Vd{3-0}; + let Inst{19-16} = Vn{3-0}; + let Inst{7} = Vn{4}; + let Inst{5} = Vm{4}; + let Inst{3-0} = Vm{3-0}; + + // Encode constant bits + let Inst{27-23} = op27_23; + let Inst{21-20} = op21_20; + let Inst{11-8} = op11_8; + let Inst{6} = op6; + let Inst{4} = op4; +} + class N3VLane32 op21_20, bits<4> op11_8, bit op6, bit op4, dag oops, dag iops, Format f, InstrItinClass itin, string opc, string dt, string asm, string cstr, diff --git a/lib/Target/ARM/ARMInstrInfo.cpp b/lib/Target/ARM/ARMInstrInfo.cpp index 80f0ec74376a..df867b49ab57 100644 --- a/lib/Target/ARM/ARMInstrInfo.cpp +++ b/lib/Target/ARM/ARMInstrInfo.cpp @@ -22,6 +22,7 @@ #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineJumpTableInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/IR/Function.h" #include "llvm/IR/GlobalVariable.h" #include "llvm/MC/MCAsmInfo.h" @@ -29,7 +30,7 @@ using namespace llvm; ARMInstrInfo::ARMInstrInfo(const ARMSubtarget &STI) - : ARMBaseInstrInfo(STI), RI(*this, STI) { + : ARMBaseInstrInfo(STI), RI(STI) { } /// getNoopForMachoTarget - Return the noop instruction to use for a noop. @@ -106,29 +107,42 @@ namespace { if (TM->getRelocationModel() != Reloc::PIC_) return false; - LLVMContext* Context = &MF.getFunction()->getContext(); - GlobalValue *GV = new GlobalVariable(Type::getInt32Ty(*Context), false, - GlobalValue::ExternalLinkage, 0, - "_GLOBAL_OFFSET_TABLE_"); - unsigned Id = AFI->createPICLabelUId(); - ARMConstantPoolValue *CPV = ARMConstantPoolConstant::Create(GV, Id); - unsigned Align = TM->getDataLayout()->getPrefTypeAlignment(GV->getType()); + LLVMContext *Context = &MF.getFunction()->getContext(); + unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); + unsigned PCAdj = TM->getSubtarget().isThumb() ? 4 : 8; + ARMConstantPoolValue *CPV = ARMConstantPoolSymbol::Create( + *Context, "_GLOBAL_OFFSET_TABLE_", ARMPCLabelIndex, PCAdj); + + unsigned Align = TM->getDataLayout() + ->getPrefTypeAlignment(Type::getInt32PtrTy(*Context)); unsigned Idx = MF.getConstantPool()->getConstantPoolIndex(CPV, Align); MachineBasicBlock &FirstMBB = MF.front(); MachineBasicBlock::iterator MBBI = FirstMBB.begin(); DebugLoc DL = FirstMBB.findDebugLoc(MBBI); - unsigned GlobalBaseReg = AFI->getGlobalBaseReg(); + unsigned TempReg = + MF.getRegInfo().createVirtualRegister(&ARM::rGPRRegClass); unsigned Opc = TM->getSubtarget().isThumb2() ? ARM::t2LDRpci : ARM::LDRcp; const TargetInstrInfo &TII = *TM->getInstrInfo(); MachineInstrBuilder MIB = BuildMI(FirstMBB, MBBI, DL, - TII.get(Opc), GlobalBaseReg) + TII.get(Opc), TempReg) .addConstantPoolIndex(Idx); if (Opc == ARM::LDRcp) MIB.addImm(0); AddDefaultPred(MIB); + // Fix the GOT address by adding pc. + unsigned GlobalBaseReg = AFI->getGlobalBaseReg(); + Opc = TM->getSubtarget().isThumb2() ? ARM::tPICADD + : ARM::PICADD; + MIB = BuildMI(FirstMBB, MBBI, DL, TII.get(Opc), GlobalBaseReg) + .addReg(TempReg) + .addImm(ARMPCLabelIndex); + if (Opc == ARM::PICADD) + AddDefaultPred(MIB); + + return true; } diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td index 1bd174e34162..2042c0460932 100644 --- a/lib/Target/ARM/ARMInstrInfo.td +++ b/lib/Target/ARM/ARMInstrInfo.td @@ -71,6 +71,9 @@ def SDT_ARMTCRET : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>; def SDT_ARMBFI : SDTypeProfile<1, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, SDTCisVT<2, i32>, SDTCisVT<3, i32>]>; +def SDT_ARMVMAXNM : SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisFP<1>, SDTCisFP<2>]>; +def SDT_ARMVMINNM : SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisFP<1>, SDTCisFP<2>]>; + def SDTBinaryArithWithFlags : SDTypeProfile<2, 2, [SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, @@ -118,7 +121,8 @@ def ARMcall_nolink : SDNode<"ARMISD::CALL_NOLINK", SDT_ARMcall, def ARMretflag : SDNode<"ARMISD::RET_FLAG", SDTNone, [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>; - +def ARMintretflag : SDNode<"ARMISD::INTRET_FLAG", SDT_ARMcall, + [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>; def ARMcmov : SDNode<"ARMISD::CMOV", SDT_ARMCMov, [SDNPInGlue]>; @@ -162,8 +166,6 @@ def ARMeh_sjlj_longjmp: SDNode<"ARMISD::EH_SJLJ_LONGJMP", SDT_ARMEH_SJLJ_Longjmp, [SDNPHasChain, SDNPSideEffect]>; -def ARMMemBarrier : SDNode<"ARMISD::MEMBARRIER", SDT_ARMMEMBARRIER, - [SDNPHasChain, SDNPSideEffect]>; def ARMMemBarrierMCR : SDNode<"ARMISD::MEMBARRIER_MCR", SDT_ARMMEMBARRIER, [SDNPHasChain, SDNPSideEffect]>; def ARMPreload : SDNode<"ARMISD::PRELOAD", SDT_ARMPREFETCH, @@ -174,9 +176,11 @@ def ARMrbit : SDNode<"ARMISD::RBIT", SDTIntUnaryOp>; def ARMtcret : SDNode<"ARMISD::TC_RETURN", SDT_ARMTCRET, [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>; - def ARMbfi : SDNode<"ARMISD::BFI", SDT_ARMBFI>; +def ARMvmaxnm : SDNode<"ARMISD::VMAXNM", SDT_ARMVMAXNM, []>; +def ARMvminnm : SDNode<"ARMISD::VMINNM", SDT_ARMVMINNM, []>; + //===----------------------------------------------------------------------===// // ARM Instruction Predicate Definitions. // @@ -189,11 +193,18 @@ def HasV5TE : Predicate<"Subtarget->hasV5TEOps()">, def HasV6 : Predicate<"Subtarget->hasV6Ops()">, AssemblerPredicate<"HasV6Ops", "armv6">; def NoV6 : Predicate<"!Subtarget->hasV6Ops()">; +def HasV6M : Predicate<"Subtarget->hasV6MOps()">, + AssemblerPredicate<"HasV6MOps", + "armv6m or armv6t2">; def HasV6T2 : Predicate<"Subtarget->hasV6T2Ops()">, AssemblerPredicate<"HasV6T2Ops", "armv6t2">; def NoV6T2 : Predicate<"!Subtarget->hasV6T2Ops()">; def HasV7 : Predicate<"Subtarget->hasV7Ops()">, AssemblerPredicate<"HasV7Ops", "armv7">; +def HasV8 : Predicate<"Subtarget->hasV8Ops()">, + AssemblerPredicate<"HasV8Ops", "armv8">; +def PreV8 : Predicate<"!Subtarget->hasV8Ops()">, + AssemblerPredicate<"!HasV8Ops", "armv7 or earlier">; def NoVFP : Predicate<"!Subtarget->hasVFP2()">; def HasVFP2 : Predicate<"Subtarget->hasVFP2()">, AssemblerPredicate<"FeatureVFP2", "VFP2">; @@ -201,14 +212,23 @@ def HasVFP3 : Predicate<"Subtarget->hasVFP3()">, AssemblerPredicate<"FeatureVFP3", "VFP3">; def HasVFP4 : Predicate<"Subtarget->hasVFP4()">, AssemblerPredicate<"FeatureVFP4", "VFP4">; +def HasDPVFP : Predicate<"!Subtarget->isFPOnlySP()">, + AssemblerPredicate<"!FeatureVFPOnlySP", + "double precision VFP">; +def HasFPARMv8 : Predicate<"Subtarget->hasFPARMv8()">, + AssemblerPredicate<"FeatureFPARMv8", "FPARMv8">; def HasNEON : Predicate<"Subtarget->hasNEON()">, AssemblerPredicate<"FeatureNEON", "NEON">; +def HasCrypto : Predicate<"Subtarget->hasCrypto()">, + AssemblerPredicate<"FeatureCrypto", "crypto">; +def HasCRC : Predicate<"Subtarget->hasCRC()">, + AssemblerPredicate<"FeatureCRC", "crc">; def HasFP16 : Predicate<"Subtarget->hasFP16()">, AssemblerPredicate<"FeatureFP16","half-float">; def HasDivide : Predicate<"Subtarget->hasDivide()">, - AssemblerPredicate<"FeatureHWDiv", "divide">; + AssemblerPredicate<"FeatureHWDiv", "divide in THUMB">; def HasDivideInARM : Predicate<"Subtarget->hasDivideInARMMode()">, - AssemblerPredicate<"FeatureHWDivARM">; + AssemblerPredicate<"FeatureHWDivARM", "divide in ARM">; def HasT2ExtractPack : Predicate<"Subtarget->hasT2ExtractPack()">, AssemblerPredicate<"FeatureT2XtPk", "pack/extract">; @@ -233,10 +253,10 @@ def IsThumb2 : Predicate<"Subtarget->isThumb2()">, AssemblerPredicate<"ModeThumb,FeatureThumb2", "thumb2">; def IsMClass : Predicate<"Subtarget->isMClass()">, - AssemblerPredicate<"FeatureMClass", "armv7m">; -def IsARClass : Predicate<"!Subtarget->isMClass()">, + AssemblerPredicate<"FeatureMClass", "armv*m">; +def IsNotMClass : Predicate<"!Subtarget->isMClass()">, AssemblerPredicate<"!FeatureMClass", - "armv7a/r">; + "!armv*m">; def IsARM : Predicate<"!Subtarget->isThumb()">, AssemblerPredicate<"!ModeThumb", "arm-mode">; def IsIOS : Predicate<"Subtarget->isTargetIOS()">; @@ -258,7 +278,9 @@ def UseMulOps : Predicate<"Subtarget->useMulOps()">; def UseFusedMAC : Predicate<"(TM.Options.AllowFPOpFusion ==" " FPOpFusion::Fast) && " "!Subtarget->isTargetDarwin()">; -def DontUseFusedMAC : Predicate<"!Subtarget->hasVFP4() || " +def DontUseFusedMAC : Predicate<"!(TM.Options.AllowFPOpFusion ==" + " FPOpFusion::Fast &&" + " Subtarget->hasVFP4()) || " "Subtarget->isTargetDarwin()">; // VGETLNi32 is microcoded on Swift - prefer VMOV. @@ -275,8 +297,8 @@ def HasSlowVDUP32 : Predicate<"Subtarget->isSwift()">; def UseVMOVSR : Predicate<"Subtarget->isCortexA9() || !Subtarget->useNEONForSinglePrecisionFP()">; def DontUseVMOVSR : Predicate<"!Subtarget->isCortexA9() && Subtarget->useNEONForSinglePrecisionFP()">; -def IsLE : Predicate<"TLI.isLittleEndian()">; -def IsBE : Predicate<"TLI.isBigEndian()">; +def IsLE : Predicate<"getTargetLowering()->isLittleEndian()">; +def IsBE : Predicate<"getTargetLowering()->isBigEndian()">; //===----------------------------------------------------------------------===// // ARM Flag Definitions. @@ -456,7 +478,7 @@ def AdrLabelAsmOperand : AsmOperandClass { let Name = "AdrLabel"; } def adrlabel : Operand { let EncoderMethod = "getAdrLabelOpValue"; let ParserMatchClass = AdrLabelAsmOperand; - let PrintMethod = "printAdrLabelOperand"; + let PrintMethod = "printAdrLabelOperand<0>"; } def neon_vcvt_imm32 : Operand { @@ -581,17 +603,6 @@ def imm0_1 : Operand { let ParserMatchClass = Imm0_1AsmOperand; } def Imm0_3AsmOperand: ImmAsmOperand { let Name = "Imm0_3"; } def imm0_3 : Operand { let ParserMatchClass = Imm0_3AsmOperand; } -/// imm0_4 predicate - Immediate in the range [0,4]. -def Imm0_4AsmOperand : ImmAsmOperand -{ - let Name = "Imm0_4"; - let DiagnosticType = "ImmRange0_4"; -} -def imm0_4 : Operand, ImmLeaf= 0 && Imm < 5; }]> { - let ParserMatchClass = Imm0_4AsmOperand; - let DecoderMethod = "DecodeImm0_4"; -} - /// imm0_7 predicate - Immediate in the range [0,7]. def Imm0_7AsmOperand: ImmAsmOperand { let Name = "Imm0_7"; } def imm0_7 : Operand, ImmLeaf, ImmLeaf, ImmLeaf= 0 && Imm < 240; }]> { + let ParserMatchClass = Imm0_239AsmOperand; +} + /// imm0_255 predicate - Immediate in the range [0,255]. def Imm0_255AsmOperand : ImmAsmOperand { let Name = "Imm0_255"; } def imm0_255 : Operand, ImmLeaf= 0 && Imm < 256; }]> { @@ -702,6 +722,11 @@ def imm0_65535_expr : Operand { let ParserMatchClass = Imm0_65535ExprAsmOperand; } +def Imm256_65535ExprAsmOperand: ImmAsmOperand { let Name = "Imm256_65535Expr"; } +def imm256_65535_expr : Operand { + let ParserMatchClass = Imm256_65535ExprAsmOperand; +} + /// imm24b - True if the 32-bit immediate is encodable in 24 bits. def Imm24bitAsmOperand: ImmAsmOperand { let Name = "Imm24bit"; } def imm24b : Operand, ImmLeaf { let DecoderMethod = "DecodeCoprocessor"; } -def pf_imm : Operand { - let PrintMethod = "printPImmediate"; - let ParserMatchClass = CoprocNumAsmOperand; -} - def CoprocRegAsmOperand : AsmOperandClass { let Name = "CoprocReg"; let ParserMethod = "parseCoprocRegOperand"; @@ -1327,7 +1347,7 @@ class AI_ext_rrot opcod, string opc, PatFrag opnode> : AExtI, - Requires<[IsARM, HasV6]> { + Requires<[IsARM, HasV6]>, Sched<[WriteALUsi]> { bits<4> Rd; bits<4> Rm; bits<2> rot; @@ -1340,11 +1360,11 @@ class AI_ext_rrot opcod, string opc, PatFrag opnode> class AI_ext_rrot_np opcod, string opc> : AExtI, - Requires<[IsARM, HasV6]> { + Requires<[IsARM, HasV6]>, Sched<[WriteALUsi]> { bits<2> rot; let Inst{19-16} = 0b1111; let Inst{11-10} = rot; -} + } /// AI_exta_rrot - A binary operation with two forms: one whose operand is a /// register and one whose operand is a register rotated by 8/16/24. @@ -1353,7 +1373,7 @@ class AI_exta_rrot opcod, string opc, PatFrag opnode> IIC_iEXTAr, opc, "\t$Rd, $Rn, $Rm$rot", [(set GPRnopc:$Rd, (opnode GPR:$Rn, (rotr GPRnopc:$Rm, rot_imm:$rot)))]>, - Requires<[IsARM, HasV6]> { + Requires<[IsARM, HasV6]>, Sched<[WriteALUsr]> { bits<4> Rd; bits<4> Rm; bits<4> Rn; @@ -1368,7 +1388,7 @@ class AI_exta_rrot opcod, string opc, PatFrag opnode> class AI_exta_rrot_np opcod, string opc> : AExtI, - Requires<[IsARM, HasV6]> { + Requires<[IsARM, HasV6]>, Sched<[WriteALUsr]> { bits<4> Rn; bits<2> rot; let Inst{19-16} = Rn; @@ -1664,53 +1684,11 @@ PseudoInst<(outs), (ins i32imm:$amt, pred:$p), NoItinerary, [(ARMcallseq_start timm:$amt)]>; } -// Atomic pseudo-insts which will be lowered to ldrexd/strexd loops. -// (These pseudos use a hand-written selection code). -let usesCustomInserter = 1, Defs = [CPSR], mayLoad = 1, mayStore = 1 in { -def ATOMOR6432 : PseudoInst<(outs GPR:$dst1, GPR:$dst2), - (ins GPR:$addr, GPR:$src1, GPR:$src2), - NoItinerary, []>; -def ATOMXOR6432 : PseudoInst<(outs GPR:$dst1, GPR:$dst2), - (ins GPR:$addr, GPR:$src1, GPR:$src2), - NoItinerary, []>; -def ATOMADD6432 : PseudoInst<(outs GPR:$dst1, GPR:$dst2), - (ins GPR:$addr, GPR:$src1, GPR:$src2), - NoItinerary, []>; -def ATOMSUB6432 : PseudoInst<(outs GPR:$dst1, GPR:$dst2), - (ins GPR:$addr, GPR:$src1, GPR:$src2), - NoItinerary, []>; -def ATOMNAND6432 : PseudoInst<(outs GPR:$dst1, GPR:$dst2), - (ins GPR:$addr, GPR:$src1, GPR:$src2), - NoItinerary, []>; -def ATOMAND6432 : PseudoInst<(outs GPR:$dst1, GPR:$dst2), - (ins GPR:$addr, GPR:$src1, GPR:$src2), - NoItinerary, []>; -def ATOMSWAP6432 : PseudoInst<(outs GPR:$dst1, GPR:$dst2), - (ins GPR:$addr, GPR:$src1, GPR:$src2), - NoItinerary, []>; -def ATOMCMPXCHG6432 : PseudoInst<(outs GPR:$dst1, GPR:$dst2), - (ins GPR:$addr, GPR:$cmp1, GPR:$cmp2, - GPR:$set1, GPR:$set2), - NoItinerary, []>; -def ATOMMIN6432 : PseudoInst<(outs GPR:$dst1, GPR:$dst2), - (ins GPR:$addr, GPR:$src1, GPR:$src2), - NoItinerary, []>; -def ATOMUMIN6432 : PseudoInst<(outs GPR:$dst1, GPR:$dst2), - (ins GPR:$addr, GPR:$src1, GPR:$src2), - NoItinerary, []>; -def ATOMMAX6432 : PseudoInst<(outs GPR:$dst1, GPR:$dst2), - (ins GPR:$addr, GPR:$src1, GPR:$src2), - NoItinerary, []>; -def ATOMUMAX6432 : PseudoInst<(outs GPR:$dst1, GPR:$dst2), - (ins GPR:$addr, GPR:$src1, GPR:$src2), - NoItinerary, []>; -} - -def HINT : AI<(outs), (ins imm0_4:$imm), MiscFrm, NoItinerary, +def HINT : AI<(outs), (ins imm0_239:$imm), MiscFrm, NoItinerary, "hint", "\t$imm", []>, Requires<[IsARM, HasV6]> { - bits<3> imm; - let Inst{27-3} = 0b0011001000001111000000000; - let Inst{2-0} = imm; + bits<8> imm; + let Inst{27-8} = 0b00110010000011110000; + let Inst{7-0} = imm; } def : InstAlias<"nop$p", (HINT 0, pred:$p)>, Requires<[IsARM, HasV6T2]>; @@ -1718,6 +1696,9 @@ def : InstAlias<"yield$p", (HINT 1, pred:$p)>, Requires<[IsARM, HasV6T2]>; def : InstAlias<"wfe$p", (HINT 2, pred:$p)>, Requires<[IsARM, HasV6T2]>; def : InstAlias<"wfi$p", (HINT 3, pred:$p)>, Requires<[IsARM, HasV6T2]>; def : InstAlias<"sev$p", (HINT 4, pred:$p)>, Requires<[IsARM, HasV6T2]>; +def : InstAlias<"sevl$p", (HINT 5, pred:$p)>, Requires<[IsARM, HasV8]>; + +def : Pat<(int_arm_sevl), (HINT 5)>; def SEL : AI<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), DPFrm, NoItinerary, "sel", "\t$Rd, $Rn, $Rm", []>, Requires<[IsARM, HasV6]> { @@ -1735,12 +1716,23 @@ def SEL : AI<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), DPFrm, NoItinerary, "sel", // The 16-bit operand $val can be used by a debugger to store more information // about the breakpoint. -def BKPT : AI<(outs), (ins imm0_65535:$val), MiscFrm, NoItinerary, - "bkpt", "\t$val", []>, Requires<[IsARM]> { +def BKPT : AInoP<(outs), (ins imm0_65535:$val), MiscFrm, NoItinerary, + "bkpt", "\t$val", []>, Requires<[IsARM]> { bits<16> val; let Inst{3-0} = val{3-0}; let Inst{19-8} = val{15-4}; let Inst{27-20} = 0b00010010; + let Inst{31-28} = 0xe; // AL + let Inst{7-4} = 0b0111; +} + +def HLT : AInoP<(outs), (ins imm0_65535:$val), MiscFrm, NoItinerary, + "hlt", "\t$val", []>, Requires<[IsARM, HasV8]> { + bits<16> val; + let Inst{3-0} = val{3-0}; + let Inst{19-8} = val{15-4}; + let Inst{27-20} = 0b00010000; + let Inst{31-28} = 0xe; // AL let Inst{7-4} = 0b0111; } @@ -1780,7 +1772,8 @@ multiclass APreLoad read, bits<1> data, string opc> { def i12 : AXI<(outs), (ins addrmode_imm12:$addr), MiscFrm, IIC_Preload, !strconcat(opc, "\t$addr"), - [(ARMPreload addrmode_imm12:$addr, (i32 read), (i32 data))]> { + [(ARMPreload addrmode_imm12:$addr, (i32 read), (i32 data))]>, + Sched<[WritePreLd]> { bits<4> Rt; bits<17> addr; let Inst{31-26} = 0b111101; @@ -1796,7 +1789,8 @@ multiclass APreLoad read, bits<1> data, string opc> { def rs : AXI<(outs), (ins ldst_so_reg:$shift), MiscFrm, IIC_Preload, !strconcat(opc, "\t$shift"), - [(ARMPreload ldst_so_reg:$shift, (i32 read), (i32 data))]> { + [(ARMPreload ldst_so_reg:$shift, (i32 read), (i32 data))]>, + Sched<[WritePreLd]> { bits<17> shift; let Inst{31-26} = 0b111101; let Inst{25} = 1; // 1 for register form @@ -1816,7 +1810,7 @@ defm PLDW : APreLoad<0, 1, "pldw">, Requires<[IsARM,HasV7,HasMP]>; defm PLI : APreLoad<1, 0, "pli">, Requires<[IsARM,HasV7]>; def SETEND : AXI<(outs), (ins setend_op:$end), MiscFrm, NoItinerary, - "setend\t$end", []>, Requires<[IsARM]> { + "setend\t$end", []>, Requires<[IsARM]>, Deprecated { bits<1> end; let Inst{31-10} = 0b1111000100000001000000; let Inst{9} = end; @@ -1863,7 +1857,8 @@ def TRAP : AXI<(outs), (ins), MiscFrm, NoItinerary, let isNotDuplicable = 1 in { def PICADD : ARMPseudoInst<(outs GPR:$dst), (ins GPR:$a, pclabel:$cp, pred:$p), 4, IIC_iALUr, - [(set GPR:$dst, (ARMpic_add GPR:$a, imm:$cp))]>; + [(set GPR:$dst, (ARMpic_add GPR:$a, imm:$cp))]>, + Sched<[WriteALU, ReadALU]>; let AddedComplexity = 10 in { def PICLDR : ARMPseudoInst<(outs GPR:$dst), (ins addrmodepc:$addr, pred:$p), @@ -1923,11 +1918,11 @@ def ADR : AI1<{0,?,?,0}, (outs GPR:$Rd), (ins adrlabel:$label), let hasSideEffects = 1 in { def LEApcrel : ARMPseudoInst<(outs GPR:$Rd), (ins i32imm:$label, pred:$p), - 4, IIC_iALUi, []>; + 4, IIC_iALUi, []>, Sched<[WriteALU, ReadALU]>; def LEApcrelJT : ARMPseudoInst<(outs GPR:$Rd), (ins i32imm:$label, nohash_imm:$id, pred:$p), - 4, IIC_iALUi, []>; + 4, IIC_iALUi, []>, Sched<[WriteALU, ReadALU]>; } //===----------------------------------------------------------------------===// @@ -1938,16 +1933,22 @@ let isReturn = 1, isTerminator = 1, isBarrier = 1 in { // ARMV4T and above def BX_RET : AI<(outs), (ins), BrMiscFrm, IIC_Br, "bx", "\tlr", [(ARMretflag)]>, - Requires<[IsARM, HasV4T]> { + Requires<[IsARM, HasV4T]>, Sched<[WriteBr]> { let Inst{27-0} = 0b0001001011111111111100011110; } // ARMV4 only def MOVPCLR : AI<(outs), (ins), BrMiscFrm, IIC_Br, "mov", "\tpc, lr", [(ARMretflag)]>, - Requires<[IsARM, NoV4T]> { + Requires<[IsARM, NoV4T]>, Sched<[WriteBr]> { let Inst{27-0} = 0b0001101000001111000000001110; } + + // Exception return: N.b. doesn't set CPSR as far as we're concerned (it sets + // the user-space one). + def SUBS_PC_LR : ARMPseudoInst<(outs), (ins i32imm:$offset, pred:$p), + 4, IIC_Br, + [(ARMintretflag imm:$offset)]>; } // Indirect branches @@ -1955,7 +1956,7 @@ let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in { // ARMV4T and above def BX : AXI<(outs), (ins GPR:$dst), BrMiscFrm, IIC_Br, "bx\t$dst", [(brind GPR:$dst)]>, - Requires<[IsARM, HasV4T]> { + Requires<[IsARM, HasV4T]>, Sched<[WriteBr]> { bits<4> dst; let Inst{31-4} = 0b1110000100101111111111110001; let Inst{3-0} = dst; @@ -1963,7 +1964,7 @@ let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in { def BX_pred : AI<(outs), (ins GPR:$dst), BrMiscFrm, IIC_Br, "bx", "\t$dst", [/* pattern left blank */]>, - Requires<[IsARM, HasV4T]> { + Requires<[IsARM, HasV4T]>, Sched<[WriteBr]> { bits<4> dst; let Inst{27-4} = 0b000100101111111111110001; let Inst{3-0} = dst; @@ -1980,7 +1981,7 @@ let isCall = 1, def BL : ABXI<0b1011, (outs), (ins bl_target:$func), IIC_Br, "bl\t$func", [(ARMcall tglobaladdr:$func)]>, - Requires<[IsARM]> { + Requires<[IsARM]>, Sched<[WriteBrL]> { let Inst{31-28} = 0b1110; bits<24> func; let Inst{23-0} = func; @@ -1990,7 +1991,7 @@ let isCall = 1, def BL_pred : ABI<0b1011, (outs), (ins bl_target:$func), IIC_Br, "bl", "\t$func", [(ARMcall_pred tglobaladdr:$func)]>, - Requires<[IsARM]> { + Requires<[IsARM]>, Sched<[WriteBrL]> { bits<24> func; let Inst{23-0} = func; let DecoderMethod = "DecodeBranchImmInstruction"; @@ -2000,7 +2001,7 @@ let isCall = 1, def BLX : AXI<(outs), (ins GPR:$func), BrMiscFrm, IIC_Br, "blx\t$func", [(ARMcall GPR:$func)]>, - Requires<[IsARM, HasV5T]> { + Requires<[IsARM, HasV5T]>, Sched<[WriteBrL]> { bits<4> func; let Inst{31-4} = 0b1110000100101111111111110011; let Inst{3-0} = func; @@ -2009,7 +2010,7 @@ let isCall = 1, def BLX_pred : AI<(outs), (ins GPR:$func), BrMiscFrm, IIC_Br, "blx", "\t$func", [(ARMcall_pred GPR:$func)]>, - Requires<[IsARM, HasV5T]> { + Requires<[IsARM, HasV5T]>, Sched<[WriteBrL]> { bits<4> func; let Inst{27-4} = 0b000100101111111111110011; let Inst{3-0} = func; @@ -2019,18 +2020,18 @@ let isCall = 1, // Note: Restrict $func to the tGPR regclass to prevent it being in LR. def BX_CALL : ARMPseudoInst<(outs), (ins tGPR:$func), 8, IIC_Br, [(ARMcall_nolink tGPR:$func)]>, - Requires<[IsARM, HasV4T]>; + Requires<[IsARM, HasV4T]>, Sched<[WriteBr]>; // ARMv4 def BMOVPCRX_CALL : ARMPseudoInst<(outs), (ins tGPR:$func), 8, IIC_Br, [(ARMcall_nolink tGPR:$func)]>, - Requires<[IsARM, NoV4T]>; + Requires<[IsARM, NoV4T]>, Sched<[WriteBr]>; // mov lr, pc; b if callee is marked noreturn to avoid confusing the // return stack predictor. def BMOVPCB_CALL : ARMPseudoInst<(outs), (ins bl_target:$func), 8, IIC_Br, [(ARMcall_nolink tglobaladdr:$func)]>, - Requires<[IsARM]>; + Requires<[IsARM]>, Sched<[WriteBr]>; } let isBranch = 1, isTerminator = 1 in { @@ -2038,7 +2039,8 @@ let isBranch = 1, isTerminator = 1 in { // a two-value operand where a dag node expects two operands. :( def Bcc : ABI<0b1010, (outs), (ins br_target:$target), IIC_Br, "b", "\t$target", - [/*(ARMbrcond bb:$target, imm:$cc, CCR:$ccr)*/]> { + [/*(ARMbrcond bb:$target, imm:$cc, CCR:$ccr)*/]>, + Sched<[WriteBr]> { bits<24> target; let Inst{23-0} = target; let DecoderMethod = "DecodeBranchImmInstruction"; @@ -2051,25 +2053,27 @@ let isBranch = 1, isTerminator = 1 in { // should be sufficient. // FIXME: Is B really a Barrier? That doesn't seem right. def B : ARMPseudoExpand<(outs), (ins br_target:$target), 4, IIC_Br, - [(br bb:$target)], (Bcc br_target:$target, (ops 14, zero_reg))>; + [(br bb:$target)], (Bcc br_target:$target, (ops 14, zero_reg))>, + Sched<[WriteBr]>; let isNotDuplicable = 1, isIndirectBranch = 1 in { def BR_JTr : ARMPseudoInst<(outs), (ins GPR:$target, i32imm:$jt, i32imm:$id), 0, IIC_Br, - [(ARMbrjt GPR:$target, tjumptable:$jt, imm:$id)]>; + [(ARMbrjt GPR:$target, tjumptable:$jt, imm:$id)]>, + Sched<[WriteBr]>; // FIXME: This shouldn't use the generic "addrmode2," but rather be split // into i12 and rs suffixed versions. def BR_JTm : ARMPseudoInst<(outs), (ins addrmode2:$target, i32imm:$jt, i32imm:$id), 0, IIC_Br, [(ARMbrjt (i32 (load addrmode2:$target)), tjumptable:$jt, - imm:$id)]>; + imm:$id)]>, Sched<[WriteBrTbl]>; def BR_JTadd : ARMPseudoInst<(outs), (ins GPR:$target, GPR:$idx, i32imm:$jt, i32imm:$id), 0, IIC_Br, [(ARMbrjt (add GPR:$target, GPR:$idx), tjumptable:$jt, - imm:$id)]>; + imm:$id)]>, Sched<[WriteBrTbl]>; } // isNotDuplicable = 1, isIndirectBranch = 1 } // isBarrier = 1 @@ -2078,7 +2082,7 @@ let isBranch = 1, isTerminator = 1 in { // BLX (immediate) def BLXi : AXI<(outs), (ins blx_target:$target), BrMiscFrm, NoItinerary, "blx\t$target", []>, - Requires<[IsARM, HasV5T]> { + Requires<[IsARM, HasV5T]>, Sched<[WriteBrL]> { let Inst{31-25} = 0b1111101; bits<25> target; let Inst{23-0} = target{24-1}; @@ -2087,7 +2091,7 @@ def BLXi : AXI<(outs), (ins blx_target:$target), BrMiscFrm, NoItinerary, // Branch and Exchange Jazelle def BXJ : ABI<0b0001, (outs), (ins GPR:$func), NoItinerary, "bxj", "\t$func", - [/* pattern left blank */]> { + [/* pattern left blank */]>, Sched<[WriteBr]> { bits<4> func; let Inst{23-20} = 0b0010; let Inst{19-8} = 0xfff; @@ -2098,18 +2102,20 @@ def BXJ : ABI<0b0001, (outs), (ins GPR:$func), NoItinerary, "bxj", "\t$func", // Tail calls. let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [SP] in { - def TCRETURNdi : PseudoInst<(outs), (ins i32imm:$dst), IIC_Br, []>; + def TCRETURNdi : PseudoInst<(outs), (ins i32imm:$dst), IIC_Br, []>, + Sched<[WriteBr]>; - def TCRETURNri : PseudoInst<(outs), (ins tcGPR:$dst), IIC_Br, []>; + def TCRETURNri : PseudoInst<(outs), (ins tcGPR:$dst), IIC_Br, []>, + Sched<[WriteBr]>; def TAILJMPd : ARMPseudoExpand<(outs), (ins br_target:$dst), 4, IIC_Br, [], (Bcc br_target:$dst, (ops 14, zero_reg))>, - Requires<[IsARM]>; + Requires<[IsARM]>, Sched<[WriteBr]>; def TAILJMPr : ARMPseudoExpand<(outs), (ins tcGPR:$dst), 4, IIC_Br, [], - (BX GPR:$dst)>, + (BX GPR:$dst)>, Sched<[WriteBr]>, Requires<[IsARM]>; } @@ -2123,7 +2129,8 @@ def SMC : ABI<0b0001, (outs), (ins imm0_15:$opt), NoItinerary, "smc", "\t$opt", // Supervisor Call (Software Interrupt) let isCall = 1, Uses = [SP] in { -def SVC : ABI<0b1111, (outs), (ins imm24b:$svc), IIC_Br, "svc", "\t$svc", []> { +def SVC : ABI<0b1111, (outs), (ins imm24b:$svc), IIC_Br, "svc", "\t$svc", []>, + Sched<[WriteBr]> { bits<24> svc; let Inst{23-0} = svc; } @@ -2272,6 +2279,13 @@ def LDRD : AI3ld<0b1101, 0, (outs GPR:$Rd, GPR:$dst2), []>, Requires<[IsARM, HasV5TE]>; } +def LDA : AIldracq<0b00, (outs GPR:$Rt), (ins addr_offset_none:$addr), + NoItinerary, "lda", "\t$Rt, $addr", []>; +def LDAB : AIldracq<0b10, (outs GPR:$Rt), (ins addr_offset_none:$addr), + NoItinerary, "ldab", "\t$Rt, $addr", []>; +def LDAH : AIldracq<0b11, (outs GPR:$Rt), (ins addr_offset_none:$addr), + NoItinerary, "ldah", "\t$Rt, $addr", []>; + // Indexed loads multiclass AI2_ldridx { @@ -2284,7 +2298,6 @@ multiclass AI2_ldridx op, string opc, InstrItinClass itin> { let Inst{19-16} = addr{12-9}; // Rn let Inst{11-8} = addr{7-4}; // imm7_4/zero let Inst{3-0} = addr{3-0}; // imm3_0/Rm - let AsmMatchConverter = "cvtLdWriteBackRegAddrMode3"; let DecoderMethod = "DecodeAddrMode3Instruction"; } def _POST : AI3ldstidx op, string opc> { let Inst{22} = 1; let Inst{11-8} = offset{7-4}; let Inst{3-0} = offset{3-0}; - let AsmMatchConverter = "cvtLdExtTWriteBackImm"; } def r : AI3ldstidxT op, string opc> { let Inst{11-8} = 0; let Unpredictable{11-8} = 0b1111; let Inst{3-0} = Rm{3-0}; - let AsmMatchConverter = "cvtLdExtTWriteBackReg"; let DecoderMethod = "DecodeLDR"; } } @@ -2544,7 +2552,6 @@ multiclass AI2_stridx op, string opc> { let Inst{22} = 1; let Inst{11-8} = offset{7-4}; let Inst{3-0} = offset{3-0}; - let AsmMatchConverter = "cvtStExtTWriteBackImm"; } def r : AI3ldstidxT op, string opc> { let Inst{22} = 0; let Inst{11-8} = 0; let Inst{3-0} = Rm{3-0}; - let AsmMatchConverter = "cvtStExtTWriteBackReg"; } } defm STRHT : AI3strT<0b1011, "strht">; +def STL : AIstrrel<0b00, (outs), (ins GPR:$Rt, addr_offset_none:$addr), + NoItinerary, "stl", "\t$Rt, $addr", []>; +def STLB : AIstrrel<0b10, (outs), (ins GPR:$Rt, addr_offset_none:$addr), + NoItinerary, "stlb", "\t$Rt, $addr", []>; +def STLH : AIstrrel<0b11, (outs), (ins GPR:$Rt, addr_offset_none:$addr), + NoItinerary, "stlh", "\t$Rt, $addr", []>; //===----------------------------------------------------------------------===// // Load / store multiple Instructions. @@ -2955,7 +2963,7 @@ defm sysSTM : arm_ldst_mult<"stm", " ^", 0, 1, LdStMulFrm, IIC_iStore_m, let neverHasSideEffects = 1 in def MOVr : AsI1<0b1101, (outs GPR:$Rd), (ins GPR:$Rm), DPFrm, IIC_iMOVr, - "mov", "\t$Rd, $Rm", []>, UnaryDP { + "mov", "\t$Rd, $Rm", []>, UnaryDP, Sched<[WriteALU]> { bits<4> Rd; bits<4> Rm; @@ -2969,7 +2977,7 @@ def MOVr : AsI1<0b1101, (outs GPR:$Rd), (ins GPR:$Rm), DPFrm, IIC_iMOVr, // A version for the smaller set of tail call registers. let neverHasSideEffects = 1 in def MOVr_TC : AsI1<0b1101, (outs tcGPR:$Rd), (ins tcGPR:$Rm), DPFrm, - IIC_iMOVr, "mov", "\t$Rd, $Rm", []>, UnaryDP { + IIC_iMOVr, "mov", "\t$Rd, $Rm", []>, UnaryDP, Sched<[WriteALU]> { bits<4> Rd; bits<4> Rm; @@ -2982,7 +2990,8 @@ def MOVr_TC : AsI1<0b1101, (outs tcGPR:$Rd), (ins tcGPR:$Rm), DPFrm, def MOVsr : AsI1<0b1101, (outs GPRnopc:$Rd), (ins shift_so_reg_reg:$src), DPSoRegRegFrm, IIC_iMOVsr, "mov", "\t$Rd, $src", - [(set GPRnopc:$Rd, shift_so_reg_reg:$src)]>, UnaryDP { + [(set GPRnopc:$Rd, shift_so_reg_reg:$src)]>, UnaryDP, + Sched<[WriteALU]> { bits<4> Rd; bits<12> src; let Inst{15-12} = Rd; @@ -2998,7 +3007,7 @@ def MOVsr : AsI1<0b1101, (outs GPRnopc:$Rd), (ins shift_so_reg_reg:$src), def MOVsi : AsI1<0b1101, (outs GPR:$Rd), (ins shift_so_reg_imm:$src), DPSoRegImmFrm, IIC_iMOVsr, "mov", "\t$Rd, $src", [(set GPR:$Rd, shift_so_reg_imm:$src)]>, - UnaryDP { + UnaryDP, Sched<[WriteALU]> { bits<4> Rd; bits<12> src; let Inst{15-12} = Rd; @@ -3011,7 +3020,8 @@ def MOVsi : AsI1<0b1101, (outs GPR:$Rd), (ins shift_so_reg_imm:$src), let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in def MOVi : AsI1<0b1101, (outs GPR:$Rd), (ins so_imm:$imm), DPFrm, IIC_iMOVi, - "mov", "\t$Rd, $imm", [(set GPR:$Rd, so_imm:$imm)]>, UnaryDP { + "mov", "\t$Rd, $imm", [(set GPR:$Rd, so_imm:$imm)]>, UnaryDP, + Sched<[WriteALU]> { bits<4> Rd; bits<12> imm; let Inst{25} = 1; @@ -3025,7 +3035,7 @@ def MOVi16 : AI1<0b1000, (outs GPR:$Rd), (ins imm0_65535_expr:$imm), DPFrm, IIC_iMOVi, "movw", "\t$Rd, $imm", [(set GPR:$Rd, imm0_65535:$imm)]>, - Requires<[IsARM, HasV6T2]>, UnaryDP { + Requires<[IsARM, HasV6T2]>, UnaryDP, Sched<[WriteALU]> { bits<4> Rd; bits<16> imm; let Inst{15-12} = Rd; @@ -3041,7 +3051,8 @@ def : InstAlias<"mov${p} $Rd, $imm", Requires<[IsARM]>; def MOVi16_ga_pcrel : PseudoInst<(outs GPR:$Rd), - (ins i32imm:$addr, pclabel:$id), IIC_iMOVi, []>; + (ins i32imm:$addr, pclabel:$id), IIC_iMOVi, []>, + Sched<[WriteALU]>; let Constraints = "$src = $Rd" in { def MOVTi16 : AI1<0b1010, (outs GPRnopc:$Rd), @@ -3051,7 +3062,7 @@ def MOVTi16 : AI1<0b1010, (outs GPRnopc:$Rd), [(set GPRnopc:$Rd, (or (and GPR:$src, 0xffff), lo16AllZero:$imm))]>, UnaryDP, - Requires<[IsARM, HasV6T2]> { + Requires<[IsARM, HasV6T2]>, Sched<[WriteALU]> { bits<4> Rd; bits<16> imm; let Inst{15-12} = Rd; @@ -3063,7 +3074,8 @@ def MOVTi16 : AI1<0b1010, (outs GPRnopc:$Rd), } def MOVTi16_ga_pcrel : PseudoInst<(outs GPR:$Rd), - (ins GPR:$src, i32imm:$addr, pclabel:$id), IIC_iMOVi, []>; + (ins GPR:$src, i32imm:$addr, pclabel:$id), IIC_iMOVi, []>, + Sched<[WriteALU]>; } // Constraints @@ -3073,7 +3085,7 @@ def : ARMPat<(or GPR:$src, 0xffff0000), (MOVTi16 GPR:$src, 0xffff)>, let Uses = [CPSR] in def RRX: PseudoInst<(outs GPR:$Rd), (ins GPR:$Rm), IIC_iMOVsi, [(set GPR:$Rd, (ARMrrx GPR:$Rm))]>, UnaryDP, - Requires<[IsARM]>; + Requires<[IsARM]>, Sched<[WriteALU]>; // These aren't really mov instructions, but we have to define them this way // due to flag operands. @@ -3081,10 +3093,10 @@ def RRX: PseudoInst<(outs GPR:$Rd), (ins GPR:$Rm), IIC_iMOVsi, let Defs = [CPSR] in { def MOVsrl_flag : PseudoInst<(outs GPR:$dst), (ins GPR:$src), IIC_iMOVsi, [(set GPR:$dst, (ARMsrl_flag GPR:$src))]>, UnaryDP, - Requires<[IsARM]>; + Sched<[WriteALU]>, Requires<[IsARM]>; def MOVsra_flag : PseudoInst<(outs GPR:$dst), (ins GPR:$src), IIC_iMOVsi, [(set GPR:$dst, (ARMsra_flag GPR:$src))]>, UnaryDP, - Requires<[IsARM]>; + Sched<[WriteALU]>, Requires<[IsARM]>; } //===----------------------------------------------------------------------===// @@ -3250,7 +3262,8 @@ class AAI op27_20, bits<8> op11_4, string opc, list pattern = [], dag iops = (ins GPRnopc:$Rn, GPRnopc:$Rm), string asm = "\t$Rd, $Rn, $Rm"> - : AI<(outs GPRnopc:$Rd), iops, DPFrm, IIC_iALUr, opc, asm, pattern> { + : AI<(outs GPRnopc:$Rd), iops, DPFrm, IIC_iALUr, opc, asm, pattern>, + Sched<[WriteALU, ReadALU, ReadALU]> { bits<4> Rn; bits<4> Rd; bits<4> Rm; @@ -3265,9 +3278,11 @@ class AAI op27_20, bits<8> op11_4, string opc, // Saturating add/subtract +let DecoderMethod = "DecodeQADDInstruction" in def QADD : AAI<0b00010000, 0b00000101, "qadd", [(set GPRnopc:$Rd, (int_arm_qadd GPRnopc:$Rm, GPRnopc:$Rn))], (ins GPRnopc:$Rm, GPRnopc:$Rn), "\t$Rd, $Rm, $Rn">; + def QSUB : AAI<0b00010010, 0b00000101, "qsub", [(set GPRnopc:$Rd, (int_arm_qsub GPRnopc:$Rm, GPRnopc:$Rn))], (ins GPRnopc:$Rm, GPRnopc:$Rn), "\t$Rd, $Rm, $Rn">; @@ -3326,7 +3341,7 @@ def UHSUB8 : AAI<0b01100111, 0b11111111, "uhsub8">; def USAD8 : AI<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), MulFrm /* for convenience */, NoItinerary, "usad8", "\t$Rd, $Rn, $Rm", []>, - Requires<[IsARM, HasV6]> { + Requires<[IsARM, HasV6]>, Sched<[WriteALU, ReadALU, ReadALU]> { bits<4> Rd; bits<4> Rn; bits<4> Rm; @@ -3340,7 +3355,7 @@ def USAD8 : AI<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), def USADA8 : AI<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm, GPR:$Ra), MulFrm /* for convenience */, NoItinerary, "usada8", "\t$Rd, $Rn, $Rm, $Ra", []>, - Requires<[IsARM, HasV6]> { + Requires<[IsARM, HasV6]>, Sched<[WriteALU, ReadALU, ReadALU]>{ bits<4> Rd; bits<4> Rn; bits<4> Rm; @@ -3473,7 +3488,7 @@ def BFI:I<(outs GPRnopc:$Rd), (ins GPRnopc:$src, GPR:$Rn, bf_inv_mask_imm:$imm), def MVNr : AsI1<0b1111, (outs GPR:$Rd), (ins GPR:$Rm), DPFrm, IIC_iMVNr, "mvn", "\t$Rd, $Rm", - [(set GPR:$Rd, (not GPR:$Rm))]>, UnaryDP { + [(set GPR:$Rd, (not GPR:$Rm))]>, UnaryDP, Sched<[WriteALU]> { bits<4> Rd; bits<4> Rm; let Inst{25} = 0; @@ -3484,7 +3499,8 @@ def MVNr : AsI1<0b1111, (outs GPR:$Rd), (ins GPR:$Rm), DPFrm, IIC_iMVNr, } def MVNsi : AsI1<0b1111, (outs GPR:$Rd), (ins so_reg_imm:$shift), DPSoRegImmFrm, IIC_iMVNsr, "mvn", "\t$Rd, $shift", - [(set GPR:$Rd, (not so_reg_imm:$shift))]>, UnaryDP { + [(set GPR:$Rd, (not so_reg_imm:$shift))]>, UnaryDP, + Sched<[WriteALU]> { bits<4> Rd; bits<12> shift; let Inst{25} = 0; @@ -3496,7 +3512,8 @@ def MVNsi : AsI1<0b1111, (outs GPR:$Rd), (ins so_reg_imm:$shift), } def MVNsr : AsI1<0b1111, (outs GPR:$Rd), (ins so_reg_reg:$shift), DPSoRegRegFrm, IIC_iMVNsr, "mvn", "\t$Rd, $shift", - [(set GPR:$Rd, (not so_reg_reg:$shift))]>, UnaryDP { + [(set GPR:$Rd, (not so_reg_reg:$shift))]>, UnaryDP, + Sched<[WriteALU]> { bits<4> Rd; bits<12> shift; let Inst{25} = 0; @@ -3511,7 +3528,7 @@ def MVNsr : AsI1<0b1111, (outs GPR:$Rd), (ins so_reg_reg:$shift), let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in def MVNi : AsI1<0b1111, (outs GPR:$Rd), (ins so_imm:$imm), DPFrm, IIC_iMVNi, "mvn", "\t$Rd, $imm", - [(set GPR:$Rd, so_imm_not:$imm)]>,UnaryDP { + [(set GPR:$Rd, so_imm_not:$imm)]>,UnaryDP, Sched<[WriteALU]> { bits<4> Rd; bits<12> imm; let Inst{25} = 1; @@ -3993,13 +4010,57 @@ def PKHTB : APKHI<0b01101000, 1, (outs GPRnopc:$Rd), // Alternate cases for PKHTB where identities eliminate some nodes. Note that // a shift amount of 0 is *not legal* here, it is PKHBT instead. +// We also can not replace a srl (17..31) by an arithmetic shift we would use in +// pkhtb src1, src2, asr (17..31). def : ARMV6Pat<(or (and GPRnopc:$src1, 0xFFFF0000), - (srl GPRnopc:$src2, imm16_31:$sh)), + (srl GPRnopc:$src2, imm16:$sh)), + (PKHTB GPRnopc:$src1, GPRnopc:$src2, imm16:$sh)>; +def : ARMV6Pat<(or (and GPRnopc:$src1, 0xFFFF0000), + (sra GPRnopc:$src2, imm16_31:$sh)), (PKHTB GPRnopc:$src1, GPRnopc:$src2, imm16_31:$sh)>; def : ARMV6Pat<(or (and GPRnopc:$src1, 0xFFFF0000), (and (srl GPRnopc:$src2, imm1_15:$sh), 0xFFFF)), (PKHTB GPRnopc:$src1, GPRnopc:$src2, imm1_15:$sh)>; +//===----------------------------------------------------------------------===// +// CRC Instructions +// +// Polynomials: +// + CRC32{B,H,W} 0x04C11DB7 +// + CRC32C{B,H,W} 0x1EDC6F41 +// + +class AI_crc32 sz, string suffix, SDPatternOperator builtin> + : AInoP<(outs GPRnopc:$Rd), (ins GPRnopc:$Rn, GPRnopc:$Rm), MiscFrm, NoItinerary, + !strconcat("crc32", suffix), "\t$Rd, $Rn, $Rm", + [(set GPRnopc:$Rd, (builtin GPRnopc:$Rn, GPRnopc:$Rm))]>, + Requires<[IsARM, HasV8, HasCRC]> { + bits<4> Rd; + bits<4> Rn; + bits<4> Rm; + + let Inst{31-28} = 0b1110; + let Inst{27-23} = 0b00010; + let Inst{22-21} = sz; + let Inst{20} = 0; + let Inst{19-16} = Rn; + let Inst{15-12} = Rd; + let Inst{11-10} = 0b00; + let Inst{9} = C; + let Inst{8} = 0; + let Inst{7-4} = 0b0100; + let Inst{3-0} = Rm; + + let Unpredictable{11-8} = 0b1101; +} + +def CRC32B : AI_crc32<0, 0b00, "b", int_arm_crc32b>; +def CRC32CB : AI_crc32<1, 0b00, "cb", int_arm_crc32cb>; +def CRC32H : AI_crc32<0, 0b01, "h", int_arm_crc32h>; +def CRC32CH : AI_crc32<1, 0b01, "ch", int_arm_crc32ch>; +def CRC32W : AI_crc32<0, 0b10, "w", int_arm_crc32w>; +def CRC32CW : AI_crc32<1, 0b10, "cw", int_arm_crc32cw>; + //===----------------------------------------------------------------------===// // Comparison Instructions... // @@ -4022,7 +4083,8 @@ def : ARMPat<(ARMcmpZ GPR:$src, so_reg_reg:$rhs), let isCompare = 1, Defs = [CPSR] in { def CMNri : AI1<0b1011, (outs), (ins GPR:$Rn, so_imm:$imm), DPFrm, IIC_iCMPi, "cmn", "\t$Rn, $imm", - [(ARMcmn GPR:$Rn, so_imm:$imm)]> { + [(ARMcmn GPR:$Rn, so_imm:$imm)]>, + Sched<[WriteCMP, ReadALU]> { bits<4> Rn; bits<12> imm; let Inst{25} = 1; @@ -4038,7 +4100,7 @@ def CMNri : AI1<0b1011, (outs), (ins GPR:$Rn, so_imm:$imm), DPFrm, IIC_iCMPi, def CMNzrr : AI1<0b1011, (outs), (ins GPR:$Rn, GPR:$Rm), DPFrm, IIC_iCMPr, "cmn", "\t$Rn, $Rm", [(BinOpFrag<(ARMcmpZ node:$LHS,(ineg node:$RHS))> - GPR:$Rn, GPR:$Rm)]> { + GPR:$Rn, GPR:$Rm)]>, Sched<[WriteCMP, ReadALU, ReadALU]> { bits<4> Rn; bits<4> Rm; let isCommutable = 1; @@ -4056,7 +4118,8 @@ def CMNzrsi : AI1<0b1011, (outs), (ins GPR:$Rn, so_reg_imm:$shift), DPSoRegImmFrm, IIC_iCMPsr, "cmn", "\t$Rn, $shift", [(BinOpFrag<(ARMcmpZ node:$LHS,(ineg node:$RHS))> - GPR:$Rn, so_reg_imm:$shift)]> { + GPR:$Rn, so_reg_imm:$shift)]>, + Sched<[WriteCMPsi, ReadALU]> { bits<4> Rn; bits<12> shift; let Inst{25} = 0; @@ -4074,7 +4137,8 @@ def CMNzrsr : AI1<0b1011, (outs), (ins GPRnopc:$Rn, so_reg_reg:$shift), DPSoRegRegFrm, IIC_iCMPsr, "cmn", "\t$Rn, $shift", [(BinOpFrag<(ARMcmpZ node:$LHS,(ineg node:$RHS))> - GPRnopc:$Rn, so_reg_reg:$shift)]> { + GPRnopc:$Rn, so_reg_reg:$shift)]>, + Sched<[WriteCMPsr, ReadALU]> { bits<4> Rn; bits<12> shift; let Inst{25} = 0; @@ -4112,65 +4176,77 @@ let usesCustomInserter = 1, isBranch = 1, isTerminator = 1, def BCCi64 : PseudoInst<(outs), (ins i32imm:$cc, GPR:$lhs1, GPR:$lhs2, GPR:$rhs1, GPR:$rhs2, brtarget:$dst), IIC_Br, - [(ARMBcci64 imm:$cc, GPR:$lhs1, GPR:$lhs2, GPR:$rhs1, GPR:$rhs2, bb:$dst)]>; + [(ARMBcci64 imm:$cc, GPR:$lhs1, GPR:$lhs2, GPR:$rhs1, GPR:$rhs2, bb:$dst)]>, + Sched<[WriteBr]>; def BCCZi64 : PseudoInst<(outs), (ins i32imm:$cc, GPR:$lhs1, GPR:$lhs2, brtarget:$dst), IIC_Br, - [(ARMBcci64 imm:$cc, GPR:$lhs1, GPR:$lhs2, 0, 0, bb:$dst)]>; + [(ARMBcci64 imm:$cc, GPR:$lhs1, GPR:$lhs2, 0, 0, bb:$dst)]>, + Sched<[WriteBr]>; } // usesCustomInserter // Conditional moves -// FIXME: should be able to write a pattern for ARMcmov, but can't use -// a two-value operand where a dag node expects two operands. :( let neverHasSideEffects = 1 in { let isCommutable = 1, isSelect = 1 in -def MOVCCr : ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$false, GPR:$Rm, pred:$p), +def MOVCCr : ARMPseudoInst<(outs GPR:$Rd), + (ins GPR:$false, GPR:$Rm, cmovpred:$p), 4, IIC_iCMOVr, - [/*(set GPR:$Rd, (ARMcmov GPR:$false, GPR:$Rm, imm:$cc, CCR:$ccr))*/]>, - RegConstraint<"$false = $Rd">; + [(set GPR:$Rd, (ARMcmov GPR:$false, GPR:$Rm, + cmovpred:$p))]>, + RegConstraint<"$false = $Rd">, Sched<[WriteALU]>; def MOVCCsi : ARMPseudoInst<(outs GPR:$Rd), - (ins GPR:$false, so_reg_imm:$shift, pred:$p), - 4, IIC_iCMOVsr, - [/*(set GPR:$Rd, (ARMcmov GPR:$false, so_reg_imm:$shift, - imm:$cc, CCR:$ccr))*/]>, - RegConstraint<"$false = $Rd">; + (ins GPR:$false, so_reg_imm:$shift, cmovpred:$p), + 4, IIC_iCMOVsr, + [(set GPR:$Rd, + (ARMcmov GPR:$false, so_reg_imm:$shift, + cmovpred:$p))]>, + RegConstraint<"$false = $Rd">, Sched<[WriteALU]>; def MOVCCsr : ARMPseudoInst<(outs GPR:$Rd), - (ins GPR:$false, so_reg_reg:$shift, pred:$p), + (ins GPR:$false, so_reg_reg:$shift, cmovpred:$p), 4, IIC_iCMOVsr, - [/*(set GPR:$Rd, (ARMcmov GPR:$false, so_reg_reg:$shift, - imm:$cc, CCR:$ccr))*/]>, - RegConstraint<"$false = $Rd">; + [(set GPR:$Rd, (ARMcmov GPR:$false, so_reg_reg:$shift, + cmovpred:$p))]>, + RegConstraint<"$false = $Rd">, Sched<[WriteALU]>; let isMoveImm = 1 in -def MOVCCi16 : ARMPseudoInst<(outs GPR:$Rd), - (ins GPR:$false, imm0_65535_expr:$imm, pred:$p), - 4, IIC_iMOVi, - []>, - RegConstraint<"$false = $Rd">, Requires<[IsARM, HasV6T2]>; +def MOVCCi16 + : ARMPseudoInst<(outs GPR:$Rd), + (ins GPR:$false, imm0_65535_expr:$imm, cmovpred:$p), + 4, IIC_iMOVi, + [(set GPR:$Rd, (ARMcmov GPR:$false, imm0_65535:$imm, + cmovpred:$p))]>, + RegConstraint<"$false = $Rd">, Requires<[IsARM, HasV6T2]>, + Sched<[WriteALU]>; let isMoveImm = 1 in def MOVCCi : ARMPseudoInst<(outs GPR:$Rd), - (ins GPR:$false, so_imm:$imm, pred:$p), + (ins GPR:$false, so_imm:$imm, cmovpred:$p), 4, IIC_iCMOVi, - [/*(set GPR:$Rd, (ARMcmov GPR:$false, so_imm:$imm, imm:$cc, CCR:$ccr))*/]>, - RegConstraint<"$false = $Rd">; + [(set GPR:$Rd, (ARMcmov GPR:$false, so_imm:$imm, + cmovpred:$p))]>, + RegConstraint<"$false = $Rd">, Sched<[WriteALU]>; // Two instruction predicate mov immediate. let isMoveImm = 1 in -def MOVCCi32imm : ARMPseudoInst<(outs GPR:$Rd), - (ins GPR:$false, i32imm:$src, pred:$p), - 8, IIC_iCMOVix2, []>, RegConstraint<"$false = $Rd">; +def MOVCCi32imm + : ARMPseudoInst<(outs GPR:$Rd), + (ins GPR:$false, i32imm:$src, cmovpred:$p), + 8, IIC_iCMOVix2, + [(set GPR:$Rd, (ARMcmov GPR:$false, imm:$src, + cmovpred:$p))]>, + RegConstraint<"$false = $Rd">, Requires<[IsARM, HasV6T2]>; let isMoveImm = 1 in def MVNCCi : ARMPseudoInst<(outs GPR:$Rd), - (ins GPR:$false, so_imm:$imm, pred:$p), + (ins GPR:$false, so_imm:$imm, cmovpred:$p), 4, IIC_iCMOVi, - [/*(set GPR:$Rd, (ARMcmov GPR:$false, so_imm_not:$imm, imm:$cc, CCR:$ccr))*/]>, - RegConstraint<"$false = $Rd">; + [(set GPR:$Rd, (ARMcmov GPR:$false, so_imm_not:$imm, + cmovpred:$p))]>, + RegConstraint<"$false = $Rd">, Sched<[WriteALU]>; } // neverHasSideEffects @@ -4189,10 +4265,20 @@ def memb_opt : Operand { let DecoderMethod = "DecodeMemBarrierOption"; } +def InstSyncBarrierOptOperand : AsmOperandClass { + let Name = "InstSyncBarrierOpt"; + let ParserMethod = "parseInstSyncBarrierOptOperand"; +} +def instsyncb_opt : Operand { + let PrintMethod = "printInstSyncBOption"; + let ParserMatchClass = InstSyncBarrierOptOperand; + let DecoderMethod = "DecodeInstSyncBarrierOption"; +} + // memory barriers protect the atomic sequences let hasSideEffects = 1 in { def DMB : AInoP<(outs), (ins memb_opt:$opt), MiscFrm, NoItinerary, - "dmb", "\t$opt", [(ARMMemBarrier (i32 imm:$opt))]>, + "dmb", "\t$opt", [(int_arm_dmb (i32 imm0_15:$opt))]>, Requires<[IsARM, HasDB]> { bits<4> opt; let Inst{31-4} = 0xf57ff05; @@ -4201,7 +4287,7 @@ def DMB : AInoP<(outs), (ins memb_opt:$opt), MiscFrm, NoItinerary, } def DSB : AInoP<(outs), (ins memb_opt:$opt), MiscFrm, NoItinerary, - "dsb", "\t$opt", []>, + "dsb", "\t$opt", [(int_arm_dsb (i32 imm0_15:$opt))]>, Requires<[IsARM, HasDB]> { bits<4> opt; let Inst{31-4} = 0xf57ff04; @@ -4209,7 +4295,7 @@ def DSB : AInoP<(outs), (ins memb_opt:$opt), MiscFrm, NoItinerary, } // ISB has only full system option -def ISB : AInoP<(outs), (ins memb_opt:$opt), MiscFrm, NoItinerary, +def ISB : AInoP<(outs), (ins instsyncb_opt:$opt), MiscFrm, NoItinerary, "isb", "\t$opt", []>, Requires<[IsARM, HasDB]> { bits<4> opt; @@ -4217,124 +4303,219 @@ def ISB : AInoP<(outs), (ins memb_opt:$opt), MiscFrm, NoItinerary, let Inst{3-0} = opt; } +let usesCustomInserter = 1, Defs = [CPSR] in { + // Pseudo instruction that combines movs + predicated rsbmi // to implement integer ABS -let usesCustomInserter = 1, Defs = [CPSR] in -def ABS : ARMPseudoInst<(outs GPR:$dst), (ins GPR:$src), 8, NoItinerary, []>; + def ABS : ARMPseudoInst<(outs GPR:$dst), (ins GPR:$src), 8, NoItinerary, []>; -let usesCustomInserter = 1 in { - let Defs = [CPSR] in { +// Atomic pseudo-insts which will be lowered to ldrex/strex loops. +// (64-bit pseudos use a hand-written selection code). + let mayLoad = 1, mayStore = 1 in { def ATOMIC_LOAD_ADD_I8 : PseudoInst< - (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary, - [(set GPR:$dst, (atomic_load_add_8 GPR:$ptr, GPR:$incr))]>; + (outs GPR:$dst), + (ins GPR:$ptr, GPR:$incr, i32imm:$ordering), + NoItinerary, []>; def ATOMIC_LOAD_SUB_I8 : PseudoInst< - (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary, - [(set GPR:$dst, (atomic_load_sub_8 GPR:$ptr, GPR:$incr))]>; + (outs GPR:$dst), + (ins GPR:$ptr, GPR:$incr, i32imm:$ordering), + NoItinerary, []>; def ATOMIC_LOAD_AND_I8 : PseudoInst< - (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary, - [(set GPR:$dst, (atomic_load_and_8 GPR:$ptr, GPR:$incr))]>; + (outs GPR:$dst), + (ins GPR:$ptr, GPR:$incr, i32imm:$ordering), + NoItinerary, []>; def ATOMIC_LOAD_OR_I8 : PseudoInst< - (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary, - [(set GPR:$dst, (atomic_load_or_8 GPR:$ptr, GPR:$incr))]>; + (outs GPR:$dst), + (ins GPR:$ptr, GPR:$incr, i32imm:$ordering), + NoItinerary, []>; def ATOMIC_LOAD_XOR_I8 : PseudoInst< - (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary, - [(set GPR:$dst, (atomic_load_xor_8 GPR:$ptr, GPR:$incr))]>; + (outs GPR:$dst), + (ins GPR:$ptr, GPR:$incr, i32imm:$ordering), + NoItinerary, []>; def ATOMIC_LOAD_NAND_I8 : PseudoInst< - (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary, - [(set GPR:$dst, (atomic_load_nand_8 GPR:$ptr, GPR:$incr))]>; + (outs GPR:$dst), + (ins GPR:$ptr, GPR:$incr, i32imm:$ordering), + NoItinerary, []>; def ATOMIC_LOAD_MIN_I8 : PseudoInst< - (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary, - [(set GPR:$dst, (atomic_load_min_8 GPR:$ptr, GPR:$val))]>; + (outs GPR:$dst), + (ins GPR:$ptr, GPR:$val, i32imm:$ordering), + NoItinerary, []>; def ATOMIC_LOAD_MAX_I8 : PseudoInst< - (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary, - [(set GPR:$dst, (atomic_load_max_8 GPR:$ptr, GPR:$val))]>; + (outs GPR:$dst), + (ins GPR:$ptr, GPR:$val, i32imm:$ordering), + NoItinerary, []>; def ATOMIC_LOAD_UMIN_I8 : PseudoInst< - (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary, - [(set GPR:$dst, (atomic_load_umin_8 GPR:$ptr, GPR:$val))]>; + (outs GPR:$dst), + (ins GPR:$ptr, GPR:$val, i32imm:$ordering), + NoItinerary, []>; def ATOMIC_LOAD_UMAX_I8 : PseudoInst< - (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary, - [(set GPR:$dst, (atomic_load_umax_8 GPR:$ptr, GPR:$val))]>; + (outs GPR:$dst), + (ins GPR:$ptr, GPR:$val, i32imm:$ordering), + NoItinerary, []>; + def ATOMIC_SWAP_I8 : PseudoInst< + (outs GPR:$dst), + (ins GPR:$ptr, GPR:$new, i32imm:$ordering), + NoItinerary, []>; + def ATOMIC_CMP_SWAP_I8 : PseudoInst< + (outs GPR:$dst), + (ins GPR:$ptr, GPR:$old, GPR:$new, i32imm:$ordering), + NoItinerary, []>; def ATOMIC_LOAD_ADD_I16 : PseudoInst< - (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary, - [(set GPR:$dst, (atomic_load_add_16 GPR:$ptr, GPR:$incr))]>; + (outs GPR:$dst), + (ins GPR:$ptr, GPR:$incr, i32imm:$ordering), + NoItinerary, []>; def ATOMIC_LOAD_SUB_I16 : PseudoInst< - (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary, - [(set GPR:$dst, (atomic_load_sub_16 GPR:$ptr, GPR:$incr))]>; + (outs GPR:$dst), + (ins GPR:$ptr, GPR:$incr, i32imm:$ordering), + NoItinerary, []>; def ATOMIC_LOAD_AND_I16 : PseudoInst< - (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary, - [(set GPR:$dst, (atomic_load_and_16 GPR:$ptr, GPR:$incr))]>; + (outs GPR:$dst), + (ins GPR:$ptr, GPR:$incr, i32imm:$ordering), + NoItinerary, []>; def ATOMIC_LOAD_OR_I16 : PseudoInst< - (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary, - [(set GPR:$dst, (atomic_load_or_16 GPR:$ptr, GPR:$incr))]>; + (outs GPR:$dst), + (ins GPR:$ptr, GPR:$incr, i32imm:$ordering), + NoItinerary, []>; def ATOMIC_LOAD_XOR_I16 : PseudoInst< - (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary, - [(set GPR:$dst, (atomic_load_xor_16 GPR:$ptr, GPR:$incr))]>; + (outs GPR:$dst), + (ins GPR:$ptr, GPR:$incr, i32imm:$ordering), + NoItinerary, []>; def ATOMIC_LOAD_NAND_I16 : PseudoInst< - (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary, - [(set GPR:$dst, (atomic_load_nand_16 GPR:$ptr, GPR:$incr))]>; + (outs GPR:$dst), + (ins GPR:$ptr, GPR:$incr, i32imm:$ordering), + NoItinerary, []>; def ATOMIC_LOAD_MIN_I16 : PseudoInst< - (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary, - [(set GPR:$dst, (atomic_load_min_16 GPR:$ptr, GPR:$val))]>; + (outs GPR:$dst), + (ins GPR:$ptr, GPR:$val, i32imm:$ordering), + NoItinerary, []>; def ATOMIC_LOAD_MAX_I16 : PseudoInst< - (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary, - [(set GPR:$dst, (atomic_load_max_16 GPR:$ptr, GPR:$val))]>; + (outs GPR:$dst), + (ins GPR:$ptr, GPR:$val, i32imm:$ordering), + NoItinerary, []>; def ATOMIC_LOAD_UMIN_I16 : PseudoInst< - (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary, - [(set GPR:$dst, (atomic_load_umin_16 GPR:$ptr, GPR:$val))]>; + (outs GPR:$dst), + (ins GPR:$ptr, GPR:$val, i32imm:$ordering), + NoItinerary, []>; def ATOMIC_LOAD_UMAX_I16 : PseudoInst< - (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary, - [(set GPR:$dst, (atomic_load_umax_16 GPR:$ptr, GPR:$val))]>; + (outs GPR:$dst), + (ins GPR:$ptr, GPR:$val, i32imm:$ordering), + NoItinerary, []>; + def ATOMIC_SWAP_I16 : PseudoInst< + (outs GPR:$dst), + (ins GPR:$ptr, GPR:$new, i32imm:$ordering), + NoItinerary, []>; + def ATOMIC_CMP_SWAP_I16 : PseudoInst< + (outs GPR:$dst), + (ins GPR:$ptr, GPR:$old, GPR:$new, i32imm:$ordering), + NoItinerary, []>; def ATOMIC_LOAD_ADD_I32 : PseudoInst< - (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary, - [(set GPR:$dst, (atomic_load_add_32 GPR:$ptr, GPR:$incr))]>; + (outs GPR:$dst), + (ins GPR:$ptr, GPR:$incr, i32imm:$ordering), + NoItinerary, []>; def ATOMIC_LOAD_SUB_I32 : PseudoInst< - (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary, - [(set GPR:$dst, (atomic_load_sub_32 GPR:$ptr, GPR:$incr))]>; + (outs GPR:$dst), + (ins GPR:$ptr, GPR:$incr, i32imm:$ordering), + NoItinerary, []>; def ATOMIC_LOAD_AND_I32 : PseudoInst< - (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary, - [(set GPR:$dst, (atomic_load_and_32 GPR:$ptr, GPR:$incr))]>; + (outs GPR:$dst), + (ins GPR:$ptr, GPR:$incr, i32imm:$ordering), + NoItinerary, []>; def ATOMIC_LOAD_OR_I32 : PseudoInst< - (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary, - [(set GPR:$dst, (atomic_load_or_32 GPR:$ptr, GPR:$incr))]>; + (outs GPR:$dst), + (ins GPR:$ptr, GPR:$incr, i32imm:$ordering), + NoItinerary, []>; def ATOMIC_LOAD_XOR_I32 : PseudoInst< - (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary, - [(set GPR:$dst, (atomic_load_xor_32 GPR:$ptr, GPR:$incr))]>; + (outs GPR:$dst), + (ins GPR:$ptr, GPR:$incr, i32imm:$ordering), + NoItinerary, []>; def ATOMIC_LOAD_NAND_I32 : PseudoInst< - (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary, - [(set GPR:$dst, (atomic_load_nand_32 GPR:$ptr, GPR:$incr))]>; + (outs GPR:$dst), + (ins GPR:$ptr, GPR:$incr, i32imm:$ordering), + NoItinerary, []>; def ATOMIC_LOAD_MIN_I32 : PseudoInst< - (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary, - [(set GPR:$dst, (atomic_load_min_32 GPR:$ptr, GPR:$val))]>; + (outs GPR:$dst), + (ins GPR:$ptr, GPR:$val, i32imm:$ordering), + NoItinerary, []>; def ATOMIC_LOAD_MAX_I32 : PseudoInst< - (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary, - [(set GPR:$dst, (atomic_load_max_32 GPR:$ptr, GPR:$val))]>; + (outs GPR:$dst), + (ins GPR:$ptr, GPR:$val, i32imm:$ordering), + NoItinerary, []>; def ATOMIC_LOAD_UMIN_I32 : PseudoInst< - (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary, - [(set GPR:$dst, (atomic_load_umin_32 GPR:$ptr, GPR:$val))]>; + (outs GPR:$dst), + (ins GPR:$ptr, GPR:$val, i32imm:$ordering), + NoItinerary, []>; def ATOMIC_LOAD_UMAX_I32 : PseudoInst< - (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary, - [(set GPR:$dst, (atomic_load_umax_32 GPR:$ptr, GPR:$val))]>; - - def ATOMIC_SWAP_I8 : PseudoInst< - (outs GPR:$dst), (ins GPR:$ptr, GPR:$new), NoItinerary, - [(set GPR:$dst, (atomic_swap_8 GPR:$ptr, GPR:$new))]>; - def ATOMIC_SWAP_I16 : PseudoInst< - (outs GPR:$dst), (ins GPR:$ptr, GPR:$new), NoItinerary, - [(set GPR:$dst, (atomic_swap_16 GPR:$ptr, GPR:$new))]>; + (outs GPR:$dst), + (ins GPR:$ptr, GPR:$val, i32imm:$ordering), + NoItinerary, []>; def ATOMIC_SWAP_I32 : PseudoInst< - (outs GPR:$dst), (ins GPR:$ptr, GPR:$new), NoItinerary, - [(set GPR:$dst, (atomic_swap_32 GPR:$ptr, GPR:$new))]>; - - def ATOMIC_CMP_SWAP_I8 : PseudoInst< - (outs GPR:$dst), (ins GPR:$ptr, GPR:$old, GPR:$new), NoItinerary, - [(set GPR:$dst, (atomic_cmp_swap_8 GPR:$ptr, GPR:$old, GPR:$new))]>; - def ATOMIC_CMP_SWAP_I16 : PseudoInst< - (outs GPR:$dst), (ins GPR:$ptr, GPR:$old, GPR:$new), NoItinerary, - [(set GPR:$dst, (atomic_cmp_swap_16 GPR:$ptr, GPR:$old, GPR:$new))]>; + (outs GPR:$dst), + (ins GPR:$ptr, GPR:$new, i32imm:$ordering), + NoItinerary, []>; def ATOMIC_CMP_SWAP_I32 : PseudoInst< - (outs GPR:$dst), (ins GPR:$ptr, GPR:$old, GPR:$new), NoItinerary, - [(set GPR:$dst, (atomic_cmp_swap_32 GPR:$ptr, GPR:$old, GPR:$new))]>; -} + (outs GPR:$dst), + (ins GPR:$ptr, GPR:$old, GPR:$new, i32imm:$ordering), + NoItinerary, []>; + def ATOMIC_LOAD_ADD_I64 : PseudoInst< + (outs GPR:$dst1, GPR:$dst2), + (ins GPR:$addr, GPR:$src1, GPR:$src2, i32imm:$ordering), + NoItinerary, []>; + def ATOMIC_LOAD_SUB_I64 : PseudoInst< + (outs GPR:$dst1, GPR:$dst2), + (ins GPR:$addr, GPR:$src1, GPR:$src2, i32imm:$ordering), + NoItinerary, []>; + def ATOMIC_LOAD_AND_I64 : PseudoInst< + (outs GPR:$dst1, GPR:$dst2), + (ins GPR:$addr, GPR:$src1, GPR:$src2, i32imm:$ordering), + NoItinerary, []>; + def ATOMIC_LOAD_OR_I64 : PseudoInst< + (outs GPR:$dst1, GPR:$dst2), + (ins GPR:$addr, GPR:$src1, GPR:$src2, i32imm:$ordering), + NoItinerary, []>; + def ATOMIC_LOAD_XOR_I64 : PseudoInst< + (outs GPR:$dst1, GPR:$dst2), + (ins GPR:$addr, GPR:$src1, GPR:$src2, i32imm:$ordering), + NoItinerary, []>; + def ATOMIC_LOAD_NAND_I64 : PseudoInst< + (outs GPR:$dst1, GPR:$dst2), + (ins GPR:$addr, GPR:$src1, GPR:$src2, i32imm:$ordering), + NoItinerary, []>; + def ATOMIC_LOAD_MIN_I64 : PseudoInst< + (outs GPR:$dst1, GPR:$dst2), + (ins GPR:$addr, GPR:$src1, GPR:$src2, i32imm:$ordering), + NoItinerary, []>; + def ATOMIC_LOAD_MAX_I64 : PseudoInst< + (outs GPR:$dst1, GPR:$dst2), + (ins GPR:$addr, GPR:$src1, GPR:$src2, i32imm:$ordering), + NoItinerary, []>; + def ATOMIC_LOAD_UMIN_I64 : PseudoInst< + (outs GPR:$dst1, GPR:$dst2), + (ins GPR:$addr, GPR:$src1, GPR:$src2, i32imm:$ordering), + NoItinerary, []>; + def ATOMIC_LOAD_UMAX_I64 : PseudoInst< + (outs GPR:$dst1, GPR:$dst2), + (ins GPR:$addr, GPR:$src1, GPR:$src2, i32imm:$ordering), + NoItinerary, []>; + def ATOMIC_SWAP_I64 : PseudoInst< + (outs GPR:$dst1, GPR:$dst2), + (ins GPR:$addr, GPR:$src1, GPR:$src2, i32imm:$ordering), + NoItinerary, []>; + def ATOMIC_CMP_SWAP_I64 : PseudoInst< + (outs GPR:$dst1, GPR:$dst2), + (ins GPR:$addr, GPR:$cmp1, GPR:$cmp2, + GPR:$set1, GPR:$set2, i32imm:$ordering), + NoItinerary, []>; + } + let mayLoad = 1 in + def ATOMIC_LOAD_I64 : PseudoInst< + (outs GPR:$dst1, GPR:$dst2), + (ins GPR:$addr, i32imm:$ordering), + NoItinerary, []>; + let mayStore = 1 in + def ATOMIC_STORE_I64 : PseudoInst< + (outs GPR:$dst1, GPR:$dst2), + (ins GPR:$addr, GPR:$src1, GPR:$src2, i32imm:$ordering), + NoItinerary, []>; } let usesCustomInserter = 1 in { @@ -4344,48 +4525,147 @@ let usesCustomInserter = 1 in { [(ARMcopystructbyval GPR:$dst, GPR:$src, imm:$size, imm:$alignment)]>; } +def ldrex_1 : PatFrag<(ops node:$ptr), (int_arm_ldrex node:$ptr), [{ + return cast(N)->getMemoryVT() == MVT::i8; +}]>; + +def ldrex_2 : PatFrag<(ops node:$ptr), (int_arm_ldrex node:$ptr), [{ + return cast(N)->getMemoryVT() == MVT::i16; +}]>; + +def ldrex_4 : PatFrag<(ops node:$ptr), (int_arm_ldrex node:$ptr), [{ + return cast(N)->getMemoryVT() == MVT::i32; +}]>; + +def strex_1 : PatFrag<(ops node:$val, node:$ptr), + (int_arm_strex node:$val, node:$ptr), [{ + return cast(N)->getMemoryVT() == MVT::i8; +}]>; + +def strex_2 : PatFrag<(ops node:$val, node:$ptr), + (int_arm_strex node:$val, node:$ptr), [{ + return cast(N)->getMemoryVT() == MVT::i16; +}]>; + +def strex_4 : PatFrag<(ops node:$val, node:$ptr), + (int_arm_strex node:$val, node:$ptr), [{ + return cast(N)->getMemoryVT() == MVT::i32; +}]>; + let mayLoad = 1 in { def LDREXB : AIldrex<0b10, (outs GPR:$Rt), (ins addr_offset_none:$addr), - NoItinerary, - "ldrexb", "\t$Rt, $addr", []>; + NoItinerary, "ldrexb", "\t$Rt, $addr", + [(set GPR:$Rt, (ldrex_1 addr_offset_none:$addr))]>; def LDREXH : AIldrex<0b11, (outs GPR:$Rt), (ins addr_offset_none:$addr), - NoItinerary, "ldrexh", "\t$Rt, $addr", []>; + NoItinerary, "ldrexh", "\t$Rt, $addr", + [(set GPR:$Rt, (ldrex_2 addr_offset_none:$addr))]>; def LDREX : AIldrex<0b00, (outs GPR:$Rt), (ins addr_offset_none:$addr), - NoItinerary, "ldrex", "\t$Rt, $addr", []>; + NoItinerary, "ldrex", "\t$Rt, $addr", + [(set GPR:$Rt, (ldrex_4 addr_offset_none:$addr))]>; let hasExtraDefRegAllocReq = 1 in -def LDREXD: AIldrex<0b01, (outs GPRPairOp:$Rt),(ins addr_offset_none:$addr), +def LDREXD : AIldrex<0b01, (outs GPRPairOp:$Rt),(ins addr_offset_none:$addr), NoItinerary, "ldrexd", "\t$Rt, $addr", []> { let DecoderMethod = "DecodeDoubleRegLoad"; } + +def LDAEXB : AIldaex<0b10, (outs GPR:$Rt), (ins addr_offset_none:$addr), + NoItinerary, "ldaexb", "\t$Rt, $addr", []>; +def LDAEXH : AIldaex<0b11, (outs GPR:$Rt), (ins addr_offset_none:$addr), + NoItinerary, "ldaexh", "\t$Rt, $addr", []>; +def LDAEX : AIldaex<0b00, (outs GPR:$Rt), (ins addr_offset_none:$addr), + NoItinerary, "ldaex", "\t$Rt, $addr", []>; +let hasExtraDefRegAllocReq = 1 in +def LDAEXD : AIldaex<0b01, (outs GPRPairOp:$Rt),(ins addr_offset_none:$addr), + NoItinerary, "ldaexd", "\t$Rt, $addr", []> { + let DecoderMethod = "DecodeDoubleRegLoad"; +} } let mayStore = 1, Constraints = "@earlyclobber $Rd" in { def STREXB: AIstrex<0b10, (outs GPR:$Rd), (ins GPR:$Rt, addr_offset_none:$addr), - NoItinerary, "strexb", "\t$Rd, $Rt, $addr", []>; + NoItinerary, "strexb", "\t$Rd, $Rt, $addr", + [(set GPR:$Rd, (strex_1 GPR:$Rt, addr_offset_none:$addr))]>; def STREXH: AIstrex<0b11, (outs GPR:$Rd), (ins GPR:$Rt, addr_offset_none:$addr), - NoItinerary, "strexh", "\t$Rd, $Rt, $addr", []>; + NoItinerary, "strexh", "\t$Rd, $Rt, $addr", + [(set GPR:$Rd, (strex_2 GPR:$Rt, addr_offset_none:$addr))]>; def STREX : AIstrex<0b00, (outs GPR:$Rd), (ins GPR:$Rt, addr_offset_none:$addr), - NoItinerary, "strex", "\t$Rd, $Rt, $addr", []>; + NoItinerary, "strex", "\t$Rd, $Rt, $addr", + [(set GPR:$Rd, (strex_4 GPR:$Rt, addr_offset_none:$addr))]>; let hasExtraSrcRegAllocReq = 1 in def STREXD : AIstrex<0b01, (outs GPR:$Rd), (ins GPRPairOp:$Rt, addr_offset_none:$addr), NoItinerary, "strexd", "\t$Rd, $Rt, $addr", []> { let DecoderMethod = "DecodeDoubleRegStore"; } +def STLEXB: AIstlex<0b10, (outs GPR:$Rd), (ins GPR:$Rt, addr_offset_none:$addr), + NoItinerary, "stlexb", "\t$Rd, $Rt, $addr", + []>; +def STLEXH: AIstlex<0b11, (outs GPR:$Rd), (ins GPR:$Rt, addr_offset_none:$addr), + NoItinerary, "stlexh", "\t$Rd, $Rt, $addr", + []>; +def STLEX : AIstlex<0b00, (outs GPR:$Rd), (ins GPR:$Rt, addr_offset_none:$addr), + NoItinerary, "stlex", "\t$Rd, $Rt, $addr", + []>; +let hasExtraSrcRegAllocReq = 1 in +def STLEXD : AIstlex<0b01, (outs GPR:$Rd), + (ins GPRPairOp:$Rt, addr_offset_none:$addr), + NoItinerary, "stlexd", "\t$Rd, $Rt, $addr", []> { + let DecoderMethod = "DecodeDoubleRegStore"; +} } - -def CLREX : AXI<(outs), (ins), MiscFrm, NoItinerary, "clrex", []>, +def CLREX : AXI<(outs), (ins), MiscFrm, NoItinerary, "clrex", + [(int_arm_clrex)]>, Requires<[IsARM, HasV7]> { let Inst{31-0} = 0b11110101011111111111000000011111; } +def : ARMPat<(and (ldrex_1 addr_offset_none:$addr), 0xff), + (LDREXB addr_offset_none:$addr)>; +def : ARMPat<(and (ldrex_2 addr_offset_none:$addr), 0xffff), + (LDREXH addr_offset_none:$addr)>; +def : ARMPat<(strex_1 (and GPR:$Rt, 0xff), addr_offset_none:$addr), + (STREXB GPR:$Rt, addr_offset_none:$addr)>; +def : ARMPat<(strex_2 (and GPR:$Rt, 0xffff), addr_offset_none:$addr), + (STREXH GPR:$Rt, addr_offset_none:$addr)>; + +class acquiring_load + : PatFrag<(ops node:$ptr), (base node:$ptr), [{ + AtomicOrdering Ordering = cast(N)->getOrdering(); + return Ordering == Acquire || Ordering == SequentiallyConsistent; +}]>; + +def atomic_load_acquire_8 : acquiring_load; +def atomic_load_acquire_16 : acquiring_load; +def atomic_load_acquire_32 : acquiring_load; + +class releasing_store + : PatFrag<(ops node:$ptr, node:$val), (base node:$ptr, node:$val), [{ + AtomicOrdering Ordering = cast(N)->getOrdering(); + return Ordering == Release || Ordering == SequentiallyConsistent; +}]>; + +def atomic_store_release_8 : releasing_store; +def atomic_store_release_16 : releasing_store; +def atomic_store_release_32 : releasing_store; + +let AddedComplexity = 8 in { + def : ARMPat<(atomic_load_acquire_8 addr_offset_none:$addr), (LDAB addr_offset_none:$addr)>; + def : ARMPat<(atomic_load_acquire_16 addr_offset_none:$addr), (LDAH addr_offset_none:$addr)>; + def : ARMPat<(atomic_load_acquire_32 addr_offset_none:$addr), (LDA addr_offset_none:$addr)>; + def : ARMPat<(atomic_store_release_8 addr_offset_none:$addr, GPR:$val), (STLB GPR:$val, addr_offset_none:$addr)>; + def : ARMPat<(atomic_store_release_16 addr_offset_none:$addr, GPR:$val), (STLH GPR:$val, addr_offset_none:$addr)>; + def : ARMPat<(atomic_store_release_32 addr_offset_none:$addr, GPR:$val), (STL GPR:$val, addr_offset_none:$addr)>; +} + // SWP/SWPB are deprecated in V6/V7. let mayLoad = 1, mayStore = 1 in { def SWP : AIswp<0, (outs GPRnopc:$Rt), - (ins GPRnopc:$Rt2, addr_offset_none:$addr), "swp", []>; + (ins GPRnopc:$Rt2, addr_offset_none:$addr), "swp", []>, + Requires<[PreV8]>; def SWPB: AIswp<1, (outs GPRnopc:$Rt), - (ins GPRnopc:$Rt2, addr_offset_none:$addr), "swpb", []>; + (ins GPRnopc:$Rt2, addr_offset_none:$addr), "swpb", []>, + Requires<[PreV8]>; } //===----------------------------------------------------------------------===// @@ -4396,7 +4676,8 @@ def CDP : ABI<0b1110, (outs), (ins p_imm:$cop, imm0_15:$opc1, c_imm:$CRd, c_imm:$CRn, c_imm:$CRm, imm0_7:$opc2), NoItinerary, "cdp", "\t$cop, $opc1, $CRd, $CRn, $CRm, $opc2", [(int_arm_cdp imm:$cop, imm:$opc1, imm:$CRd, imm:$CRn, - imm:$CRm, imm:$opc2)]> { + imm:$CRm, imm:$opc2)]>, + Requires<[PreV8]> { bits<4> opc1; bits<4> CRn; bits<4> CRd; @@ -4413,11 +4694,12 @@ def CDP : ABI<0b1110, (outs), (ins p_imm:$cop, imm0_15:$opc1, let Inst{23-20} = opc1; } -def CDP2 : ABXI<0b1110, (outs), (ins pf_imm:$cop, imm0_15:$opc1, +def CDP2 : ABXI<0b1110, (outs), (ins p_imm:$cop, imm0_15:$opc1, c_imm:$CRd, c_imm:$CRn, c_imm:$CRm, imm0_7:$opc2), NoItinerary, "cdp2\t$cop, $opc1, $CRd, $CRn, $CRm, $opc2", [(int_arm_cdp2 imm:$cop, imm:$opc1, imm:$CRd, imm:$CRn, - imm:$CRm, imm:$opc2)]> { + imm:$CRm, imm:$opc2)]>, + Requires<[PreV8]> { let Inst{31-28} = 0b1111; bits<4> opc1; bits<4> CRn; @@ -4595,10 +4877,10 @@ defm LDC : LdStCop <1, 0, "ldc">; defm LDCL : LdStCop <1, 1, "ldcl">; defm STC : LdStCop <0, 0, "stc">; defm STCL : LdStCop <0, 1, "stcl">; -defm LDC2 : LdSt2Cop<1, 0, "ldc2">; -defm LDC2L : LdSt2Cop<1, 1, "ldc2l">; -defm STC2 : LdSt2Cop<0, 0, "stc2">; -defm STC2L : LdSt2Cop<0, 1, "stc2l">; +defm LDC2 : LdSt2Cop<1, 0, "ldc2">, Requires<[PreV8]>; +defm LDC2L : LdSt2Cop<1, 1, "ldc2l">, Requires<[PreV8]>; +defm STC2 : LdSt2Cop<0, 0, "stc2">, Requires<[PreV8]>; +defm STC2L : LdSt2Cop<0, 1, "stc2l">, Requires<[PreV8]>; //===----------------------------------------------------------------------===// // Move between coprocessor and ARM core register. @@ -4631,16 +4913,17 @@ def MCR : MovRCopro<"mcr", 0 /* from ARM core register to coprocessor */, (ins p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn, c_imm:$CRm, imm0_7:$opc2), [(int_arm_mcr imm:$cop, imm:$opc1, GPR:$Rt, imm:$CRn, - imm:$CRm, imm:$opc2)]>; + imm:$CRm, imm:$opc2)]>, + ComplexDeprecationPredicate<"MCR">; def : ARMInstAlias<"mcr${p} $cop, $opc1, $Rt, $CRn, $CRm", (MCR p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn, c_imm:$CRm, 0, pred:$p)>; def MRC : MovRCopro<"mrc", 1 /* from coprocessor to ARM core register */, - (outs GPR:$Rt), + (outs GPRwithAPSR:$Rt), (ins p_imm:$cop, imm0_7:$opc1, c_imm:$CRn, c_imm:$CRm, imm0_7:$opc2), []>; def : ARMInstAlias<"mrc${p} $cop, $opc1, $Rt, $CRn, $CRm", - (MRC GPR:$Rt, p_imm:$cop, imm0_7:$opc1, c_imm:$CRn, + (MRC GPRwithAPSR:$Rt, p_imm:$cop, imm0_7:$opc1, c_imm:$CRn, c_imm:$CRm, 0, pred:$p)>; def : ARMPat<(int_arm_mrc imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2), @@ -4650,7 +4933,7 @@ class MovRCopro2 pattern> : ABXI<0b1110, oops, iops, NoItinerary, !strconcat(opc, "\t$cop, $opc1, $Rt, $CRn, $CRm, $opc2"), pattern> { - let Inst{31-28} = 0b1111; + let Inst{31-24} = 0b11111110; let Inst{20} = direction; let Inst{4} = 1; @@ -4674,16 +4957,18 @@ def MCR2 : MovRCopro2<"mcr2", 0 /* from ARM core register to coprocessor */, (ins p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn, c_imm:$CRm, imm0_7:$opc2), [(int_arm_mcr2 imm:$cop, imm:$opc1, GPR:$Rt, imm:$CRn, - imm:$CRm, imm:$opc2)]>; + imm:$CRm, imm:$opc2)]>, + Requires<[PreV8]>; def : ARMInstAlias<"mcr2$ $cop, $opc1, $Rt, $CRn, $CRm", (MCR2 p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn, c_imm:$CRm, 0)>; def MRC2 : MovRCopro2<"mrc2", 1 /* from coprocessor to ARM core register */, - (outs GPR:$Rt), + (outs GPRwithAPSR:$Rt), (ins p_imm:$cop, imm0_7:$opc1, c_imm:$CRn, c_imm:$CRm, - imm0_7:$opc2), []>; + imm0_7:$opc2), []>, + Requires<[PreV8]>; def : ARMInstAlias<"mrc2$ $cop, $opc1, $Rt, $CRn, $CRm", - (MRC2 GPR:$Rt, p_imm:$cop, imm0_7:$opc1, c_imm:$CRn, + (MRC2 GPRwithAPSR:$Rt, p_imm:$cop, imm0_7:$opc1, c_imm:$CRn, c_imm:$CRm, 0)>; def : ARMV5TPat<(int_arm_mrc2 imm:$cop, imm:$opc1, imm:$CRn, @@ -4718,7 +5003,8 @@ def MRRC : MovRRCopro<"mrrc", 1 /* from coprocessor to ARM core register */>; class MovRRCopro2 pattern = []> : ABXI<0b1100, (outs), (ins p_imm:$cop, imm0_15:$opc1, GPRnopc:$Rt, GPRnopc:$Rt2, c_imm:$CRm), NoItinerary, - !strconcat(opc, "\t$cop, $opc1, $Rt, $Rt2, $CRm"), pattern> { + !strconcat(opc, "\t$cop, $opc1, $Rt, $Rt2, $CRm"), pattern>, + Requires<[PreV8]> { let Inst{31-28} = 0b1111; let Inst{23-21} = 0b010; let Inst{20} = direction; @@ -4820,7 +5106,7 @@ def MSRi : ABI<0b0011, (outs), (ins msr_mask:$mask, so_imm:$a), NoItinerary, let isCall = 1, Defs = [R0, R12, LR, CPSR], Uses = [SP] in { def TPsoft : PseudoInst<(outs), (ins), IIC_Br, - [(set R0, ARMthread_pointer)]>; + [(set R0, ARMthread_pointer)]>, Sched<[WriteBr]>; } //===----------------------------------------------------------------------===// @@ -4884,7 +5170,7 @@ let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in def MOVPCRX : ARMPseudoExpand<(outs), (ins GPR:$dst), 4, IIC_Br, [(brind GPR:$dst)], (MOVr PC, GPR:$dst, (ops 14, zero_reg), zero_reg)>, - Requires<[IsARM, NoV4T]>; + Requires<[IsARM, NoV4T]>, Sched<[WriteBr]>; // Large immediate handling. @@ -5153,10 +5439,10 @@ def : MnemonicAlias<"rfeed", "rfeib">; def : MnemonicAlias<"rfe", "rfeia">; // SRS aliases -def : MnemonicAlias<"srsfa", "srsda">; -def : MnemonicAlias<"srsea", "srsdb">; -def : MnemonicAlias<"srsfd", "srsia">; -def : MnemonicAlias<"srsed", "srsib">; +def : MnemonicAlias<"srsfa", "srsib">; +def : MnemonicAlias<"srsea", "srsia">; +def : MnemonicAlias<"srsfd", "srsdb">; +def : MnemonicAlias<"srsed", "srsda">; def : MnemonicAlias<"srs", "srsia">; // QSAX == QSUBADDX @@ -5233,7 +5519,7 @@ def RORi : ARMAsmPseudo<"ror${s}${p} $Rd, $Rm, $imm", cc_out:$s)>; } def RRXi : ARMAsmPseudo<"rrx${s}${p} $Rd, $Rm", - (ins GPRnopc:$Rd, GPRnopc:$Rm, pred:$p, cc_out:$s)>; + (ins GPR:$Rd, GPR:$Rm, pred:$p, cc_out:$s)>; let TwoOperandAliasConstraint = "$Rn = $Rd" in { def ASRr : ARMAsmPseudo<"asr${s}${p} $Rd, $Rn, $Rm", (ins GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, pred:$p, @@ -5269,4 +5555,5 @@ def : InstAlias<"umull${s}${p} $RdLo, $RdHi, $Rn, $Rm", // 'it' blocks in ARM mode just validate the predicates. The IT itself // is discarded. -def ITasm : ARMAsmPseudo<"it$mask $cc", (ins it_pred:$cc, it_mask:$mask)>; +def ITasm : ARMAsmPseudo<"it$mask $cc", (ins it_pred:$cc, it_mask:$mask)>, + ComplexDeprecationPredicate<"IT">; diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td index 896fd0f7850c..43bd4c21dc39 100644 --- a/lib/Target/ARM/ARMInstrNEON.td +++ b/lib/Target/ARM/ARMInstrNEON.td @@ -626,7 +626,7 @@ class VLD1D op7_4, string Dt> "vld1", Dt, "$Vd, $Rn", "", []> { let Rm = 0b1111; let Inst{4} = Rn{4}; - let DecoderMethod = "DecodeVLDInstruction"; + let DecoderMethod = "DecodeVLDST1Instruction"; } class VLD1Q op7_4, string Dt> : NLdSt<0,0b10,0b1010,op7_4, (outs VecListDPair:$Vd), @@ -634,7 +634,7 @@ class VLD1Q op7_4, string Dt> "vld1", Dt, "$Vd, $Rn", "", []> { let Rm = 0b1111; let Inst{5-4} = Rn{5-4}; - let DecoderMethod = "DecodeVLDInstruction"; + let DecoderMethod = "DecodeVLDST1Instruction"; } def VLD1d8 : VLD1D<{0,0,0,?}, "8">; @@ -655,16 +655,14 @@ multiclass VLD1DWB op7_4, string Dt> { "$Rn.addr = $wb", []> { let Rm = 0b1101; // NLdSt will assign to the right encoding bits. let Inst{4} = Rn{4}; - let DecoderMethod = "DecodeVLDInstruction"; - let AsmMatchConverter = "cvtVLDwbFixed"; + let DecoderMethod = "DecodeVLDST1Instruction"; } def _register : NLdSt<0,0b10,0b0111,op7_4, (outs VecListOneD:$Vd, GPR:$wb), (ins addrmode6:$Rn, rGPR:$Rm), IIC_VLD1u, "vld1", Dt, "$Vd, $Rn, $Rm", "$Rn.addr = $wb", []> { let Inst{4} = Rn{4}; - let DecoderMethod = "DecodeVLDInstruction"; - let AsmMatchConverter = "cvtVLDwbRegister"; + let DecoderMethod = "DecodeVLDST1Instruction"; } } multiclass VLD1QWB op7_4, string Dt> { @@ -674,16 +672,14 @@ multiclass VLD1QWB op7_4, string Dt> { "$Rn.addr = $wb", []> { let Rm = 0b1101; // NLdSt will assign to the right encoding bits. let Inst{5-4} = Rn{5-4}; - let DecoderMethod = "DecodeVLDInstruction"; - let AsmMatchConverter = "cvtVLDwbFixed"; + let DecoderMethod = "DecodeVLDST1Instruction"; } def _register : NLdSt<0,0b10,0b1010,op7_4, (outs VecListDPair:$Vd, GPR:$wb), (ins addrmode6:$Rn, rGPR:$Rm), IIC_VLD1x2u, "vld1", Dt, "$Vd, $Rn, $Rm", "$Rn.addr = $wb", []> { let Inst{5-4} = Rn{5-4}; - let DecoderMethod = "DecodeVLDInstruction"; - let AsmMatchConverter = "cvtVLDwbRegister"; + let DecoderMethod = "DecodeVLDST1Instruction"; } } @@ -703,7 +699,7 @@ class VLD1D3 op7_4, string Dt> "$Vd, $Rn", "", []> { let Rm = 0b1111; let Inst{4} = Rn{4}; - let DecoderMethod = "DecodeVLDInstruction"; + let DecoderMethod = "DecodeVLDST1Instruction"; } multiclass VLD1D3WB op7_4, string Dt> { def _fixed : NLdSt<0,0b10,0b0110, op7_4, (outs VecListThreeD:$Vd, GPR:$wb), @@ -712,16 +708,14 @@ multiclass VLD1D3WB op7_4, string Dt> { "$Rn.addr = $wb", []> { let Rm = 0b1101; // NLdSt will assign to the right encoding bits. let Inst{4} = Rn{4}; - let DecoderMethod = "DecodeVLDInstruction"; - let AsmMatchConverter = "cvtVLDwbFixed"; + let DecoderMethod = "DecodeVLDST1Instruction"; } def _register : NLdSt<0,0b10,0b0110,op7_4, (outs VecListThreeD:$Vd, GPR:$wb), (ins addrmode6:$Rn, rGPR:$Rm), IIC_VLD1x2u, "vld1", Dt, "$Vd, $Rn, $Rm", "$Rn.addr = $wb", []> { let Inst{4} = Rn{4}; - let DecoderMethod = "DecodeVLDInstruction"; - let AsmMatchConverter = "cvtVLDwbRegister"; + let DecoderMethod = "DecodeVLDST1Instruction"; } } @@ -744,7 +738,7 @@ class VLD1D4 op7_4, string Dt> "$Vd, $Rn", "", []> { let Rm = 0b1111; let Inst{5-4} = Rn{5-4}; - let DecoderMethod = "DecodeVLDInstruction"; + let DecoderMethod = "DecodeVLDST1Instruction"; } multiclass VLD1D4WB op7_4, string Dt> { def _fixed : NLdSt<0,0b10,0b0010, op7_4, (outs VecListFourD:$Vd, GPR:$wb), @@ -753,16 +747,14 @@ multiclass VLD1D4WB op7_4, string Dt> { "$Rn.addr = $wb", []> { let Rm = 0b1101; // NLdSt will assign to the right encoding bits. let Inst{5-4} = Rn{5-4}; - let DecoderMethod = "DecodeVLDInstruction"; - let AsmMatchConverter = "cvtVLDwbFixed"; + let DecoderMethod = "DecodeVLDST1Instruction"; } def _register : NLdSt<0,0b10,0b0010,op7_4, (outs VecListFourD:$Vd, GPR:$wb), (ins addrmode6:$Rn, rGPR:$Rm), IIC_VLD1x2u, "vld1", Dt, "$Vd, $Rn, $Rm", "$Rn.addr = $wb", []> { let Inst{5-4} = Rn{5-4}; - let DecoderMethod = "DecodeVLDInstruction"; - let AsmMatchConverter = "cvtVLDwbRegister"; + let DecoderMethod = "DecodeVLDST1Instruction"; } } @@ -786,7 +778,7 @@ class VLD2 op11_8, bits<4> op7_4, string Dt, RegisterOperand VdTy, "vld2", Dt, "$Vd, $Rn", "", []> { let Rm = 0b1111; let Inst{5-4} = Rn{5-4}; - let DecoderMethod = "DecodeVLDInstruction"; + let DecoderMethod = "DecodeVLDST2Instruction"; } def VLD2d8 : VLD2<0b1000, {0,0,?,?}, "8", VecListDPair, IIC_VLD2>; @@ -810,16 +802,14 @@ multiclass VLD2WB op11_8, bits<4> op7_4, string Dt, "$Rn.addr = $wb", []> { let Rm = 0b1101; // NLdSt will assign to the right encoding bits. let Inst{5-4} = Rn{5-4}; - let DecoderMethod = "DecodeVLDInstruction"; - let AsmMatchConverter = "cvtVLDwbFixed"; + let DecoderMethod = "DecodeVLDST2Instruction"; } def _register : NLdSt<0, 0b10, op11_8, op7_4, (outs VdTy:$Vd, GPR:$wb), (ins addrmode6:$Rn, rGPR:$Rm), itin, "vld2", Dt, "$Vd, $Rn, $Rm", "$Rn.addr = $wb", []> { let Inst{5-4} = Rn{5-4}; - let DecoderMethod = "DecodeVLDInstruction"; - let AsmMatchConverter = "cvtVLDwbRegister"; + let DecoderMethod = "DecodeVLDST2Instruction"; } } @@ -853,7 +843,7 @@ class VLD3D op11_8, bits<4> op7_4, string Dt> "vld3", Dt, "\\{$Vd, $dst2, $dst3\\}, $Rn", "", []> { let Rm = 0b1111; let Inst{4} = Rn{4}; - let DecoderMethod = "DecodeVLDInstruction"; + let DecoderMethod = "DecodeVLDST3Instruction"; } def VLD3d8 : VLD3D<0b0100, {0,0,0,?}, "8">; @@ -872,7 +862,7 @@ class VLD3DWB op11_8, bits<4> op7_4, string Dt> "vld3", Dt, "\\{$Vd, $dst2, $dst3\\}, $Rn$Rm", "$Rn.addr = $wb", []> { let Inst{4} = Rn{4}; - let DecoderMethod = "DecodeVLDInstruction"; + let DecoderMethod = "DecodeVLDST3Instruction"; } def VLD3d8_UPD : VLD3DWB<0b0100, {0,0,0,?}, "8">; @@ -912,7 +902,7 @@ class VLD4D op11_8, bits<4> op7_4, string Dt> "vld4", Dt, "\\{$Vd, $dst2, $dst3, $dst4\\}, $Rn", "", []> { let Rm = 0b1111; let Inst{5-4} = Rn{5-4}; - let DecoderMethod = "DecodeVLDInstruction"; + let DecoderMethod = "DecodeVLDST4Instruction"; } def VLD4d8 : VLD4D<0b0000, {0,0,?,?}, "8">; @@ -931,7 +921,7 @@ class VLD4DWB op11_8, bits<4> op7_4, string Dt> "vld4", Dt, "\\{$Vd, $dst2, $dst3, $dst4\\}, $Rn$Rm", "$Rn.addr = $wb", []> { let Inst{5-4} = Rn{5-4}; - let DecoderMethod = "DecodeVLDInstruction"; + let DecoderMethod = "DecodeVLDST4Instruction"; } def VLD4d8_UPD : VLD4DWB<0b0000, {0,0,?,?}, "8">; @@ -1348,7 +1338,6 @@ multiclass VLD1DUPWB op7_4, string Dt> { let Rm = 0b1101; // NLdSt will assign to the right encoding bits. let Inst{4} = Rn{4}; let DecoderMethod = "DecodeVLD1DupInstruction"; - let AsmMatchConverter = "cvtVLDwbFixed"; } def _register : NLdSt<1, 0b10, 0b1100, op7_4, (outs VecListOneDAllLanes:$Vd, GPR:$wb), @@ -1357,7 +1346,6 @@ multiclass VLD1DUPWB op7_4, string Dt> { "$Rn.addr = $wb", []> { let Inst{4} = Rn{4}; let DecoderMethod = "DecodeVLD1DupInstruction"; - let AsmMatchConverter = "cvtVLDwbRegister"; } } multiclass VLD1QDUPWB op7_4, string Dt> { @@ -1369,7 +1357,6 @@ multiclass VLD1QDUPWB op7_4, string Dt> { let Rm = 0b1101; // NLdSt will assign to the right encoding bits. let Inst{4} = Rn{4}; let DecoderMethod = "DecodeVLD1DupInstruction"; - let AsmMatchConverter = "cvtVLDwbFixed"; } def _register : NLdSt<1, 0b10, 0b1100, op7_4, (outs VecListDPairAllLanes:$Vd, GPR:$wb), @@ -1378,7 +1365,6 @@ multiclass VLD1QDUPWB op7_4, string Dt> { "$Rn.addr = $wb", []> { let Inst{4} = Rn{4}; let DecoderMethod = "DecodeVLD1DupInstruction"; - let AsmMatchConverter = "cvtVLDwbRegister"; } } @@ -1419,7 +1405,6 @@ multiclass VLD2DUPWB op7_4, string Dt, RegisterOperand VdTy> { let Rm = 0b1101; // NLdSt will assign to the right encoding bits. let Inst{4} = Rn{4}; let DecoderMethod = "DecodeVLD2DupInstruction"; - let AsmMatchConverter = "cvtVLDwbFixed"; } def _register : NLdSt<1, 0b10, 0b1101, op7_4, (outs VdTy:$Vd, GPR:$wb), @@ -1428,7 +1413,6 @@ multiclass VLD2DUPWB op7_4, string Dt, RegisterOperand VdTy> { "$Rn.addr = $wb", []> { let Inst{4} = Rn{4}; let DecoderMethod = "DecodeVLD2DupInstruction"; - let AsmMatchConverter = "cvtVLDwbRegister"; } } @@ -1580,14 +1564,14 @@ class VST1D op7_4, string Dt> IIC_VST1, "vst1", Dt, "$Vd, $Rn", "", []> { let Rm = 0b1111; let Inst{4} = Rn{4}; - let DecoderMethod = "DecodeVSTInstruction"; + let DecoderMethod = "DecodeVLDST1Instruction"; } class VST1Q op7_4, string Dt> : NLdSt<0,0b00,0b1010,op7_4, (outs), (ins addrmode6:$Rn, VecListDPair:$Vd), IIC_VST1x2, "vst1", Dt, "$Vd, $Rn", "", []> { let Rm = 0b1111; let Inst{5-4} = Rn{5-4}; - let DecoderMethod = "DecodeVSTInstruction"; + let DecoderMethod = "DecodeVLDST1Instruction"; } def VST1d8 : VST1D<{0,0,0,?}, "8">; @@ -1608,8 +1592,7 @@ multiclass VST1DWB op7_4, string Dt> { "$Rn.addr = $wb", []> { let Rm = 0b1101; // NLdSt will assign to the right encoding bits. let Inst{4} = Rn{4}; - let DecoderMethod = "DecodeVSTInstruction"; - let AsmMatchConverter = "cvtVSTwbFixed"; + let DecoderMethod = "DecodeVLDST1Instruction"; } def _register : NLdSt<0,0b00,0b0111,op7_4, (outs GPR:$wb), (ins addrmode6:$Rn, rGPR:$Rm, VecListOneD:$Vd), @@ -1617,8 +1600,7 @@ multiclass VST1DWB op7_4, string Dt> { "vst1", Dt, "$Vd, $Rn, $Rm", "$Rn.addr = $wb", []> { let Inst{4} = Rn{4}; - let DecoderMethod = "DecodeVSTInstruction"; - let AsmMatchConverter = "cvtVSTwbRegister"; + let DecoderMethod = "DecodeVLDST1Instruction"; } } multiclass VST1QWB op7_4, string Dt> { @@ -1628,8 +1610,7 @@ multiclass VST1QWB op7_4, string Dt> { "$Rn.addr = $wb", []> { let Rm = 0b1101; // NLdSt will assign to the right encoding bits. let Inst{5-4} = Rn{5-4}; - let DecoderMethod = "DecodeVSTInstruction"; - let AsmMatchConverter = "cvtVSTwbFixed"; + let DecoderMethod = "DecodeVLDST1Instruction"; } def _register : NLdSt<0,0b00,0b1010,op7_4, (outs GPR:$wb), (ins addrmode6:$Rn, rGPR:$Rm, VecListDPair:$Vd), @@ -1637,8 +1618,7 @@ multiclass VST1QWB op7_4, string Dt> { "vst1", Dt, "$Vd, $Rn, $Rm", "$Rn.addr = $wb", []> { let Inst{5-4} = Rn{5-4}; - let DecoderMethod = "DecodeVSTInstruction"; - let AsmMatchConverter = "cvtVSTwbRegister"; + let DecoderMethod = "DecodeVLDST1Instruction"; } } @@ -1659,7 +1639,7 @@ class VST1D3 op7_4, string Dt> IIC_VST1x3, "vst1", Dt, "$Vd, $Rn", "", []> { let Rm = 0b1111; let Inst{4} = Rn{4}; - let DecoderMethod = "DecodeVSTInstruction"; + let DecoderMethod = "DecodeVLDST1Instruction"; } multiclass VST1D3WB op7_4, string Dt> { def _fixed : NLdSt<0,0b00,0b0110,op7_4, (outs GPR:$wb), @@ -1668,8 +1648,7 @@ multiclass VST1D3WB op7_4, string Dt> { "$Rn.addr = $wb", []> { let Rm = 0b1101; // NLdSt will assign to the right encoding bits. let Inst{5-4} = Rn{5-4}; - let DecoderMethod = "DecodeVSTInstruction"; - let AsmMatchConverter = "cvtVSTwbFixed"; + let DecoderMethod = "DecodeVLDST1Instruction"; } def _register : NLdSt<0,0b00,0b0110,op7_4, (outs GPR:$wb), (ins addrmode6:$Rn, rGPR:$Rm, VecListThreeD:$Vd), @@ -1677,8 +1656,7 @@ multiclass VST1D3WB op7_4, string Dt> { "vst1", Dt, "$Vd, $Rn, $Rm", "$Rn.addr = $wb", []> { let Inst{5-4} = Rn{5-4}; - let DecoderMethod = "DecodeVSTInstruction"; - let AsmMatchConverter = "cvtVSTwbRegister"; + let DecoderMethod = "DecodeVLDST1Instruction"; } } @@ -1704,7 +1682,7 @@ class VST1D4 op7_4, string Dt> []> { let Rm = 0b1111; let Inst{5-4} = Rn{5-4}; - let DecoderMethod = "DecodeVSTInstruction"; + let DecoderMethod = "DecodeVLDST1Instruction"; } multiclass VST1D4WB op7_4, string Dt> { def _fixed : NLdSt<0,0b00,0b0010,op7_4, (outs GPR:$wb), @@ -1713,8 +1691,7 @@ multiclass VST1D4WB op7_4, string Dt> { "$Rn.addr = $wb", []> { let Rm = 0b1101; // NLdSt will assign to the right encoding bits. let Inst{5-4} = Rn{5-4}; - let DecoderMethod = "DecodeVSTInstruction"; - let AsmMatchConverter = "cvtVSTwbFixed"; + let DecoderMethod = "DecodeVLDST1Instruction"; } def _register : NLdSt<0,0b00,0b0010,op7_4, (outs GPR:$wb), (ins addrmode6:$Rn, rGPR:$Rm, VecListFourD:$Vd), @@ -1722,8 +1699,7 @@ multiclass VST1D4WB op7_4, string Dt> { "vst1", Dt, "$Vd, $Rn, $Rm", "$Rn.addr = $wb", []> { let Inst{5-4} = Rn{5-4}; - let DecoderMethod = "DecodeVSTInstruction"; - let AsmMatchConverter = "cvtVSTwbRegister"; + let DecoderMethod = "DecodeVLDST1Instruction"; } } @@ -1748,7 +1724,7 @@ class VST2 op11_8, bits<4> op7_4, string Dt, RegisterOperand VdTy, itin, "vst2", Dt, "$Vd, $Rn", "", []> { let Rm = 0b1111; let Inst{5-4} = Rn{5-4}; - let DecoderMethod = "DecodeVSTInstruction"; + let DecoderMethod = "DecodeVLDST2Instruction"; } def VST2d8 : VST2<0b1000, {0,0,?,?}, "8", VecListDPair, IIC_VST2>; @@ -1772,16 +1748,14 @@ multiclass VST2DWB op11_8, bits<4> op7_4, string Dt, "$Rn.addr = $wb", []> { let Rm = 0b1101; // NLdSt will assign to the right encoding bits. let Inst{5-4} = Rn{5-4}; - let DecoderMethod = "DecodeVSTInstruction"; - let AsmMatchConverter = "cvtVSTwbFixed"; + let DecoderMethod = "DecodeVLDST2Instruction"; } def _register : NLdSt<0, 0b00, op11_8, op7_4, (outs GPR:$wb), (ins addrmode6:$Rn, rGPR:$Rm, VdTy:$Vd), IIC_VLD1u, "vst2", Dt, "$Vd, $Rn, $Rm", "$Rn.addr = $wb", []> { let Inst{5-4} = Rn{5-4}; - let DecoderMethod = "DecodeVSTInstruction"; - let AsmMatchConverter = "cvtVSTwbRegister"; + let DecoderMethod = "DecodeVLDST2Instruction"; } } multiclass VST2QWB op7_4, string Dt> { @@ -1791,8 +1765,7 @@ multiclass VST2QWB op7_4, string Dt> { "$Rn.addr = $wb", []> { let Rm = 0b1101; // NLdSt will assign to the right encoding bits. let Inst{5-4} = Rn{5-4}; - let DecoderMethod = "DecodeVSTInstruction"; - let AsmMatchConverter = "cvtVSTwbFixed"; + let DecoderMethod = "DecodeVLDST2Instruction"; } def _register : NLdSt<0, 0b00, 0b0011, op7_4, (outs GPR:$wb), (ins addrmode6:$Rn, rGPR:$Rm, VecListFourD:$Vd), @@ -1800,8 +1773,7 @@ multiclass VST2QWB op7_4, string Dt> { "vst2", Dt, "$Vd, $Rn, $Rm", "$Rn.addr = $wb", []> { let Inst{5-4} = Rn{5-4}; - let DecoderMethod = "DecodeVSTInstruction"; - let AsmMatchConverter = "cvtVSTwbRegister"; + let DecoderMethod = "DecodeVLDST2Instruction"; } } @@ -1835,7 +1807,7 @@ class VST3D op11_8, bits<4> op7_4, string Dt> "vst3", Dt, "\\{$Vd, $src2, $src3\\}, $Rn", "", []> { let Rm = 0b1111; let Inst{4} = Rn{4}; - let DecoderMethod = "DecodeVSTInstruction"; + let DecoderMethod = "DecodeVLDST3Instruction"; } def VST3d8 : VST3D<0b0100, {0,0,0,?}, "8">; @@ -1854,7 +1826,7 @@ class VST3DWB op11_8, bits<4> op7_4, string Dt> "vst3", Dt, "\\{$Vd, $src2, $src3\\}, $Rn$Rm", "$Rn.addr = $wb", []> { let Inst{4} = Rn{4}; - let DecoderMethod = "DecodeVSTInstruction"; + let DecoderMethod = "DecodeVLDST3Instruction"; } def VST3d8_UPD : VST3DWB<0b0100, {0,0,0,?}, "8">; @@ -1894,7 +1866,7 @@ class VST4D op11_8, bits<4> op7_4, string Dt> "", []> { let Rm = 0b1111; let Inst{5-4} = Rn{5-4}; - let DecoderMethod = "DecodeVSTInstruction"; + let DecoderMethod = "DecodeVLDST4Instruction"; } def VST4d8 : VST4D<0b0000, {0,0,?,?}, "8">; @@ -1913,7 +1885,7 @@ class VST4DWB op11_8, bits<4> op7_4, string Dt> "vst4", Dt, "\\{$Vd, $src2, $src3, $src4\\}, $Rn$Rm", "$Rn.addr = $wb", []> { let Inst{5-4} = Rn{5-4}; - let DecoderMethod = "DecodeVSTInstruction"; + let DecoderMethod = "DecodeVLDST4Instruction"; } def VST4d8_UPD : VST4DWB<0b0000, {0,0,?,?}, "8">; @@ -2379,6 +2351,40 @@ class N2VQInt op24_23, bits<2> op21_20, bits<2> op19_18, (ins QPR:$Vm), itin, OpcodeStr, Dt, "$Vd, $Vm", "", [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$Vm))))]>; +// Same as above, but not predicated. +class N2VDIntnp op17_16, bits<3> op10_8, bit op7, + InstrItinClass itin, string OpcodeStr, string Dt, + ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp> + : N2Vnp<0b10, op17_16, op10_8, op7, 0, (outs DPR:$Vd), (ins DPR:$Vm), + itin, OpcodeStr, Dt, ResTy, OpTy, + [(set DPR:$Vd, (ResTy (IntOp (OpTy DPR:$Vm))))]>; + +class N2VQIntnp op17_16, bits<3> op10_8, bit op7, + InstrItinClass itin, string OpcodeStr, string Dt, + ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp> + : N2Vnp<0b10, op17_16, op10_8, op7, 1, (outs QPR:$Vd), (ins QPR:$Vm), + itin, OpcodeStr, Dt, ResTy, OpTy, + [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$Vm))))]>; + +// Similar to NV2VQIntnp with some more encoding bits exposed (crypto). +class N2VQIntXnp op19_18, bits<2> op17_16, bits<3> op10_8, bit op6, + bit op7, InstrItinClass itin, string OpcodeStr, string Dt, + ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp> + : N2Vnp; + +// Same as N2VQIntXnp but with Vd as a src register. +class N2VQIntX2np op19_18, bits<2> op17_16, bits<3> op10_8, bit op6, + bit op7, InstrItinClass itin, string OpcodeStr, string Dt, + ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp> + : N2Vnp { + let Constraints = "$src = $Vd"; +} + // Narrow 2-register operations. class N2VN op24_23, bits<2> op21_20, bits<2> op19_18, bits<2> op17_16, bits<5> op11_7, bit op6, bit op4, @@ -2541,6 +2547,16 @@ class N3VDInt op21_20, bits<4> op11_8, bit op4, let TwoOperandAliasConstraint = "$Vn = $Vd"; let isCommutable = Commutable; } + +class N3VDIntnp op27_23, bits<2> op21_20, bits<4> op11_8, bit op6, + bit op4, Format f, InstrItinClass itin, string OpcodeStr, + string Dt, ValueType ResTy, ValueType OpTy, + SDPatternOperator IntOp, bit Commutable> + : N3Vnp; + class N3VDIntSL op21_20, bits<4> op11_8, InstrItinClass itin, string OpcodeStr, string Dt, ValueType Ty, SDPatternOperator IntOp> : N3VLane32<0, 1, op21_20, op11_8, 1, 0, @@ -2552,6 +2568,7 @@ class N3VDIntSL op21_20, bits<4> op11_8, InstrItinClass itin, imm:$lane)))))]> { let isCommutable = 0; } + class N3VDIntSL16 op21_20, bits<4> op11_8, InstrItinClass itin, string OpcodeStr, string Dt, ValueType Ty, SDPatternOperator IntOp> : N3VLane16<0, 1, op21_20, op11_8, 1, 0, @@ -2584,6 +2601,29 @@ class N3VQInt op21_20, bits<4> op11_8, bit op4, let TwoOperandAliasConstraint = "$Vn = $Vd"; let isCommutable = Commutable; } + +class N3VQIntnp op27_23, bits<2> op21_20, bits<4> op11_8, bit op6, + bit op4, Format f, InstrItinClass itin, string OpcodeStr, + string Dt, ValueType ResTy, ValueType OpTy, + SDPatternOperator IntOp, bit Commutable> + : N3Vnp; + +// Same as N3VQIntnp but with Vd as a src register. +class N3VQInt3np op27_23, bits<2> op21_20, bits<4> op11_8, bit op6, + bit op4, Format f, InstrItinClass itin, string OpcodeStr, + string Dt, ValueType ResTy, ValueType OpTy, + SDPatternOperator IntOp, bit Commutable> + : N3Vnp { + let Constraints = "$src = $Vd"; +} + class N3VQIntSL op21_20, bits<4> op11_8, InstrItinClass itin, string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp> @@ -2834,6 +2874,7 @@ class N3VL op21_20, bits<4> op11_8, bit op4, [(set QPR:$Vd, (TyQ (OpNode (TyD DPR:$Vn), (TyD DPR:$Vm))))]> { let isCommutable = Commutable; } + class N3VLSL op21_20, bits<4> op11_8, InstrItinClass itin, string OpcodeStr, string Dt, ValueType TyQ, ValueType TyD, SDNode OpNode> @@ -2889,6 +2930,17 @@ class N3VLInt op21_20, bits<4> op11_8, bit op4, [(set QPR:$Vd, (TyQ (IntOp (TyD DPR:$Vn), (TyD DPR:$Vm))))]> { let isCommutable = Commutable; } + +// Same as above, but not predicated. +class N3VLIntnp op27_23, bits<2> op21_20, bits<4> op11_8, bit op6, + bit op4, InstrItinClass itin, string OpcodeStr, + string Dt, ValueType ResTy, ValueType OpTy, + SDPatternOperator IntOp, bit Commutable> + : N3Vnp; + class N3VLIntSL op21_20, bits<4> op11_8, InstrItinClass itin, string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp> @@ -3965,12 +4017,18 @@ defm VQADDu : N3VInt_QHSD<1, 0, 0b0000, 1, N3RegFrm, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q, "vqadd", "u", int_arm_neon_vqaddu, 1>; // VADDHN : Vector Add and Narrow Returning High Half (D = Q + Q) -defm VADDHN : N3VNInt_HSD<0,1,0b0100,0, "vaddhn", "i", - int_arm_neon_vaddhn, 1>; +defm VADDHN : N3VNInt_HSD<0,1,0b0100,0, "vaddhn", "i", null_frag, 1>; // VRADDHN : Vector Rounding Add and Narrow Returning High Half (D = Q + Q) defm VRADDHN : N3VNInt_HSD<1,1,0b0100,0, "vraddhn", "i", int_arm_neon_vraddhn, 1>; +def : Pat<(v8i8 (trunc (NEONvshru (add (v8i16 QPR:$Vn), QPR:$Vm), 8))), + (VADDHNv8i8 QPR:$Vn, QPR:$Vm)>; +def : Pat<(v4i16 (trunc (NEONvshru (add (v4i32 QPR:$Vn), QPR:$Vm), 16))), + (VADDHNv4i16 QPR:$Vn, QPR:$Vm)>; +def : Pat<(v2i32 (trunc (NEONvshru (add (v2i64 QPR:$Vn), QPR:$Vm), 32))), + (VADDHNv2i32 QPR:$Vn, QPR:$Vm)>; + // Vector Multiply Operations. // VMUL : Vector Multiply (integer, polynomial and floating-point) @@ -4008,6 +4066,17 @@ def : Pat<(v4f32 (fmul (v4f32 QPR:$src1), (DSubReg_i32_reg imm:$lane))), (SubReg_i32_lane imm:$lane)))>; + +def : Pat<(v2f32 (fmul DPR:$Rn, (NEONvdup (f32 SPR:$Rm)))), + (VMULslfd DPR:$Rn, + (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), SPR:$Rm, ssub_0), + (i32 0))>; +def : Pat<(v4f32 (fmul QPR:$Rn, (NEONvdup (f32 SPR:$Rm)))), + (VMULslfq QPR:$Rn, + (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), SPR:$Rm, ssub_0), + (i32 0))>; + + // VQDMULH : Vector Saturating Doubling Multiply Returning High Half defm VQDMULH : N3VInt_HS<0, 0, 0b1011, 0, N3RegFrm, IIC_VMULi16D, IIC_VMULi32D, IIC_VMULi16Q, IIC_VMULi32Q, @@ -4053,12 +4122,18 @@ def : Pat<(v4i32 (int_arm_neon_vqrdmulh (v4i32 QPR:$src1), (SubReg_i32_lane imm:$lane)))>; // VMULL : Vector Multiply Long (integer and polynomial) (Q = D * D) -defm VMULLs : N3VL_QHS<0,1,0b1100,0, IIC_VMULi16D, IIC_VMULi32D, - "vmull", "s", NEONvmulls, 1>; -defm VMULLu : N3VL_QHS<1,1,0b1100,0, IIC_VMULi16D, IIC_VMULi32D, - "vmull", "u", NEONvmullu, 1>; -def VMULLp : N3VLInt<0, 1, 0b00, 0b1110, 0, IIC_VMULi16D, "vmull", "p8", - v8i16, v8i8, int_arm_neon_vmullp, 1>; +let PostEncoderMethod = "NEONThumb2DataIPostEncoder", + DecoderNamespace = "NEONData" in { + defm VMULLs : N3VL_QHS<0,1,0b1100,0, IIC_VMULi16D, IIC_VMULi32D, + "vmull", "s", NEONvmulls, 1>; + defm VMULLu : N3VL_QHS<1,1,0b1100,0, IIC_VMULi16D, IIC_VMULi32D, + "vmull", "u", NEONvmullu, 1>; + def VMULLp8 : N3VLInt<0, 1, 0b00, 0b1110, 0, IIC_VMULi16D, "vmull", "p8", + v8i16, v8i8, int_arm_neon_vmullp, 1>; + def VMULLp64 : N3VLIntnp<0b00101, 0b10, 0b1110, 0, 0, NoItinerary, + "vmull", "p64", v2i64, v1i64, int_arm_neon_vmullp, 1>, + Requires<[HasV8, HasCrypto]>; +} defm VMULLsls : N3VLSL_HS<0, 0b1010, IIC_VMULi16D, "vmull", "s", NEONvmulls>; defm VMULLslu : N3VLSL_HS<1, 0b1010, IIC_VMULi16D, "vmull", "u", NEONvmullu>; @@ -4125,8 +4200,27 @@ defm VMLALslu : N3VLMulOpSL_HS<1, 0b0010, "vmlal", "u", NEONvmullu, add>; // VQDMLAL : Vector Saturating Doubling Multiply Accumulate Long (Q += D * D) defm VQDMLAL : N3VLInt3_HS<0, 1, 0b1001, 0, IIC_VMACi16D, IIC_VMACi32D, - "vqdmlal", "s", int_arm_neon_vqdmlal>; -defm VQDMLALsl: N3VLInt3SL_HS<0, 0b0011, "vqdmlal", "s", int_arm_neon_vqdmlal>; + "vqdmlal", "s", null_frag>; +defm VQDMLALsl: N3VLInt3SL_HS<0, 0b0011, "vqdmlal", "s", null_frag>; + +def : Pat<(v4i32 (int_arm_neon_vqadds (v4i32 QPR:$src1), + (v4i32 (int_arm_neon_vqdmull (v4i16 DPR:$Vn), + (v4i16 DPR:$Vm))))), + (VQDMLALv4i32 QPR:$src1, DPR:$Vn, DPR:$Vm)>; +def : Pat<(v2i64 (int_arm_neon_vqadds (v2i64 QPR:$src1), + (v2i64 (int_arm_neon_vqdmull (v2i32 DPR:$Vn), + (v2i32 DPR:$Vm))))), + (VQDMLALv2i64 QPR:$src1, DPR:$Vn, DPR:$Vm)>; +def : Pat<(v4i32 (int_arm_neon_vqadds (v4i32 QPR:$src1), + (v4i32 (int_arm_neon_vqdmull (v4i16 DPR:$Vn), + (v4i16 (NEONvduplane (v4i16 DPR_8:$Vm), + imm:$lane)))))), + (VQDMLALslv4i16 QPR:$src1, DPR:$Vn, DPR_8:$Vm, imm:$lane)>; +def : Pat<(v2i64 (int_arm_neon_vqadds (v2i64 QPR:$src1), + (v2i64 (int_arm_neon_vqdmull (v2i32 DPR:$Vn), + (v2i32 (NEONvduplane (v2i32 DPR_VFP2:$Vm), + imm:$lane)))))), + (VQDMLALslv2i32 QPR:$src1, DPR:$Vn, DPR_VFP2:$Vm, imm:$lane)>; // VMLS : Vector Multiply Subtract (integer and floating-point) defm VMLS : N3VMulOp_QHS<1, 0, 0b1001, 0, IIC_VMACi16D, IIC_VMACi32D, @@ -4182,25 +4276,44 @@ defm VMLSLslu : N3VLMulOpSL_HS<1, 0b0110, "vmlsl", "u", NEONvmullu, sub>; // VQDMLSL : Vector Saturating Doubling Multiply Subtract Long (Q -= D * D) defm VQDMLSL : N3VLInt3_HS<0, 1, 0b1011, 0, IIC_VMACi16D, IIC_VMACi32D, - "vqdmlsl", "s", int_arm_neon_vqdmlsl>; -defm VQDMLSLsl: N3VLInt3SL_HS<0, 0b111, "vqdmlsl", "s", int_arm_neon_vqdmlsl>; + "vqdmlsl", "s", null_frag>; +defm VQDMLSLsl: N3VLInt3SL_HS<0, 0b111, "vqdmlsl", "s", null_frag>; + +def : Pat<(v4i32 (int_arm_neon_vqsubs (v4i32 QPR:$src1), + (v4i32 (int_arm_neon_vqdmull (v4i16 DPR:$Vn), + (v4i16 DPR:$Vm))))), + (VQDMLSLv4i32 QPR:$src1, DPR:$Vn, DPR:$Vm)>; +def : Pat<(v2i64 (int_arm_neon_vqsubs (v2i64 QPR:$src1), + (v2i64 (int_arm_neon_vqdmull (v2i32 DPR:$Vn), + (v2i32 DPR:$Vm))))), + (VQDMLSLv2i64 QPR:$src1, DPR:$Vn, DPR:$Vm)>; +def : Pat<(v4i32 (int_arm_neon_vqsubs (v4i32 QPR:$src1), + (v4i32 (int_arm_neon_vqdmull (v4i16 DPR:$Vn), + (v4i16 (NEONvduplane (v4i16 DPR_8:$Vm), + imm:$lane)))))), + (VQDMLSLslv4i16 QPR:$src1, DPR:$Vn, DPR_8:$Vm, imm:$lane)>; +def : Pat<(v2i64 (int_arm_neon_vqsubs (v2i64 QPR:$src1), + (v2i64 (int_arm_neon_vqdmull (v2i32 DPR:$Vn), + (v2i32 (NEONvduplane (v2i32 DPR_VFP2:$Vm), + imm:$lane)))))), + (VQDMLSLslv2i32 QPR:$src1, DPR:$Vn, DPR_VFP2:$Vm, imm:$lane)>; // Fused Vector Multiply-Accumulate and Fused Multiply-Subtract Operations. def VFMAfd : N3VDMulOp<0, 0, 0b00, 0b1100, 1, IIC_VFMACD, "vfma", "f32", v2f32, fmul_su, fadd_mlx>, - Requires<[HasVFP4,UseFusedMAC]>; + Requires<[HasNEON,HasVFP4,UseFusedMAC]>; def VFMAfq : N3VQMulOp<0, 0, 0b00, 0b1100, 1, IIC_VFMACQ, "vfma", "f32", v4f32, fmul_su, fadd_mlx>, - Requires<[HasVFP4,UseFusedMAC]>; + Requires<[HasNEON,HasVFP4,UseFusedMAC]>; // Fused Vector Multiply Subtract (floating-point) def VFMSfd : N3VDMulOp<0, 0, 0b10, 0b1100, 1, IIC_VFMACD, "vfms", "f32", v2f32, fmul_su, fsub_mlx>, - Requires<[HasVFP4,UseFusedMAC]>; + Requires<[HasNEON,HasVFP4,UseFusedMAC]>; def VFMSfq : N3VQMulOp<0, 0, 0b10, 0b1100, 1, IIC_VFMACQ, "vfms", "f32", v4f32, fmul_su, fsub_mlx>, - Requires<[HasVFP4,UseFusedMAC]>; + Requires<[HasNEON,HasVFP4,UseFusedMAC]>; // Match @llvm.fma.* intrinsics def : Pat<(v2f32 (fma DPR:$Vn, DPR:$Vm, DPR:$src1)), @@ -4248,12 +4361,18 @@ defm VQSUBu : N3VInt_QHSD<1, 0, 0b0010, 1, N3RegFrm, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q, "vqsub", "u", int_arm_neon_vqsubu, 0>; // VSUBHN : Vector Subtract and Narrow Returning High Half (D = Q - Q) -defm VSUBHN : N3VNInt_HSD<0,1,0b0110,0, "vsubhn", "i", - int_arm_neon_vsubhn, 0>; +defm VSUBHN : N3VNInt_HSD<0,1,0b0110,0, "vsubhn", "i", null_frag, 0>; // VRSUBHN : Vector Rounding Subtract and Narrow Returning High Half (D=Q-Q) defm VRSUBHN : N3VNInt_HSD<1,1,0b0110,0, "vrsubhn", "i", int_arm_neon_vrsubhn, 0>; +def : Pat<(v8i8 (trunc (NEONvshru (sub (v8i16 QPR:$Vn), QPR:$Vm), 8))), + (VSUBHNv8i8 QPR:$Vn, QPR:$Vm)>; +def : Pat<(v4i16 (trunc (NEONvshru (sub (v4i32 QPR:$Vn), QPR:$Vm), 16))), + (VSUBHNv4i16 QPR:$Vn, QPR:$Vm)>; +def : Pat<(v2i32 (trunc (NEONvshru (sub (v2i64 QPR:$Vn), QPR:$Vm), 32))), + (VSUBHNv2i32 QPR:$Vn, QPR:$Vm)>; + // Vector Comparisons. // VCEQ : Vector Compare Equal @@ -4659,6 +4778,18 @@ def VMAXfq : N3VQInt<0, 0, 0b00, 0b1111, 0, N3RegFrm, IIC_VBINQ, "vmax", "f32", v4f32, v4f32, int_arm_neon_vmaxs, 1>; +// VMAXNM +let PostEncoderMethod = "NEONThumb2V8PostEncoder", DecoderNamespace = "v8NEON" in { + def VMAXNMND : N3VDIntnp<0b00110, 0b00, 0b1111, 0, 1, + N3RegFrm, NoItinerary, "vmaxnm", "f32", + v2f32, v2f32, int_arm_neon_vmaxnm, 1>, + Requires<[HasV8, HasNEON]>; + def VMAXNMNQ : N3VQIntnp<0b00110, 0b00, 0b1111, 1, 1, + N3RegFrm, NoItinerary, "vmaxnm", "f32", + v4f32, v4f32, int_arm_neon_vmaxnm, 1>, + Requires<[HasV8, HasNEON]>; +} + // VMIN : Vector Minimum defm VMINs : N3VInt_QHS<0, 0, 0b0110, 1, N3RegFrm, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q, @@ -4673,6 +4804,18 @@ def VMINfq : N3VQInt<0, 0, 0b10, 0b1111, 0, N3RegFrm, IIC_VBINQ, "vmin", "f32", v4f32, v4f32, int_arm_neon_vmins, 1>; +// VMINNM +let PostEncoderMethod = "NEONThumb2V8PostEncoder", DecoderNamespace = "v8NEON" in { + def VMINNMND : N3VDIntnp<0b00110, 0b10, 0b1111, 0, 1, + N3RegFrm, NoItinerary, "vminnm", "f32", + v2f32, v2f32, int_arm_neon_vminnm, 1>, + Requires<[HasV8, HasNEON]>; + def VMINNMNQ : N3VQIntnp<0b00110, 0b10, 0b1111, 1, 1, + N3RegFrm, NoItinerary, "vminnm", "f32", + v4f32, v4f32, int_arm_neon_vminnm, 1>, + Requires<[HasV8, HasNEON]>; +} + // Vector Pairwise Operations. // VPADD : Vector Pairwise Add @@ -5015,10 +5158,10 @@ def VSWPq : N2VX<0b11, 0b11, 0b00, 0b10, 0b00000, 1, 0, // Vector Move Operations. // VMOV : Vector Move (Register) -def : InstAlias<"vmov${p} $Vd, $Vm", - (VORRd DPR:$Vd, DPR:$Vm, DPR:$Vm, pred:$p)>; -def : InstAlias<"vmov${p} $Vd, $Vm", - (VORRq QPR:$Vd, QPR:$Vm, QPR:$Vm, pred:$p)>; +def : NEONInstAlias<"vmov${p} $Vd, $Vm", + (VORRd DPR:$Vd, DPR:$Vm, DPR:$Vm, pred:$p)>; +def : NEONInstAlias<"vmov${p} $Vd, $Vm", + (VORRq QPR:$Vd, QPR:$Vm, QPR:$Vm, pred:$p)>; // VMOV : Vector Move (Immediate) @@ -5386,6 +5529,26 @@ def VCVTs2fq : N2VQ<0b11, 0b11, 0b10, 0b11, 0b01100, 0, "vcvt", "f32.s32", def VCVTu2fq : N2VQ<0b11, 0b11, 0b10, 0b11, 0b01101, 0, "vcvt", "f32.u32", v4f32, v4i32, uint_to_fp>; +// VCVT{A, N, P, M} +multiclass VCVT_FPI op10_8, SDPatternOperator IntS, + SDPatternOperator IntU> { + let PostEncoderMethod = "NEONThumb2V8PostEncoder", DecoderNamespace = "v8NEON" in { + def SD : N2VDIntnp<0b11, op10_8, 0, NoItinerary, !strconcat("vcvt", op), + "s32.f32", v2i32, v2f32, IntS>, Requires<[HasV8, HasNEON]>; + def SQ : N2VQIntnp<0b11, op10_8, 0, NoItinerary, !strconcat("vcvt", op), + "s32.f32", v4i32, v4f32, IntS>, Requires<[HasV8, HasNEON]>; + def UD : N2VDIntnp<0b11, op10_8, 1, NoItinerary, !strconcat("vcvt", op), + "u32.f32", v2i32, v2f32, IntU>, Requires<[HasV8, HasNEON]>; + def UQ : N2VQIntnp<0b11, op10_8, 1, NoItinerary, !strconcat("vcvt", op), + "u32.f32", v4i32, v4f32, IntU>, Requires<[HasV8, HasNEON]>; + } +} + +defm VCVTAN : VCVT_FPI<"a", 0b000, int_arm_neon_vcvtas, int_arm_neon_vcvtau>; +defm VCVTNN : VCVT_FPI<"n", 0b001, int_arm_neon_vcvtns, int_arm_neon_vcvtnu>; +defm VCVTPN : VCVT_FPI<"p", 0b010, int_arm_neon_vcvtps, int_arm_neon_vcvtpu>; +defm VCVTMN : VCVT_FPI<"m", 0b011, int_arm_neon_vcvtms, int_arm_neon_vcvtmu>; + // VCVT : Vector Convert Between Floating-Point and Fixed-Point. let DecoderMethod = "DecodeVCVTD" in { def VCVTf2xsd : N2VCvtD<0, 1, 0b1111, 0, 1, "vcvt", "s32.f32", @@ -5409,6 +5572,25 @@ def VCVTxu2fq : N2VCvtQ<1, 1, 0b1110, 0, 1, "vcvt", "f32.u32", v4f32, v4i32, int_arm_neon_vcvtfxu2fp>; } +def : NEONInstAlias<"vcvt${p}.s32.f32 $Dd, $Dm, #0", + (VCVTf2sd DPR:$Dd, DPR:$Dm, pred:$p)>; +def : NEONInstAlias<"vcvt${p}.u32.f32 $Dd, $Dm, #0", + (VCVTf2ud DPR:$Dd, DPR:$Dm, pred:$p)>; +def : NEONInstAlias<"vcvt${p}.f32.s32 $Dd, $Dm, #0", + (VCVTs2fd DPR:$Dd, DPR:$Dm, pred:$p)>; +def : NEONInstAlias<"vcvt${p}.f32.u32 $Dd, $Dm, #0", + (VCVTu2fd DPR:$Dd, DPR:$Dm, pred:$p)>; + +def : NEONInstAlias<"vcvt${p}.s32.f32 $Qd, $Qm, #0", + (VCVTf2sq QPR:$Qd, QPR:$Qm, pred:$p)>; +def : NEONInstAlias<"vcvt${p}.u32.f32 $Qd, $Qm, #0", + (VCVTf2uq QPR:$Qd, QPR:$Qm, pred:$p)>; +def : NEONInstAlias<"vcvt${p}.f32.s32 $Qd, $Qm, #0", + (VCVTs2fq QPR:$Qd, QPR:$Qm, pred:$p)>; +def : NEONInstAlias<"vcvt${p}.f32.u32 $Qd, $Qm, #0", + (VCVTu2fq QPR:$Qd, QPR:$Qm, pred:$p)>; + + // VCVT : Vector Convert Between Half-Precision and Single-Precision. def VCVTf2h : N2VNInt<0b11, 0b11, 0b01, 0b10, 0b01100, 0, 0, IIC_VUNAQ, "vcvt", "f16.f32", @@ -5509,8 +5691,9 @@ class VEXTd IIC_VEXTD, OpcodeStr, Dt, "$Vd, $Vn, $Vm, $index", "", [(set DPR:$Vd, (Ty (NEONvext (Ty DPR:$Vn), (Ty DPR:$Vm), imm:$index)))]> { - bits<4> index; - let Inst{11-8} = index{3-0}; + bits<3> index; + let Inst{11} = 0b0; + let Inst{10-8} = index{2-0}; } class VEXTq @@ -5525,14 +5708,14 @@ class VEXTq } def VEXTd8 : VEXTd<"vext", "8", v8i8, imm0_7> { - let Inst{11-8} = index{3-0}; + let Inst{10-8} = index{2-0}; } def VEXTd16 : VEXTd<"vext", "16", v4i16, imm0_3> { - let Inst{11-9} = index{2-0}; + let Inst{10-9} = index{1-0}; let Inst{8} = 0b0; } def VEXTd32 : VEXTd<"vext", "32", v2i32, imm0_1> { - let Inst{11-10} = index{1-0}; + let Inst{10} = index{0}; let Inst{9-8} = 0b00; } def : Pat<(v2f32 (NEONvext (v2f32 DPR:$Vn), @@ -5657,6 +5840,77 @@ def VTBX4Pseudo IIC_VTBX4, "$orig = $dst", []>; } // DecoderMethod = "DecodeTBLInstruction" +// VRINT : Vector Rounding +multiclass VRINT_FPI op9_7, SDPatternOperator Int> { + let PostEncoderMethod = "NEONThumb2V8PostEncoder", DecoderNamespace = "v8NEON" in { + def D : N2VDIntnp<0b10, 0b100, 0, NoItinerary, + !strconcat("vrint", op), "f32", + v2f32, v2f32, Int>, Requires<[HasV8, HasNEON]> { + let Inst{9-7} = op9_7; + } + def Q : N2VQIntnp<0b10, 0b100, 0, NoItinerary, + !strconcat("vrint", op), "f32", + v4f32, v4f32, Int>, Requires<[HasV8, HasNEON]> { + let Inst{9-7} = op9_7; + } + } + + def : NEONInstAlias(NAME#"D") DPR:$Dd, DPR:$Dm)>; + def : NEONInstAlias(NAME#"Q") QPR:$Qd, QPR:$Qm)>; +} + +defm VRINTNN : VRINT_FPI<"n", 0b000, int_arm_neon_vrintn>; +defm VRINTXN : VRINT_FPI<"x", 0b001, int_arm_neon_vrintx>; +defm VRINTAN : VRINT_FPI<"a", 0b010, int_arm_neon_vrinta>; +defm VRINTZN : VRINT_FPI<"z", 0b011, int_arm_neon_vrintz>; +defm VRINTMN : VRINT_FPI<"m", 0b101, int_arm_neon_vrintm>; +defm VRINTPN : VRINT_FPI<"p", 0b111, int_arm_neon_vrintp>; + +// Cryptography instructions +let PostEncoderMethod = "NEONThumb2DataIPostEncoder", + DecoderNamespace = "v8Crypto" in { + class AES + : N2VQIntXnp<0b00, 0b00, 0b011, op6, op7, NoItinerary, + !strconcat("aes", op), "8", v16i8, v16i8, Int>, + Requires<[HasV8, HasCrypto]>; + class AES2Op + : N2VQIntX2np<0b00, 0b00, 0b011, op6, op7, NoItinerary, + !strconcat("aes", op), "8", v16i8, v16i8, Int>, + Requires<[HasV8, HasCrypto]>; + class N2SHA op17_16, bits<3> op10_8, bit op7, bit op6, + SDPatternOperator Int> + : N2VQIntXnp<0b10, op17_16, op10_8, op6, op7, NoItinerary, + !strconcat("sha", op), "32", v4i32, v4i32, Int>, + Requires<[HasV8, HasCrypto]>; + class N2SHA2Op op17_16, bits<3> op10_8, bit op7, bit op6, + SDPatternOperator Int> + : N2VQIntX2np<0b10, op17_16, op10_8, op6, op7, NoItinerary, + !strconcat("sha", op), "32", v4i32, v4i32, Int>, + Requires<[HasV8, HasCrypto]>; + class N3SHA3Op op27_23, bits<2> op21_20, SDPatternOperator Int> + : N3VQInt3np, + Requires<[HasV8, HasCrypto]>; +} + +def AESD : AES2Op<"d", 0, 1, int_arm_neon_aesd>; +def AESE : AES2Op<"e", 0, 0, int_arm_neon_aese>; +def AESIMC : AES<"imc", 1, 1, int_arm_neon_aesimc>; +def AESMC : AES<"mc", 1, 0, int_arm_neon_aesmc>; + +def SHA1H : N2SHA<"1h", 0b01, 0b010, 1, 1, int_arm_neon_sha1h>; +def SHA1SU1 : N2SHA2Op<"1su1", 0b10, 0b011, 1, 0, int_arm_neon_sha1su1>; +def SHA256SU0 : N2SHA2Op<"256su0", 0b10, 0b011, 1, 1, int_arm_neon_sha256su0>; +def SHA1C : N3SHA3Op<"1c", 0b00100, 0b00, int_arm_neon_sha1c>; +def SHA1M : N3SHA3Op<"1m", 0b00100, 0b10, int_arm_neon_sha1m>; +def SHA1P : N3SHA3Op<"1p", 0b00100, 0b01, int_arm_neon_sha1p>; +def SHA1SU0 : N3SHA3Op<"1su0", 0b00100, 0b11, int_arm_neon_sha1su0>; +def SHA256H : N3SHA3Op<"256h", 0b00110, 0b00, int_arm_neon_sha256h>; +def SHA256H2 : N3SHA3Op<"256h2", 0b00110, 0b01, int_arm_neon_sha256h2>; +def SHA256SU1 : N3SHA3Op<"256su1", 0b00110, 0b10, int_arm_neon_sha256su1>; + //===----------------------------------------------------------------------===// // NEON instructions for single-precision FP math //===----------------------------------------------------------------------===// @@ -6697,12 +6951,17 @@ def VST4qWB_register_Asm_32 : (ins VecListFourQ:$list, addrmode6:$addr, rGPR:$Rm, pred:$p)>; -// VMOV takes an optional datatype suffix +// VMOV/VMVN takes an optional datatype suffix defm : NEONDTAnyInstAlias<"vmov${p}", "$Vd, $Vm", (VORRd DPR:$Vd, DPR:$Vm, DPR:$Vm, pred:$p)>; defm : NEONDTAnyInstAlias<"vmov${p}", "$Vd, $Vm", (VORRq QPR:$Vd, QPR:$Vm, QPR:$Vm, pred:$p)>; +defm : NEONDTAnyInstAlias<"vmvn${p}", "$Vd, $Vm", + (VMVNd DPR:$Vd, DPR:$Vm, pred:$p)>; +defm : NEONDTAnyInstAlias<"vmvn${p}", "$Vd, $Vm", + (VMVNq QPR:$Vd, QPR:$Vm, pred:$p)>; + // VCLT (register) is an assembler alias for VCGT w/ the operands reversed. // D-register versions. def : NEONInstAlias<"vcle${p}.s8 $Dd, $Dn, $Dm", diff --git a/lib/Target/ARM/ARMInstrThumb.td b/lib/Target/ARM/ARMInstrThumb.td index ae7a5c00bd74..af5ef537b536 100644 --- a/lib/Target/ARM/ARMInstrThumb.td +++ b/lib/Target/ARM/ARMInstrThumb.td @@ -69,11 +69,6 @@ def thumb_immshifted_shamt : SDNodeXFormgetTargetConstant(V, MVT::i32); }]>; -// ADR instruction labels. -def t_adrlabel : Operand { - let EncoderMethod = "getThumbAdrLabelOpValue"; -} - // Scaled 4 immediate. def t_imm0_1020s4_asmoperand: AsmOperandClass { let Name = "Imm0_1020s4"; } def t_imm0_1020s4 : Operand { @@ -97,12 +92,34 @@ def t_imm0_508s4_neg : Operand { // Define Thumb specific addressing modes. +// unsigned 8-bit, 2-scaled memory offset +class OperandUnsignedOffset_b8s2 : AsmOperandClass { + let Name = "UnsignedOffset_b8s2"; + let PredicateMethod = "isUnsignedOffset<8, 2>"; +} + +def UnsignedOffset_b8s2 : OperandUnsignedOffset_b8s2; + +// thumb style PC relative operand. signed, 8 bits magnitude, +// two bits shift. can be represented as either [pc, #imm], #imm, +// or relocatable expression... +def ThumbMemPC : AsmOperandClass { + let Name = "ThumbMemPC"; +} + let OperandType = "OPERAND_PCREL" in { def t_brtarget : Operand { let EncoderMethod = "getThumbBRTargetOpValue"; let DecoderMethod = "DecodeThumbBROperand"; } +// ADR instruction labels. +def t_adrlabel : Operand { + let EncoderMethod = "getThumbAdrLabelOpValue"; + let PrintMethod = "printAdrLabelOperand<2>"; + let ParserMatchClass = UnsignedOffset_b8s2; +} + def t_bcctarget : Operand { let EncoderMethod = "getThumbBCCTargetOpValue"; let DecoderMethod = "DecodeThumbBCCTargetOperand"; @@ -122,6 +139,15 @@ def t_blxtarget : Operand { let EncoderMethod = "getThumbBLXTargetOpValue"; let DecoderMethod = "DecodeThumbBLXOffset"; } + +// t_addrmode_pc :=